Also plot total downloads.

This commit is contained in:
Julien Palard 2023-06-15 15:30:46 +02:00
parent e566171100
commit a42a00c553
Signed by: mdk
GPG Key ID: 0EFC1AC1006886F8
6 changed files with 49 additions and 57 deletions

View File

@ -1,5 +1,18 @@
# Analysis of version adoptions on PyPI
## pypi.org downloads by version
![](python-versions-lines.png)
![](python-versions-stacked.png)
## pypi.org downloads total
![](pypi-download-counts.png)
## Where does the data come from?
We get
[publicly available PyPI download statistics](https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads)
on Google BigQuery using [pypinfo](https://github.com/ofek/pypinfo/).
@ -20,15 +33,3 @@ Then there's two main invocations, first fetch the data using:
Then plot it using:
python python-versions.py
## Data
### Percentage of pypi.org downloads
![](python-versions-pct.png)
### Number of pypi.org downloads
![](python-versions.png)

BIN
pypi-download-counts.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

BIN
python-versions-lines.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

View File

Before

Width:  |  Height:  |  Size: 83 KiB

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

View File

@ -107,45 +107,10 @@ def fetch_main():
)
def mean_date(a: datetime, b: datetime) -> datetime:
return a + (b - a) / 2
def plot_main():
db = DB()
by_version = defaultdict(lambda: [[], []])
versions = db.fetch_python_version()
biggest_value = max(version["download_count"] for version in versions)
for row in versions:
if row["download_count"] <= biggest_value / 20:
continue
mid_of_month = mean_date(row["start_date"], row["end_date"])
_, number_of_days = calendar.monthrange(mid_of_month.year, mid_of_month.month)
by_version[row["python_version"]][0].append(mid_of_month)
by_version[row["python_version"]][1].append(row["download_count"] / number_of_days)
plt.style.use("tableau-colorblind10")
plt.figure(figsize=(10, 10 * 2 / 3))
fmt = iter(cycle(["-", "--", ":", "-."]))
for version, (x, y) in by_version.items():
if version is None:
continue
if len(x) <= 2:
plt.plot(x, y, label=version)
continue
smooth_x = np.linspace(date2num(min(x)), date2num(max(x)), 200)
spline = make_interp_spline([date2num(d) for d in x], y, k=2)
smooth_y = spline(smooth_x)
plt.plot_date(smooth_x, smooth_y, label=version, fmt=next(fmt))
plt.xlabel("Date")
plt.ylabel("PyPI daily downloads")
plt.legend()
plt.savefig("python-versions.png")
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
def plot_pct():
def plot():
def by_version(version_string):
try:
minor, major = version_string.split(".")
@ -164,24 +129,29 @@ def plot_pct():
)
versions["download_count"] = pd.to_numeric(versions["download_count"])
versions["Python version"].fillna("Other", inplace=True)
versions = versions.merge(
versions.groupby("start_date").agg(monthly_downloads=("download_count", "sum")),
on="start_date",
download_counts = versions.groupby("start_date").agg(
monthly_downloads=("download_count", "sum")
)
plot_download_counts(download_counts)
versions = versions.merge(download_counts, on="start_date")
versions["pct"] = versions.download_count / versions.monthly_downloads
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period('M')
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period("M")
versions.set_index(["Python version", "date"], inplace=True)
to_plot = versions.pct.unstack(0, fill_value=0)
to_plot.sort_values(
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
)
pd.options.display.float_format = '{:.2%}'.format
pd.options.display.float_format = "{:.2%}".format
pd.options.display.max_rows = 999
print(to_plot)
for version in HIDE:
del to_plot[version]
del to_plot["Other"]
plt.style.use("tableau-colorblind10")
plot_lines(to_plot)
plot_stacked(to_plot)
def plot_stacked(to_plot):
ax = to_plot.plot.area(
stacked=True,
figsize=(10, 10 * 2 / 3),
@ -190,7 +160,28 @@ def plot_pct():
ylabel="%",
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
plt.savefig("python-versions-pct.png")
plt.savefig("python-versions-stacked.png")
def plot_lines(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="% of PyPI download by Python version",
legend="reverse",
ylabel="%",
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
plt.savefig("python-versions-lines.png")
def plot_download_counts(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="PyPI number of downloads",
legend="reverse",
xlabel="date",
)
plt.savefig("pypi-download-counts.png")
def parse_args():
@ -207,5 +198,5 @@ if __name__ == "__main__":
args = parse_args()
if args.fetch:
fetch_main()
plot_pct()
plot_main()
plt.style.use("tableau-colorblind10")
plot()