Also plot total downloads.

This commit is contained in:
Julien Palard 2023-06-15 15:30:46 +02:00
parent e566171100
commit a42a00c553
Signed by: mdk
GPG Key ID: 0EFC1AC1006886F8
6 changed files with 49 additions and 57 deletions

View File

@ -1,5 +1,18 @@
# Analysis of version adoptions on PyPI # Analysis of version adoptions on PyPI
## pypi.org downloads by version
![](python-versions-lines.png)
![](python-versions-stacked.png)
## pypi.org downloads total
![](pypi-download-counts.png)
## Where does the data come from?
We get We get
[publicly available PyPI download statistics](https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads) [publicly available PyPI download statistics](https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads)
on Google BigQuery using [pypinfo](https://github.com/ofek/pypinfo/). on Google BigQuery using [pypinfo](https://github.com/ofek/pypinfo/).
@ -20,15 +33,3 @@ Then there's two main invocations, first fetch the data using:
Then plot it using: Then plot it using:
python python-versions.py python python-versions.py
## Data
### Percentage of pypi.org downloads
![](python-versions-pct.png)
### Number of pypi.org downloads
![](python-versions.png)

BIN
pypi-download-counts.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

BIN
python-versions-lines.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

View File

Before

Width:  |  Height:  |  Size: 83 KiB

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

View File

@ -107,45 +107,10 @@ def fetch_main():
) )
def mean_date(a: datetime, b: datetime) -> datetime:
return a + (b - a) / 2
def plot_main():
db = DB()
by_version = defaultdict(lambda: [[], []])
versions = db.fetch_python_version()
biggest_value = max(version["download_count"] for version in versions)
for row in versions:
if row["download_count"] <= biggest_value / 20:
continue
mid_of_month = mean_date(row["start_date"], row["end_date"])
_, number_of_days = calendar.monthrange(mid_of_month.year, mid_of_month.month)
by_version[row["python_version"]][0].append(mid_of_month)
by_version[row["python_version"]][1].append(row["download_count"] / number_of_days)
plt.style.use("tableau-colorblind10")
plt.figure(figsize=(10, 10 * 2 / 3))
fmt = iter(cycle(["-", "--", ":", "-."]))
for version, (x, y) in by_version.items():
if version is None:
continue
if len(x) <= 2:
plt.plot(x, y, label=version)
continue
smooth_x = np.linspace(date2num(min(x)), date2num(max(x)), 200)
spline = make_interp_spline([date2num(d) for d in x], y, k=2)
smooth_y = spline(smooth_x)
plt.plot_date(smooth_x, smooth_y, label=version, fmt=next(fmt))
plt.xlabel("Date")
plt.ylabel("PyPI daily downloads")
plt.legend()
plt.savefig("python-versions.png")
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"} HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
def plot_pct(): def plot():
def by_version(version_string): def by_version(version_string):
try: try:
minor, major = version_string.split(".") minor, major = version_string.split(".")
@ -164,24 +129,29 @@ def plot_pct():
) )
versions["download_count"] = pd.to_numeric(versions["download_count"]) versions["download_count"] = pd.to_numeric(versions["download_count"])
versions["Python version"].fillna("Other", inplace=True) versions["Python version"].fillna("Other", inplace=True)
versions = versions.merge( download_counts = versions.groupby("start_date").agg(
versions.groupby("start_date").agg(monthly_downloads=("download_count", "sum")), monthly_downloads=("download_count", "sum")
on="start_date",
) )
plot_download_counts(download_counts)
versions = versions.merge(download_counts, on="start_date")
versions["pct"] = versions.download_count / versions.monthly_downloads versions["pct"] = versions.download_count / versions.monthly_downloads
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period('M') versions["date"] = pd.to_datetime(versions.start_date).dt.to_period("M")
versions.set_index(["Python version", "date"], inplace=True) versions.set_index(["Python version", "date"], inplace=True)
to_plot = versions.pct.unstack(0, fill_value=0) to_plot = versions.pct.unstack(0, fill_value=0)
to_plot.sort_values( to_plot.sort_values(
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
) )
pd.options.display.float_format = '{:.2%}'.format pd.options.display.float_format = "{:.2%}".format
pd.options.display.max_rows = 999 pd.options.display.max_rows = 999
print(to_plot) print(to_plot)
for version in HIDE: for version in HIDE:
del to_plot[version] del to_plot[version]
del to_plot["Other"] del to_plot["Other"]
plt.style.use("tableau-colorblind10") plot_lines(to_plot)
plot_stacked(to_plot)
def plot_stacked(to_plot):
ax = to_plot.plot.area( ax = to_plot.plot.area(
stacked=True, stacked=True,
figsize=(10, 10 * 2 / 3), figsize=(10, 10 * 2 / 3),
@ -190,7 +160,28 @@ def plot_pct():
ylabel="%", ylabel="%",
) )
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1)) ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
plt.savefig("python-versions-pct.png") plt.savefig("python-versions-stacked.png")
def plot_lines(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="% of PyPI download by Python version",
legend="reverse",
ylabel="%",
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
plt.savefig("python-versions-lines.png")
def plot_download_counts(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="PyPI number of downloads",
legend="reverse",
xlabel="date",
)
plt.savefig("pypi-download-counts.png")
def parse_args(): def parse_args():
@ -207,5 +198,5 @@ if __name__ == "__main__":
args = parse_args() args = parse_args()
if args.fetch: if args.fetch:
fetch_main() fetch_main()
plot_pct() plt.style.use("tableau-colorblind10")
plot_main() plot()