Also plot total downloads.
This commit is contained in:
parent
e566171100
commit
a42a00c553
25
README.md
25
README.md
|
@ -1,5 +1,18 @@
|
|||
# Analysis of version adoptions on PyPI
|
||||
|
||||
## pypi.org downloads by version
|
||||
|
||||
![](python-versions-lines.png)
|
||||
![](python-versions-stacked.png)
|
||||
|
||||
|
||||
## pypi.org downloads total
|
||||
|
||||
![](pypi-download-counts.png)
|
||||
|
||||
|
||||
## Where does the data come from?
|
||||
|
||||
We get
|
||||
[publicly available PyPI download statistics](https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads)
|
||||
on Google BigQuery using [pypinfo](https://github.com/ofek/pypinfo/).
|
||||
|
@ -20,15 +33,3 @@ Then there's two main invocations, first fetch the data using:
|
|||
Then plot it using:
|
||||
|
||||
python python-versions.py
|
||||
|
||||
|
||||
## Data
|
||||
|
||||
### Percentage of pypi.org downloads
|
||||
|
||||
![](python-versions-pct.png)
|
||||
|
||||
|
||||
### Number of pypi.org downloads
|
||||
|
||||
![](python-versions.png)
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 34 KiB |
Binary file not shown.
After Width: | Height: | Size: 84 KiB |
Before Width: | Height: | Size: 83 KiB After Width: | Height: | Size: 83 KiB |
Binary file not shown.
Before Width: | Height: | Size: 70 KiB |
|
@ -107,45 +107,10 @@ def fetch_main():
|
|||
)
|
||||
|
||||
|
||||
def mean_date(a: datetime, b: datetime) -> datetime:
|
||||
return a + (b - a) / 2
|
||||
|
||||
|
||||
def plot_main():
|
||||
db = DB()
|
||||
by_version = defaultdict(lambda: [[], []])
|
||||
versions = db.fetch_python_version()
|
||||
biggest_value = max(version["download_count"] for version in versions)
|
||||
for row in versions:
|
||||
if row["download_count"] <= biggest_value / 20:
|
||||
continue
|
||||
mid_of_month = mean_date(row["start_date"], row["end_date"])
|
||||
_, number_of_days = calendar.monthrange(mid_of_month.year, mid_of_month.month)
|
||||
by_version[row["python_version"]][0].append(mid_of_month)
|
||||
by_version[row["python_version"]][1].append(row["download_count"] / number_of_days)
|
||||
plt.style.use("tableau-colorblind10")
|
||||
plt.figure(figsize=(10, 10 * 2 / 3))
|
||||
fmt = iter(cycle(["-", "--", ":", "-."]))
|
||||
for version, (x, y) in by_version.items():
|
||||
if version is None:
|
||||
continue
|
||||
if len(x) <= 2:
|
||||
plt.plot(x, y, label=version)
|
||||
continue
|
||||
smooth_x = np.linspace(date2num(min(x)), date2num(max(x)), 200)
|
||||
spline = make_interp_spline([date2num(d) for d in x], y, k=2)
|
||||
smooth_y = spline(smooth_x)
|
||||
plt.plot_date(smooth_x, smooth_y, label=version, fmt=next(fmt))
|
||||
plt.xlabel("Date")
|
||||
plt.ylabel("PyPI daily downloads")
|
||||
plt.legend()
|
||||
plt.savefig("python-versions.png")
|
||||
|
||||
|
||||
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
|
||||
|
||||
|
||||
def plot_pct():
|
||||
def plot():
|
||||
def by_version(version_string):
|
||||
try:
|
||||
minor, major = version_string.split(".")
|
||||
|
@ -164,24 +129,29 @@ def plot_pct():
|
|||
)
|
||||
versions["download_count"] = pd.to_numeric(versions["download_count"])
|
||||
versions["Python version"].fillna("Other", inplace=True)
|
||||
versions = versions.merge(
|
||||
versions.groupby("start_date").agg(monthly_downloads=("download_count", "sum")),
|
||||
on="start_date",
|
||||
download_counts = versions.groupby("start_date").agg(
|
||||
monthly_downloads=("download_count", "sum")
|
||||
)
|
||||
plot_download_counts(download_counts)
|
||||
versions = versions.merge(download_counts, on="start_date")
|
||||
versions["pct"] = versions.download_count / versions.monthly_downloads
|
||||
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period('M')
|
||||
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period("M")
|
||||
versions.set_index(["Python version", "date"], inplace=True)
|
||||
to_plot = versions.pct.unstack(0, fill_value=0)
|
||||
to_plot.sort_values(
|
||||
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
|
||||
)
|
||||
pd.options.display.float_format = '{:.2%}'.format
|
||||
pd.options.display.float_format = "{:.2%}".format
|
||||
pd.options.display.max_rows = 999
|
||||
print(to_plot)
|
||||
for version in HIDE:
|
||||
del to_plot[version]
|
||||
del to_plot["Other"]
|
||||
plt.style.use("tableau-colorblind10")
|
||||
plot_lines(to_plot)
|
||||
plot_stacked(to_plot)
|
||||
|
||||
|
||||
def plot_stacked(to_plot):
|
||||
ax = to_plot.plot.area(
|
||||
stacked=True,
|
||||
figsize=(10, 10 * 2 / 3),
|
||||
|
@ -190,7 +160,28 @@ def plot_pct():
|
|||
ylabel="%",
|
||||
)
|
||||
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
|
||||
plt.savefig("python-versions-pct.png")
|
||||
plt.savefig("python-versions-stacked.png")
|
||||
|
||||
|
||||
def plot_lines(to_plot):
|
||||
ax = to_plot.plot(
|
||||
figsize=(10, 10 * 2 / 3),
|
||||
title="% of PyPI download by Python version",
|
||||
legend="reverse",
|
||||
ylabel="%",
|
||||
)
|
||||
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
|
||||
plt.savefig("python-versions-lines.png")
|
||||
|
||||
|
||||
def plot_download_counts(to_plot):
|
||||
ax = to_plot.plot(
|
||||
figsize=(10, 10 * 2 / 3),
|
||||
title="PyPI number of downloads",
|
||||
legend="reverse",
|
||||
xlabel="date",
|
||||
)
|
||||
plt.savefig("pypi-download-counts.png")
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
@ -207,5 +198,5 @@ if __name__ == "__main__":
|
|||
args = parse_args()
|
||||
if args.fetch:
|
||||
fetch_main()
|
||||
plot_pct()
|
||||
plot_main()
|
||||
plt.style.use("tableau-colorblind10")
|
||||
plot()
|
||||
|
|
Loading…
Reference in New Issue