Also plot total downloads.
This commit is contained in:
parent
e566171100
commit
a42a00c553
25
README.md
25
README.md
|
@ -1,5 +1,18 @@
|
||||||
# Analysis of version adoptions on PyPI
|
# Analysis of version adoptions on PyPI
|
||||||
|
|
||||||
|
## pypi.org downloads by version
|
||||||
|
|
||||||
|
![](python-versions-lines.png)
|
||||||
|
![](python-versions-stacked.png)
|
||||||
|
|
||||||
|
|
||||||
|
## pypi.org downloads total
|
||||||
|
|
||||||
|
![](pypi-download-counts.png)
|
||||||
|
|
||||||
|
|
||||||
|
## Where does the data come from?
|
||||||
|
|
||||||
We get
|
We get
|
||||||
[publicly available PyPI download statistics](https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads)
|
[publicly available PyPI download statistics](https://bigquery.cloud.google.com/table/bigquery-public-data:pypi.downloads)
|
||||||
on Google BigQuery using [pypinfo](https://github.com/ofek/pypinfo/).
|
on Google BigQuery using [pypinfo](https://github.com/ofek/pypinfo/).
|
||||||
|
@ -20,15 +33,3 @@ Then there's two main invocations, first fetch the data using:
|
||||||
Then plot it using:
|
Then plot it using:
|
||||||
|
|
||||||
python python-versions.py
|
python python-versions.py
|
||||||
|
|
||||||
|
|
||||||
## Data
|
|
||||||
|
|
||||||
### Percentage of pypi.org downloads
|
|
||||||
|
|
||||||
![](python-versions-pct.png)
|
|
||||||
|
|
||||||
|
|
||||||
### Number of pypi.org downloads
|
|
||||||
|
|
||||||
![](python-versions.png)
|
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 34 KiB |
Binary file not shown.
After Width: | Height: | Size: 84 KiB |
Before Width: | Height: | Size: 83 KiB After Width: | Height: | Size: 83 KiB |
Binary file not shown.
Before Width: | Height: | Size: 70 KiB |
|
@ -107,45 +107,10 @@ def fetch_main():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def mean_date(a: datetime, b: datetime) -> datetime:
|
|
||||||
return a + (b - a) / 2
|
|
||||||
|
|
||||||
|
|
||||||
def plot_main():
|
|
||||||
db = DB()
|
|
||||||
by_version = defaultdict(lambda: [[], []])
|
|
||||||
versions = db.fetch_python_version()
|
|
||||||
biggest_value = max(version["download_count"] for version in versions)
|
|
||||||
for row in versions:
|
|
||||||
if row["download_count"] <= biggest_value / 20:
|
|
||||||
continue
|
|
||||||
mid_of_month = mean_date(row["start_date"], row["end_date"])
|
|
||||||
_, number_of_days = calendar.monthrange(mid_of_month.year, mid_of_month.month)
|
|
||||||
by_version[row["python_version"]][0].append(mid_of_month)
|
|
||||||
by_version[row["python_version"]][1].append(row["download_count"] / number_of_days)
|
|
||||||
plt.style.use("tableau-colorblind10")
|
|
||||||
plt.figure(figsize=(10, 10 * 2 / 3))
|
|
||||||
fmt = iter(cycle(["-", "--", ":", "-."]))
|
|
||||||
for version, (x, y) in by_version.items():
|
|
||||||
if version is None:
|
|
||||||
continue
|
|
||||||
if len(x) <= 2:
|
|
||||||
plt.plot(x, y, label=version)
|
|
||||||
continue
|
|
||||||
smooth_x = np.linspace(date2num(min(x)), date2num(max(x)), 200)
|
|
||||||
spline = make_interp_spline([date2num(d) for d in x], y, k=2)
|
|
||||||
smooth_y = spline(smooth_x)
|
|
||||||
plt.plot_date(smooth_x, smooth_y, label=version, fmt=next(fmt))
|
|
||||||
plt.xlabel("Date")
|
|
||||||
plt.ylabel("PyPI daily downloads")
|
|
||||||
plt.legend()
|
|
||||||
plt.savefig("python-versions.png")
|
|
||||||
|
|
||||||
|
|
||||||
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
|
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
|
||||||
|
|
||||||
|
|
||||||
def plot_pct():
|
def plot():
|
||||||
def by_version(version_string):
|
def by_version(version_string):
|
||||||
try:
|
try:
|
||||||
minor, major = version_string.split(".")
|
minor, major = version_string.split(".")
|
||||||
|
@ -164,24 +129,29 @@ def plot_pct():
|
||||||
)
|
)
|
||||||
versions["download_count"] = pd.to_numeric(versions["download_count"])
|
versions["download_count"] = pd.to_numeric(versions["download_count"])
|
||||||
versions["Python version"].fillna("Other", inplace=True)
|
versions["Python version"].fillna("Other", inplace=True)
|
||||||
versions = versions.merge(
|
download_counts = versions.groupby("start_date").agg(
|
||||||
versions.groupby("start_date").agg(monthly_downloads=("download_count", "sum")),
|
monthly_downloads=("download_count", "sum")
|
||||||
on="start_date",
|
|
||||||
)
|
)
|
||||||
|
plot_download_counts(download_counts)
|
||||||
|
versions = versions.merge(download_counts, on="start_date")
|
||||||
versions["pct"] = versions.download_count / versions.monthly_downloads
|
versions["pct"] = versions.download_count / versions.monthly_downloads
|
||||||
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period('M')
|
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period("M")
|
||||||
versions.set_index(["Python version", "date"], inplace=True)
|
versions.set_index(["Python version", "date"], inplace=True)
|
||||||
to_plot = versions.pct.unstack(0, fill_value=0)
|
to_plot = versions.pct.unstack(0, fill_value=0)
|
||||||
to_plot.sort_values(
|
to_plot.sort_values(
|
||||||
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
|
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
|
||||||
)
|
)
|
||||||
pd.options.display.float_format = '{:.2%}'.format
|
pd.options.display.float_format = "{:.2%}".format
|
||||||
pd.options.display.max_rows = 999
|
pd.options.display.max_rows = 999
|
||||||
print(to_plot)
|
print(to_plot)
|
||||||
for version in HIDE:
|
for version in HIDE:
|
||||||
del to_plot[version]
|
del to_plot[version]
|
||||||
del to_plot["Other"]
|
del to_plot["Other"]
|
||||||
plt.style.use("tableau-colorblind10")
|
plot_lines(to_plot)
|
||||||
|
plot_stacked(to_plot)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_stacked(to_plot):
|
||||||
ax = to_plot.plot.area(
|
ax = to_plot.plot.area(
|
||||||
stacked=True,
|
stacked=True,
|
||||||
figsize=(10, 10 * 2 / 3),
|
figsize=(10, 10 * 2 / 3),
|
||||||
|
@ -190,7 +160,28 @@ def plot_pct():
|
||||||
ylabel="%",
|
ylabel="%",
|
||||||
)
|
)
|
||||||
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
|
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
|
||||||
plt.savefig("python-versions-pct.png")
|
plt.savefig("python-versions-stacked.png")
|
||||||
|
|
||||||
|
|
||||||
|
def plot_lines(to_plot):
|
||||||
|
ax = to_plot.plot(
|
||||||
|
figsize=(10, 10 * 2 / 3),
|
||||||
|
title="% of PyPI download by Python version",
|
||||||
|
legend="reverse",
|
||||||
|
ylabel="%",
|
||||||
|
)
|
||||||
|
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
|
||||||
|
plt.savefig("python-versions-lines.png")
|
||||||
|
|
||||||
|
|
||||||
|
def plot_download_counts(to_plot):
|
||||||
|
ax = to_plot.plot(
|
||||||
|
figsize=(10, 10 * 2 / 3),
|
||||||
|
title="PyPI number of downloads",
|
||||||
|
legend="reverse",
|
||||||
|
xlabel="date",
|
||||||
|
)
|
||||||
|
plt.savefig("pypi-download-counts.png")
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
@ -207,5 +198,5 @@ if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
if args.fetch:
|
if args.fetch:
|
||||||
fetch_main()
|
fetch_main()
|
||||||
plot_pct()
|
plt.style.use("tableau-colorblind10")
|
||||||
plot_main()
|
plot()
|
||||||
|
|
Loading…
Reference in New Issue