python-versions/python-versions.py

203 lines
6.1 KiB
Python
Raw Permalink Normal View History

2021-10-03 13:39:45 +00:00
"""Module to fetch and graph adoption of Python releases.
"""
2022-08-19 11:25:27 +00:00
import argparse
2021-10-03 13:39:45 +00:00
import calendar
import sqlite3
from datetime import datetime, timedelta, date
from collections import defaultdict
2021-12-24 14:52:20 +00:00
from itertools import cycle, count
2021-10-03 13:39:45 +00:00
2021-12-07 23:24:09 +00:00
import pandas as pd
2021-10-03 13:39:45 +00:00
from pypinfo.fields import PythonVersion
from pypinfo.core import build_query, create_client, create_config, parse_query_result
from pypinfo.db import get_credentials
import matplotlib.pyplot as plt
2021-10-23 16:17:21 +00:00
from matplotlib.dates import date2num
2023-04-03 09:30:51 +00:00
import matplotlib.ticker as mtick
2021-10-23 16:17:21 +00:00
from scipy.interpolate import make_interp_spline
import numpy as np
2021-10-03 13:39:45 +00:00
class DB:
def __init__(self):
self.connection = sqlite3.connect(
"python-versions.sqlite",
isolation_level=None,
detect_types=sqlite3.PARSE_COLNAMES,
)
self.connection.row_factory = sqlite3.Row
self.migrate()
def migrate(self):
self.connection.execute(
"""CREATE TABLE IF NOT EXISTS python_version (
"id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"start_date" TEXT NOT NULL,
"end_date" TEXT NOT NULL,
"python_version" TEXT NULL,
"download_count" INT NOT NULL);"""
)
def store_python_version(
self, start_date, end_date, python_version, download_count
):
self.connection.execute(
"INSERT INTO python_version (start_date, end_date, python_version, download_count) VALUES (?, ?, ?, ?)",
(start_date, end_date, python_version, download_count),
)
def have_data_for_dates(self, start_date, end_date) -> bool:
return (
self.connection.execute(
"SELECT COUNT(1) FROM python_version WHERE start_date = ? AND end_date = ?",
(start_date, end_date),
).fetchone()[0]
> 0
)
def fetch_python_version(self):
return self.connection.execute(
"""
SELECT start_date as "start_date [date]",
end_date as "end_date [date]",
python_version,
download_count
FROM python_version
ORDER BY start_date"""
).fetchall()
def query_python_versions(start_date: str, end_date: str) -> list[tuple[str, int]]:
built_query = build_query(
"",
[PythonVersion],
start_date=start_date,
end_date=end_date,
)
with create_client(get_credentials()) as client:
query_job = client.query(built_query, job_config=create_config())
query_rows = query_job.result(timeout=120)
return [tuple(row) for row in query_rows]
def fetch_main():
db = DB()
today = date.today()
2021-12-24 14:52:20 +00:00
for year_in_the_past in count():
2022-02-02 10:24:07 +00:00
year = today.year - year_in_the_past
if year < 2017:
2022-01-27 22:17:53 +00:00
# There's no data before 2017.
return
2021-12-24 14:52:20 +00:00
for month in reversed(range(1, 13)):
2021-10-03 13:39:45 +00:00
start_date = date(year, month, 1)
end_date = start_date.replace(
day=calendar.monthrange(year, month)[1]
) + timedelta(days=1)
if end_date > today:
continue
if db.have_data_for_dates(start_date, end_date):
continue
print(f"Querying BigTable in [{start_date}; {end_date}]")
results = query_python_versions(str(start_date), str(end_date))
for python_version, download_count in results:
db.store_python_version(
start_date, end_date, python_version, download_count
)
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
2023-06-15 13:30:46 +00:00
def plot():
def by_version(version_string):
try:
minor, major = version_string.split(".")
return float(minor), float(major)
except ValueError:
return 0, 0
def by_versions(version_strings):
return version_strings.map(by_version)
2021-12-07 23:24:09 +00:00
db = DB()
versions = pd.DataFrame(
db.fetch_python_version(),
2022-06-01 20:16:49 +00:00
columns=["start_date", "end_date", "Python version", "download_count"],
2021-12-07 23:24:09 +00:00
dtype="str",
)
versions["download_count"] = pd.to_numeric(versions["download_count"])
2022-06-01 20:16:49 +00:00
versions["Python version"].fillna("Other", inplace=True)
2023-06-15 13:30:46 +00:00
download_counts = versions.groupby("start_date").agg(
monthly_downloads=("download_count", "sum")
2021-12-07 23:24:09 +00:00
)
2023-06-15 13:30:46 +00:00
plot_download_counts(download_counts)
versions = versions.merge(download_counts, on="start_date")
2023-04-03 09:30:51 +00:00
versions["pct"] = versions.download_count / versions.monthly_downloads
2023-06-15 13:30:46 +00:00
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period("M")
2022-06-01 20:16:49 +00:00
versions.set_index(["Python version", "date"], inplace=True)
2021-12-07 23:24:09 +00:00
to_plot = versions.pct.unstack(0, fill_value=0)
2022-06-01 20:16:49 +00:00
to_plot.sort_values(
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
)
2023-06-15 13:30:46 +00:00
pd.options.display.float_format = "{:.2%}".format
2023-05-06 06:47:37 +00:00
pd.options.display.max_rows = 999
print(to_plot)
2022-06-01 20:16:49 +00:00
for version in HIDE:
del to_plot[version]
del to_plot["Other"]
2023-06-15 13:30:46 +00:00
plot_lines(to_plot)
plot_stacked(to_plot)
def plot_stacked(to_plot):
2023-04-03 09:30:51 +00:00
ax = to_plot.plot.area(
2022-06-01 20:16:49 +00:00
stacked=True,
figsize=(10, 10 * 2 / 3),
title="% of PyPI download by Python version",
legend="reverse",
ylabel="%",
)
2023-04-03 09:30:51 +00:00
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
2023-06-15 13:30:46 +00:00
plt.savefig("python-versions-stacked.png")
def plot_lines(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="% of PyPI download by Python version",
legend="reverse",
ylabel="%",
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
plt.savefig("python-versions-lines.png")
def plot_download_counts(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="PyPI number of downloads",
legend="reverse",
xlabel="date",
)
plt.savefig("pypi-download-counts.png")
2021-12-07 23:24:09 +00:00
2022-08-19 11:25:27 +00:00
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--fetch",
action="store_true",
help="Fetch more data instead of just displaying them",
)
return parser.parse_args()
2021-10-03 13:39:45 +00:00
if __name__ == "__main__":
2022-08-19 11:25:27 +00:00
args = parse_args()
if args.fetch:
2021-10-03 13:39:45 +00:00
fetch_main()
2023-06-15 13:30:46 +00:00
plt.style.use("tableau-colorblind10")
plot()