python-versions/python-versions.py

203 lines
6.1 KiB
Python

"""Module to fetch and graph adoption of Python releases.
"""
import argparse
import calendar
import sqlite3
from datetime import datetime, timedelta, date
from collections import defaultdict
from itertools import cycle, count
import pandas as pd
from pypinfo.fields import PythonVersion
from pypinfo.core import build_query, create_client, create_config, parse_query_result
from pypinfo.db import get_credentials
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import matplotlib.ticker as mtick
from scipy.interpolate import make_interp_spline
import numpy as np
class DB:
def __init__(self):
self.connection = sqlite3.connect(
"python-versions.sqlite",
isolation_level=None,
detect_types=sqlite3.PARSE_COLNAMES,
)
self.connection.row_factory = sqlite3.Row
self.migrate()
def migrate(self):
self.connection.execute(
"""CREATE TABLE IF NOT EXISTS python_version (
"id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
"start_date" TEXT NOT NULL,
"end_date" TEXT NOT NULL,
"python_version" TEXT NULL,
"download_count" INT NOT NULL);"""
)
def store_python_version(
self, start_date, end_date, python_version, download_count
):
self.connection.execute(
"INSERT INTO python_version (start_date, end_date, python_version, download_count) VALUES (?, ?, ?, ?)",
(start_date, end_date, python_version, download_count),
)
def have_data_for_dates(self, start_date, end_date) -> bool:
return (
self.connection.execute(
"SELECT COUNT(1) FROM python_version WHERE start_date = ? AND end_date = ?",
(start_date, end_date),
).fetchone()[0]
> 0
)
def fetch_python_version(self):
return self.connection.execute(
"""
SELECT start_date as "start_date [date]",
end_date as "end_date [date]",
python_version,
download_count
FROM python_version
ORDER BY start_date"""
).fetchall()
def query_python_versions(start_date: str, end_date: str) -> list[tuple[str, int]]:
built_query = build_query(
"",
[PythonVersion],
start_date=start_date,
end_date=end_date,
)
with create_client(get_credentials()) as client:
query_job = client.query(built_query, job_config=create_config())
query_rows = query_job.result(timeout=120)
return [tuple(row) for row in query_rows]
def fetch_main():
db = DB()
today = date.today()
for year_in_the_past in count():
year = today.year - year_in_the_past
if year < 2017:
# There's no data before 2017.
return
for month in reversed(range(1, 13)):
start_date = date(year, month, 1)
end_date = start_date.replace(
day=calendar.monthrange(year, month)[1]
) + timedelta(days=1)
if end_date > today:
continue
if db.have_data_for_dates(start_date, end_date):
continue
print(f"Querying BigTable in [{start_date}; {end_date}]")
results = query_python_versions(str(start_date), str(end_date))
for python_version, download_count in results:
db.store_python_version(
start_date, end_date, python_version, download_count
)
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
def plot():
def by_version(version_string):
try:
minor, major = version_string.split(".")
return float(minor), float(major)
except ValueError:
return 0, 0
def by_versions(version_strings):
return version_strings.map(by_version)
db = DB()
versions = pd.DataFrame(
db.fetch_python_version(),
columns=["start_date", "end_date", "Python version", "download_count"],
dtype="str",
)
versions["download_count"] = pd.to_numeric(versions["download_count"])
versions["Python version"].fillna("Other", inplace=True)
download_counts = versions.groupby("start_date").agg(
monthly_downloads=("download_count", "sum")
)
plot_download_counts(download_counts)
versions = versions.merge(download_counts, on="start_date")
versions["pct"] = versions.download_count / versions.monthly_downloads
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period("M")
versions.set_index(["Python version", "date"], inplace=True)
to_plot = versions.pct.unstack(0, fill_value=0)
to_plot.sort_values(
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
)
pd.options.display.float_format = "{:.2%}".format
pd.options.display.max_rows = 999
print(to_plot)
for version in HIDE:
del to_plot[version]
del to_plot["Other"]
plot_lines(to_plot)
plot_stacked(to_plot)
def plot_stacked(to_plot):
ax = to_plot.plot.area(
stacked=True,
figsize=(10, 10 * 2 / 3),
title="% of PyPI download by Python version",
legend="reverse",
ylabel="%",
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
plt.savefig("python-versions-stacked.png")
def plot_lines(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="% of PyPI download by Python version",
legend="reverse",
ylabel="%",
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
plt.savefig("python-versions-lines.png")
def plot_download_counts(to_plot):
ax = to_plot.plot(
figsize=(10, 10 * 2 / 3),
title="PyPI number of downloads",
legend="reverse",
xlabel="date",
)
plt.savefig("pypi-download-counts.png")
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--fetch",
action="store_true",
help="Fetch more data instead of just displaying them",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
if args.fetch:
fetch_main()
plt.style.use("tableau-colorblind10")
plot()