203 lines
6.1 KiB
Python
203 lines
6.1 KiB
Python
"""Module to fetch and graph adoption of Python releases.
|
|
"""
|
|
|
|
import argparse
|
|
import calendar
|
|
import sqlite3
|
|
from datetime import datetime, timedelta, date
|
|
from collections import defaultdict
|
|
from itertools import cycle, count
|
|
|
|
import pandas as pd
|
|
from pypinfo.fields import PythonVersion
|
|
from pypinfo.core import build_query, create_client, create_config, parse_query_result
|
|
from pypinfo.db import get_credentials
|
|
import matplotlib.pyplot as plt
|
|
from matplotlib.dates import date2num
|
|
import matplotlib.ticker as mtick
|
|
from scipy.interpolate import make_interp_spline
|
|
import numpy as np
|
|
|
|
|
|
class DB:
|
|
def __init__(self):
|
|
self.connection = sqlite3.connect(
|
|
"python-versions.sqlite",
|
|
isolation_level=None,
|
|
detect_types=sqlite3.PARSE_COLNAMES,
|
|
)
|
|
self.connection.row_factory = sqlite3.Row
|
|
self.migrate()
|
|
|
|
def migrate(self):
|
|
self.connection.execute(
|
|
"""CREATE TABLE IF NOT EXISTS python_version (
|
|
"id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
|
|
"start_date" TEXT NOT NULL,
|
|
"end_date" TEXT NOT NULL,
|
|
"python_version" TEXT NULL,
|
|
"download_count" INT NOT NULL);"""
|
|
)
|
|
|
|
def store_python_version(
|
|
self, start_date, end_date, python_version, download_count
|
|
):
|
|
self.connection.execute(
|
|
"INSERT INTO python_version (start_date, end_date, python_version, download_count) VALUES (?, ?, ?, ?)",
|
|
(start_date, end_date, python_version, download_count),
|
|
)
|
|
|
|
def have_data_for_dates(self, start_date, end_date) -> bool:
|
|
return (
|
|
self.connection.execute(
|
|
"SELECT COUNT(1) FROM python_version WHERE start_date = ? AND end_date = ?",
|
|
(start_date, end_date),
|
|
).fetchone()[0]
|
|
> 0
|
|
)
|
|
|
|
def fetch_python_version(self):
|
|
return self.connection.execute(
|
|
"""
|
|
SELECT start_date as "start_date [date]",
|
|
end_date as "end_date [date]",
|
|
python_version,
|
|
download_count
|
|
FROM python_version
|
|
ORDER BY start_date"""
|
|
).fetchall()
|
|
|
|
|
|
def query_python_versions(start_date: str, end_date: str) -> list[tuple[str, int]]:
|
|
built_query = build_query(
|
|
"",
|
|
[PythonVersion],
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
)
|
|
|
|
with create_client(get_credentials()) as client:
|
|
query_job = client.query(built_query, job_config=create_config())
|
|
query_rows = query_job.result(timeout=120)
|
|
return [tuple(row) for row in query_rows]
|
|
|
|
|
|
def fetch_main():
|
|
db = DB()
|
|
today = date.today()
|
|
for year_in_the_past in count():
|
|
year = today.year - year_in_the_past
|
|
if year < 2017:
|
|
# There's no data before 2017.
|
|
return
|
|
for month in reversed(range(1, 13)):
|
|
start_date = date(year, month, 1)
|
|
end_date = start_date.replace(
|
|
day=calendar.monthrange(year, month)[1]
|
|
) + timedelta(days=1)
|
|
if end_date > today:
|
|
continue
|
|
if db.have_data_for_dates(start_date, end_date):
|
|
continue
|
|
print(f"Querying BigTable in [{start_date}; {end_date}]")
|
|
results = query_python_versions(str(start_date), str(end_date))
|
|
for python_version, download_count in results:
|
|
db.store_python_version(
|
|
start_date, end_date, python_version, download_count
|
|
)
|
|
|
|
|
|
HIDE = {"1.17", "2.4", "2.5", "2.6", "3.2", "3.3", "3.4"}
|
|
|
|
|
|
def plot():
|
|
def by_version(version_string):
|
|
try:
|
|
minor, major = version_string.split(".")
|
|
return float(minor), float(major)
|
|
except ValueError:
|
|
return 0, 0
|
|
|
|
def by_versions(version_strings):
|
|
return version_strings.map(by_version)
|
|
|
|
db = DB()
|
|
versions = pd.DataFrame(
|
|
db.fetch_python_version(),
|
|
columns=["start_date", "end_date", "Python version", "download_count"],
|
|
dtype="str",
|
|
)
|
|
versions["download_count"] = pd.to_numeric(versions["download_count"])
|
|
versions["Python version"].fillna("Other", inplace=True)
|
|
download_counts = versions.groupby("start_date").agg(
|
|
monthly_downloads=("download_count", "sum")
|
|
)
|
|
plot_download_counts(download_counts)
|
|
versions = versions.merge(download_counts, on="start_date")
|
|
versions["pct"] = versions.download_count / versions.monthly_downloads
|
|
versions["date"] = pd.to_datetime(versions.start_date).dt.to_period("M")
|
|
versions.set_index(["Python version", "date"], inplace=True)
|
|
to_plot = versions.pct.unstack(0, fill_value=0)
|
|
to_plot.sort_values(
|
|
by="Python version", ascending=False, axis=1, inplace=True, key=by_versions
|
|
)
|
|
pd.options.display.float_format = "{:.2%}".format
|
|
pd.options.display.max_rows = 999
|
|
print(to_plot)
|
|
for version in HIDE:
|
|
del to_plot[version]
|
|
del to_plot["Other"]
|
|
plot_lines(to_plot)
|
|
plot_stacked(to_plot)
|
|
|
|
|
|
def plot_stacked(to_plot):
|
|
ax = to_plot.plot.area(
|
|
stacked=True,
|
|
figsize=(10, 10 * 2 / 3),
|
|
title="% of PyPI download by Python version",
|
|
legend="reverse",
|
|
ylabel="%",
|
|
)
|
|
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
|
|
plt.savefig("python-versions-stacked.png")
|
|
|
|
|
|
def plot_lines(to_plot):
|
|
ax = to_plot.plot(
|
|
figsize=(10, 10 * 2 / 3),
|
|
title="% of PyPI download by Python version",
|
|
legend="reverse",
|
|
ylabel="%",
|
|
)
|
|
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
|
|
plt.savefig("python-versions-lines.png")
|
|
|
|
|
|
def plot_download_counts(to_plot):
|
|
ax = to_plot.plot(
|
|
figsize=(10, 10 * 2 / 3),
|
|
title="PyPI number of downloads",
|
|
legend="reverse",
|
|
xlabel="date",
|
|
)
|
|
plt.savefig("pypi-download-counts.png")
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument(
|
|
"--fetch",
|
|
action="store_true",
|
|
help="Fetch more data instead of just displaying them",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
if args.fetch:
|
|
fetch_main()
|
|
plt.style.use("tableau-colorblind10")
|
|
plot()
|