formations/python-avancé/exercises/BANO.ipynb

252 lines
5.6 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4200dd32",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import warnings\n",
"import matplotlib.pyplot as plt\n",
"from Levenshtein import distance\n",
"warnings.simplefilter(\"ignore\")\n",
"%matplotlib notebook"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe6a15c4",
"metadata": {},
"outputs": [],
"source": [
"bano = pd.read_csv(\"https://bano.openstreetmap.fr/data/bano-78.csv\", names=[\"id\", \"numero\", \"voie\", \"code_post\", \"nom_comm\", \"source\", \"lat\", \"lon\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18f66986",
"metadata": {},
"outputs": [],
"source": [
"bano"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40f775d7",
"metadata": {},
"outputs": [],
"source": [
"bano.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "181f584b",
"metadata": {},
"outputs": [],
"source": [
"fig1, ax1 = plt.subplots()\n",
"ax1.set_aspect('equal')\n",
"plot = ax1.scatter(bano.lon, bano.lat, s=.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41b10d5d",
"metadata": {},
"outputs": [],
"source": [
"bano.source"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67014ee9",
"metadata": {},
"outputs": [],
"source": [
"bano.source = pd.Categorical(bano.source)\n",
"fig1, ax1 = plt.subplots()\n",
"ax1.set_aspect('equal')\n",
"plot = ax1.scatter(bano.lon, bano.lat, s=1, c=bano.source.cat.codes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47d5416a",
"metadata": {},
"outputs": [],
"source": [
"bano.voie.str.split().str[0].value_counts().head(15).index"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8814511d",
"metadata": {},
"outputs": [],
"source": [
"fig1, ax1 = plt.subplots()\n",
"ax1.set_aspect('equal')\n",
"for kind in bano.voie.str.split().str[0].value_counts().head(15).index:\n",
" to_plot = bano[bano.voie.str.startswith(kind)]\n",
" ax1.scatter(to_plot.lon, to_plot.lat, s=.1, label=kind)\n",
"ax1.legend(markerscale=10, fontsize='small', bbox_to_anchor=(1.05, .95), loc='upper left', )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d442c4b",
"metadata": {},
"outputs": [],
"source": [
"bano.numero.max()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da44b40f",
"metadata": {},
"outputs": [],
"source": [
"pd.to_numeric(bano.numero, errors='coerce').max()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2cb15832",
"metadata": {},
"outputs": [],
"source": [
"bano[\"numero_as_number\"] = pd.to_numeric(bano[\"numero\"], errors='coerce')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd103772",
"metadata": {},
"outputs": [],
"source": [
"bano[bano.numero == \"78250\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4da24e3",
"metadata": {},
"outputs": [],
"source": [
"bano[(bano.numero_as_number > 3000) & (bano.numero_as_number < 5000)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f81334da",
"metadata": {},
"outputs": [],
"source": [
"bano[~bano.code_post.astype(str).str.startswith(\"78\")]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c33bb852",
"metadata": {},
"outputs": [],
"source": [
"road_name_counts = bano.voie.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c0eb639",
"metadata": {},
"outputs": [],
"source": [
"road_name_counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2abc0c2d",
"metadata": {},
"outputs": [],
"source": [
"road_name_counts[road_name_counts == 1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39170999",
"metadata": {},
"outputs": [],
"source": [
"epsilon = 0.01\n",
"for suspicious_name in road_name_counts[road_name_counts == 1].index:\n",
" lat, lon = bano[bano.voie == suspicious_name].loc[:,('lat', 'lon')].to_numpy()[0]\n",
" neighbors = bano[(lat - epsilon < bano.lat) & (bano.lat < lat + epsilon) & (lon - epsilon < bano.lon) & (bano.lon < lon + epsilon)]\n",
" neighbors_names = set(neighbors.voie.values) - set([suspicious_name])\n",
" if not neighbors_names:\n",
" continue\n",
" proposed_name = min(neighbors_names, key=lambda n: distance(n, suspicious_name))\n",
" if distance(proposed_name, suspicious_name) < 3:\n",
" print(f\"Proposing to replace {suspicious_name!r} by {proposed_name!r}\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b7dc66c4",
"metadata": {},
"outputs": [],
"source": [
"nom_comm_counts = bano.nom_comm.value_counts()\n",
"print(nom_comm_counts)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}