252 lines
5.6 KiB
Plaintext
252 lines
5.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4200dd32",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import warnings\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from Levenshtein import distance\n",
|
|
"warnings.simplefilter(\"ignore\")\n",
|
|
"%matplotlib notebook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fe6a15c4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano = pd.read_csv(\"https://bano.openstreetmap.fr/data/bano-78.csv\", names=[\"id\", \"numero\", \"voie\", \"code_post\", \"nom_comm\", \"source\", \"lat\", \"lon\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "18f66986",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "40f775d7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano.describe()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "181f584b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"fig1, ax1 = plt.subplots()\n",
|
|
"ax1.set_aspect('equal')\n",
|
|
"plot = ax1.scatter(bano.lon, bano.lat, s=.1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "41b10d5d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano.source"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "67014ee9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano.source = pd.Categorical(bano.source)\n",
|
|
"fig1, ax1 = plt.subplots()\n",
|
|
"ax1.set_aspect('equal')\n",
|
|
"plot = ax1.scatter(bano.lon, bano.lat, s=1, c=bano.source.cat.codes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "47d5416a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano.voie.str.split().str[0].value_counts().head(15).index"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8814511d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"fig1, ax1 = plt.subplots()\n",
|
|
"ax1.set_aspect('equal')\n",
|
|
"for kind in bano.voie.str.split().str[0].value_counts().head(15).index:\n",
|
|
" to_plot = bano[bano.voie.str.startswith(kind)]\n",
|
|
" ax1.scatter(to_plot.lon, to_plot.lat, s=.1, label=kind)\n",
|
|
"ax1.legend(markerscale=10, fontsize='small', bbox_to_anchor=(1.05, .95), loc='upper left', )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4d442c4b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano.numero.max()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "da44b40f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pd.to_numeric(bano.numero, errors='coerce').max()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2cb15832",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano[\"numero_as_number\"] = pd.to_numeric(bano[\"numero\"], errors='coerce')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cd103772",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano[bano.numero == \"78250\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b4da24e3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano[(bano.numero_as_number > 3000) & (bano.numero_as_number < 5000)]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f81334da",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"bano[~bano.code_post.astype(str).str.startswith(\"78\")]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c33bb852",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"road_name_counts = bano.voie.value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6c0eb639",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"road_name_counts"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2abc0c2d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"road_name_counts[road_name_counts == 1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "39170999",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"epsilon = 0.01\n",
|
|
"for suspicious_name in road_name_counts[road_name_counts == 1].index:\n",
|
|
" lat, lon = bano[bano.voie == suspicious_name].loc[:,('lat', 'lon')].to_numpy()[0]\n",
|
|
" neighbors = bano[(lat - epsilon < bano.lat) & (bano.lat < lat + epsilon) & (lon - epsilon < bano.lon) & (bano.lon < lon + epsilon)]\n",
|
|
" neighbors_names = set(neighbors.voie.values) - set([suspicious_name])\n",
|
|
" if not neighbors_names:\n",
|
|
" continue\n",
|
|
" proposed_name = min(neighbors_names, key=lambda n: distance(n, suspicious_name))\n",
|
|
" if distance(proposed_name, suspicious_name) < 3:\n",
|
|
" print(f\"Proposing to replace {suspicious_name!r} by {proposed_name!r}\")\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b7dc66c4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"nom_comm_counts = bano.nom_comm.value_counts()\n",
|
|
"print(nom_comm_counts)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|