323 lines
11 KiB
Python
323 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Author: freezed <freezed@users.noreply.github.com> 2018-07-24
|
|
Version: 0.1
|
|
Licence: `GNU GPL v3` GNU GPL v3: http://www.gnu.org/licenses/
|
|
|
|
Call Open Food Facts API to populate a local MariaDB/MySQL database with product data
|
|
This DB will serve an CLI client which gives alternative products with better
|
|
nurition grade.
|
|
"""
|
|
import json
|
|
import requests
|
|
from config import FIELD_KEPT, API_URL_CAT
|
|
|
|
|
|
def get_product(code, from_file=False):
|
|
"""
|
|
Call Open Food Facts API to get data of a single product
|
|
|
|
"""
|
|
|
|
ERR_FILE = "File load error : {}"
|
|
filename = 'sample/product-{}.json'.format(str(code))
|
|
|
|
try:
|
|
int(code)
|
|
|
|
except ValueError: # as except_detail:
|
|
# print("Exception: «{}»".format(except_detail))
|
|
print(ERR_FILE.format(filename))
|
|
|
|
else:
|
|
|
|
if from_file:
|
|
from os import path
|
|
|
|
# File did not exists
|
|
if path.isfile(filename) is False:
|
|
print(ERR_FILE.format(filename))
|
|
status = 404
|
|
product_json = {'status': 0}
|
|
|
|
else:
|
|
with open(filename, "r") as json_file:
|
|
product_json = json.loads(json_file.read())
|
|
status = 200
|
|
|
|
else:
|
|
|
|
response = requests.get(
|
|
"https://fr.openfoodfacts.org/api/v0/product/{}.json".format(code)
|
|
)
|
|
product_json = json.loads(response.text)
|
|
status = response.status_code
|
|
|
|
if product_json['status'] and status == 200:
|
|
product_kept = {
|
|
'code': code,
|
|
'url': "https://fr.openfoodfacts.org/product/{}/".format(code)
|
|
}
|
|
|
|
for field in FIELD_KEPT['product']:
|
|
product_kept[field] = product_json['product'][field]
|
|
|
|
return product_kept
|
|
|
|
else:
|
|
return False
|
|
|
|
|
|
def get_category(name, from_file=False):
|
|
"""
|
|
Call Open Food Facts API to get data of products in a single category
|
|
|
|
:return: Dict filled with products & kept fields
|
|
|
|
First try, TODO :
|
|
- work offline with local JSON
|
|
- need to get all the products of a category
|
|
|
|
:Tests ONLINE:
|
|
>>> prod_false = get_category('1664')
|
|
>>> prod_false
|
|
False
|
|
|
|
>>> prod_bles = get_category('blés')
|
|
|
|
:Tests OFFLINE:
|
|
# >>> prod_bles = get_category('biscuits', True)
|
|
>>> prod_bles['category'] == 'biscuits'
|
|
True
|
|
|
|
>>> 'count' in prod_bles
|
|
True
|
|
|
|
>>> 'product_name' in prod_bles['products'][0]
|
|
True
|
|
|
|
>>> 'nutrition_grades' in prod_bles['products'][0]
|
|
True
|
|
|
|
>>> 'categories_tags' in prod_bles['products'][0]
|
|
True
|
|
|
|
>>> get_category('wrong_file', True)
|
|
File load error : sample/category-wrong_file.json
|
|
False
|
|
|
|
# >>> pprint.pprint(prod_bles)
|
|
"""
|
|
|
|
if from_file:
|
|
from os import path
|
|
|
|
filename = 'sample/category-{}.json'.format(str(name))
|
|
# File did not exists
|
|
if path.isfile(filename) is False:
|
|
print("File load error : {}".format(filename))
|
|
status = 404
|
|
cat_json = {'count': 0}
|
|
|
|
else:
|
|
with open(filename, "r") as json_file:
|
|
cat_json = json.loads(json_file.read())
|
|
status = 200
|
|
|
|
# Requests over API
|
|
else:
|
|
page = 1
|
|
response = requests.get(API_URL_CAT.format(str(name), page))
|
|
cat_json = json.loads(response.text)
|
|
status = response.status_code
|
|
|
|
# Gets data
|
|
if cat_json['count'] > 0:
|
|
# Defines dict it will be returned
|
|
staging_data = {
|
|
# 'count': cat_json['count'],
|
|
'category': str(name),
|
|
'products': []
|
|
}
|
|
|
|
# Counts pages of this category
|
|
total_pages = int(cat_json['count'] // cat_json['page_size'])
|
|
|
|
if int(cat_json['count'] % cat_json['page_size']) > 0:
|
|
total_pages += 1
|
|
|
|
# Loops on data from 1st page
|
|
for idx, product_fields in enumerate(cat_json['products']):
|
|
staging_data['products'].append(dict())
|
|
|
|
for field in FIELD_KEPT['category']:
|
|
|
|
if field in product_fields:
|
|
staging_data['products'][idx][field] = product_fields[field]
|
|
|
|
else:
|
|
staging_data['products'][idx][field] = False
|
|
|
|
# Gets data for all other pages
|
|
while page < total_pages:
|
|
# Requests next page over API
|
|
page += 1
|
|
response = requests.get(API_URL_CAT.format(str(name), page))
|
|
cat_json = json.loads(response.text)
|
|
idx = len(staging_data['products'])
|
|
|
|
for product_fields in cat_json['products']:
|
|
staging_data['products'].append(dict())
|
|
|
|
for field in FIELD_KEPT['category']:
|
|
|
|
if field in product_fields:
|
|
staging_data['products'][idx][field] = product_fields[field]
|
|
|
|
else:
|
|
staging_data['products'][idx][field] = False
|
|
|
|
idx += 1
|
|
|
|
print("\t\t[…finish page {}/{} - {} ids]".format(page, total_pages, idx))
|
|
|
|
return staging_data
|
|
|
|
else:
|
|
return False
|
|
|
|
|
|
def false_to_null(sql_list):
|
|
""" Replacing nutrition_score="False" by nutrition_score=NULL """
|
|
for idx, request in enumerate(sql_list):
|
|
if "False" in request:
|
|
sql_list[idx] = "{}NULL{}".format(
|
|
request[:request.find('False')-1],
|
|
request[request.find('False')+6:]
|
|
)
|
|
return sql_list
|
|
|
|
|
|
def pick_category(cat_list):
|
|
"""
|
|
Picks only one category to associate the product in the local DB
|
|
|
|
One of the shortest tag (without langage prefix) is taken.
|
|
For improvement it is a good place to adds more work here, like selecting
|
|
by langage prefix.
|
|
|
|
:Tests:
|
|
>>> pick_category(['en:sugary-snacks', 'en:biscuits-and-cakes', \
|
|
'en:biscuits'])
|
|
'biscuits'
|
|
"""
|
|
if len(cat_list) > 1:
|
|
# get idx of the shortest tag
|
|
flip_list = [(len(cat), idx) for idx, cat in enumerate(cat_list)]
|
|
flip_list.sort()
|
|
|
|
shortest_tag_idx = flip_list[0][1]
|
|
|
|
return cat_list[shortest_tag_idx].split(":")[1]
|
|
|
|
elif len(cat_list) == 1:
|
|
return cat_list[0].split(":")[1]
|
|
|
|
else:
|
|
return False
|
|
|
|
|
|
def sql_generator(staging_data):
|
|
"""
|
|
Uses `staging_data` to generate SQL INSERT requests.
|
|
|
|
:staging_data: dict() created with `get_product()` or `get_category()`
|
|
:return: list() of SQL requests
|
|
|
|
:Tests:
|
|
>>> sql_generator(False) is False
|
|
True
|
|
|
|
>>> bisc = {'count': 4377,'category':'biscuits','products':[{'_id':'8480000141323','categories_tags':['en:sugary-snacks','en:biscuits-and-cakes','en:biscuits'],'nutrition_grades':'e','product_name':'Galletas María Dorada Hacendado','url':'https://fr-en.openfoodfacts.org/product/8480000141323/galletas-maria-dorada-hacendado'},{'_id':'3593551174971','categories_tags':['en:sugary-snacks','en:biscuits-and-cakes','en:biscuits'],'nutrition_grades':'False','product_name':'Les Broyés du Poitou','url':'https://fr-en.openfoodfacts.org/product/3593551174971/les-broyes-du-poitou-les-mousquetaires'}]}
|
|
|
|
>>> sql_list_bisc = sql_generator(bisc)
|
|
>>> sql_list_bisc[0]
|
|
"INSERT INTO category (`name`) VALUES ('biscuits');"
|
|
|
|
>>> sql_list_bisc[1]
|
|
'INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT "Galletas María Dorada Hacendado", "8480000141323", "https://fr-en.openfoodfacts.org/product/8480000141323/galletas-maria-dorada-hacendado", "e", id AS category_id FROM category WHERE name = "biscuits";'
|
|
|
|
>>> sql_list_bisc[2]
|
|
'INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT "Les Broyés du Poitou", "3593551174971", "https://fr-en.openfoodfacts.org/product/3593551174971/les-broyes-du-poitou-les-mousquetaires", NULL, id AS category_id FROM category WHERE name = "biscuits";'
|
|
|
|
>>> oreo = {'categories_tags':['en:sugary-snacks','en:biscuits-and-cakes','en:biscuits','en:chocolate-biscuits','es:sandwich-cookies'],'code':'8410000810004','nutrition_grades':'e','product_name':'Biscuit Oreo', 'url':'https://fr.openfoodfacts.org/product/8410000810004/'}
|
|
>>> sql_list_oreo = sql_generator(oreo)
|
|
>>> sql_list_oreo[0]
|
|
"INSERT INTO category (`name`) VALUES ('biscuits');"
|
|
|
|
>>> sql_list_oreo[1]
|
|
'INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT "Biscuit Oreo", "8410000810004", "https://fr.openfoodfacts.org/product/8410000810004/", "e", id AS category_id FROM category WHERE name = "biscuits";'
|
|
|
|
>>> oreo_nutri_null = {'categories_tags':['en:sugary-snacks','en:biscuits-and-cakes','en:biscuits','en:chocolate-biscuits','es:sandwich-cookies'],'code':'8410000810004','nutrition_grades':'False','product_name':'Biscuit Oreo', 'url':'https://fr.openfoodfacts.org/product/8410000810004/'}
|
|
>>> sql_list_oreo_nutri_null = sql_generator(oreo_nutri_null)
|
|
|
|
>>> sql_list_oreo_nutri_null[1]
|
|
'INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT "Biscuit Oreo", "8410000810004", "https://fr.openfoodfacts.org/product/8410000810004/", NULL, id AS category_id FROM category WHERE name = "biscuits";'
|
|
"""
|
|
|
|
sql_list = []
|
|
insert_cat = "INSERT INTO category (`name`) VALUES ('{}');"
|
|
insert_prod = """INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) \
|
|
SELECT "{name}", "{code}", "{url}", "{nutri}", id AS category_id \
|
|
FROM category \
|
|
WHERE name = "{cat}";"""
|
|
|
|
if staging_data is not False and 'category' in staging_data.keys():
|
|
used_category = staging_data['category']
|
|
|
|
# insert category
|
|
sql_list.append(insert_cat.format(used_category))
|
|
|
|
# insert products
|
|
for idx, val in enumerate(staging_data['products']):
|
|
sql_list.append(
|
|
insert_prod.format(
|
|
code=val['_id'],
|
|
url=val['url'],
|
|
name=val['product_name'],
|
|
nutri=val['nutrition_grades'],
|
|
cat=used_category
|
|
)
|
|
)
|
|
|
|
elif staging_data is not False and 'product_name' in staging_data.keys():
|
|
used_category = pick_category(staging_data['categories_tags'])
|
|
|
|
# insert category
|
|
sql_list.append(insert_cat.format(used_category))
|
|
|
|
sql_list.append(
|
|
insert_prod.format(
|
|
code=staging_data['code'],
|
|
url=staging_data['url'],
|
|
name=staging_data['product_name'],
|
|
nutri=staging_data['nutrition_grades'],
|
|
cat=used_category
|
|
)
|
|
)
|
|
|
|
else:
|
|
sql_list = False
|
|
|
|
if sql_list is not False:
|
|
sql_list = false_to_null(sql_list)
|
|
|
|
return sql_list
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
doctest.testmod()
|