2018-07-24 14:27:37 +00:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Author : freezed < freezed @users.noreply.github.com > 2018 - 07 - 24
Version : 0.1
Licence : ` GNU GPL v3 ` GNU GPL v3 : http : / / www . gnu . org / licenses /
2018-08-09 11:20:03 +00:00
Call Open Food Facts API to populate a local MariaDB / MySQL database with product data
2018-07-24 14:27:37 +00:00
This DB will serve an CLI client which gives alternative products with better
nurition grade .
"""
import json
import requests
2018-08-08 20:08:18 +00:00
from config import FIELD_KEPT , API_URL_CAT
2018-07-24 18:40:31 +00:00
2018-08-02 07:47:27 +00:00
def get_product ( code , from_file = False ) :
2018-07-24 14:27:37 +00:00
"""
2018-08-09 11:20:03 +00:00
Call Open Food Facts API to get data of a single product
2018-07-24 14:27:37 +00:00
2018-07-31 13:17:36 +00:00
: Tests ONLINE :
2018-08-02 07:47:27 +00:00
# >>> prod_beurre = get_product('3017760000109')
# >>> prod_oreo = get_product('8410000810004')
# >>> prod_false is False
# True
# >>> prod_string is False
# True
2018-07-31 13:17:36 +00:00
: Tests OFFLINE :
2018-08-02 07:47:27 +00:00
>> > prod_beurre = get_product ( ' 3017760000109 ' , True )
>> > prod_oreo = get_product ( ' 8410000810004 ' , True )
>> > prod_false = get_product ( ' 1664 ' , True )
2018-08-02 09:41:58 +00:00
File load error : sample / product - 1664. json
2018-08-02 07:47:27 +00:00
>> > prod_string = get_product ( ' string ' , True )
2018-08-02 09:41:58 +00:00
File load error : sample / product - string . json
2018-08-02 07:47:27 +00:00
2018-07-25 17:22:01 +00:00
>> > print ( prod_beurre [ ' product_name ' ] )
2018-07-24 14:27:37 +00:00
Le Véritable Petit Beurre
2018-07-25 17:22:01 +00:00
>> > print ( prod_beurre [ ' nutrition_grades ' ] )
2018-07-24 14:27:37 +00:00
e
2018-07-25 17:22:01 +00:00
>> > print ( prod_beurre [ ' categories_tags ' ] )
2018-08-01 16:11:04 +00:00
[ ' en:sugary-snacks ' , ' en:biscuits-and-cakes ' , \
' en:biscuits ' , ' fr:petits-beurres ' ]
2018-07-25 17:22:01 +00:00
2018-07-25 22:10:24 +00:00
>> > print ( prod_oreo [ ' code ' ] )
8410000810004
2018-08-02 05:54:51 +00:00
>> > print ( prod_oreo [ ' url ' ] )
https : / / fr . openfoodfacts . org / product / 8410000810004 /
2018-07-25 17:22:01 +00:00
>> > print ( prod_oreo [ ' product_name ' ] )
Biscuit Oreo
>> > print ( prod_oreo [ ' nutrition_grades ' ] )
e
>> > print ( prod_oreo [ ' categories_tags ' ] )
2018-08-01 16:11:04 +00:00
[ ' en:sugary-snacks ' , ' en:biscuits-and-cakes ' , ' en:biscuits ' , \
' en:chocolate-biscuits ' , ' es:sandwich-cookies ' ]
2018-07-24 14:27:37 +00:00
"""
2018-08-02 07:47:27 +00:00
ERR_FILE = " File load error : {} "
2018-08-02 09:41:58 +00:00
filename = ' sample/product- {} .json ' . format ( str ( code ) )
2018-08-02 07:47:27 +00:00
2018-07-25 21:24:35 +00:00
try :
int ( code )
2018-07-25 17:22:01 +00:00
2018-07-25 22:10:24 +00:00
except ValueError : # as except_detail:
2018-07-25 21:24:35 +00:00
# print("Exception: «{}»".format(except_detail))
2018-08-02 07:47:27 +00:00
print ( ERR_FILE . format ( filename ) )
2018-07-24 18:40:31 +00:00
2018-07-25 21:24:35 +00:00
else :
2018-08-02 07:47:27 +00:00
if from_file :
from os import path
# File did not exists
if path . isfile ( filename ) is False :
print ( ERR_FILE . format ( filename ) )
status = 404
product_json = { ' status ' : 0 }
else :
with open ( filename , " r " ) as json_file :
product_json = json . loads ( json_file . read ( ) )
status = 200
else :
response = requests . get (
" https://fr.openfoodfacts.org/api/v0/product/ {} .json " . format ( code )
)
product_json = json . loads ( response . text )
status = response . status_code
if product_json [ ' status ' ] and status == 200 :
2018-08-02 05:54:51 +00:00
product_kept = {
' code ' : code ,
' url ' : " https://fr.openfoodfacts.org/product/ {} / " . format ( code )
}
2018-07-25 21:24:35 +00:00
2018-07-31 13:17:36 +00:00
for field in FIELD_KEPT [ ' product ' ] :
2018-07-25 22:10:24 +00:00
product_kept [ field ] = product_json [ ' product ' ] [ field ]
2018-07-25 21:24:35 +00:00
2018-07-25 22:10:24 +00:00
return product_kept
2018-07-25 21:24:35 +00:00
else :
return False
2018-07-24 18:40:31 +00:00
2018-07-30 06:13:36 +00:00
def get_category ( name , from_file = False ) :
2018-07-26 13:42:21 +00:00
"""
2018-08-09 11:20:03 +00:00
Call Open Food Facts API to get data of products in a single category
2018-07-26 13:42:21 +00:00
2018-07-31 13:17:36 +00:00
: return : Dict filled with products & kept fields
2018-07-26 13:42:21 +00:00
First try , TODO :
2018-07-31 13:17:36 +00:00
- work offline with local JSON
2018-07-26 13:42:21 +00:00
- need to get all the products of a category
2018-07-31 13:17:36 +00:00
: Tests ONLINE :
2018-07-26 13:42:21 +00:00
>> > prod_false = get_category ( ' 1664 ' )
>> > prod_false
False
2018-08-08 20:08:18 +00:00
>> > prod_bles = get_category ( ' blés ' )
2018-07-31 13:17:36 +00:00
: Tests OFFLINE :
2018-08-08 20:08:18 +00:00
# >>> prod_bles = get_category('biscuits', True)
>> > prod_bles [ ' category ' ] == ' biscuits '
2018-08-01 16:11:04 +00:00
True
2018-08-08 20:08:18 +00:00
>> > ' count ' in prod_bles
2018-07-26 13:42:21 +00:00
True
2018-08-08 20:08:18 +00:00
>> > ' product_name ' in prod_bles [ ' products ' ] [ 0 ]
2018-07-26 13:42:21 +00:00
True
2018-08-08 20:08:18 +00:00
>> > ' nutrition_grades ' in prod_bles [ ' products ' ] [ 0 ]
2018-07-26 13:42:21 +00:00
True
2018-08-08 20:08:18 +00:00
>> > ' categories_tags ' in prod_bles [ ' products ' ] [ 0 ]
2018-07-26 13:42:21 +00:00
True
2018-07-30 06:13:36 +00:00
>> > get_category ( ' wrong_file ' , True )
2018-08-02 09:41:58 +00:00
File load error : sample / category - wrong_file . json
2018-07-30 06:13:36 +00:00
False
2018-07-31 13:17:36 +00:00
2018-08-08 20:08:18 +00:00
# >>> pprint.pprint(prod_bles)
2018-07-26 13:42:21 +00:00
"""
2018-07-30 06:13:36 +00:00
if from_file :
from os import path
2018-07-26 13:42:21 +00:00
2018-08-02 09:41:58 +00:00
filename = ' sample/category- {} .json ' . format ( str ( name ) )
2018-07-30 06:13:36 +00:00
# File did not exists
if path . isfile ( filename ) is False :
print ( " File load error : {} " . format ( filename ) )
2018-07-31 13:17:36 +00:00
status = 404
2018-08-08 20:08:18 +00:00
cat_json = { ' count ' : 0 }
2018-07-30 06:13:36 +00:00
else :
with open ( filename , " r " ) as json_file :
2018-08-08 20:08:18 +00:00
cat_json = json . loads ( json_file . read ( ) )
2018-07-30 06:13:36 +00:00
status = 200
2018-08-08 20:08:18 +00:00
# Requests over API
2018-07-30 06:13:36 +00:00
else :
2018-08-08 20:08:18 +00:00
page = 1
response = requests . get ( API_URL_CAT . format ( str ( name ) , page ) )
cat_json = json . loads ( response . text )
2018-07-30 06:13:36 +00:00
status = response . status_code
2018-07-26 13:42:21 +00:00
2018-08-08 20:08:18 +00:00
# Gets data
if cat_json [ ' count ' ] > 0 :
# Defines dict it will be returned
staging_data = {
# 'count': cat_json['count'],
2018-08-01 16:11:04 +00:00
' category ' : str ( name ) ,
2018-07-31 13:17:36 +00:00
' products ' : [ ]
2018-08-08 20:08:18 +00:00
}
# Counts pages of this category
total_pages = int ( cat_json [ ' count ' ] / / cat_json [ ' page_size ' ] )
2018-07-26 13:42:21 +00:00
2018-08-08 20:08:18 +00:00
if int ( cat_json [ ' count ' ] % cat_json [ ' page_size ' ] ) > 0 :
total_pages + = 1
# Loops on data from 1st page
for idx , product_fields in enumerate ( cat_json [ ' products ' ] ) :
staging_data [ ' products ' ] . append ( dict ( ) )
2018-07-31 13:17:36 +00:00
for field in FIELD_KEPT [ ' category ' ] :
if field in product_fields :
2018-08-08 20:08:18 +00:00
staging_data [ ' products ' ] [ idx ] [ field ] = product_fields [ field ]
2018-07-31 13:17:36 +00:00
else :
2018-08-08 20:08:18 +00:00
staging_data [ ' products ' ] [ idx ] [ field ] = False
# Gets data for all other pages
while page < total_pages :
# Requests next page over API
page + = 1
response = requests . get ( API_URL_CAT . format ( str ( name ) , page ) )
cat_json = json . loads ( response . text )
idx = len ( staging_data [ ' products ' ] )
for product_fields in cat_json [ ' products ' ] :
staging_data [ ' products ' ] . append ( dict ( ) )
for field in FIELD_KEPT [ ' category ' ] :
2018-08-09 11:20:03 +00:00
2018-08-08 20:08:18 +00:00
if field in product_fields :
staging_data [ ' products ' ] [ idx ] [ field ] = product_fields [ field ]
else :
staging_data [ ' products ' ] [ idx ] [ field ] = False
idx + = 1
print ( " \t \t […finish page {} / {} - {} ids] " . format ( page , total_pages , idx ) )
2018-07-26 13:42:21 +00:00
2018-08-08 20:08:18 +00:00
return staging_data
2018-07-26 13:42:21 +00:00
else :
return False
2018-08-04 13:58:31 +00:00
def false_to_null ( sql_list ) :
""" Replacing nutrition_score= " False " by nutrition_score=NULL """
for idx , request in enumerate ( sql_list ) :
if " False " in request :
sql_list [ idx ] = " {} NULL {} " . format (
request [ : request . find ( ' False ' ) - 1 ] ,
request [ request . find ( ' False ' ) + 6 : ]
)
return sql_list
2018-08-01 14:32:44 +00:00
def pick_category ( cat_list ) :
"""
Picks only one category to associate the product in the local DB
One of the shortest tag ( without langage prefix ) is taken .
For improvement it is a good place to adds more work here , like selecting
by langage prefix .
: Tests :
2018-08-01 16:11:04 +00:00
>> > pick_category ( [ ' en:sugary-snacks ' , ' en:biscuits-and-cakes ' , \
' en:biscuits ' ] )
2018-08-01 14:32:44 +00:00
' biscuits '
"""
if len ( cat_list ) > 1 :
# get idx of the shortest tag
flip_list = [ ( len ( cat ) , idx ) for idx , cat in enumerate ( cat_list ) ]
flip_list . sort ( )
shortest_tag_idx = flip_list [ 0 ] [ 1 ]
return cat_list [ shortest_tag_idx ] . split ( " : " ) [ 1 ]
elif len ( cat_list ) == 1 :
return cat_list [ 0 ] . split ( " : " ) [ 1 ]
else :
return False
2018-08-02 08:02:41 +00:00
def sql_generator ( staging_data ) :
"""
Uses ` staging_data ` to generate SQL INSERT requests .
: staging_data : dict ( ) created with ` get_product ( ) ` or ` get_category ( ) `
: return : list ( ) of SQL requests
: Tests :
2018-08-04 14:26:34 +00:00
>> > sql_generator ( False ) is False
True
2018-08-04 13:58:31 +00:00
>> > bisc = { ' count ' : 4377 , ' category ' : ' biscuits ' , ' products ' : [ { ' _id ' : ' 8480000141323 ' , ' categories_tags ' : [ ' en:sugary-snacks ' , ' en:biscuits-and-cakes ' , ' en:biscuits ' ] , ' nutrition_grades ' : ' e ' , ' product_name ' : ' Galletas María Dorada Hacendado ' , ' url ' : ' https://fr-en.openfoodfacts.org/product/8480000141323/galletas-maria-dorada-hacendado ' } , { ' _id ' : ' 3593551174971 ' , ' categories_tags ' : [ ' en:sugary-snacks ' , ' en:biscuits-and-cakes ' , ' en:biscuits ' ] , ' nutrition_grades ' : ' False ' , ' product_name ' : ' Les Broyés du Poitou ' , ' url ' : ' https://fr-en.openfoodfacts.org/product/3593551174971/les-broyes-du-poitou-les-mousquetaires ' } ] }
2018-08-02 08:02:41 +00:00
>> > sql_list_bisc = sql_generator ( bisc )
>> > sql_list_bisc [ 0 ]
2018-08-04 13:58:31 +00:00
" INSERT INTO category (`name`) VALUES ( ' biscuits ' ); "
2018-08-02 08:02:41 +00:00
>> > sql_list_bisc [ 1 ]
2018-08-04 13:58:31 +00:00
' INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT " Galletas María Dorada Hacendado " , " 8480000141323 " , " https://fr-en.openfoodfacts.org/product/8480000141323/galletas-maria-dorada-hacendado " , " e " , id AS category_id FROM category WHERE name = " biscuits " ; '
2018-08-02 08:02:41 +00:00
>> > sql_list_bisc [ 2 ]
2018-08-04 13:58:31 +00:00
' INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT " Les Broyés du Poitou " , " 3593551174971 " , " https://fr-en.openfoodfacts.org/product/3593551174971/les-broyes-du-poitou-les-mousquetaires " , NULL, id AS category_id FROM category WHERE name = " biscuits " ; '
2018-08-02 08:02:41 +00:00
>> > oreo = { ' categories_tags ' : [ ' en:sugary-snacks ' , ' en:biscuits-and-cakes ' , ' en:biscuits ' , ' en:chocolate-biscuits ' , ' es:sandwich-cookies ' ] , ' code ' : ' 8410000810004 ' , ' nutrition_grades ' : ' e ' , ' product_name ' : ' Biscuit Oreo ' , ' url ' : ' https://fr.openfoodfacts.org/product/8410000810004/ ' }
>> > sql_list_oreo = sql_generator ( oreo )
>> > sql_list_oreo [ 0 ]
2018-08-04 13:58:31 +00:00
" INSERT INTO category (`name`) VALUES ( ' biscuits ' ); "
2018-08-02 08:02:41 +00:00
>> > sql_list_oreo [ 1 ]
2018-08-04 13:58:31 +00:00
' INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT " Biscuit Oreo " , " 8410000810004 " , " https://fr.openfoodfacts.org/product/8410000810004/ " , " e " , id AS category_id FROM category WHERE name = " biscuits " ; '
>> > oreo_nutri_null = { ' categories_tags ' : [ ' en:sugary-snacks ' , ' en:biscuits-and-cakes ' , ' en:biscuits ' , ' en:chocolate-biscuits ' , ' es:sandwich-cookies ' ] , ' code ' : ' 8410000810004 ' , ' nutrition_grades ' : ' False ' , ' product_name ' : ' Biscuit Oreo ' , ' url ' : ' https://fr.openfoodfacts.org/product/8410000810004/ ' }
>> > sql_list_oreo_nutri_null = sql_generator ( oreo_nutri_null )
>> > sql_list_oreo_nutri_null [ 1 ]
' INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) SELECT " Biscuit Oreo " , " 8410000810004 " , " https://fr.openfoodfacts.org/product/8410000810004/ " , NULL, id AS category_id FROM category WHERE name = " biscuits " ; '
2018-08-02 08:02:41 +00:00
"""
sql_list = [ ]
2018-08-03 15:16:09 +00:00
insert_cat = " INSERT INTO category (`name`) VALUES ( ' {} ' ); "
2018-08-04 13:58:31 +00:00
insert_prod = """ INSERT INTO product (`name`, `code`, `url`, `nutrition_grades`, `category_id`) \
SELECT " {name} " , " {code} " , " {url} " , " {nutri} " , id AS category_id \
2018-08-02 08:02:41 +00:00
FROM category \
2018-08-03 15:16:09 +00:00
WHERE name = " {cat} " ; """
2018-08-02 08:02:41 +00:00
2018-08-04 14:26:34 +00:00
if staging_data is not False and ' category ' in staging_data . keys ( ) :
2018-08-02 08:02:41 +00:00
used_category = staging_data [ ' category ' ]
# insert category
sql_list . append ( insert_cat . format ( used_category ) )
# insert products
for idx , val in enumerate ( staging_data [ ' products ' ] ) :
sql_list . append (
insert_prod . format (
2018-08-04 13:58:31 +00:00
code = val [ ' _id ' ] ,
url = val [ ' url ' ] ,
name = val [ ' product_name ' ] ,
nutri = val [ ' nutrition_grades ' ] ,
2018-08-02 08:02:41 +00:00
cat = used_category
)
)
2018-08-04 14:26:34 +00:00
elif staging_data is not False and ' product_name ' in staging_data . keys ( ) :
2018-08-02 08:02:41 +00:00
used_category = pick_category ( staging_data [ ' categories_tags ' ] )
# insert category
sql_list . append ( insert_cat . format ( used_category ) )
sql_list . append (
insert_prod . format (
code = staging_data [ ' code ' ] ,
url = staging_data [ ' url ' ] ,
name = staging_data [ ' product_name ' ] ,
nutri = staging_data [ ' nutrition_grades ' ] ,
cat = used_category
)
)
else :
sql_list = False
2018-08-04 13:58:31 +00:00
if sql_list is not False :
sql_list = false_to_null ( sql_list )
2018-08-02 08:02:41 +00:00
return sql_list
2018-07-24 14:27:37 +00:00
if __name__ == " __main__ " :
import doctest
doctest . testmod ( )