416 lines
14 KiB
SQL
416 lines
14 KiB
SQL
-- ---------------------------------------------
|
|
-- Creates a local DB from the hudge CSV file --
|
|
-- ---------------------------------------------
|
|
|
|
DROP DATABASE IF EXISTS offlocal;
|
|
CREATE DATABASE offlocal CHARACTER SET 'utf8';
|
|
USE offlocal;
|
|
|
|
CREATE TABLE original_data(
|
|
`code` BIGINT UNSIGNED NOT NULL,
|
|
`url` TEXT(500),
|
|
`creator` VARCHAR(5),
|
|
`created_t` INT,
|
|
`created_datetime` VARCHAR(5),
|
|
`last_modified_t` INT,
|
|
`last_modified_datetime` VARCHAR(5),
|
|
`product_name` VARCHAR(200),
|
|
`generic_name` VARCHAR(5),
|
|
`quantity` VARCHAR(255),
|
|
`packaging` VARCHAR(5),
|
|
`packaging_tags` VARCHAR(5),
|
|
`brands` VARCHAR(255),
|
|
`brands_tags` VARCHAR(5),
|
|
`categories` VARCHAR(5),
|
|
`categories_tags` VARCHAR(5),
|
|
`categories_fr` TEXT(800),
|
|
`origins` VARCHAR(5),
|
|
`origins_tags` VARCHAR(5),
|
|
`manufacturing_places` VARCHAR(5),
|
|
`manufacturing_places_tags` VARCHAR(5),
|
|
`labels` VARCHAR(5),
|
|
`labels_tags` VARCHAR(5),
|
|
`labels_fr` TEXT,
|
|
`emb_codes` VARCHAR(5),
|
|
`emb_codes_tags` VARCHAR(5),
|
|
`first_packaging_code_geo` VARCHAR(5),
|
|
`cities` VARCHAR(5),
|
|
`cities_tags` VARCHAR(5),
|
|
`purchase_places` VARCHAR(100),
|
|
`stores` VARCHAR(255),
|
|
`countries` VARCHAR(5),
|
|
`countries_tags` VARCHAR(5),
|
|
`countries_fr` TEXT,
|
|
`ingredients_text` TEXT,
|
|
`allergens` VARCHAR(5),
|
|
`allergens_fr` VARCHAR(5),
|
|
`traces` VARCHAR(5),
|
|
`traces_tags` VARCHAR(5),
|
|
`traces_fr` VARCHAR(5),
|
|
`serving_size` VARCHAR(255),
|
|
`serving_quantity` TINYINT UNSIGNED,
|
|
`no_nutriments` VARCHAR(5),
|
|
`additives_n` VARCHAR(5),
|
|
`additives` VARCHAR(5),
|
|
`additives_tags` VARCHAR(5),
|
|
`additives_fr` TEXT,
|
|
`ingredients_from_palm_oil_n` TINYINT UNSIGNED,
|
|
`ingredients_from_palm_oil` VARCHAR(5),
|
|
`ingredients_from_palm_oil_tags` VARCHAR(5),
|
|
`ingredients_that_may_be_from_palm_oil_n` VARCHAR(5),
|
|
`ingredients_that_may_be_from_palm_oil` VARCHAR(5),
|
|
`ingredients_that_may_be_from_palm_oil_tags` VARCHAR(5),
|
|
`nutrition_grade_uk` VARCHAR(5),
|
|
`nutrition_grade_fr` VARCHAR(40),
|
|
`pnns_groups_1` VARCHAR(5),
|
|
`pnns_groups_2` VARCHAR(5),
|
|
`states` VARCHAR(5),
|
|
`states_tags` VARCHAR(5),
|
|
`states_fr` TEXT,
|
|
`main_category` VARCHAR(5),
|
|
`main_category_fr` VARCHAR(100),
|
|
`image_url` VARCHAR(5),
|
|
`image_small_url` VARCHAR(5),
|
|
`image_ingredients_url` VARCHAR(5),
|
|
`image_ingredients_small_url` VARCHAR(5),
|
|
`image_nutrition_url` VARCHAR(5),
|
|
`image_nutrition_small_url` VARCHAR(5),
|
|
`energy_100g` SMALLINT,
|
|
`energy-from-fat_100g` VARCHAR(5),
|
|
`fat_100g` DECIMAL(4,2),
|
|
`saturated-fat_100g` VARCHAR(5),
|
|
`butyric-acid_100g` VARCHAR(5),
|
|
`caproic-acid_100g` VARCHAR(5),
|
|
`caprylic-acid_100g` VARCHAR(5),
|
|
`capric-acid_100g` VARCHAR(5),
|
|
`lauric-acid_100g` VARCHAR(5),
|
|
`myristic-acid_100g` VARCHAR(5),
|
|
`palmitic-acid_100g` VARCHAR(5),
|
|
`stearic-acid_100g` VARCHAR(5),
|
|
`arachidic-acid_100g` VARCHAR(5),
|
|
`behenic-acid_100g` VARCHAR(5),
|
|
`lignoceric-acid_100g` VARCHAR(5),
|
|
`cerotic-acid_100g` VARCHAR(5),
|
|
`montanic-acid_100g` VARCHAR(5),
|
|
`melissic-acid_100g` VARCHAR(5),
|
|
`monounsaturated-fat_100g` VARCHAR(5),
|
|
`polyunsaturated-fat_100g` VARCHAR(5),
|
|
`omega-3-fat_100g` VARCHAR(5),
|
|
`alpha-linolenic-acid_100g` VARCHAR(5),
|
|
`eicosapentaenoic-acid_100g` VARCHAR(5),
|
|
`docosahexaenoic-acid_100g` VARCHAR(5),
|
|
`omega-6-fat_100g` VARCHAR(5),
|
|
`linoleic-acid_100g` VARCHAR(5),
|
|
`arachidonic-acid_100g` VARCHAR(5),
|
|
`gamma-linolenic-acid_100g` VARCHAR(5),
|
|
`dihomo-gamma-linolenic-acid_100g` VARCHAR(5),
|
|
`omega-9-fat_100g` VARCHAR(5),
|
|
`oleic-acid_100g` VARCHAR(5),
|
|
`elaidic-acid_100g` VARCHAR(5),
|
|
`gondoic-acid_100g` VARCHAR(5),
|
|
`mead-acid_100g` VARCHAR(5),
|
|
`erucic-acid_100g` VARCHAR(5),
|
|
`nervonic-acid_100g` VARCHAR(5),
|
|
`trans-fat_100g` VARCHAR(5),
|
|
`cholesterol_100g` VARCHAR(5),
|
|
`carbohydrates_100g` VARCHAR(5),
|
|
`sugars_100g` DECIMAL(4,2),
|
|
`sucrose_100g` VARCHAR(5),
|
|
`glucose_100g` VARCHAR(5),
|
|
`fructose_100g` VARCHAR(5),
|
|
`lactose_100g` VARCHAR(5),
|
|
`maltose_100g` VARCHAR(5),
|
|
`maltodextrins_100g` VARCHAR(5),
|
|
`starch_100g` VARCHAR(5),
|
|
`polyols_100g` VARCHAR(5),
|
|
`fiber_100g` VARCHAR(5),
|
|
`proteins_100g` VARCHAR(5),
|
|
`casein_100g` VARCHAR(5),
|
|
`serum-proteins_100g` VARCHAR(5),
|
|
`nucleotides_100g` VARCHAR(5),
|
|
`salt_100g` DECIMAL(4,2),
|
|
`sodium_100g` DECIMAL(4,2),
|
|
`alcohol_100g` DECIMAL(4,2),
|
|
`vitamin-a_100g` VARCHAR(5),
|
|
`beta-carotene_100g` VARCHAR(5),
|
|
`vitamin-d_100g` VARCHAR(5),
|
|
`vitamin-e_100g` VARCHAR(5),
|
|
`vitamin-k_100g` VARCHAR(5),
|
|
`vitamin-c_100g` VARCHAR(5),
|
|
`vitamin-b1_100g` VARCHAR(5),
|
|
`vitamin-b2_100g` VARCHAR(5),
|
|
`vitamin-pp_100g` VARCHAR(5),
|
|
`vitamin-b6_100g` VARCHAR(5),
|
|
`vitamin-b9_100g` VARCHAR(5),
|
|
`folates_100g` VARCHAR(5),
|
|
`vitamin-b12_100g` VARCHAR(5),
|
|
`biotin_100g` VARCHAR(5),
|
|
`pantothenic-acid_100g` VARCHAR(5),
|
|
`silica_100g` VARCHAR(5),
|
|
`bicarbonate_100g` VARCHAR(5),
|
|
`potassium_100g` VARCHAR(5),
|
|
`chloride_100g` VARCHAR(5),
|
|
`calcium_100g` VARCHAR(5),
|
|
`phosphorus_100g` VARCHAR(5),
|
|
`iron_100g` VARCHAR(5),
|
|
`magnesium_100g` VARCHAR(5),
|
|
`zinc_100g` VARCHAR(5),
|
|
`copper_100g` VARCHAR(5),
|
|
`manganese_100g` VARCHAR(5),
|
|
`fluoride_100g` VARCHAR(5),
|
|
`selenium_100g` VARCHAR(5),
|
|
`chromium_100g` VARCHAR(5),
|
|
`molybdenum_100g` VARCHAR(5),
|
|
`iodine_100g` VARCHAR(5),
|
|
`caffeine_100g` VARCHAR(5),
|
|
`taurine_100g` VARCHAR(5),
|
|
`ph_100g` VARCHAR(5),
|
|
`fruits-vegetables-nuts_100g` VARCHAR(5),
|
|
`fruits-vegetables-nuts-estimate_100g` VARCHAR(5),
|
|
`collagen-meat-protein-ratio_100g` VARCHAR(5),
|
|
`cocoa_100g` VARCHAR(5),
|
|
`chlorophyl_100g` VARCHAR(5),
|
|
`carbon-footprint_100g` VARCHAR(5),
|
|
`nutrition-score-fr_100g` TINYINT UNSIGNED,
|
|
`nutrition-score-uk_100g` VARCHAR(5),
|
|
`glycemic-index_100g` VARCHAR(5),
|
|
`water-hardness_100g` VARCHAR(5),
|
|
`choline_100g` VARCHAR(5),
|
|
`phylloquinone_100g` VARCHAR(5),
|
|
`beta-glucan_100g` VARCHAR(5),
|
|
`inositol_100g` VARCHAR(5),
|
|
`carnitine_100g` VARCHAR(5)
|
|
);
|
|
|
|
-- ------------------------------------------
|
|
-- Loads data from CSV file --
|
|
-- a smaller one could be used for testing --
|
|
-- ------------------------------------------
|
|
|
|
-- LOAD DATA LOCAL INFILE 'small-sample.csv'
|
|
LOAD DATA LOCAL INFILE 'fr.openfoodfacts.org.products.csv'
|
|
INTO TABLE original_data
|
|
IGNORE 1 LINES;
|
|
|
|
-- ----------------
|
|
-- Shows DB size --
|
|
-- ----------------
|
|
|
|
SELECT table_schema "Full database",
|
|
CONCAT(ROUND(SUM(data_length + index_length) / 1024 / 1024, 1)," MB") "Size"
|
|
FROM information_schema.tables
|
|
WHERE table_schema = "offlocal"
|
|
GROUP BY table_schema
|
|
|
|
UNION
|
|
|
|
SELECT "Rows in 'original_data table'", COUNT(*)
|
|
FROM offlocal.original_data;
|
|
|
|
|
|
-- ------------------------------------------------
|
|
-- Deletes all data not associated with 'France' --
|
|
-- ------------------------------------------------
|
|
|
|
DELETE FROM original_data
|
|
WHERE countries_fr NOT LIKE 'France%'
|
|
OR countries_fr NOT LIKE '%France'
|
|
OR countries_fr NOT LIKE '%France%';
|
|
|
|
|
|
-- ------------------------
|
|
-- Drops useless columns --
|
|
-- ------------------------
|
|
|
|
ALTER TABLE original_data
|
|
DROP COLUMN `creator`,
|
|
DROP COLUMN `created_datetime`,
|
|
DROP COLUMN `last_modified_datetime`,
|
|
DROP COLUMN `generic_name`,
|
|
DROP COLUMN `packaging_tags`,
|
|
DROP COLUMN `brands_tags`,
|
|
DROP COLUMN `categories`,
|
|
DROP COLUMN `categories_tags`,
|
|
DROP COLUMN `origins_tags`,
|
|
DROP COLUMN `manufacturing_places_tags`,
|
|
DROP COLUMN `labels`,
|
|
DROP COLUMN `labels_tags`,
|
|
DROP COLUMN `emb_codes`,
|
|
DROP COLUMN `emb_codes_tags`,
|
|
DROP COLUMN `first_packaging_code_geo`,
|
|
DROP COLUMN `cities`,
|
|
DROP COLUMN `cities_tags`,
|
|
DROP COLUMN `countries`,
|
|
DROP COLUMN `countries_tags`,
|
|
DROP COLUMN `countries_fr`,
|
|
DROP COLUMN `allergens`,
|
|
DROP COLUMN `allergens_fr`,
|
|
DROP COLUMN `traces`,
|
|
DROP COLUMN `traces_tags`,
|
|
DROP COLUMN `traces_fr`,
|
|
DROP COLUMN `no_nutriments`,
|
|
DROP COLUMN `additives_n`,
|
|
DROP COLUMN `additives`,
|
|
DROP COLUMN `additives_tags`,
|
|
DROP COLUMN `ingredients_from_palm_oil`,
|
|
DROP COLUMN `ingredients_from_palm_oil_tags`,
|
|
DROP COLUMN `ingredients_that_may_be_from_palm_oil`,
|
|
DROP COLUMN `nutrition_grade_uk`,
|
|
DROP COLUMN `pnns_groups_1`,
|
|
DROP COLUMN `pnns_groups_2`,
|
|
DROP COLUMN `states`,
|
|
DROP COLUMN `states_tags`,
|
|
DROP COLUMN `main_category`,
|
|
DROP COLUMN `image_url`,
|
|
DROP COLUMN `image_small_url`,
|
|
DROP COLUMN `image_ingredients_url`,
|
|
DROP COLUMN `image_ingredients_small_url`,
|
|
DROP COLUMN `image_nutrition_url`,
|
|
DROP COLUMN `image_nutrition_small_url`,
|
|
DROP COLUMN `energy-from-fat_100g`,
|
|
DROP COLUMN `saturated-fat_100g`,
|
|
DROP COLUMN `butyric-acid_100g`,
|
|
DROP COLUMN `caproic-acid_100g`,
|
|
DROP COLUMN `caprylic-acid_100g`,
|
|
DROP COLUMN `capric-acid_100g`,
|
|
DROP COLUMN `lauric-acid_100g`,
|
|
DROP COLUMN `myristic-acid_100g`,
|
|
DROP COLUMN `palmitic-acid_100g`,
|
|
DROP COLUMN `stearic-acid_100g`,
|
|
DROP COLUMN `arachidic-acid_100g`,
|
|
DROP COLUMN `behenic-acid_100g`,
|
|
DROP COLUMN `lignoceric-acid_100g`,
|
|
DROP COLUMN `cerotic-acid_100g`,
|
|
DROP COLUMN `montanic-acid_100g`,
|
|
DROP COLUMN `melissic-acid_100g`,
|
|
DROP COLUMN `monounsaturated-fat_100g`,
|
|
DROP COLUMN `polyunsaturated-fat_100g`,
|
|
DROP COLUMN `omega-3-fat_100g`,
|
|
DROP COLUMN `alpha-linolenic-acid_100g`,
|
|
DROP COLUMN `eicosapentaenoic-acid_100g`,
|
|
DROP COLUMN `docosahexaenoic-acid_100g`,
|
|
DROP COLUMN `omega-6-fat_100g`,
|
|
DROP COLUMN `linoleic-acid_100g`,
|
|
DROP COLUMN `arachidonic-acid_100g`,
|
|
DROP COLUMN `gamma-linolenic-acid_100g`,
|
|
DROP COLUMN `dihomo-gamma-linolenic-acid_100g`,
|
|
DROP COLUMN `omega-9-fat_100g`,
|
|
DROP COLUMN `oleic-acid_100g`,
|
|
DROP COLUMN `elaidic-acid_100g`,
|
|
DROP COLUMN `gondoic-acid_100g`,
|
|
DROP COLUMN `mead-acid_100g`,
|
|
DROP COLUMN `erucic-acid_100g`,
|
|
DROP COLUMN `nervonic-acid_100g`,
|
|
DROP COLUMN `trans-fat_100g`,
|
|
DROP COLUMN `cholesterol_100g`,
|
|
DROP COLUMN `carbohydrates_100g`,
|
|
DROP COLUMN `sucrose_100g`,
|
|
DROP COLUMN `glucose_100g`,
|
|
DROP COLUMN `fructose_100g`,
|
|
DROP COLUMN `lactose_100g`,
|
|
DROP COLUMN `maltose_100g`,
|
|
DROP COLUMN `maltodextrins_100g`,
|
|
DROP COLUMN `starch_100g`,
|
|
DROP COLUMN `polyols_100g`,
|
|
DROP COLUMN `fiber_100g`,
|
|
DROP COLUMN `proteins_100g`,
|
|
DROP COLUMN `casein_100g`,
|
|
DROP COLUMN `serum-proteins_100g`,
|
|
DROP COLUMN `nucleotides_100g`,
|
|
DROP COLUMN `vitamin-a_100g`,
|
|
DROP COLUMN `beta-carotene_100g`,
|
|
DROP COLUMN `vitamin-d_100g`,
|
|
DROP COLUMN `vitamin-e_100g`,
|
|
DROP COLUMN `vitamin-k_100g`,
|
|
DROP COLUMN `vitamin-c_100g`,
|
|
DROP COLUMN `vitamin-b1_100g`,
|
|
DROP COLUMN `vitamin-b2_100g`,
|
|
DROP COLUMN `vitamin-pp_100g`,
|
|
DROP COLUMN `vitamin-b6_100g`,
|
|
DROP COLUMN `vitamin-b9_100g`,
|
|
DROP COLUMN `folates_100g`,
|
|
DROP COLUMN `vitamin-b12_100g`,
|
|
DROP COLUMN `biotin_100g`,
|
|
DROP COLUMN `pantothenic-acid_100g`,
|
|
DROP COLUMN `silica_100g`,
|
|
DROP COLUMN `bicarbonate_100g`,
|
|
DROP COLUMN `potassium_100g`,
|
|
DROP COLUMN `chloride_100g`,
|
|
DROP COLUMN `calcium_100g`,
|
|
DROP COLUMN `phosphorus_100g`,
|
|
DROP COLUMN `iron_100g`,
|
|
DROP COLUMN `magnesium_100g`,
|
|
DROP COLUMN `zinc_100g`,
|
|
DROP COLUMN `copper_100g`,
|
|
DROP COLUMN `manganese_100g`,
|
|
DROP COLUMN `fluoride_100g`,
|
|
DROP COLUMN `selenium_100g`,
|
|
DROP COLUMN `chromium_100g`,
|
|
DROP COLUMN `molybdenum_100g`,
|
|
DROP COLUMN `iodine_100g`,
|
|
DROP COLUMN `caffeine_100g`,
|
|
DROP COLUMN `taurine_100g`,
|
|
DROP COLUMN `ph_100g`,
|
|
DROP COLUMN `fruits-vegetables-nuts_100g`,
|
|
DROP COLUMN `fruits-vegetables-nuts-estimate_100g`,
|
|
DROP COLUMN `collagen-meat-protein-ratio_100g`,
|
|
DROP COLUMN `cocoa_100g`,
|
|
DROP COLUMN `chlorophyl_100g`,
|
|
DROP COLUMN `carbon-footprint_100g`,
|
|
DROP COLUMN `nutrition-score-uk_100g`,
|
|
DROP COLUMN `glycemic-index_100g`,
|
|
DROP COLUMN `water-hardness_100g`,
|
|
DROP COLUMN `choline_100g`,
|
|
DROP COLUMN `phylloquinone_100g`,
|
|
DROP COLUMN `beta-glucan_100g`,
|
|
DROP COLUMN `inositol_100g`,
|
|
DROP COLUMN `carnitine_100g`;
|
|
|
|
|
|
-- ----------------
|
|
-- Shows DB size --
|
|
-- ----------------
|
|
|
|
SELECT table_schema "Lowered database",
|
|
CONCAT(ROUND(SUM(data_length + index_length) / 1024 / 1024, 1)," MB") "Size"
|
|
FROM information_schema.tables
|
|
WHERE table_schema = "offlocal"
|
|
GROUP BY table_schema
|
|
|
|
UNION
|
|
|
|
SELECT "Rows in 'original_data' table", COUNT(*)
|
|
FROM offlocal.original_data;
|
|
|
|
|
|
-- ----------------------
|
|
-- Fields ideal length --
|
|
-- ----------------------
|
|
|
|
SELECT
|
|
MAX(CHAR_LENGTH(`code`)) AS 'code',
|
|
MAX(CHAR_LENGTH(`url`)) AS 'url',
|
|
MAX(CHAR_LENGTH(`created_t`)) AS 'crea',
|
|
MAX(CHAR_LENGTH(`last_modified_t`)) AS 'last',
|
|
MAX(CHAR_LENGTH(`product_name`)) AS 'prod',
|
|
MAX(CHAR_LENGTH(`quantity`)) AS 'qtity',
|
|
MAX(CHAR_LENGTH(`categories_fr`)) AS 'cat',
|
|
MAX(CHAR_LENGTH(`labels_fr`)) AS 'lab',
|
|
MAX(CHAR_LENGTH(`purchase_places`)) AS 'purc',
|
|
MAX(CHAR_LENGTH(`stores`)) AS 'stor',
|
|
MAX(CHAR_LENGTH(`ingredients_text`)) AS 'ingr',
|
|
MAX(CHAR_LENGTH(`serving_size`)) AS 's_siz',
|
|
MAX(CHAR_LENGTH(`serving_quantity`)) AS 's_qua',
|
|
MAX(CHAR_LENGTH(`additives_fr`)) AS 'add',
|
|
MAX(CHAR_LENGTH(`nutrition_grade_fr`)) AS 'nut_gr',
|
|
MAX(CHAR_LENGTH(`states_fr`)) AS 'state',
|
|
MAX(CHAR_LENGTH(`main_category_fr`)) AS 'm_cat',
|
|
MAX(CHAR_LENGTH(`energy_100g`)) AS 'energy',
|
|
MAX(CHAR_LENGTH(`fat_100g`)) AS 'fat',
|
|
MAX(CHAR_LENGTH(`sugars_100g`)) AS 'sugars',
|
|
MAX(CHAR_LENGTH(`salt_100g`)) AS 'salt',
|
|
MAX(CHAR_LENGTH(`sodium_100g`)) AS 'sodium',
|
|
MAX(CHAR_LENGTH(`alcohol_100g`)) AS 'alcohol',
|
|
MAX(CHAR_LENGTH(`nutrition-score-fr_100g`)) AS 'nut-sc'
|
|
FROM original_data;
|