ocp5/legacy-sql/create-db-offlocal.sql

416 lines
14 KiB
SQL

-- ---------------------------------------------
-- Creates a local DB from the hudge CSV file --
-- ---------------------------------------------
DROP DATABASE IF EXISTS offlocal;
CREATE DATABASE offlocal CHARACTER SET 'utf8';
USE offlocal;
CREATE TABLE original_data(
`code` BIGINT UNSIGNED NOT NULL,
`url` TEXT(500),
`creator` VARCHAR(5),
`created_t` INT,
`created_datetime` VARCHAR(5),
`last_modified_t` INT,
`last_modified_datetime` VARCHAR(5),
`product_name` VARCHAR(200),
`generic_name` VARCHAR(5),
`quantity` VARCHAR(255),
`packaging` VARCHAR(5),
`packaging_tags` VARCHAR(5),
`brands` VARCHAR(255),
`brands_tags` VARCHAR(5),
`categories` VARCHAR(5),
`categories_tags` VARCHAR(5),
`categories_fr` TEXT(800),
`origins` VARCHAR(5),
`origins_tags` VARCHAR(5),
`manufacturing_places` VARCHAR(5),
`manufacturing_places_tags` VARCHAR(5),
`labels` VARCHAR(5),
`labels_tags` VARCHAR(5),
`labels_fr` TEXT,
`emb_codes` VARCHAR(5),
`emb_codes_tags` VARCHAR(5),
`first_packaging_code_geo` VARCHAR(5),
`cities` VARCHAR(5),
`cities_tags` VARCHAR(5),
`purchase_places` VARCHAR(100),
`stores` VARCHAR(255),
`countries` VARCHAR(5),
`countries_tags` VARCHAR(5),
`countries_fr` TEXT,
`ingredients_text` TEXT,
`allergens` VARCHAR(5),
`allergens_fr` VARCHAR(5),
`traces` VARCHAR(5),
`traces_tags` VARCHAR(5),
`traces_fr` VARCHAR(5),
`serving_size` VARCHAR(255),
`serving_quantity` TINYINT UNSIGNED,
`no_nutriments` VARCHAR(5),
`additives_n` VARCHAR(5),
`additives` VARCHAR(5),
`additives_tags` VARCHAR(5),
`additives_fr` TEXT,
`ingredients_from_palm_oil_n` TINYINT UNSIGNED,
`ingredients_from_palm_oil` VARCHAR(5),
`ingredients_from_palm_oil_tags` VARCHAR(5),
`ingredients_that_may_be_from_palm_oil_n` VARCHAR(5),
`ingredients_that_may_be_from_palm_oil` VARCHAR(5),
`ingredients_that_may_be_from_palm_oil_tags` VARCHAR(5),
`nutrition_grade_uk` VARCHAR(5),
`nutrition_grade_fr` VARCHAR(40),
`pnns_groups_1` VARCHAR(5),
`pnns_groups_2` VARCHAR(5),
`states` VARCHAR(5),
`states_tags` VARCHAR(5),
`states_fr` TEXT,
`main_category` VARCHAR(5),
`main_category_fr` VARCHAR(100),
`image_url` VARCHAR(5),
`image_small_url` VARCHAR(5),
`image_ingredients_url` VARCHAR(5),
`image_ingredients_small_url` VARCHAR(5),
`image_nutrition_url` VARCHAR(5),
`image_nutrition_small_url` VARCHAR(5),
`energy_100g` SMALLINT,
`energy-from-fat_100g` VARCHAR(5),
`fat_100g` DECIMAL(4,2),
`saturated-fat_100g` VARCHAR(5),
`butyric-acid_100g` VARCHAR(5),
`caproic-acid_100g` VARCHAR(5),
`caprylic-acid_100g` VARCHAR(5),
`capric-acid_100g` VARCHAR(5),
`lauric-acid_100g` VARCHAR(5),
`myristic-acid_100g` VARCHAR(5),
`palmitic-acid_100g` VARCHAR(5),
`stearic-acid_100g` VARCHAR(5),
`arachidic-acid_100g` VARCHAR(5),
`behenic-acid_100g` VARCHAR(5),
`lignoceric-acid_100g` VARCHAR(5),
`cerotic-acid_100g` VARCHAR(5),
`montanic-acid_100g` VARCHAR(5),
`melissic-acid_100g` VARCHAR(5),
`monounsaturated-fat_100g` VARCHAR(5),
`polyunsaturated-fat_100g` VARCHAR(5),
`omega-3-fat_100g` VARCHAR(5),
`alpha-linolenic-acid_100g` VARCHAR(5),
`eicosapentaenoic-acid_100g` VARCHAR(5),
`docosahexaenoic-acid_100g` VARCHAR(5),
`omega-6-fat_100g` VARCHAR(5),
`linoleic-acid_100g` VARCHAR(5),
`arachidonic-acid_100g` VARCHAR(5),
`gamma-linolenic-acid_100g` VARCHAR(5),
`dihomo-gamma-linolenic-acid_100g` VARCHAR(5),
`omega-9-fat_100g` VARCHAR(5),
`oleic-acid_100g` VARCHAR(5),
`elaidic-acid_100g` VARCHAR(5),
`gondoic-acid_100g` VARCHAR(5),
`mead-acid_100g` VARCHAR(5),
`erucic-acid_100g` VARCHAR(5),
`nervonic-acid_100g` VARCHAR(5),
`trans-fat_100g` VARCHAR(5),
`cholesterol_100g` VARCHAR(5),
`carbohydrates_100g` VARCHAR(5),
`sugars_100g` DECIMAL(4,2),
`sucrose_100g` VARCHAR(5),
`glucose_100g` VARCHAR(5),
`fructose_100g` VARCHAR(5),
`lactose_100g` VARCHAR(5),
`maltose_100g` VARCHAR(5),
`maltodextrins_100g` VARCHAR(5),
`starch_100g` VARCHAR(5),
`polyols_100g` VARCHAR(5),
`fiber_100g` VARCHAR(5),
`proteins_100g` VARCHAR(5),
`casein_100g` VARCHAR(5),
`serum-proteins_100g` VARCHAR(5),
`nucleotides_100g` VARCHAR(5),
`salt_100g` DECIMAL(4,2),
`sodium_100g` DECIMAL(4,2),
`alcohol_100g` DECIMAL(4,2),
`vitamin-a_100g` VARCHAR(5),
`beta-carotene_100g` VARCHAR(5),
`vitamin-d_100g` VARCHAR(5),
`vitamin-e_100g` VARCHAR(5),
`vitamin-k_100g` VARCHAR(5),
`vitamin-c_100g` VARCHAR(5),
`vitamin-b1_100g` VARCHAR(5),
`vitamin-b2_100g` VARCHAR(5),
`vitamin-pp_100g` VARCHAR(5),
`vitamin-b6_100g` VARCHAR(5),
`vitamin-b9_100g` VARCHAR(5),
`folates_100g` VARCHAR(5),
`vitamin-b12_100g` VARCHAR(5),
`biotin_100g` VARCHAR(5),
`pantothenic-acid_100g` VARCHAR(5),
`silica_100g` VARCHAR(5),
`bicarbonate_100g` VARCHAR(5),
`potassium_100g` VARCHAR(5),
`chloride_100g` VARCHAR(5),
`calcium_100g` VARCHAR(5),
`phosphorus_100g` VARCHAR(5),
`iron_100g` VARCHAR(5),
`magnesium_100g` VARCHAR(5),
`zinc_100g` VARCHAR(5),
`copper_100g` VARCHAR(5),
`manganese_100g` VARCHAR(5),
`fluoride_100g` VARCHAR(5),
`selenium_100g` VARCHAR(5),
`chromium_100g` VARCHAR(5),
`molybdenum_100g` VARCHAR(5),
`iodine_100g` VARCHAR(5),
`caffeine_100g` VARCHAR(5),
`taurine_100g` VARCHAR(5),
`ph_100g` VARCHAR(5),
`fruits-vegetables-nuts_100g` VARCHAR(5),
`fruits-vegetables-nuts-estimate_100g` VARCHAR(5),
`collagen-meat-protein-ratio_100g` VARCHAR(5),
`cocoa_100g` VARCHAR(5),
`chlorophyl_100g` VARCHAR(5),
`carbon-footprint_100g` VARCHAR(5),
`nutrition-score-fr_100g` TINYINT UNSIGNED,
`nutrition-score-uk_100g` VARCHAR(5),
`glycemic-index_100g` VARCHAR(5),
`water-hardness_100g` VARCHAR(5),
`choline_100g` VARCHAR(5),
`phylloquinone_100g` VARCHAR(5),
`beta-glucan_100g` VARCHAR(5),
`inositol_100g` VARCHAR(5),
`carnitine_100g` VARCHAR(5)
);
-- ------------------------------------------
-- Loads data from CSV file --
-- a smaller one could be used for testing --
-- ------------------------------------------
-- LOAD DATA LOCAL INFILE 'small-sample.csv'
LOAD DATA LOCAL INFILE 'fr.openfoodfacts.org.products.csv'
INTO TABLE original_data
IGNORE 1 LINES;
-- ----------------
-- Shows DB size --
-- ----------------
SELECT table_schema "Full database",
CONCAT(ROUND(SUM(data_length + index_length) / 1024 / 1024, 1)," MB") "Size"
FROM information_schema.tables
WHERE table_schema = "offlocal"
GROUP BY table_schema
UNION
SELECT "Rows in 'original_data table'", COUNT(*)
FROM offlocal.original_data;
-- ------------------------------------------------
-- Deletes all data not associated with 'France' --
-- ------------------------------------------------
DELETE FROM original_data
WHERE countries_fr NOT LIKE 'France%'
OR countries_fr NOT LIKE '%France'
OR countries_fr NOT LIKE '%France%';
-- ------------------------
-- Drops useless columns --
-- ------------------------
ALTER TABLE original_data
DROP COLUMN `creator`,
DROP COLUMN `created_datetime`,
DROP COLUMN `last_modified_datetime`,
DROP COLUMN `generic_name`,
DROP COLUMN `packaging_tags`,
DROP COLUMN `brands_tags`,
DROP COLUMN `categories`,
DROP COLUMN `categories_tags`,
DROP COLUMN `origins_tags`,
DROP COLUMN `manufacturing_places_tags`,
DROP COLUMN `labels`,
DROP COLUMN `labels_tags`,
DROP COLUMN `emb_codes`,
DROP COLUMN `emb_codes_tags`,
DROP COLUMN `first_packaging_code_geo`,
DROP COLUMN `cities`,
DROP COLUMN `cities_tags`,
DROP COLUMN `countries`,
DROP COLUMN `countries_tags`,
DROP COLUMN `countries_fr`,
DROP COLUMN `allergens`,
DROP COLUMN `allergens_fr`,
DROP COLUMN `traces`,
DROP COLUMN `traces_tags`,
DROP COLUMN `traces_fr`,
DROP COLUMN `no_nutriments`,
DROP COLUMN `additives_n`,
DROP COLUMN `additives`,
DROP COLUMN `additives_tags`,
DROP COLUMN `ingredients_from_palm_oil`,
DROP COLUMN `ingredients_from_palm_oil_tags`,
DROP COLUMN `ingredients_that_may_be_from_palm_oil`,
DROP COLUMN `nutrition_grade_uk`,
DROP COLUMN `pnns_groups_1`,
DROP COLUMN `pnns_groups_2`,
DROP COLUMN `states`,
DROP COLUMN `states_tags`,
DROP COLUMN `main_category`,
DROP COLUMN `image_url`,
DROP COLUMN `image_small_url`,
DROP COLUMN `image_ingredients_url`,
DROP COLUMN `image_ingredients_small_url`,
DROP COLUMN `image_nutrition_url`,
DROP COLUMN `image_nutrition_small_url`,
DROP COLUMN `energy-from-fat_100g`,
DROP COLUMN `saturated-fat_100g`,
DROP COLUMN `butyric-acid_100g`,
DROP COLUMN `caproic-acid_100g`,
DROP COLUMN `caprylic-acid_100g`,
DROP COLUMN `capric-acid_100g`,
DROP COLUMN `lauric-acid_100g`,
DROP COLUMN `myristic-acid_100g`,
DROP COLUMN `palmitic-acid_100g`,
DROP COLUMN `stearic-acid_100g`,
DROP COLUMN `arachidic-acid_100g`,
DROP COLUMN `behenic-acid_100g`,
DROP COLUMN `lignoceric-acid_100g`,
DROP COLUMN `cerotic-acid_100g`,
DROP COLUMN `montanic-acid_100g`,
DROP COLUMN `melissic-acid_100g`,
DROP COLUMN `monounsaturated-fat_100g`,
DROP COLUMN `polyunsaturated-fat_100g`,
DROP COLUMN `omega-3-fat_100g`,
DROP COLUMN `alpha-linolenic-acid_100g`,
DROP COLUMN `eicosapentaenoic-acid_100g`,
DROP COLUMN `docosahexaenoic-acid_100g`,
DROP COLUMN `omega-6-fat_100g`,
DROP COLUMN `linoleic-acid_100g`,
DROP COLUMN `arachidonic-acid_100g`,
DROP COLUMN `gamma-linolenic-acid_100g`,
DROP COLUMN `dihomo-gamma-linolenic-acid_100g`,
DROP COLUMN `omega-9-fat_100g`,
DROP COLUMN `oleic-acid_100g`,
DROP COLUMN `elaidic-acid_100g`,
DROP COLUMN `gondoic-acid_100g`,
DROP COLUMN `mead-acid_100g`,
DROP COLUMN `erucic-acid_100g`,
DROP COLUMN `nervonic-acid_100g`,
DROP COLUMN `trans-fat_100g`,
DROP COLUMN `cholesterol_100g`,
DROP COLUMN `carbohydrates_100g`,
DROP COLUMN `sucrose_100g`,
DROP COLUMN `glucose_100g`,
DROP COLUMN `fructose_100g`,
DROP COLUMN `lactose_100g`,
DROP COLUMN `maltose_100g`,
DROP COLUMN `maltodextrins_100g`,
DROP COLUMN `starch_100g`,
DROP COLUMN `polyols_100g`,
DROP COLUMN `fiber_100g`,
DROP COLUMN `proteins_100g`,
DROP COLUMN `casein_100g`,
DROP COLUMN `serum-proteins_100g`,
DROP COLUMN `nucleotides_100g`,
DROP COLUMN `vitamin-a_100g`,
DROP COLUMN `beta-carotene_100g`,
DROP COLUMN `vitamin-d_100g`,
DROP COLUMN `vitamin-e_100g`,
DROP COLUMN `vitamin-k_100g`,
DROP COLUMN `vitamin-c_100g`,
DROP COLUMN `vitamin-b1_100g`,
DROP COLUMN `vitamin-b2_100g`,
DROP COLUMN `vitamin-pp_100g`,
DROP COLUMN `vitamin-b6_100g`,
DROP COLUMN `vitamin-b9_100g`,
DROP COLUMN `folates_100g`,
DROP COLUMN `vitamin-b12_100g`,
DROP COLUMN `biotin_100g`,
DROP COLUMN `pantothenic-acid_100g`,
DROP COLUMN `silica_100g`,
DROP COLUMN `bicarbonate_100g`,
DROP COLUMN `potassium_100g`,
DROP COLUMN `chloride_100g`,
DROP COLUMN `calcium_100g`,
DROP COLUMN `phosphorus_100g`,
DROP COLUMN `iron_100g`,
DROP COLUMN `magnesium_100g`,
DROP COLUMN `zinc_100g`,
DROP COLUMN `copper_100g`,
DROP COLUMN `manganese_100g`,
DROP COLUMN `fluoride_100g`,
DROP COLUMN `selenium_100g`,
DROP COLUMN `chromium_100g`,
DROP COLUMN `molybdenum_100g`,
DROP COLUMN `iodine_100g`,
DROP COLUMN `caffeine_100g`,
DROP COLUMN `taurine_100g`,
DROP COLUMN `ph_100g`,
DROP COLUMN `fruits-vegetables-nuts_100g`,
DROP COLUMN `fruits-vegetables-nuts-estimate_100g`,
DROP COLUMN `collagen-meat-protein-ratio_100g`,
DROP COLUMN `cocoa_100g`,
DROP COLUMN `chlorophyl_100g`,
DROP COLUMN `carbon-footprint_100g`,
DROP COLUMN `nutrition-score-uk_100g`,
DROP COLUMN `glycemic-index_100g`,
DROP COLUMN `water-hardness_100g`,
DROP COLUMN `choline_100g`,
DROP COLUMN `phylloquinone_100g`,
DROP COLUMN `beta-glucan_100g`,
DROP COLUMN `inositol_100g`,
DROP COLUMN `carnitine_100g`;
-- ----------------
-- Shows DB size --
-- ----------------
SELECT table_schema "Lowered database",
CONCAT(ROUND(SUM(data_length + index_length) / 1024 / 1024, 1)," MB") "Size"
FROM information_schema.tables
WHERE table_schema = "offlocal"
GROUP BY table_schema
UNION
SELECT "Rows in 'original_data' table", COUNT(*)
FROM offlocal.original_data;
-- ----------------------
-- Fields ideal length --
-- ----------------------
SELECT
MAX(CHAR_LENGTH(`code`)) AS 'code',
MAX(CHAR_LENGTH(`url`)) AS 'url',
MAX(CHAR_LENGTH(`created_t`)) AS 'crea',
MAX(CHAR_LENGTH(`last_modified_t`)) AS 'last',
MAX(CHAR_LENGTH(`product_name`)) AS 'prod',
MAX(CHAR_LENGTH(`quantity`)) AS 'qtity',
MAX(CHAR_LENGTH(`categories_fr`)) AS 'cat',
MAX(CHAR_LENGTH(`labels_fr`)) AS 'lab',
MAX(CHAR_LENGTH(`purchase_places`)) AS 'purc',
MAX(CHAR_LENGTH(`stores`)) AS 'stor',
MAX(CHAR_LENGTH(`ingredients_text`)) AS 'ingr',
MAX(CHAR_LENGTH(`serving_size`)) AS 's_siz',
MAX(CHAR_LENGTH(`serving_quantity`)) AS 's_qua',
MAX(CHAR_LENGTH(`additives_fr`)) AS 'add',
MAX(CHAR_LENGTH(`nutrition_grade_fr`)) AS 'nut_gr',
MAX(CHAR_LENGTH(`states_fr`)) AS 'state',
MAX(CHAR_LENGTH(`main_category_fr`)) AS 'm_cat',
MAX(CHAR_LENGTH(`energy_100g`)) AS 'energy',
MAX(CHAR_LENGTH(`fat_100g`)) AS 'fat',
MAX(CHAR_LENGTH(`sugars_100g`)) AS 'sugars',
MAX(CHAR_LENGTH(`salt_100g`)) AS 'salt',
MAX(CHAR_LENGTH(`sodium_100g`)) AS 'sodium',
MAX(CHAR_LENGTH(`alcohol_100g`)) AS 'alcohol',
MAX(CHAR_LENGTH(`nutrition-score-fr_100g`)) AS 'nut-sc'
FROM original_data;