From 42618ac95cee57bfb901ccc1a12b8c1824a335ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Tue, 10 Dec 2024 12:12:04 +0100 Subject: [PATCH] fix: improve parsing of 'category (type 1, type 2..)' ingredients (#10999) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR to better handle things like "vegetal oil (palm, rapeseed)": - instead of turning "vegetal oil (palm, rapeseed)" to "palm vegetal oil", "rapeseed vegetal oil", we now turn it to "vegetal oil (palm vegetal oil, rapeseed vegetal oil)", as keeping a parent ingredient is better for ingredient percent estimation - improved the definition of all the variations of "huile et stéarine végétales non hydrogénées (colza, palme)" to have better coverage - added support for percentages like "huiles végétales 54% (colza, palme)" Work in progress, some tests will need to be updated. --------- Co-authored-by: Open Food Facts Bot Co-authored-by: Pierre Slamich Co-authored-by: Alex Garel Co-authored-by: Alex Garel --- .github/labeler.yml | 33 +- lib/ProductOpener/Ingredients.pm | 305 +++++++++++------- stop_words.txt | 2 + .../ingredients/en-category-types.json | 122 ++++--- .../fr-infinite-loop-allergens.json | 100 +++--- .../ingredients/fr-marmelade.json | 162 +++++----- .../ingredients/fr-percents-origins-2.json | 33 +- .../ingredients/ru-russian-oil.json | 109 ++++--- .../ingredients_preparsing/10.json | 2 +- .../ingredients_preparsing/106.json | 2 +- .../ingredients_preparsing/108.json | 2 +- .../ingredients_preparsing/11.json | 2 +- .../ingredients_preparsing/110.json | 2 +- .../ingredients_preparsing/111.json | 2 +- .../ingredients_preparsing/112.json | 2 +- .../ingredients_preparsing/12.json | 2 +- .../ingredients_preparsing/121.json | 2 +- .../ingredients_preparsing/13.json | 2 +- .../ingredients_preparsing/14.json | 2 +- .../ingredients_preparsing/177.json | 2 +- .../ingredients_preparsing/178.json | 2 +- .../ingredients_preparsing/179.json | 2 +- .../ingredients_preparsing/180.json | 2 +- .../ingredients_preparsing/181.json | 2 +- .../ingredients_preparsing/182.json | 2 +- .../ingredients_preparsing/183.json | 2 +- .../ingredients_preparsing/184.json | 2 +- .../ingredients_preparsing/185.json | 2 +- .../ingredients_preparsing/186.json | 2 +- .../ingredients_preparsing/187.json | 2 +- .../ingredients_preparsing/188.json | 2 +- .../ingredients_preparsing/189.json | 2 +- .../ingredients_preparsing/190.json | 2 +- .../ingredients_preparsing/191.json | 2 +- .../ingredients_preparsing/192.json | 2 +- .../ingredients_preparsing/193.json | 2 +- .../ingredients_preparsing/194.json | 2 +- .../ingredients_preparsing/195.json | 2 +- .../ingredients_preparsing/196.json | 2 +- .../ingredients_preparsing/20.json | 2 +- .../ingredients_preparsing/205.json | 2 +- .../ingredients_preparsing/206.json | 2 +- .../ingredients_preparsing/207.json | 2 +- .../ingredients_preparsing/208.json | 2 +- .../ingredients_preparsing/21.json | 2 +- .../ingredients_preparsing/211.json | 2 +- .../ingredients_preparsing/212.json | 2 +- .../ingredients_preparsing/213.json | 2 +- .../ingredients_preparsing/214.json | 2 +- .../ingredients_preparsing/215.json | 2 +- .../ingredients_preparsing/216.json | 2 +- .../ingredients_preparsing/217.json | 2 +- .../ingredients_preparsing/218.json | 2 +- .../ingredients_preparsing/219.json | 2 +- .../ingredients_preparsing/22.json | 2 +- .../ingredients_preparsing/220.json | 2 +- .../ingredients_preparsing/222.json | 2 +- .../ingredients_preparsing/224.json | 2 +- .../ingredients_preparsing/225.json | 6 + .../ingredients_preparsing/226.json | 6 + .../ingredients_preparsing/227.json | 6 + .../ingredients_preparsing/228.json | 6 + .../ingredients_preparsing/229.json | 6 + .../ingredients_preparsing/23.json | 2 +- .../ingredients_preparsing/24.json | 2 +- .../ingredients_preparsing/6.json | 2 +- .../ingredients_preparsing/7.json | 2 +- .../ingredients_preparsing/77.json | 2 +- .../ingredients_preparsing/78.json | 2 +- .../ingredients_preparsing/8.json | 2 +- .../ingredients_preparsing/85.json | 2 +- .../ingredients_preparsing/9.json | 2 +- .../ingredients_preparsing/98.json | 2 +- tests/unit/ingredients_preparsing.t | 39 ++- tests/unit/ingredients_processing.t | 14 +- tests/unit/ingredients_tags.t | 9 +- 76 files changed, 666 insertions(+), 412 deletions(-) create mode 100644 tests/unit/expected_test_results/ingredients_preparsing/225.json create mode 100644 tests/unit/expected_test_results/ingredients_preparsing/226.json create mode 100644 tests/unit/expected_test_results/ingredients_preparsing/227.json create mode 100644 tests/unit/expected_test_results/ingredients_preparsing/228.json create mode 100644 tests/unit/expected_test_results/ingredients_preparsing/229.json diff --git a/.github/labeler.yml b/.github/labeler.yml index babefa64c92c8..05ff365ea71b1 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -505,21 +505,6 @@ Data import: - any-glob-to-any-file: 'cgi/generate_sample_import_file.pl' # https://openfoodfacts.github.io/openfoodfacts-server/dev/ref-perl-pod/ProductOpener/Ingredients.html -🥗 Ingredients: -- changed-files: - - any-glob-to-any-file: 'lib/ProductOpener/Ingredients.pm' - - any-glob-to-any-file: 'taxonomies/food/ingredients.txt' - - any-glob-to-any-file: 'tests/unit/ingredients.t' - - any-glob-to-any-file: 'tests/unit/ingredients_analysis.t' - - any-glob-to-any-file: 'tests/unit/ingredients_clean.t' - - any-glob-to-any-file: 'tests/unit/ingredients_nesting.t' - - any-glob-to-any-file: 'tests/unit/ingredients_parsing.t' - - any-glob-to-any-file: 'tests/unit/ingredients_parsing_todo.t' - - any-glob-to-any-file: 'tests/unit/ingredients_percent.t' - - any-glob-to-any-file: 'tests/unit/ingredients_processing.t' - - any-glob-to-any-file: 'tests/unit/ingredients_tags.t' - - any-glob-to-any-file: 'scripts/test_ingredient_parser.pl' - # We want to improve the analysis of ingredient list to extract ingredients and their properties, across languages. # This is helpful to determine if a product is vegan, vegetarian, contains palm oil, is kosher/halal, the exact Nutri-Score, how much environmental impact it has… # https://wiki.openfoodfacts.org/Ingredients_Extraction_and_Analysis @@ -538,7 +523,23 @@ Data import: - any-glob-to-any-file: 'scripts/extract_individual_ingredients.pl' - any-glob-to-any-file: 'scripts/aggregate_ingredients.pl' - any-glob-to-any-file: 'lib/ProductOpener/Ingredients.pm' - + - any-glob-to-any-file: 'tests/unit/ingredients_parsing.t' + - any-glob-to-any-file: 'lib/ProductOpener/Ingredients.pm' + - any-glob-to-any-file: 'taxonomies/food/ingredients.txt' + - any-glob-to-any-file: 'tests/unit/ingredients.t' + - any-glob-to-any-file: 'tests/unit/ingredients_analysis.t' + - any-glob-to-any-file: 'tests/unit/ingredients_clean.t' + - any-glob-to-any-file: 'tests/unit/ingredients_nesting.t' + - any-glob-to-any-file: 'tests/unit/ingredients_parsing_todo.t' + - any-glob-to-any-file: 'tests/unit/ingredients_percent.t' + - any-glob-to-any-file: 'tests/unit/ingredients_processing.t' + - any-glob-to-any-file: 'tests/unit/ingredients_tags.t' + - any-glob-to-any-file: 'scripts/test_ingredient_parser.pl' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/en-category-types.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/fr-infinite-loop-allergens.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/fr-marmelade.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/fr-percents-origins-2.json' + - any-glob-to-any-file: 'tests/unit/expected_test_results/ingredients/ru-russian-oil.json' # Labels are all claims present on product packages. # https://wiki.openfoodfacts.org/Labels # Tracking issue: diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index 604851118e571..3f66071f994b0 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -174,6 +174,10 @@ my $separators_except_comma = qr/(;|:|$middle_dot|\[|\{|\(|\N{U+FF08}|( $dashes my $separators = qr/($stops\s|$commas|$separators_except_comma)/i; +# Symbols to indicate labels like organic, fairtrade etc. +my @symbols = ('\*\*\*', '\*\*', '\*', '°°°', '°°', '°', '\(1\)', '\(2\)', '¹', '²'); +my $symbols_regexp = join('|', @symbols); + # do not add sub ( ) in the regexps below as it would change which parts gets matched in $1, $2 etc. in other regexps that use those regexps # put the longest strings first, so that we can match "possible traces" before "traces" my %may_contain_regexps = ( @@ -533,6 +537,84 @@ my %of_finished_product = ( sv => " sylt", ); +=head1 FUNCTIONS + +=head2 init_percent_or_quantity_regexps($ingredients_lc) - initialize regular expressions needed for ingredients parsing + +This function creates regular expressions that match quantities or percent of an ingredient, +including localized strings like "minimum" + +=cut + +# prepared with +my %prepared_with = ( + en => "(?:made|prepared|produced) with", + da => "fremstillet af", + es => "elabora con", + fr => "(?:(?:é|e)labor(?:é|e)|fabriqu(?:é|e)|pr(?:é|e)par(?:é|e)|produit)(?:e)?(?:s)? (?:avec|à partir)", + hr => "(?:proizvedeno od|sadrži)", + nl => "bereid met", + sv => "är", +); + +my %min_regexp = ( + en => "min|min\.|minimum", + ca => "min|min\.|mín|mín\.|mínim|minim", + es => "min|min\.|mín|mín\.|mínimo|minimo|minimum", + fr => "min|min\.|mini|minimum", + hr => "min|min\.|mini|minimum", + pl => "min|min\.|minimum", +); + +my %max_regexp = ( + en => "max|max\.|maximum", + ca => "max|max\.|màxim", + es => "max|max\.|máximo", + fr => "max|max\.|maxi|maximum", + hr => "max|max\.|maxi|maximum", + pl => "max|max\.|maximum", +); + +# Words that can be ignored after a percent +# e.g. 50% du poids total, 30% of the total weight +# groups need to be non-capturing: prefixed with (?: + +my %ignore_strings_after_percent = ( + en => "of (?:the )?(?:total weight|grain is wholegrain rye)", + es => "(?:en el chocolate(?: con leche)?)", + fi => "jauhojen määrästä", + fr => "(?:dans le chocolat(?: (?:blanc|noir|au lait))?)|(?:du poids total|du poids)", + sv => "fetthalt", +); + +my %percent_or_quantity_regexps = (); + +sub init_percent_or_quantity_regexps($ingredients_lc) { + + if (not exists $percent_or_quantity_regexps{$ingredients_lc}) { + + my $prepared_with = $prepared_with{$ingredients_lc} || '', + + my $min_regexp = $min_regexp{$ingredients_lc} || ''; + + my $max_regexp = $max_regexp{$ingredients_lc} || ''; + + my $ignore_strings_after_percent = $ignore_strings_after_percent{$ingredients_lc} || ''; + + # Regular expression to find percent or quantities + # $percent_or_quantity_regexp has 2 capturing group: one for the number, and one for the % sign or the unit + $percent_or_quantity_regexps{$ingredients_lc} = '(?:' . "(?:$prepared_with )" . ' )?' # optional produced with + . '(?:>|' . $max_regexp . '|<|' . $min_regexp . '|\s|\.|:)*' # optional maximum, minimum, and separators + . '(?:\d+(?:[,.]\d+)?\s*-\s*?)?' # number+hyphens, first part (10-) of "10-12%" + . '(\d+(?:(?:\,|\.)\d+)?)\s*' # number, possibly with a dot or comma + . '(\%|g|gr|mg|kg|ml|cl|dl|l)\s*' # % or unit + . '(?:' . $min_regexp . '|' . $max_regexp . '|' # optional minimum, optional maximum + . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*'; # strings that can be ignored + } + + return; +} + # Labels that we want to recognize in the ingredients # e.g. "fraises issues de l'agriculture biologique" @@ -555,8 +637,6 @@ my %labels_regexps = (); # Needs to be called after Tags.pm has loaded taxonomies -=head1 FUNCTIONS - =head2 init_labels_regexps () - initialize regular expressions needed for ingredients parsing This function creates regular expressions that match all variations of labels @@ -728,47 +808,6 @@ sub extract_ingredients_from_image ($product_ref, $id, $ocr_engine, $results_ref return; } -# prepared with -my %prepared_with = ( - en => "(?:made|prepared|produced) with", - da => "fremstillet af", - es => "elabora con", - fr => "(?:(?:é|e)labor(?:é|e)|fabriqu(?:é|e)|pr(?:é|e)par(?:é|e)|produit)(?:e)?(?:s)? (?:avec|à partir)", - hr => "(?:proizvedeno od|sadrži)", - nl => "bereid met", - sv => "är", -); - -my %min_regexp = ( - en => "min|min\.|minimum", - ca => "min|min\.|mín|mín\.|mínim|minim", - es => "min|min\.|mín|mín\.|mínimo|minimo|minimum", - fr => "min|min\.|mini|minimum", - hr => "min|min\.|mini|minimum", - pl => "min|min\.|minimum", -); - -my %max_regexp = ( - en => "max|max\.|maximum", - ca => "max|max\.|màxim", - es => "max|max\.|máximo", - fr => "max|max\.|maxi|maximum", - hr => "max|max\.|maxi|maximum", - pl => "max|max\.|maximum", -); - -# Words that can be ignored after a percent -# e.g. 50% du poids total, 30% of the total weight -# groups need to be non-capturing: prefixed with (?: - -my %ignore_strings_after_percent = ( - en => "of (?:the )?(?:total weight|grain is wholegrain rye)", - es => "(?:en el chocolate(?: con leche)?)", - fi => "jauhojen määrästä", - fr => "(?:dans le chocolat(?: (?:blanc|noir|au lait))?)|(?:du poids total|du poids)", - sv => "fetthalt", -); - =head2 has_specific_ingredient_property ( product_ref, searched_ingredient_id, property ) Check if the specific ingredients structure (extracted from the end of the ingredients list and product labels) @@ -1891,28 +1930,12 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref, $ my $and = $and{$ingredients_lc} || " and "; - my $prepared_with = $prepared_with{$ingredients_lc} || '', - - my $min_regexp = $min_regexp{$ingredients_lc} || ''; - - my $max_regexp = $max_regexp{$ingredients_lc} || ''; - - my $ignore_strings_after_percent = $ignore_strings_after_percent{$ingredients_lc} || ''; - - # Regular expression to find percent or quantities - # $percent_or_quantity_regexp has 2 capturing group: one for the number, and one for the % sign or the unit - my $percent_or_quantity_regexp = '(?:' . "(?:$prepared_with )" . ' )?' # optional produced with - . '(?:>|' . $max_regexp . '|<|' . $min_regexp . '|\s|\.|:)*' # optional maximum, minimum, and separators - . '(?:\d+(?:[,.]\d+)?\s*-\s*?)?' # number+hyphens, first part (10-) of "10-12%" - . '(\d+(?:(?:\,|\.)\d+)?)\s*' # number, possibly with a dot or comma - . '(\%|g|gr|mg|kg|ml|cl|dl|l)\s*' # % or unit - . '(?:' . $min_regexp . '|' . $max_regexp . '|' # optional minimum, optional maximum - . $ignore_strings_after_percent . '|\s|\)|\]|\}|\*)*'; # strings that can be ignored - my $per = $per{$ingredients_lc} || ' per '; my $of_finished_product = $of_finished_product{$ingredients_lc} || ''; my $per_100g_regexp = "(${per}|\/)${one_hundred_grams_or_ml}(?:$of_finished_product)?"; + my $percent_or_quantity_regexp = $percent_or_quantity_regexps{$ingredients_lc}; + # Extract phrases related to specific ingredients at the end of the ingredients list $text = parse_specific_ingredients_from_text($product_ref, $text, $percent_or_quantity_regexp, $per_100g_regexp); @@ -4548,7 +4571,20 @@ sub normalize_fr_a_de_b ($a, $b) { } } -=head2 normalize_a_of_b ( $lc, $a, $b, $of_bool, $alternate_names_ref ) +# This function removes labels like "organic" from ingredients, so that we can check if they exist +# with canonicalize_taxonomy_tag. The labels can be parsed out when doing ingredients analysis. + +sub remove_parsable_labels ($ingredients_lc, $ingredient) { + if ($ingredients_lc eq "en") { + $ingredient =~ s/(?:organic |fair trade )*//ig; + } + elsif ($ingredients_lc eq "fr") { + $ingredient =~ s/(?: bio| biologique| équitable|s|\s|' . $symbols_regexp . ')//ig; + } + return $ingredient; +} + +=head2 normalize_a_of_b ( $lc, $a, $b, $of_bool, $alternate_names_ref = undef ) This function is called by normalize_enumeration() @@ -4590,18 +4626,18 @@ string, comma-joined category and type, example: 'palm vegetal oil' or 'sunflowe =cut -sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { +sub normalize_a_of_b ($ingredients_lc, $a, $b, $of_bool, $alternate_names_ref = undef) { $a =~ s/\s+$//; $b =~ s/^\s+//; my $a_of_b; - if (($lc eq "en") or ($lc eq "hr")) { + if (($ingredients_lc eq "en") or ($ingredients_lc eq "hr")) { # start by "with" (example: "mlijeko (s 1.0% mliječne masti)"), in which case it $b should be added after $a # start by "with etc." should be added at the end of the previous ingredient my %with = (hr => '(s | sa )',); - my $with = $with{$lc} || " will not match "; + my $with = $with{$ingredients_lc} || " will not match "; if ($b =~ /^$with/i) { $a_of_b = $a . " " . $b; } @@ -4609,10 +4645,10 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { $a_of_b = $b . " " . $a; } } - elsif ($lc eq "es") { + elsif ($ingredients_lc eq "es") { $a_of_b = $a . " de " . $b; } - elsif ($lc eq "fr") { + elsif ($ingredients_lc eq "fr") { $b =~ s/^(de |d')//; if (($b =~ /^(a|e|i|o|u|y|h)/i) && ($of_bool == 1)) { @@ -4625,11 +4661,11 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { $a_of_b = $a . " " . $b; } } - elsif (($lc eq "de") or ($lc eq "ru") or ($lc eq "pl")) { + elsif (($ingredients_lc eq "de") or ($ingredients_lc eq "ru") or ($ingredients_lc eq "pl")) { $a_of_b = $a . " " . $b; } else { - die("unsupported language in normalize_a_of_b: $lc, $a, $b"); + die("unsupported language in normalize_a_of_b: $ingredients_lc, $a, $b"); } # If we have alternate categories, check if $a_of_b is an existing taxonomy entry, @@ -4638,7 +4674,11 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { if (defined $alternate_names_ref) { my $name_exists; - canonicalize_taxonomy_tag($lc, "ingredients", $a_of_b, \$name_exists); + # remove labels like "organic", "fairtrade": they can be parsed out when doing ingredients analysis + # TODO: use the labels regexps instead + my $a_of_b_copy = remove_parsable_labels($ingredients_lc, $a_of_b); + canonicalize_taxonomy_tag($ingredients_lc, "ingredients", $a_of_b_copy, \$name_exists); + print STDERR "a: $a - b: $b - $a_of_b: $a_of_b - a_of_b_copy: $a_of_b_copy: - $name_exists\n"; if (not $name_exists) { foreach my $alternate_name (@{$alternate_names_ref}) { @@ -4646,7 +4686,10 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { = $alternate_name; # make a copy so that we can modify it without changing the array entry $alternate_name_copy =~ s//$b/; my $alternate_name_exists; - canonicalize_taxonomy_tag($lc, "ingredients", $alternate_name_copy, \$alternate_name_exists); + canonicalize_taxonomy_tag($ingredients_lc, "ingredients", $alternate_name_copy, + \$alternate_name_exists); + print STDERR + "alternate_name: $alternate_name - alternate_name_copy: $alternate_name_copy: - $alternate_name_exists\n"; if ($alternate_name_exists) { $a_of_b = $alternate_name_copy; last; @@ -4658,7 +4701,7 @@ sub normalize_a_of_b ($lc, $a, $b, $of_bool, $alternate_names_ref = undef) { return $a_of_b; } -=head2 normalize_enumeration ($lc, $category, $types, $of_bool, $alternate_names_ref = undef) +=head2 normalize_enumeration ($ingredients_lc, $category, $types, $of_bool, $alternate_names_ref = undef, $do_not_output_parent = undef) This function is called by develop_ingredients_categories_and_types() @@ -4668,7 +4711,7 @@ Some ingredients are specified by an ingredient "category" (e.g. "oil") and a "t This function combines the category to all elements of the types string $category = "Vegetal oil" and $types = "palm, sunflower and olive" will return -"palm vegetal oil, sunflower vegetal oil, olive vegetal oil" +"vegetal oil (palm vegetal oil, sunflower vegetal oil, olive vegetal oil)" =head3 Arguments @@ -4678,21 +4721,41 @@ language abbreviation (en for English, for example) =head4 category -string, as defined in %ingredients_categories_and_types, example: 'Vegetal oil' for 'Vegetal oil (sunflower, olive and palm)' +string, as matched from definition in %ingredients_categories_and_types, example: 'Vegetal oil' for 'Vegetal oil (sunflower, olive and palm)' =head4 types -string, as defined in %ingredients_categories_and_types, example: 'sunflower, olive and palm' for 'Vegetal oil (sunflower, olive and palm)' +string, as matched from definition in %ingredients_categories_and_types, example: 'sunflower, olive and palm' for 'Vegetal oil (sunflower, olive and palm)' + +=head4 $of_bool - indicate if we want to construct entries like " of " + +e.g. in French we combine "huile" and "olive" to "huile d'olive" +but we combine "poivron" and "rouge" to "poivron rouge". + +=head4 $alternate_names_ref + +Reference to an array of alternate names for the category + +=head4 $do_not_output_parent - indicate if we want to output the parent ingredient + +e.g. for "carbonates d'ammonium et de sodium", we want only "carbonates d'ammonium, carbonates de sodium" +and not "carbonates (carbonates d'ammonium, carbonates de sodium)" as "carbonates" is another additive =head3 Return value =head4 Transformed ingredients list text -string, comma-joined category with all elements of the types, example: 'sunflower vegetal oil, olive vegetal oil, palm vegetal oil' +string, with the type + a list of comma-joined category with all elements of the types +example: 'vegetal oils (sunflower vegetal oil, olive vegetal oil, palm vegetal oil)' =cut -sub normalize_enumeration ($lc, $category, $types, $of_bool, $alternate_names_ref = undef) { +sub normalize_enumeration ( + $ingredients_lc, $category, $types, $of_bool, + $alternate_names_ref = undef, + $do_not_output_parent = undef + ) +{ $log->debug("normalize_enumeration", {category => $category, types => $types}) if $log->is_debug(); # If there is a trailing space, save it and output it @@ -4702,12 +4765,27 @@ sub normalize_enumeration ($lc, $category, $types, $of_bool, $alternate_names_re } # do not match anything if we don't have a translation for "and" - my $and = $and{$lc} || " will not match "; + my $and = $and{$ingredients_lc} || " will not match "; my @list = split(/$obrackets|$cbrackets|\/| \/ | $dashes |$commas |$commas|$and/i, $types); - return - join(", ", map {normalize_a_of_b($lc, $category, $_, $of_bool, $alternate_names_ref)} @list) . $trailing_space; + # If we have a percent or quantity, we output it only for the parent + my $category_without_percent_or_quantity = $category; + my $percent_or_quantity_regexp = $percent_or_quantity_regexps{$ingredients_lc}; + $category_without_percent_or_quantity =~ s/$percent_or_quantity_regexp//ig; + + my $list = join( + ", ", + map { + normalize_a_of_b($ingredients_lc, $category_without_percent_or_quantity, $_, $of_bool, $alternate_names_ref) + } @list + ); + + unless ($do_not_output_parent) { + $list = $category . " (" . $list . ")"; + } + + return $list . $trailing_space; } # iodure et hydroxide de potassium @@ -4716,12 +4794,12 @@ sub normalize_fr_a_et_b_de_c ($a, $b, $c) { return normalize_fr_a_de_b($a, $c) . ", " . normalize_fr_a_de_b($b, $c); } -sub normalize_additives_enumeration ($lc, $enumeration) { +sub normalize_additives_enumeration ($ingredients_lc, $enumeration) { $log->debug("normalize_additives_enumeration", {enumeration => $enumeration}) if $log->is_debug(); # do not match anything if we don't have a translation for "and" - my $and = $and{$lc} || " will not match "; + my $and = $and{$ingredients_lc} || " will not match "; my @list = split(/$obrackets|$cbrackets|\/| \/ | $dashes |$commas |$commas|$and/i, $enumeration); @@ -5802,16 +5880,8 @@ my %ingredients_categories_and_types = ( # huiles { categories => [ - "huile", - "huile végétale", - "huiles végétales", - "matière grasse", - "matières grasses", - "matière grasse végétale", - "matières grasses végétales", - "graisse", - "graisse végétale", - "graisses végétales", + # allow multiple types of oils in the category (e.g. "huiles et graisses"), with modifiers (e.g. "végétale") + '(?:(?: et )?(?:huile|graisse|stéarine|matière\s? grasse)s?)+(?: (?:végétale|(?:partiellement |totalement |non(?:-| |))hydrogénée?)s?)*', ], types => [ "arachide", "avocat", "carthame", "chanvre", @@ -5821,7 +5891,14 @@ my %ingredients_categories_and_types = ( "olive vierge", "olive extra vierge", "olive vierge extra", "palme", "palmiste", "pépins de raisin", "sal", "sésame", "soja", "tournesol", "tournesol oléique", - ] + ], + alternate_names => [ + "huile de ", + "huile d'", + "matière grasse de ", + "graisse de ", + "stéarine de " + ], }, # (natural) extract { @@ -5887,7 +5964,10 @@ my %ingredients_categories_and_types = ( types => [ "aluminium", "ammonium", "calcium", "cuivre", "fer", "magnésium", "manganèse", "potassium", "sodium", "zinc", - ] + ], + # avoid turning "carbonates d'ammonium et de sodium" into "carbonates (carbonates d'ammonium, carbonates de sodium)" + # as "carbonates" is an additive + do_not_output_parent => 1, }, # peppers {categories => ["piment", "poivron"], types => ["vert", "jaune", "rouge",], of_bool => 0,}, @@ -5922,7 +6002,7 @@ my %ingredients_categories_and_types = ( "voćni", ] }, - # falvouring + # flavouring { categories => ["prirodna aroma", "prirodne arome",], types => ["citrusa sa ostalim prirodnim aromama", "limuna", "mente", "mente s drugim prirodnim aromama",] @@ -6023,7 +6103,7 @@ my %ingredients_categories_and_types = ( ru => [ # oils { - categories => ["масло", "масло растительное",], + categories => ['масло(?: растительное)?',], types => [ "Подсолнечное", "Пальмовое", "Рапсовое", "Кокосовое", "горчицы", "Соевое", "Пальмоядровое", "Оливковое", "пальм", @@ -6033,16 +6113,16 @@ my %ingredients_categories_and_types = ( ); -# Symbols to indicate labels like organic, fairtrade etc. -my @symbols = ('\*\*\*', '\*\*', '\*', '°°°', '°°', '°', '\(1\)', '\(2\)', '¹', '²'); -my $symbols_regexp = join('|', @symbols); - sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { $log->debug("develop_ingredients_categories_and_types", {ingredients_lc => $ingredients_lc, text => $text}) if $log->is_debug(); if (defined $ingredients_categories_and_types{$ingredients_lc}) { + my $percent_or_quantity_regexp = $percent_or_quantity_regexps{$ingredients_lc}; + # Make the 2 capture groups (for number and for % or unit, starting with (\d and (\% non capturing + $percent_or_quantity_regexp =~ s/\(\\/\(?:\\/g; + foreach my $categories_and_types_ref (@{$ingredients_categories_and_types{$ingredients_lc}}) { my $category_regexp = ""; foreach my $category (@{$categories_and_types_ref->{categories}}) { @@ -6051,7 +6131,6 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { if ($unaccented_category ne $category) { $category_regexp .= '|' . $unaccented_category . '|' . $unaccented_category . 's'; } - } $category_regexp =~ s/^\|//; @@ -6066,6 +6145,9 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { $category_regexp = '(?:' . $category_regexp . ')(?:' . $symbols_regexp . ')*'; } + # Also match % after the category (e.g. "vegetal oil 45% (palm, rapeseed)" + $category_regexp .= '\s*(?:' . $percent_or_quantity_regexp . ')?'; + my $type_regexp = ""; foreach my $type (@{$categories_and_types_ref->{types}}) { $type_regexp .= '|' . $type . '|' . $type . 's'; @@ -6076,6 +6158,8 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { } $type_regexp =~ s/^\|//; + #$log->debug("develop_ingredients_categories_and_types", { category_regexp => $category_regexp, type_regexp => $type_regexp}) if $log->is_debug(); + my $of_bool = 1; if (defined $categories_and_types_ref->{of_bool}) { $of_bool = $categories_and_types_ref->{of_bool}; @@ -6109,19 +6193,20 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { or ($ingredients_lc eq "pl")) { # vegetable oil (palm, sunflower and olive) -> palm vegetable oil, sunflower vegetable oil, olive vegetable oil + # nNte: not using the /x modifier to put spaces in the regexp, as it doesn't work if the interpolated variables contain spaces themselves... $text - =~ s/($category_regexp)(?::|\(|\[| | $of )+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))?/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)(?::|\(|\[| | $of )+((($type_regexp)($symbols_regexp|\s)*(\s|\/|\s\/\s|\s-\s|,|,\s|$and|$of|$and_of|$and_or)+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))?/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # vegetable oil (palm) -> palm vegetable oil $text - =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # vegetable oil: palm $text - =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool,$categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # ječmeni i pšenični slad (barley and wheat malt) -> ječmeni slad, pšenični slad $text - =~ s/((?:(?:$type_regexp)(?: |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+(?:$type_regexp))\s*($category_regexp)/normalize_enumeration($ingredients_lc,$2,$1,$of_bool,$categories_and_types_ref->{alternate_names})/ieg; + =~ s/((?:(?:$type_regexp)(?: |\/| \/ | - |,|, |$and|$of|$and_of|$and_or)+)+(?:$type_regexp))\s*($category_regexp)/normalize_enumeration($ingredients_lc,$2,$1,$of_bool,$categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; } elsif ($ingredients_lc eq "fr") { # arôme naturel de pomme avec d'autres âromes @@ -6140,20 +6225,20 @@ sub develop_ingredients_categories_and_types ($ingredients_lc, $text) { # require a " et " and/or " de " at the end of the enumeration # $text - =~ s/($category_regexp)(?::| | de | d')+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)*($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, )*( et | de | et de | et d'| d'| d'autres | et d'autres )( |\/| \/ | - |,|, )*($type_regexp)($symbols_regexp|\s)*)\b/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)(?::| | de | d')+((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)*($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, )*( et | de | et de | et d'| d'| d'autres | et d'autres )( |\/| \/ | - |,|, )*($type_regexp)($symbols_regexp|\s)*)\b/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; # Huiles végétales (palme, colza et tournesol) $text - =~ s/($category_regexp)(?:\(|\[)(?:de |d')?((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)(?:\(|\[)(?:de |d')?((($type_regexp)($symbols_regexp|\s)*( |\/| \/ | - |,|, | et | de | et de | et d'| d')+)+($type_regexp)($symbols_regexp|\s)*)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names},$categories_and_types_ref->{do_not_output_parent})/ieg; $text =~ s/fer_élémentaire/fer élémentaire/ig; # huile végétale (colza) $text - =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?:\(|\[)\s?($type_regexp)\b(\s?(\)|\]))/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names}, $categories_and_types_ref->{do_not_output_parent})/ieg; # huile végétale : colza, $text - =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names})/ieg; + =~ s/($category_regexp)\s?(?::)\s?($type_regexp)(?=$separators|.|$)/normalize_enumeration($ingredients_lc,$1,$2,$of_bool, $categories_and_types_ref->{alternate_names}, $categories_and_types_ref->{do_not_output_parent})/ieg; } } @@ -6261,6 +6346,8 @@ sub preparse_ingredients_text ($ingredients_lc, $text) { init_origins_regexps(); } + init_percent_or_quantity_regexps($ingredients_lc); + my $and = $and{$ingredients_lc} || " and "; my $and_without_spaces = $and; $and_without_spaces =~ s/^ //; diff --git a/stop_words.txt b/stop_words.txt index 8b7e67bb3b9cb..1fa509739073c 100644 --- a/stop_words.txt +++ b/stop_words.txt @@ -11,6 +11,8 @@ AgriBalyse AGS Alimentarius Allergènes +ammonium +d'ammonium Anses ANSES api diff --git a/tests/unit/expected_test_results/ingredients/en-category-types.json b/tests/unit/expected_test_results/ingredients/en-category-types.json index 51183491e9574..1dc7d27c5406e 100644 --- a/tests/unit/expected_test_results/ingredients/en-category-types.json +++ b/tests/unit/expected_test_results/ingredients/en-category-types.json @@ -1,51 +1,64 @@ { "ingredients" : [ { - "ciqual_food_code" : "17130", - "ecobalyse_code" : "rapeseed-oil", - "from_palm_oil" : "no", - "id" : "en:rapeseed-oil", + "from_palm_oil" : "maybe", + "id" : "en:vegetable-oil-and-fat", + "ingredients" : [ + { + "ciqual_food_code" : "17130", + "ecobalyse_code" : "rapeseed-oil", + "from_palm_oil" : "no", + "id" : "en:rapeseed-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 62.5, + "percent_max" : 100, + "percent_min" : 25, + "text" : "Rapsöl", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "ciqual_proxy_food_code" : "16129", + "ecobalyse_code" : "refined-palm-oil", + "from_palm_oil" : "yes", + "id" : "en:palm-fat", + "is_in_taxonomy" : 1, + "percent_estimate" : 18.75, + "percent_max" : 50, + "percent_min" : 0, + "text" : "Palmfett", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "ciqual_food_code" : "16110", + "from_palm_oil" : "no", + "id" : "en:shea-butter", + "is_in_taxonomy" : 1, + "percent_estimate" : 9.375, + "percent_max" : 33.3333333333333, + "percent_min" : 0, + "text" : "Sheafett", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "from_palm_oil" : "no", + "id" : "en:sunflower-fat", + "is_in_taxonomy" : 1, + "percent_estimate" : 9.375, + "percent_max" : 25, + "percent_min" : 0, + "text" : "Sonnenblumenfett", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], "is_in_taxonomy" : 1, - "percent_estimate" : 62.5, + "percent_estimate" : 100, "percent_max" : 100, - "percent_min" : 25, - "text" : "Rapsöl", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "ciqual_proxy_food_code" : "16129", - "ecobalyse_code" : "refined-palm-oil", - "from_palm_oil" : "yes", - "id" : "en:palm-fat", - "is_in_taxonomy" : 1, - "percent_estimate" : 18.75, - "percent_max" : 50, - "percent_min" : 0, - "text" : "Palmfett", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "ciqual_food_code" : "16110", - "from_palm_oil" : "no", - "id" : "en:shea-butter", - "is_in_taxonomy" : 1, - "percent_estimate" : 9.375, - "percent_max" : 33.3333333333333, - "percent_min" : 0, - "text" : "Sheafett", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "from_palm_oil" : "no", - "id" : "en:sunflower-fat", - "is_in_taxonomy" : 1, - "percent_estimate" : 9.375, - "percent_max" : 25, - "percent_min" : 0, - "text" : "Sonnenblumenfett", + "percent_min" : 100, + "text" : "pflanzliche Öle und Fette", "vegan" : "yes", "vegetarian" : "yes" } @@ -61,9 +74,9 @@ "en:vegetarian" ], "ingredients_hierarchy" : [ - "en:rapeseed-oil", - "en:oil-and-fat", "en:vegetable-oil-and-fat", + "en:oil-and-fat", + "en:rapeseed-oil", "en:palm-fat", "en:palm-oil-and-fat", "en:shea-butter", @@ -71,12 +84,13 @@ "en:sunflower-fat" ], "ingredients_lc" : "de", - "ingredients_n" : 4, + "ingredients_n" : 5, "ingredients_n_tags" : [ - "4", + "5", "1-10" ], "ingredients_original_tags" : [ + "en:vegetable-oil-and-fat", "en:rapeseed-oil", "en:palm-fat", "en:shea-butter", @@ -84,9 +98,9 @@ ], "ingredients_percent_analysis" : 1, "ingredients_tags" : [ - "en:rapeseed-oil", - "en:oil-and-fat", "en:vegetable-oil-and-fat", + "en:oil-and-fat", + "en:rapeseed-oil", "en:palm-fat", "en:palm-oil-and-fat", "en:shea-butter", @@ -99,14 +113,16 @@ "ingredients_with_unspecified_percent_n" : 4, "ingredients_with_unspecified_percent_sum" : 100, "ingredients_without_ciqual_codes" : [ - "en:sunflower-fat" + "en:sunflower-fat", + "en:vegetable-oil-and-fat" ], - "ingredients_without_ciqual_codes_n" : 1, + "ingredients_without_ciqual_codes_n" : 2, "ingredients_without_ecobalyse_ids" : [ "en:shea-butter", - "en:sunflower-fat" + "en:sunflower-fat", + "en:vegetable-oil-and-fat" ], - "ingredients_without_ecobalyse_ids_n" : 2, + "ingredients_without_ecobalyse_ids_n" : 3, "known_ingredients_n" : 8, "lc" : "de", "nutriments" : { diff --git a/tests/unit/expected_test_results/ingredients/fr-infinite-loop-allergens.json b/tests/unit/expected_test_results/ingredients/fr-infinite-loop-allergens.json index c366f98a9af43..127746f930e89 100644 --- a/tests/unit/expected_test_results/ingredients/fr-infinite-loop-allergens.json +++ b/tests/unit/expected_test_results/ingredients/fr-infinite-loop-allergens.json @@ -23,35 +23,46 @@ "vegetarian" : "yes" }, { - "ciqual_proxy_food_code" : "16129", - "ecobalyse_code" : "refined-palm-oil", - "from_palm_oil" : "yes", - "id" : "en:palm-fat", + "from_palm_oil" : "maybe", + "id" : "en:vegetable-fat", + "ingredients" : [ + { + "ciqual_food_code" : "16129", + "ecobalyse_code" : "refined-palm-oil", + "from_palm_oil" : "yes", + "id" : "en:palm-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 6.25, + "text" : "huile de palme", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "from_palm_oil" : "yes", + "id" : "en:palm-kernel-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 6.25, + "text" : "huile de palmiste", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], "is_in_taxonomy" : 1, "percent_estimate" : 12.5, - "text" : "graisse végétale de palme", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "from_palm_oil" : "yes", - "id" : "en:palm-kernel-fat", - "is_in_taxonomy" : 1, - "percent_estimate" : 6.25, - "text" : "graisse végétale de palmiste", + "text" : "graisse végétale", "vegan" : "yes", "vegetarian" : "yes" }, { "id" : "fr:beurre-de-cacao1", "is_in_taxonomy" : 0, - "percent_estimate" : 3.125, + "percent_estimate" : 6.25, "text" : "beurre de cacao1" }, { "id" : "fr:pate-de-cacao1", "is_in_taxonomy" : 0, - "percent_estimate" : 1.5625, + "percent_estimate" : 3.125, "text" : "pâte de cacao1" }, { @@ -60,7 +71,7 @@ "id" : "en:skimmed-milk-powder", "is_in_taxonomy" : 1, "percent" : 3, - "percent_estimate" : 1.5625, + "percent_estimate" : 3, "text" : "LAIT* écrémé en poudre", "vegan" : "no", "vegetarian" : "yes" @@ -71,7 +82,7 @@ "from_palm_oil" : "no", "id" : "en:sunflower-oil", "is_in_taxonomy" : 1, - "percent_estimate" : 0, + "percent_estimate" : 0.0625, "text" : "huile de tournesol", "vegan" : "yes", "vegetarian" : "yes" @@ -82,20 +93,20 @@ { "id" : "en:e322", "is_in_taxonomy" : 1, - "percent_estimate" : 0, + "percent_estimate" : 0.03125, "text" : "lécithines", "vegan" : "maybe", "vegetarian" : "maybe" } ], "is_in_taxonomy" : 1, - "percent_estimate" : 0, + "percent_estimate" : 0.03125, "text" : "émulsifiant" }, { "id" : "en:vanilla-flavouring", "is_in_taxonomy" : 1, - "percent_estimate" : 0, + "percent_estimate" : 0.015625, "text" : "arômes de vanille", "vegan" : "maybe", "vegetarian" : "maybe" @@ -105,7 +116,7 @@ "id" : "en:cocoa", "is_in_taxonomy" : 1, "percent" : 30, - "percent_estimate" : 0, + "percent_estimate" : 0.015625, "text" : "Cacao", "vegan" : "yes", "vegetarian" : "yes" @@ -117,8 +128,8 @@ "en:skimmed-milk-powder" ], "en:palm-oil" : [ - "en:palm-fat", - "en:palm-kernel-fat" + "en:palm-oil", + "en:palm-kernel-oil" ], "en:vegan-status-unknown" : [ "fr:beurre-de-cacao1", @@ -141,12 +152,9 @@ "en:whole-milk-powder", "en:dairy", "en:milk-powder", - "en:palm-fat", + "en:vegetable-fat", "en:oil-and-fat", "en:vegetable-oil-and-fat", - "en:palm-oil-and-fat", - "en:palm-kernel-fat", - "en:palm-kernel-oil-and-fat", "fr:beurre-de-cacao1", "fr:pate-de-cacao1", "en:skimmed-milk-powder", @@ -157,19 +165,22 @@ "en:flavouring", "en:cocoa", "en:plant", + "en:palm-oil", + "en:palm-oil-and-fat", + "en:palm-kernel-oil", + "en:palm-kernel-oil-and-fat", "en:e322" ], "ingredients_lc" : "fr", - "ingredients_n" : 12, + "ingredients_n" : 13, "ingredients_n_tags" : [ - "12", + "13", "11-20" ], "ingredients_original_tags" : [ "en:sugar", "en:whole-milk-powder", - "en:palm-fat", - "en:palm-kernel-fat", + "en:vegetable-fat", "fr:beurre-de-cacao1", "fr:pate-de-cacao1", "en:skimmed-milk-powder", @@ -177,6 +188,8 @@ "en:emulsifier", "en:vanilla-flavouring", "en:cocoa", + "en:palm-oil", + "en:palm-kernel-oil", "en:e322" ], "ingredients_percent_analysis" : -1, @@ -187,12 +200,9 @@ "en:whole-milk-powder", "en:dairy", "en:milk-powder", - "en:palm-fat", + "en:vegetable-fat", "en:oil-and-fat", "en:vegetable-oil-and-fat", - "en:palm-oil-and-fat", - "en:palm-kernel-fat", - "en:palm-kernel-oil-and-fat", "fr:beurre-de-cacao1", "fr:pate-de-cacao1", "en:skimmed-milk-powder", @@ -203,33 +213,39 @@ "en:flavouring", "en:cocoa", "en:plant", + "en:palm-oil", + "en:palm-oil-and-fat", + "en:palm-kernel-oil", + "en:palm-kernel-oil-and-fat", "en:e322" ], "ingredients_text" : "Sucre, LAIT* entier en poudre 25%, graisse végétale (palme, palmiste), beurre de cacao1, pâte de cacao1, LAIT* écrémé en poudre 3%, huile de tournesol, émulsifiant: lécithines, arômes de vanille. Traces éventuelles de fruits à coque et de céréales contenant du gluten. Cacao: 30% minimum dans le chocolat au lait. *Lait: origine UE et/ou non UE (Royaume-Uni)", "ingredients_with_specified_percent_n" : 3, "ingredients_with_specified_percent_sum" : 58, "ingredients_with_unspecified_percent_n" : 8, - "ingredients_with_unspecified_percent_sum" : 73.4375, + "ingredients_with_unspecified_percent_sum" : 71.984375, "ingredients_without_ciqual_codes" : [ "en:e322", "en:emulsifier", - "en:palm-kernel-fat", + "en:palm-kernel-oil", "en:vanilla-flavouring", + "en:vegetable-fat", "fr:beurre-de-cacao1", "fr:pate-de-cacao1" ], - "ingredients_without_ciqual_codes_n" : 6, + "ingredients_without_ciqual_codes_n" : 7, "ingredients_without_ecobalyse_ids" : [ "en:cocoa", "en:e322", "en:emulsifier", - "en:palm-kernel-fat", + "en:palm-kernel-oil", "en:vanilla-flavouring", + "en:vegetable-fat", "fr:beurre-de-cacao1", "fr:pate-de-cacao1" ], - "ingredients_without_ecobalyse_ids_n" : 7, - "known_ingredients_n" : 21, + "ingredients_without_ecobalyse_ids_n" : 8, + "known_ingredients_n" : 22, "lc" : "fr", "misc_tags" : [ "en:some-ingredients-with-specified-percent" diff --git a/tests/unit/expected_test_results/ingredients/fr-marmelade.json b/tests/unit/expected_test_results/ingredients/fr-marmelade.json index 5047b84e68f33..ec23c9e717f0a 100644 --- a/tests/unit/expected_test_results/ingredients/fr-marmelade.json +++ b/tests/unit/expected_test_results/ingredients/fr-marmelade.json @@ -177,57 +177,68 @@ "vegetarian" : "yes" }, { - "from_palm_oil" : "no", - "id" : "en:illipe-oil", + "from_palm_oil" : "maybe", + "id" : "en:vegetable-fat", + "ingredients" : [ + { + "from_palm_oil" : "no", + "id" : "en:illipe-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 0.778125, + "text" : "huile de illipe", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "from_palm_oil" : "no", + "id" : "en:mango-kernel-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 0.3890625, + "text" : "huile de mangue", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "from_palm_oil" : "no", + "id" : "en:shorea-robusta-seed-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 0.19453125, + "text" : "huile de sal", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "ciqual_food_code" : "16110", + "from_palm_oil" : "no", + "id" : "en:shea-butter", + "is_in_taxonomy" : 1, + "percent_estimate" : 0.097265625, + "text" : "huile de karité", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "ciqual_food_code" : "16129", + "ecobalyse_code" : "refined-palm-oil", + "from_palm_oil" : "yes", + "id" : "en:palm-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 0.0972656249999999, + "text" : "huile de palme", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], "is_in_taxonomy" : 1, "percent_estimate" : 1.55625, - "text" : "graisses végétales d'illipe", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "from_palm_oil" : "no", - "id" : "en:mango-kernel-oil", - "is_in_taxonomy" : 1, - "percent_estimate" : 0.778124999999999, - "text" : "graisses végétales de mangue", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "from_palm_oil" : "no", - "id" : "en:shorea-robusta-seed-oil", - "is_in_taxonomy" : 1, - "percent_estimate" : 0.3890625, - "text" : "graisses végétales de sal", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "ciqual_food_code" : "16110", - "from_palm_oil" : "no", - "id" : "en:shea-butter", - "is_in_taxonomy" : 1, - "percent_estimate" : 0.194531250000001, - "text" : "graisses végétales de karité", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "ciqual_proxy_food_code" : "16129", - "ecobalyse_code" : "refined-palm-oil", - "from_palm_oil" : "yes", - "id" : "en:palm-fat", - "is_in_taxonomy" : 1, - "percent_estimate" : 0.0972656250000004, - "text" : "graisses végétales de palme", + "text" : "graisses végétales", "vegan" : "yes", "vegetarian" : "yes" }, { "id" : "en:flavouring", "is_in_taxonomy" : 1, - "percent_estimate" : 0.0486328124999993, + "percent_estimate" : 0.778124999999999, "text" : "arôme", "vegan" : "maybe", "vegetarian" : "maybe" @@ -239,20 +250,20 @@ "ciqual_food_code" : "42200", "id" : "en:soya-lecithin", "is_in_taxonomy" : 1, - "percent_estimate" : 0.0243164062499996, + "percent_estimate" : 0.3890625, "text" : "lécithine de soja", "vegan" : "yes", "vegetarian" : "yes" } ], "is_in_taxonomy" : 1, - "percent_estimate" : 0.0243164062499996, + "percent_estimate" : 0.3890625, "text" : "émulsifiant" }, { "id" : "en:lactose-and-milk-proteins", "is_in_taxonomy" : 1, - "percent_estimate" : 0.0243164062500014, + "percent_estimate" : 0.389062500000001, "text" : "lactose et protéines de lait", "vegan" : "no", "vegetarian" : "yes" @@ -380,7 +391,7 @@ "en:egg" ], "en:palm-oil" : [ - "en:palm-fat" + "en:palm-oil" ], "en:vegan-status-unknown" : [ "en:sodium-citrate" @@ -439,14 +450,7 @@ "en:plant", "en:cocoa", "en:cocoa-butter", - "en:illipe-oil", "en:vegetable-fat", - "en:mango-kernel-oil", - "en:vegetable-oil", - "en:shorea-robusta-seed-oil", - "en:shea-butter", - "en:palm-fat", - "en:palm-oil-and-fat", "en:lactose-and-milk-proteins", "en:protein", "en:animal-protein", @@ -467,12 +471,19 @@ "en:sodium-citrate", "en:minerals", "en:sodium", - "en:e415" + "en:e415", + "en:illipe-oil", + "en:mango-kernel-oil", + "en:vegetable-oil", + "en:shorea-robusta-seed-oil", + "en:shea-butter", + "en:palm-oil", + "en:palm-oil-and-fat" ], "ingredients_lc" : "fr", - "ingredients_n" : 41, + "ingredients_n" : 42, "ingredients_n_tags" : [ - "41", + "42", "41-50" ], "ingredients_original_tags" : [ @@ -499,11 +510,7 @@ "en:sugar", "en:cocoa-paste", "en:cocoa-butter", - "en:illipe-oil", - "en:mango-kernel-oil", - "en:shorea-robusta-seed-oil", - "en:shea-butter", - "en:palm-fat", + "en:vegetable-fat", "en:flavouring", "en:emulsifier", "en:lactose-and-milk-proteins", @@ -516,6 +523,11 @@ "en:e333", "en:sodium-citrate", "en:e415", + "en:illipe-oil", + "en:mango-kernel-oil", + "en:shorea-robusta-seed-oil", + "en:shea-butter", + "en:palm-oil", "en:soya-lecithin" ], "ingredients_percent_analysis" : -1, @@ -564,14 +576,7 @@ "en:plant", "en:cocoa", "en:cocoa-butter", - "en:illipe-oil", "en:vegetable-fat", - "en:mango-kernel-oil", - "en:vegetable-oil", - "en:shorea-robusta-seed-oil", - "en:shea-butter", - "en:palm-fat", - "en:palm-oil-and-fat", "en:lactose-and-milk-proteins", "en:protein", "en:animal-protein", @@ -592,7 +597,14 @@ "en:sodium-citrate", "en:minerals", "en:sodium", - "en:e415" + "en:e415", + "en:illipe-oil", + "en:mango-kernel-oil", + "en:vegetable-oil", + "en:shorea-robusta-seed-oil", + "en:shea-butter", + "en:palm-oil", + "en:palm-oil-and-fat" ], "ingredients_text" : "Marmelade d'oranges 41% (sirop de glucose-fructose, sucre, pulpe d'orange 4.5%, jus d'orange concentré 1.4% (équivalent jus d'orange 7.8%), pulpe d'orange concentrée 0.6% (équivalent pulpe d'orange 2.6%), gélifiant (pectines), acidifiant (acide citrique), correcteurs d'acidité (citrate de calcium, citrate de sodium), arôme naturel d'orange, épaississant (gomme xanthane)), chocolat 24.9% (sucre, pâte de cacao, beurre de cacao, graisses végétales (illipe, mangue, sal, karité et palme en proportions variables), arôme, émulsifiant (lécithine de soja), lactose et protéines de lait), farine de blé, sucre, oeufs, sirop de glucose-fructose, huile de colza, poudre à lever (carbonate acide d'ammonium, diphosphate disodique, carbonate acide de sodium), sel, émulsifiant (lécithine de soja).", "ingredients_with_specified_percent_n" : 1, @@ -620,9 +632,10 @@ "en:raising-agent", "en:shorea-robusta-seed-oil", "en:sodium-citrate", - "en:thickener" + "en:thickener", + "en:vegetable-fat" ], - "ingredients_without_ciqual_codes_n" : 21, + "ingredients_without_ciqual_codes_n" : 22, "ingredients_without_ecobalyse_ids" : [ "en:acid", "en:acidity-regulator", @@ -651,9 +664,10 @@ "en:shorea-robusta-seed-oil", "en:sodium-citrate", "en:soya-lecithin", - "en:thickener" + "en:thickener", + "en:vegetable-fat" ], - "ingredients_without_ecobalyse_ids_n" : 28, + "ingredients_without_ecobalyse_ids_n" : 29, "known_ingredients_n" : 73, "lc" : "fr", "misc_tags" : [ diff --git a/tests/unit/expected_test_results/ingredients/fr-percents-origins-2.json b/tests/unit/expected_test_results/ingredients/fr-percents-origins-2.json index 6d872cdc35027..03f4a3c106b7f 100644 --- a/tests/unit/expected_test_results/ingredients/fr-percents-origins-2.json +++ b/tests/unit/expected_test_results/ingredients/fr-percents-origins-2.json @@ -4,12 +4,22 @@ "id" : "en:emulsifier", "ingredients" : [ { - "id" : "en:sunflower-lecithin", + "id" : "en:e322", + "ingredients" : [ + { + "id" : "en:sunflower-lecithin", + "is_in_taxonomy" : 1, + "percent_estimate" : 50, + "text" : "lécithines de tournesol", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], "is_in_taxonomy" : 1, "percent_estimate" : 50, - "text" : "lécithines de tournesol", - "vegan" : "yes", - "vegetarian" : "yes" + "text" : "lécithines", + "vegan" : "maybe", + "vegetarian" : "maybe" } ], "is_in_taxonomy" : 1, @@ -89,14 +99,14 @@ "en:oil-and-fat", "en:fat", "en:milkfat", - "en:sunflower-lecithin", "en:e322", + "en:sunflower-lecithin", "en:e322i" ], "ingredients_lc" : "fr", - "ingredients_n" : 6, + "ingredients_n" : 7, "ingredients_n_tags" : [ - "6", + "7", "1-10" ], "ingredients_original_tags" : [ @@ -105,6 +115,7 @@ "en:wheat-flour", "en:sugar", "en:butterfat", + "en:e322", "en:sunflower-lecithin" ], "ingredients_percent_analysis" : -1, @@ -124,8 +135,8 @@ "en:oil-and-fat", "en:fat", "en:milkfat", - "en:sunflower-lecithin", "en:e322", + "en:sunflower-lecithin", "en:e322i" ], "ingredients_text" : "émulsifiant : lécithines (tournesol), arôme)(UE), farine de blé 33% (France), sucre, beurre concentré* 6,5% (France)", @@ -134,18 +145,20 @@ "ingredients_with_unspecified_percent_n" : 3, "ingredients_with_unspecified_percent_sum" : 75, "ingredients_without_ciqual_codes" : [ + "en:e322", "en:emulsifier", "en:flavouring", "en:sunflower-lecithin" ], - "ingredients_without_ciqual_codes_n" : 3, + "ingredients_without_ciqual_codes_n" : 4, "ingredients_without_ecobalyse_ids" : [ "en:butterfat", + "en:e322", "en:emulsifier", "en:flavouring", "en:sunflower-lecithin" ], - "ingredients_without_ecobalyse_ids_n" : 4, + "ingredients_without_ecobalyse_ids_n" : 5, "known_ingredients_n" : 18, "lc" : "fr", "misc_tags" : [ diff --git a/tests/unit/expected_test_results/ingredients/ru-russian-oil.json b/tests/unit/expected_test_results/ingredients/ru-russian-oil.json index 736d4c1a513e0..ea6d266fb95c7 100644 --- a/tests/unit/expected_test_results/ingredients/ru-russian-oil.json +++ b/tests/unit/expected_test_results/ingredients/ru-russian-oil.json @@ -1,41 +1,67 @@ { "ingredients" : [ { - "ciqual_food_code" : "17440", - "ecobalyse_code" : "sunflower-oil", - "from_palm_oil" : "no", - "id" : "en:sunflower-oil", + "from_palm_oil" : "maybe", + "id" : "en:vegetable-oil", + "ingredients" : [ + { + "ciqual_food_code" : "17440", + "ecobalyse_code" : "sunflower-oil", + "from_palm_oil" : "no", + "id" : "en:sunflower-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 50, + "percent_max" : 100, + "percent_min" : 25, + "text" : "масло растительное подсолнечное", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "ciqual_food_code" : "17420", + "from_palm_oil" : "no", + "id" : "en:soya-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "масло растительное соевое", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], "is_in_taxonomy" : 1, - "percent_estimate" : 66.6666666666667, + "percent_estimate" : 75, "percent_max" : 100, - "percent_min" : 33.3333333333333, - "text" : "масло растительное подсолнечное", + "percent_min" : 50, + "text" : "масло растительное", "vegan" : "yes", "vegetarian" : "yes" }, { - "ciqual_food_code" : "17420", - "from_palm_oil" : "no", - "id" : "en:soya-oil", + "from_palm_oil" : "maybe", + "id" : "en:oil", + "ingredients" : [ + { + "ciqual_food_code" : "17420", + "from_palm_oil" : "no", + "id" : "en:soya-oil", + "is_in_taxonomy" : 1, + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "Масло соевое", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], "is_in_taxonomy" : 1, - "percent_estimate" : 16.6666666666667, + "percent_estimate" : 25, "percent_max" : 50, "percent_min" : 0, - "text" : "масло растительное соевое", - "vegan" : "yes", - "vegetarian" : "yes" - }, - { - "ciqual_food_code" : "17420", - "from_palm_oil" : "no", - "id" : "en:soya-oil", - "is_in_taxonomy" : 1, - "percent_estimate" : 16.6666666666667, - "percent_max" : 33.3333333333333, - "percent_min" : 0, - "text" : "Масло соевое", - "vegan" : "yes", - "vegetarian" : "yes" + "text" : "Масло", + "vegan" : "maybe", + "vegetarian" : "maybe" } ], "ingredients_analysis" : {}, @@ -45,29 +71,33 @@ "en:vegetarian" ], "ingredients_hierarchy" : [ - "en:sunflower-oil", + "en:vegetable-oil", "en:oil-and-fat", "en:vegetable-oil-and-fat", - "en:vegetable-oil", + "en:oil", + "en:sunflower-oil", "en:soya-oil" ], "ingredients_lc" : "ru", - "ingredients_n" : 3, + "ingredients_n" : 5, "ingredients_n_tags" : [ - "3", + "5", "1-10" ], "ingredients_original_tags" : [ + "en:vegetable-oil", + "en:oil", "en:sunflower-oil", "en:soya-oil", "en:soya-oil" ], "ingredients_percent_analysis" : 1, "ingredients_tags" : [ - "en:sunflower-oil", + "en:vegetable-oil", "en:oil-and-fat", "en:vegetable-oil-and-fat", - "en:vegetable-oil", + "en:oil", + "en:sunflower-oil", "en:soya-oil" ], "ingredients_text" : "масло растительное (подсолнечное, соевое), Масло (соевое)", @@ -75,13 +105,18 @@ "ingredients_with_specified_percent_sum" : 0, "ingredients_with_unspecified_percent_n" : 3, "ingredients_with_unspecified_percent_sum" : 100, - "ingredients_without_ciqual_codes" : [], - "ingredients_without_ciqual_codes_n" : 0, + "ingredients_without_ciqual_codes" : [ + "en:oil", + "en:vegetable-oil" + ], + "ingredients_without_ciqual_codes_n" : 2, "ingredients_without_ecobalyse_ids" : [ - "en:soya-oil" + "en:oil", + "en:soya-oil", + "en:vegetable-oil" ], - "ingredients_without_ecobalyse_ids_n" : 1, - "known_ingredients_n" : 5, + "ingredients_without_ecobalyse_ids_n" : 3, + "known_ingredients_n" : 6, "lc" : "ru", "nutriments" : { "fruits-vegetables-legumes-estimate-from-ingredients_100g" : 0, diff --git a/tests/unit/expected_test_results/ingredients_preparsing/10.json b/tests/unit/expected_test_results/ingredients_preparsing/10.json index 2c3cd00f9f6b7..cbbf323635824 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/10.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/10.json @@ -2,5 +2,5 @@ "id" : "10", "ingredients_text" : "Huiles végétales de palme et d'olive", "lc" : "fr", - "preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales d'olive" + "preparsed_ingredients_text" : "Huiles végétales (huile de palme, huile de olive)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/106.json b/tests/unit/expected_test_results/ingredients_preparsing/106.json index 11d9c296acddb..8f03f14c9b3be 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/106.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/106.json @@ -2,5 +2,5 @@ "id" : "106", "ingredients_text" : "graisse végétale bio (colza)", "lc" : "fr", - "preparsed_ingredients_text" : "graisse végétale bio de colza" + "preparsed_ingredients_text" : "graisse végétale bio (huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/108.json b/tests/unit/expected_test_results/ingredients_preparsing/108.json index e0831dc8ea94c..7de16818eb494 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/108.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/108.json @@ -2,5 +2,5 @@ "id" : "108", "ingredients_text" : "huile biologique (tournesol, olive)", "lc" : "fr", - "preparsed_ingredients_text" : "huile biologique de tournesol, huile biologique d'olive" + "preparsed_ingredients_text" : "huile biologique (huile de tournesol, huile de olive)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/11.json b/tests/unit/expected_test_results/ingredients_preparsing/11.json index ce8c621ff33e2..1e98643f168a9 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/11.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/11.json @@ -2,5 +2,5 @@ "id" : "11", "ingredients_text" : "Huiles végétales de palme, de colza et de tournesol", "lc" : "fr", - "preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales de tournesol" + "preparsed_ingredients_text" : "Huiles végétales (huile de palme, huile de colza, huile de tournesol)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/110.json b/tests/unit/expected_test_results/ingredients_preparsing/110.json index e4c888eeba0d1..fa3269ff360f4 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/110.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/110.json @@ -2,5 +2,5 @@ "id" : "110", "ingredients_text" : "huiles biologiques (tournesol, olive)", "lc" : "fr", - "preparsed_ingredients_text" : "huiles biologiques de tournesol, huiles biologiques d'olive" + "preparsed_ingredients_text" : "huiles biologiques (huile de tournesol, huile de olive)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/111.json b/tests/unit/expected_test_results/ingredients_preparsing/111.json index cfb13880c6510..203936f3b30fe 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/111.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/111.json @@ -2,5 +2,5 @@ "id" : "111", "ingredients_text" : "huiles (tournesol*, olive). * : bio", "lc" : "fr", - "preparsed_ingredients_text" : "huiles de tournesol Bio, huiles d'olive." + "preparsed_ingredients_text" : "huiles (huile de tournesol Bio, huile de olive)." } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/112.json b/tests/unit/expected_test_results/ingredients_preparsing/112.json index b03e9d12ae74f..5539cdc19312b 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/112.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/112.json @@ -2,5 +2,5 @@ "id" : "112", "ingredients_text" : "huiles* (tournesol*, olive vierge extra), sel marin. *issus de l'agriculture biologique.", "lc" : "fr", - "preparsed_ingredients_text" : "huiles Bio de tournesol Bio, huiles Bio d'olive vierge extra, sel marin." + "preparsed_ingredients_text" : "huiles Bio (huile de tournesol Bio, huile de olive vierge extra), sel marin." } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/12.json b/tests/unit/expected_test_results/ingredients_preparsing/12.json index 04596bc56aa52..386b63761e7a9 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/12.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/12.json @@ -2,5 +2,5 @@ "id" : "12", "ingredients_text" : "Huiles végétales de palme, de colza, de tournesol", "lc" : "fr", - "preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales de tournesol" + "preparsed_ingredients_text" : "Huiles végétales (huile de palme, huile de colza, huile de tournesol)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/121.json b/tests/unit/expected_test_results/ingredients_preparsing/121.json index a3cc90e4d4dcf..875fb7a7140a7 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/121.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/121.json @@ -2,5 +2,5 @@ "id" : "121", "ingredients_text" : "vegetable oil (coconut & rapeseed)", "lc" : "en", - "preparsed_ingredients_text" : "coconut vegetable oil, rapeseed vegetable oil" + "preparsed_ingredients_text" : "vegetable oil (coconut vegetable oil, rapeseed vegetable oil)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/13.json b/tests/unit/expected_test_results/ingredients_preparsing/13.json index 21e4b9e340d95..794caad7fe2e6 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/13.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/13.json @@ -2,5 +2,5 @@ "id" : "13", "ingredients_text" : "Huiles végétales de palme, de colza et d'olive en proportion variable", "lc" : "fr", - "preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales d'olive" + "preparsed_ingredients_text" : "Huiles végétales (huile de palme, huile de colza, huile de olive)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/14.json b/tests/unit/expected_test_results/ingredients_preparsing/14.json index 63e7792f88793..6a5235ddeae32 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/14.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/14.json @@ -2,5 +2,5 @@ "id" : "14", "ingredients_text" : "Huiles végétales de palme, de colza et d'olive", "lc" : "fr", - "preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales d'olive" + "preparsed_ingredients_text" : "Huiles végétales (huile de palme, huile de colza, huile de olive)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/177.json b/tests/unit/expected_test_results/ingredients_preparsing/177.json index f6aa708b8e26f..9f2ac7d2046ad 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/177.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/177.json @@ -2,5 +2,5 @@ "id" : "177", "ingredients_text" : "huiles végétales (palme, olive et tournesol)", "lc" : "fr", - "preparsed_ingredients_text" : "huiles végétales de palme, huiles végétales d'olive, huiles végétales de tournesol" + "preparsed_ingredients_text" : "huiles végétales (huile de palme, huile de olive, huile de tournesol)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/178.json b/tests/unit/expected_test_results/ingredients_preparsing/178.json index bd8b1ca55fc09..7609d6ddb61a8 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/178.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/178.json @@ -2,5 +2,5 @@ "id" : "178", "ingredients_text" : "huile végétale : colza", "lc" : "fr", - "preparsed_ingredients_text" : "huile végétale de colza" + "preparsed_ingredients_text" : "huile végétale (huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/179.json b/tests/unit/expected_test_results/ingredients_preparsing/179.json index 908c9239a9151..4d56c16e2da25 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/179.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/179.json @@ -2,5 +2,5 @@ "id" : "179", "ingredients_text" : "huile végétale : colza, fraises", "lc" : "fr", - "preparsed_ingredients_text" : "huile végétale de colza, fraises" + "preparsed_ingredients_text" : "huile végétale (huile de colza), fraises" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/180.json b/tests/unit/expected_test_results/ingredients_preparsing/180.json index ccd8430a4082e..b01c1027fe546 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/180.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/180.json @@ -2,5 +2,5 @@ "id" : "180", "ingredients_text" : "huile végétale : colza et tomates", "lc" : "fr", - "preparsed_ingredients_text" : "huile végétale de colza et tomates" + "preparsed_ingredients_text" : "huile végétale (huile de colza) et tomates" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/181.json b/tests/unit/expected_test_results/ingredients_preparsing/181.json index 6b496a0e3b402..af888059f4760 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/181.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/181.json @@ -2,5 +2,5 @@ "id" : "181", "ingredients_text" : "vegetable oil: sunflower", "lc" : "en", - "preparsed_ingredients_text" : "sunflower vegetable oil" + "preparsed_ingredients_text" : "vegetable oil (sunflower vegetable oil)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/182.json b/tests/unit/expected_test_results/ingredients_preparsing/182.json index 2d5f69fc2d4f8..5499b5c0cb526 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/182.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/182.json @@ -2,5 +2,5 @@ "id" : "182", "ingredients_text" : "vegetable oil (palm)", "lc" : "en", - "preparsed_ingredients_text" : "palm vegetable oil" + "preparsed_ingredients_text" : "vegetable oil (palm vegetable oil)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/183.json b/tests/unit/expected_test_results/ingredients_preparsing/183.json index 12d06b3da3976..6dd38591e2205 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/183.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/183.json @@ -2,5 +2,5 @@ "id" : "183", "ingredients_text" : "vegetable oils (palm, olive)", "lc" : "en", - "preparsed_ingredients_text" : "palm vegetable oils, olive vegetable oils" + "preparsed_ingredients_text" : "vegetable oils (palm vegetable oils, olive vegetable oils)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/184.json b/tests/unit/expected_test_results/ingredients_preparsing/184.json index 8ddcd79b6f7b1..38a0ee7cb56e0 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/184.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/184.json @@ -2,5 +2,5 @@ "id" : "184", "ingredients_text" : "organic vegetable oils (sunflower, colza and rapeseed)", "lc" : "en", - "preparsed_ingredients_text" : "sunflower organic vegetable oils, colza organic vegetable oils, rapeseed organic vegetable oils" + "preparsed_ingredients_text" : "organic vegetable oils (sunflower organic vegetable oils, colza organic vegetable oils, rapeseed organic vegetable oils)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/185.json b/tests/unit/expected_test_results/ingredients_preparsing/185.json index a6ee2ab5dbb5e..ba0b14cb5f991 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/185.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/185.json @@ -2,5 +2,5 @@ "id" : "185", "ingredients_text" : "vegetable oils : sunflower, colza and strawberry", "lc" : "en", - "preparsed_ingredients_text" : "sunflower vegetable oils, colza vegetable oils and strawberry" + "preparsed_ingredients_text" : "vegetable oils (sunflower vegetable oils, colza vegetable oils) and strawberry" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/186.json b/tests/unit/expected_test_results/ingredients_preparsing/186.json index a3c6440e90e56..4392d3c001e5a 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/186.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/186.json @@ -2,5 +2,5 @@ "id" : "186", "ingredients_text" : "oleje roślinne (słonecznikowy)", "lc" : "pl", - "preparsed_ingredients_text" : "oleje roślinne słonecznikowy" + "preparsed_ingredients_text" : "oleje roślinne (oleje roślinne słonecznikowy)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/187.json b/tests/unit/expected_test_results/ingredients_preparsing/187.json index 58a2a841ae609..e64a061126632 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/187.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/187.json @@ -2,5 +2,5 @@ "id" : "187", "ingredients_text" : "oleje roślinne: słonecznikowy", "lc" : "pl", - "preparsed_ingredients_text" : "oleje roślinne słonecznikowy" + "preparsed_ingredients_text" : "oleje roślinne (oleje roślinne słonecznikowy)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/188.json b/tests/unit/expected_test_results/ingredients_preparsing/188.json index 6c57a92152d2c..33829e6be952c 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/188.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/188.json @@ -2,5 +2,5 @@ "id" : "188", "ingredients_text" : "oleje roślinne (słonecznikowy, rzepakowy)", "lc" : "pl", - "preparsed_ingredients_text" : "oleje roślinne słonecznikowy, oleje roślinne rzepakowy" + "preparsed_ingredients_text" : "oleje roślinne (oleje roślinne słonecznikowy, oleje roślinne rzepakowy)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/189.json b/tests/unit/expected_test_results/ingredients_preparsing/189.json index b2d368f36a1da..56e2d583accf7 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/189.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/189.json @@ -2,5 +2,5 @@ "id" : "189", "ingredients_text" : "oleje roślinne (sojowy, słonecznikowy, kokosowy, rzepakowy) w zmiennych proporcjach", "lc" : "pl", - "preparsed_ingredients_text" : "oleje roślinne sojowy, oleje roślinne słonecznikowy, oleje roślinne kokosowy, oleje roślinne rzepakowy" + "preparsed_ingredients_text" : "oleje roślinne (oleje roślinne sojowy, oleje roślinne słonecznikowy, oleje roślinne kokosowy, oleje roślinne rzepakowy)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/190.json b/tests/unit/expected_test_results/ingredients_preparsing/190.json index 264a4e67f5b96..378451ca43314 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/190.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/190.json @@ -2,5 +2,5 @@ "id" : "190", "ingredients_text" : "tłuszcze roślinne (palmowy nieutwardzony, shea)", "lc" : "pl", - "preparsed_ingredients_text" : "tłuszcze roślinne palmowy nieutwardzony, tłuszcze roślinne shea" + "preparsed_ingredients_text" : "tłuszcze roślinne (tłuszcze roślinne palmowy nieutwardzony, tłuszcze roślinne shea)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/191.json b/tests/unit/expected_test_results/ingredients_preparsing/191.json index e43c2c5f0452c..cf8d058bfea79 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/191.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/191.json @@ -2,5 +2,5 @@ "id" : "191", "ingredients_text" : "tłuszcze roślinne (kokosowy i palmowy) w zmiennych proporcjach", "lc" : "pl", - "preparsed_ingredients_text" : "tłuszcze roślinne kokosowy, tłuszcze roślinne palmowy" + "preparsed_ingredients_text" : "tłuszcze roślinne (tłuszcze roślinne kokosowy, tłuszcze roślinne palmowy)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/192.json b/tests/unit/expected_test_results/ingredients_preparsing/192.json index ad195858f9513..4be73f074cafc 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/192.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/192.json @@ -2,5 +2,5 @@ "id" : "192", "ingredients_text" : "mięso (wołowe, wieprzowe, cielęce)", "lc" : "pl", - "preparsed_ingredients_text" : "mięso wołowe, mięso wieprzowe, mięso cielęce" + "preparsed_ingredients_text" : "mięso (mięso wołowe, mięso wieprzowe, mięso cielęce)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/193.json b/tests/unit/expected_test_results/ingredients_preparsing/193.json index 83eb7af7be14a..f28ebf0776e2d 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/193.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/193.json @@ -2,5 +2,5 @@ "id" : "193", "ingredients_text" : "przeciery z (jabłek, bananów, marchwi)", "lc" : "pl", - "preparsed_ingredients_text" : "przeciery z jabłek, przeciery z bananów, przeciery z marchwi" + "preparsed_ingredients_text" : "przeciery z (przeciery z jabłek, przeciery z bananów, przeciery z marchwi)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/194.json b/tests/unit/expected_test_results/ingredients_preparsing/194.json index 57a33b472d72e..f2631301b92fa 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/194.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/194.json @@ -2,5 +2,5 @@ "id" : "194", "ingredients_text" : "масло (Подсолнечное)", "lc" : "ru", - "preparsed_ingredients_text" : "масло Подсолнечное" + "preparsed_ingredients_text" : "масло (масло Подсолнечное)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/195.json b/tests/unit/expected_test_results/ingredients_preparsing/195.json index dbcf4e74bcbb2..a6aab75a64d3d 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/195.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/195.json @@ -2,5 +2,5 @@ "id" : "195", "ingredients_text" : "Масло (подсолнечное)", "lc" : "ru", - "preparsed_ingredients_text" : "Масло подсолнечное" + "preparsed_ingredients_text" : "Масло (Масло подсолнечное)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/196.json b/tests/unit/expected_test_results/ingredients_preparsing/196.json index ef493384fcdb2..7bd83951bcd04 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/196.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/196.json @@ -2,5 +2,5 @@ "id" : "196", "ingredients_text" : "масло растительное (подсолнечное, соевое)", "lc" : "ru", - "preparsed_ingredients_text" : "масло растительное подсолнечное, масло растительное соевое" + "preparsed_ingredients_text" : "масло растительное (масло растительное подсолнечное, масло растительное соевое)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/20.json b/tests/unit/expected_test_results/ingredients_preparsing/20.json index d3b67e3bd90a7..9be98163c0afa 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/20.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/20.json @@ -2,5 +2,5 @@ "id" : "20", "ingredients_text" : "Marmelade d'oranges 41% (sirop de glucose-fructose, sucre, pulpe d'orange 4.5%, jus d'orange concentré 1.4% (équivalent jus d'orange 7.8%), pulpe d'orange concentrée 0.6% (équivalent pulpe d'orange 2.6%), gélifiant (pectines), acidifiant (acide citrique), correcteurs d'acidité (citrate de calcium, citrate de sodium), arôme naturel d'orange, épaississant (gomme xanthane)), chocolat 24.9% (sucre, pâte de cacao, beurre de cacao, graisses végétales (illipe, mangue, sal, karité et palme en proportions variables), arôme, émulsifiant (lécithine de soja), lactose et protéines de lait), farine de blé, sucre, oeufs, sirop de glucose-fructose, huile de colza, poudre à lever (carbonate acide d'ammonium, diphosphate disodique, carbonate acide de sodium), sel, émulsifiant (lécithine de soja).", "lc" : "fr", - "preparsed_ingredients_text" : "Marmelade d'oranges 41% (sirop de glucose-fructose, sucre, pulpe d'orange 4.5%, jus d'orange concentré 1.4% (équivalent jus d'orange 7.8%), pulpe d'orange concentrée 0.6% (équivalent pulpe d'orange 2.6%), gélifiant (pectines), acidifiant (acide citrique), correcteurs d'acidité (citrate de calcium, citrate de sodium), arôme naturel d'orange, épaississant (gomme xanthane)), chocolat 24.9% (sucre, pâte de cacao, beurre de cacao, graisses végétales d'illipe, graisses végétales de mangue, graisses végétales de sal, graisses végétales de karité, graisses végétales de palme, arôme, émulsifiant (lécithine de soja), lactose et protéines de lait), farine de blé, sucre, oeufs, sirop de glucose-fructose, huile de colza, poudre à lever (carbonate acide d'ammonium, diphosphate disodique, carbonate acide de sodium), sel, émulsifiant (lécithine de soja)." + "preparsed_ingredients_text" : "Marmelade d'oranges 41% (sirop de glucose-fructose, sucre, pulpe d'orange 4.5%, jus d'orange concentré 1.4% (équivalent jus d'orange 7.8%), pulpe d'orange concentrée 0.6% (équivalent pulpe d'orange 2.6%), gélifiant (pectines), acidifiant (acide citrique), correcteurs d'acidité (citrate de calcium, citrate de sodium), arôme naturel d'orange, épaississant (gomme xanthane)), chocolat 24.9% (sucre, pâte de cacao, beurre de cacao, graisses végétales (huile de illipe, huile de mangue, huile de sal, huile de karité, huile de palme), arôme, émulsifiant (lécithine de soja), lactose et protéines de lait), farine de blé, sucre, oeufs, sirop de glucose-fructose, huile de colza, poudre à lever (carbonate acide d'ammonium, diphosphate disodique, carbonate acide de sodium), sel, émulsifiant (lécithine de soja)." } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/205.json b/tests/unit/expected_test_results/ingredients_preparsing/205.json index bf0e8b0bae6b6..1034b5f99323d 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/205.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/205.json @@ -2,5 +2,5 @@ "id" : "205", "ingredients_text" : "Piments (vert, rouge, jaune)", "lc" : "fr", - "preparsed_ingredients_text" : "Piments vert, Piments rouge, Piments jaune" + "preparsed_ingredients_text" : "Piments (Piments vert, Piments rouge, Piments jaune)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/206.json b/tests/unit/expected_test_results/ingredients_preparsing/206.json index 879310a6c3e59..f274e2082ac87 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/206.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/206.json @@ -2,5 +2,5 @@ "id" : "206", "ingredients_text" : "pflanzliches Fett (Kokosnuss, Palmkern)", "lc" : "de", - "preparsed_ingredients_text" : "Kokosnussfett, Palmkernfett" + "preparsed_ingredients_text" : "pflanzliches Fett (Kokosnussfett, Palmkernfett)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/207.json b/tests/unit/expected_test_results/ingredients_preparsing/207.json index e7384ea6a8ead..cdd31b4c90a20 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/207.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/207.json @@ -2,5 +2,5 @@ "id" : "207", "ingredients_text" : "pflanzliche Öle und Fette (Raps, Palm, Shea, Sonnenblumen)", "lc" : "de", - "preparsed_ingredients_text" : "Rapsöl, Palmfett, Sheafett, Sonnenblumenfett" + "preparsed_ingredients_text" : "pflanzliche Öle und Fette (Rapsöl, Palmfett, Sheafett, Sonnenblumenfett)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/208.json b/tests/unit/expected_test_results/ingredients_preparsing/208.json index 89b9c81c22cd4..44cb87bf16016 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/208.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/208.json @@ -2,5 +2,5 @@ "id" : "208", "ingredients_text" : "Huiles végétales de palme, de colza et de tournesol", "lc" : "fr", - "preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales de tournesol" + "preparsed_ingredients_text" : "Huiles végétales (huile de palme, huile de colza, huile de tournesol)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/21.json b/tests/unit/expected_test_results/ingredients_preparsing/21.json index 2a09a1f993984..f4db6b9f4e368 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/21.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/21.json @@ -2,5 +2,5 @@ "id" : "21", "ingredients_text" : "graisses végétales (illipe, mangue, sal, karité et palme en proportions variables)", "lc" : "fr", - "preparsed_ingredients_text" : "graisses végétales d'illipe, graisses végétales de mangue, graisses végétales de sal, graisses végétales de karité, graisses végétales de palme" + "preparsed_ingredients_text" : "graisses végétales (huile de illipe, huile de mangue, huile de sal, huile de karité, huile de palme)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/211.json b/tests/unit/expected_test_results/ingredients_preparsing/211.json index 6633263b9cd2f..3cf2a335b5fb8 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/211.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/211.json @@ -2,5 +2,5 @@ "id" : "211", "ingredients_text" : "huile végétale (colza)", "lc" : "fr", - "preparsed_ingredients_text" : "huile végétale de colza" + "preparsed_ingredients_text" : "huile végétale (huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/212.json b/tests/unit/expected_test_results/ingredients_preparsing/212.json index 99a9cf3a6e0d4..9aba99f8159d2 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/212.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/212.json @@ -2,5 +2,5 @@ "id" : "212", "ingredients_text" : "huile végétale : colza", "lc" : "fr", - "preparsed_ingredients_text" : "huile végétale de colza" + "preparsed_ingredients_text" : "huile végétale (huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/213.json b/tests/unit/expected_test_results/ingredients_preparsing/213.json index ad000e8a4fc8e..dcc6668dfc663 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/213.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/213.json @@ -2,5 +2,5 @@ "id" : "213", "ingredients_text" : "ječmeni i pšenični slad", "lc" : "hr", - "preparsed_ingredients_text" : "ječmeni slad, pšenični slad" + "preparsed_ingredients_text" : "slad (ječmeni slad, pšenični slad)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/214.json b/tests/unit/expected_test_results/ingredients_preparsing/214.json index 0c3bda643c598..2a20c02cc084a 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/214.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/214.json @@ -2,5 +2,5 @@ "id" : "214", "ingredients_text" : "ječmeni, ječmeni i pšenični slad", "lc" : "hr", - "preparsed_ingredients_text" : "ječmeni slad, ječmeni slad, pšenični slad" + "preparsed_ingredients_text" : "slad (ječmeni slad, ječmeni slad, pšenični slad)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/215.json b/tests/unit/expected_test_results/ingredients_preparsing/215.json index 6346738056162..0a9c910bf1079 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/215.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/215.json @@ -2,5 +2,5 @@ "id" : "215", "ingredients_text" : "Pasterizirano mlijeko (s 1.0% mliječne masti)", "lc" : "hr", - "preparsed_ingredients_text" : "Pasterizirano mlijeko s 1.0% mliječne masti" + "preparsed_ingredients_text" : "Pasterizirano mlijeko (mlijeko s 1.0% mliječne masti)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/216.json b/tests/unit/expected_test_results/ingredients_preparsing/216.json index 3034bd0b1c901..2e80fca316cd0 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/216.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/216.json @@ -2,5 +2,5 @@ "id" : "216", "ingredients_text" : "Vegetal oil (sunflower, olive and palm)", "lc" : "en", - "preparsed_ingredients_text" : "sunflower Vegetal oil, olive Vegetal oil, palm Vegetal oil" + "preparsed_ingredients_text" : "Vegetal oil (sunflower Vegetal oil, olive Vegetal oil, palm Vegetal oil)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/217.json b/tests/unit/expected_test_results/ingredients_preparsing/217.json index 6ef630346929b..1b56020d3ca80 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/217.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/217.json @@ -2,5 +2,5 @@ "id" : "217", "ingredients_text" : "vegetable oil (palm)", "lc" : "en", - "preparsed_ingredients_text" : "palm vegetable oil" + "preparsed_ingredients_text" : "vegetable oil (palm vegetable oil)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/218.json b/tests/unit/expected_test_results/ingredients_preparsing/218.json index 9978164f46913..17b87596c1442 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/218.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/218.json @@ -2,5 +2,5 @@ "id" : "218", "ingredients_text" : "vegetable oil: palm", "lc" : "en", - "preparsed_ingredients_text" : "palm vegetable oil" + "preparsed_ingredients_text" : "vegetable oil (palm vegetable oil)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/219.json b/tests/unit/expected_test_results/ingredients_preparsing/219.json index 869c4754f8ee5..bc7a161250f36 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/219.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/219.json @@ -2,5 +2,5 @@ "id" : "219", "ingredients_text" : "protéines végétales (soja, blé)", "lc" : "fr", - "preparsed_ingredients_text" : "protéine de soja, protéine de blé" + "preparsed_ingredients_text" : "protéines végétales (protéine de soja, protéine de blé)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/22.json b/tests/unit/expected_test_results/ingredients_preparsing/22.json index c29b69e676eca..862aed6a5cec6 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/22.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/22.json @@ -2,5 +2,5 @@ "id" : "22", "ingredients_text" : "graisses végétales (illipe, mangue, palme)", "lc" : "fr", - "preparsed_ingredients_text" : "graisses végétales d'illipe, graisses végétales de mangue, graisses végétales de palme" + "preparsed_ingredients_text" : "graisses végétales (huile de illipe, huile de mangue, huile de palme)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/220.json b/tests/unit/expected_test_results/ingredients_preparsing/220.json index ae44a714abfd5..687c2b6c01992 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/220.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/220.json @@ -2,5 +2,5 @@ "id" : "220", "ingredients_text" : "pflanzliche Proteine (Erbsen, Sonnenblumen)", "lc" : "de", - "preparsed_ingredients_text" : "Erbsenprotein, Sonnenblumenprotein" + "preparsed_ingredients_text" : "pflanzliche Proteine (Erbsenprotein, Sonnenblumenprotein)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/222.json b/tests/unit/expected_test_results/ingredients_preparsing/222.json index bc489ca92a861..daee9cdbfa11b 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/222.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/222.json @@ -2,5 +2,5 @@ "id" : "222", "ingredients_text" : "Huile de palme, noisettes et tournesol", "lc" : "fr", - "preparsed_ingredients_text" : "Huile de palme, Huile de noisettes, Huile de tournesol" + "preparsed_ingredients_text" : "Huile (huile de palme, huile de noisettes, huile de tournesol)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/224.json b/tests/unit/expected_test_results/ingredients_preparsing/224.json index d6f476a00a1e9..5423dd2959621 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/224.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/224.json @@ -2,5 +2,5 @@ "id" : "224", "ingredients_text" : "arôme naturel de citron, citron vert et d'autres agrumes", "lc" : "fr", - "preparsed_ingredients_text" : "arôme naturel de citron, arôme naturel de citron vert, arôme naturel d'agrumes" + "preparsed_ingredients_text" : "arôme naturel (arôme naturel de citron, arôme naturel de citron vert, arôme naturel d'agrumes)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/225.json b/tests/unit/expected_test_results/ingredients_preparsing/225.json new file mode 100644 index 0000000000000..4a2c0a546c87a --- /dev/null +++ b/tests/unit/expected_test_results/ingredients_preparsing/225.json @@ -0,0 +1,6 @@ +{ + "id" : "225", + "ingredients_text" : "Huiles végétales (colza, palme)", + "lc" : "fr", + "preparsed_ingredients_text" : "Huiles végétales (huile de colza, huile de palme)" +} diff --git a/tests/unit/expected_test_results/ingredients_preparsing/226.json b/tests/unit/expected_test_results/ingredients_preparsing/226.json new file mode 100644 index 0000000000000..d85e0e075d4ae --- /dev/null +++ b/tests/unit/expected_test_results/ingredients_preparsing/226.json @@ -0,0 +1,6 @@ +{ + "id" : "226", + "ingredients_text" : "Huiles végétales 54.5% (colza, palme)", + "lc" : "fr", + "preparsed_ingredients_text" : "Huiles végétales 54.5% (huile de colza, huile de palme)" +} diff --git a/tests/unit/expected_test_results/ingredients_preparsing/227.json b/tests/unit/expected_test_results/ingredients_preparsing/227.json new file mode 100644 index 0000000000000..ef1a255590a9c --- /dev/null +++ b/tests/unit/expected_test_results/ingredients_preparsing/227.json @@ -0,0 +1,6 @@ +{ + "id" : "227", + "ingredients_text" : "Huiles végétales non hydrogénées (colza, palme)", + "lc" : "fr", + "preparsed_ingredients_text" : "Huiles végétales non hydrogénées (huile de colza, huile de palme)" +} diff --git a/tests/unit/expected_test_results/ingredients_preparsing/228.json b/tests/unit/expected_test_results/ingredients_preparsing/228.json new file mode 100644 index 0000000000000..6072a2195565c --- /dev/null +++ b/tests/unit/expected_test_results/ingredients_preparsing/228.json @@ -0,0 +1,6 @@ +{ + "id" : "228", + "ingredients_text" : "Huiles végétales bio (olive, palme, tournesol)", + "lc" : "fr", + "preparsed_ingredients_text" : "Huiles végétales bio (huile de olive, huile de palme, huile de tournesol)" +} diff --git a/tests/unit/expected_test_results/ingredients_preparsing/229.json b/tests/unit/expected_test_results/ingredients_preparsing/229.json new file mode 100644 index 0000000000000..d8e37be1069c8 --- /dev/null +++ b/tests/unit/expected_test_results/ingredients_preparsing/229.json @@ -0,0 +1,6 @@ +{ + "id" : "229", + "ingredients_text" : "Масло (Пальмовое), масло растительное (подсолнечное, соевое)", + "lc" : "ru", + "preparsed_ingredients_text" : "Масло (Масло Пальмовое), масло растительное (масло растительное подсолнечное, масло растительное соевое)" +} diff --git a/tests/unit/expected_test_results/ingredients_preparsing/23.json b/tests/unit/expected_test_results/ingredients_preparsing/23.json index d7f6284a349f7..9d5ed9a58c536 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/23.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/23.json @@ -2,5 +2,5 @@ "id" : "23", "ingredients_text" : "graisses végétales (illipe)", "lc" : "fr", - "preparsed_ingredients_text" : "graisses végétales d'illipe" + "preparsed_ingredients_text" : "graisses végétales (huile de illipe)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/24.json b/tests/unit/expected_test_results/ingredients_preparsing/24.json index 6c61262239cb4..0015264fa0cab 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/24.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/24.json @@ -2,5 +2,5 @@ "id" : "24", "ingredients_text" : "graisses végétales (illipe et sal)", "lc" : "fr", - "preparsed_ingredients_text" : "graisses végétales d'illipe, graisses végétales de sal" + "preparsed_ingredients_text" : "graisses végétales (huile de illipe, huile de sal)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/6.json b/tests/unit/expected_test_results/ingredients_preparsing/6.json index ed1fb9c7eff84..8d6e5877963d4 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/6.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/6.json @@ -2,5 +2,5 @@ "id" : "6", "ingredients_text" : "Huile (palme)", "lc" : "fr", - "preparsed_ingredients_text" : "Huile de palme" + "preparsed_ingredients_text" : "Huile (huile de palme)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/7.json b/tests/unit/expected_test_results/ingredients_preparsing/7.json index 8744a6c172f4c..68ce87cd65204 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/7.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/7.json @@ -2,5 +2,5 @@ "id" : "7", "ingredients_text" : "Huile (palme, colza)", "lc" : "fr", - "preparsed_ingredients_text" : "Huile de palme, Huile de colza" + "preparsed_ingredients_text" : "Huile (huile de palme, huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/77.json b/tests/unit/expected_test_results/ingredients_preparsing/77.json index 3c0a5b48907e3..5d59661da3a88 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/77.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/77.json @@ -2,5 +2,5 @@ "id" : "77", "ingredients_text" : "arôme naturel de citron-citron vert et d'autres agrumes", "lc" : "fr", - "preparsed_ingredients_text" : "arôme naturel de citron, arôme naturel de citron vert, arôme naturel d'agrumes" + "preparsed_ingredients_text" : "arôme naturel (arôme naturel de citron, arôme naturel de citron vert, arôme naturel d'agrumes)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/78.json b/tests/unit/expected_test_results/ingredients_preparsing/78.json index ac852977a62c8..22e452a1d312b 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/78.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/78.json @@ -2,5 +2,5 @@ "id" : "78", "ingredients_text" : "arômes naturels de citron et de limette", "lc" : "fr", - "preparsed_ingredients_text" : "arômes naturels de citron, arômes naturels de limette" + "preparsed_ingredients_text" : "arômes naturels (arômes naturels de citron, arômes naturels de limette)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/8.json b/tests/unit/expected_test_results/ingredients_preparsing/8.json index 945d751f82cb1..e079704bbb88b 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/8.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/8.json @@ -2,5 +2,5 @@ "id" : "8", "ingredients_text" : "Huile (palme et colza)", "lc" : "fr", - "preparsed_ingredients_text" : "Huile de palme, Huile de colza" + "preparsed_ingredients_text" : "Huile (huile de palme, huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/85.json b/tests/unit/expected_test_results/ingredients_preparsing/85.json index 8151c108065cc..abbd100bb2b88 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/85.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/85.json @@ -2,5 +2,5 @@ "id" : "85", "ingredients_text" : "huile végétale de tournesol et/ou colza", "lc" : "fr", - "preparsed_ingredients_text" : "huile végétale de tournesol, huile végétale de colza" + "preparsed_ingredients_text" : "huile végétale (huile de tournesol, huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/9.json b/tests/unit/expected_test_results/ingredients_preparsing/9.json index 83c8636e99105..6e941781083d8 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/9.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/9.json @@ -2,5 +2,5 @@ "id" : "9", "ingredients_text" : "Huiles végétales de palme et de colza", "lc" : "fr", - "preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza" + "preparsed_ingredients_text" : "Huiles végétales (huile de palme, huile de colza)" } diff --git a/tests/unit/expected_test_results/ingredients_preparsing/98.json b/tests/unit/expected_test_results/ingredients_preparsing/98.json index 2fe0d4dfff0aa..4c6cd525d75bf 100644 --- a/tests/unit/expected_test_results/ingredients_preparsing/98.json +++ b/tests/unit/expected_test_results/ingredients_preparsing/98.json @@ -2,5 +2,5 @@ "id" : "98", "ingredients_text" : "matière grasse végétale (palme) raffinée", "lc" : "fr", - "preparsed_ingredients_text" : "matière grasse végétale de palme raffinée" + "preparsed_ingredients_text" : "matière grasse végétale (huile de palme) raffinée" } diff --git a/tests/unit/ingredients_preparsing.t b/tests/unit/ingredients_preparsing.t index 9cd4627d38502..c84146d0541a1 100755 --- a/tests/unit/ingredients_preparsing.t +++ b/tests/unit/ingredients_preparsing.t @@ -20,10 +20,16 @@ is(normalize_a_of_b("en", "oil", "olive", 1), "olive oil"); is(normalize_a_of_b("es", "aceta", "oliva", 1), "aceta de oliva"); is(normalize_a_of_b("fr", "huile végétale", "olive", 1), "huile végétale d'olive"); -is(normalize_enumeration("en", "phosphates", "calcium and sodium", 1), "calcium phosphates, sodium phosphates"); -is(normalize_enumeration("en", "vegetal oil", "sunflower, palm", 1), "sunflower vegetal oil, palm vegetal oil"); -is(normalize_enumeration("fr", "huile", "colza, tournesol et olive", 1), - "huile de colza, huile de tournesol, huile d'olive"); +is(normalize_enumeration("en", "phosphates", "calcium and sodium", 1), + "phosphates (calcium phosphates, sodium phosphates)"); +is( + normalize_enumeration("en", "vegetal oil", "sunflower, palm", 1), + "vegetal oil (sunflower vegetal oil, palm vegetal oil)" +); +is( + normalize_enumeration("fr", "huile", "colza, tournesol et olive", 1), + "huile (huile de colza, huile de tournesol, huile d'olive)" +); is(separate_additive_class("fr", "colorant", " ", "", "naturel"), "colorant "); is(separate_additive_class("fr", "colorant", " ", "", "carmins"), "colorant : "); @@ -1200,6 +1206,31 @@ my @tests = ( id => '224', lc => 'fr', ingredients_text => 'arôme naturel de citron, citron vert et d\'autres agrumes' + }, + { + id => '225', + lc => 'fr', + ingredients_text => 'Huiles végétales (colza, palme)', + }, + { + id => '226', + lc => 'fr', + ingredients_text => 'Huiles végétales 54.5% (colza, palme)', + }, + { + id => '227', + lc => 'fr', + ingredients_text => 'Huiles végétales non hydrogénées (colza, palme)', + }, + { + id => '228', + lc => 'fr', + ingredients_text => 'Huiles végétales bio (olive, palme, tournesol)', + }, + { + id => '229', + lc => 'ru', + ingredients_text => 'Масло (Пальмовое), масло растительное (подсолнечное, соевое)', } ); diff --git a/tests/unit/ingredients_processing.t b/tests/unit/ingredients_processing.t index 6965af01a2a32..7b89fd40034e8 100755 --- a/tests/unit/ingredients_processing.t +++ b/tests/unit/ingredients_processing.t @@ -1837,10 +1837,16 @@ my @tests = ( {lc => "hr", ingredients_text => "Pasterizirano mlijeko (s 1.0% mliječne masti)"}, [ { - 'id' => 'en:milk-with-1-0-milk-fat', - 'is_in_taxonomy' => 1, - 'processing' => 'en:pasteurised', - 'text' => 'mlijeko s 1.0% mliječne masti' + id => "en:pasteurised-milk", + ingredients => [ + { + id => "en:milk-with-1-0-milk-fat", + is_in_taxonomy => 1, + text => "mlijeko s 1.0% mlije\x{10d}ne masti" + } + ], + is_in_taxonomy => 1, + text => "Pasterizirano mlijeko" } ] ], diff --git a/tests/unit/ingredients_tags.t b/tests/unit/ingredients_tags.t index df342b2ff7b04..9a713a7b42942 100755 --- a/tests/unit/ingredients_tags.t +++ b/tests/unit/ingredients_tags.t @@ -187,7 +187,7 @@ my @tests = ( ["en:colour", "en:e162", "en:e160c", "en:e100"], ], - [{lc => "fr", ingredients_text => "graisse végétale bio (colza)"}, ["en:colza-oil"]], + [{lc => "fr", ingredients_text => "graisse végétale bio (colza)"}, ["en:vegetable-fat", "en:colza-oil"]], [{lc => "fr", ingredients_text => "lait cru de lapin"}, ["fr:lait-cru-de-lapin"]], [ @@ -199,7 +199,10 @@ my @tests = ( ["en:unrefined-cane-sugar", "en:banana", "en:tomato", "en:unrefined-sugar"] ], - [{lc => "en", ingredients_text => "vegetable oil (coconut & rapeseed)"}, ["en:coconut-oil", "en:rapeseed-oil"]], + [ + {lc => "en", ingredients_text => "vegetable oil (coconut & rapeseed)"}, + ["en:vegetable-oil", "en:coconut-oil", "en:rapeseed-oil"] + ], [{lc => "fr", ingredients_text => "amidon de blé. traces de _céleri_."}, ["en:wheat-starch"]], @@ -316,7 +319,7 @@ my @tests = ( # Russian oil [ {lc => "ru", ingredients_text => "масло растительное (подсолнечное, соевое), Масло (Пальмовое)"}, - ["en:sunflower-oil", "en:soya-oil", "en:palm-oil"] + ["en:vegetable-oil", "en:oil", "en:sunflower-oil", "en:soya-oil", "en:palm-oil"] ], [{lc => "fr", ingredients_text => "Banane coupée et cuite au naturel"}, ["en:banana"],], [