From 730f62161e66f0692ea81c531462d94d0a14c362 Mon Sep 17 00:00:00 2001 From: benbenben2 <110821832+benbenben2@users.noreply.github.com> Date: Thu, 30 Nov 2023 13:28:02 +0100 Subject: [PATCH] feat: parse origin of ingredients for Japanese (#9125) --- lib/ProductOpener/Ingredients.pm | 80 +- taxonomies/additives.txt | 2 +- taxonomies/countries.txt | 2 +- taxonomies/ingredients.txt | 65 +- taxonomies/ingredients_processing.txt | 7 +- taxonomies/origins.txt | 3622 +++++++++++++++++ .../ingredients/en-origin-and.json | 56 + .../ingredients/fr-origin-and.json | 58 + .../ingredients/ja-origin-and.json | 78 + .../ingredients/ja-origins.json | 242 ++ .../ingredients/ja-parenthesis.json | 27 +- tests/unit/ingredients.t | 43 + 12 files changed, 4236 insertions(+), 46 deletions(-) create mode 100644 tests/unit/expected_test_results/ingredients/en-origin-and.json create mode 100644 tests/unit/expected_test_results/ingredients/fr-origin-and.json create mode 100644 tests/unit/expected_test_results/ingredients/ja-origin-and.json create mode 100644 tests/unit/expected_test_results/ingredients/ja-origins.json diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index 52781dd890d16..a5df551afc664 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -144,8 +144,9 @@ use Data::DeepAccess qw(deep_get deep_exists); # U+204D "⁍" (Black Rightwards Bullet) # U+2219 "∙" (Bullet Operator ) # U+22C5 "⋅" (Dot Operator) +# U+30FB "・" (Katakana Middle Dot) my $middle_dot - = qr/(?: \N{U+00B7} |\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5})/i; + = qr/(?: \N{U+00B7} |\N{U+2022}|\N{U+2023}|\N{U+25E6}|\N{U+2043}|\N{U+204C}|\N{U+204D}|\N{U+2219}|\N{U+22C5}|\N{U+30FB})/i; # Unicode category 'Punctuation, Dash', SWUNG DASH and MINUS SIGN my $dashes = qr/(?:\p{Pd}|\N{U+2053}|\N{U+2212})/i; @@ -466,6 +467,7 @@ my %and_or = ( fr => " et | ou | et/ou | et / ou ", is => " og | eða | og/eða | og / eða ", it => " e | o | e/o | e / o", + ja => "又は", # or nl => " en/of | en / of ", nb => " og | eller | og/eller | og / eller ", pl => " i | oraz | lub | albo ", @@ -1898,15 +1900,13 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { # e.g. (Contains milk.) -> Contains milk. $between =~ s/(\s|\.)+$//; - $debug_ingredients and $log->debug("found sub-ingredients", {between => $between, after => $after}) + $debug_ingredients and $log->debug("parse_ingredients_text - sub-ingredients found: $between") if $log->is_debug(); # percent followed by a separator, assume the percent applies to the parent (e.g. tomatoes) # tomatoes (64%, origin: Spain) # tomatoes (145g per 100g of finished product) - if (($between =~ $separators) and ($` =~ /^$percent_or_quantity_regexp$/i)) { - $percent_or_quantity_value = $1; $percent_or_quantity_unit = $2; # remove what is before the first separator @@ -1924,10 +1924,20 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { # sel marin (France, Italie) # -> if we have origins, put "origins:" before - if ( ($between =~ $separators) - and (exists_taxonomy_tag("origins", canonicalize_taxonomy_tag($ingredients_lc, "origins", $`)))) + if ( + ( + ($between =~ /$separators|$and/) + and ( + exists_taxonomy_tag( + "origins", canonicalize_taxonomy_tag($ingredients_lc, "origins", $`) + ) + ) + ) + or ($between =~ /産|製造/) + ) { - $between =~ s/^(.*?$separators)/origins:$1/; + # prepend "origins:" in the beginning of the text, that will be reused below + $between = "origins:" . $between; } $debug_ingredients and $log->debug( @@ -1940,59 +1950,82 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { } ) if $log->is_debug(); - # : is in $separators but we want to keep "origine : France" or "min : 23%" if ( ($between =~ $separators) and ($` !~ /\s*(origin|origins|origine|alkuperä|ursprung)\s*/i) and ($between !~ /^$percent_or_quantity_regexp$/i)) { $between_level = $level + 1; - $debug_ingredients and $log->debug("between contains a separator", {between => $between}) - if $log->is_debug(); + $log->debug( + "parse_ingredients_text - sub-ingredients: between contains a separator and is not origin nor has percent", + {between => $between} + ) if $log->is_debug(); } else { # no separator found : 34% ? or single ingredient - $debug_ingredients - and $log->debug("between does not contain a separator", {between => $between}) - if $log->is_debug(); + $log->debug( + "parse_ingredients_text - sub-ingredients: between does not contain a separator or is origin or is percent", + {between => $between} + ) if $log->is_debug(); if ($between =~ /^$percent_or_quantity_regexp(?:$per_100g_regexp)?$/i) { $percent_or_quantity_value = $1; $percent_or_quantity_unit = $2; - $debug_ingredients - and $log->debug( - "between is a percent", + $log->debug( + "parse_ingredients_text - sub-ingredients: between is a percent", { between => $between, percent_or_quantity_value => $percent_or_quantity_value, percent_or_quantity_unit => $percent_or_quantity_unit } - ) if $log->is_debug(); + ) if $log->is_debug(); $between = ''; } else { # label? (organic) # origin? (origine : France) + $log->debug("parse_ingredients_text - sub-ingredients: label? origin? ($between)") + if $log->is_debug(); # try to remove the origin and store it as property if ($between - =~ /\s*(de origine|d'origine|origine|origin|origins|alkuperä|ursprung|oorsprong)\s?:?\s?\b(.*)$/i + =~ /\s*(?:de origine|d'origine|origine|origin|origins|alkuperä|ursprung|oorsprong)\s?:?\s?\b(.*)$/i ) { + $log->debug("parse_ingredients_text - sub-ingredients: contains origin in $between") + if $log->is_debug(); + $between = ''; - my $origin_string = $2; + # rm first occurence (origin:) + my $origin_string = $1; + + # rm additional parenthesis and its content that are sub-ingredient of origing (not parsed for now) + # example: "トマト (輸入又は国産 (未満 5%))"" (i.e., "Tomatoes (imported or domestically produced (less than 5%)))"") + $origin_string =~ s/\s*\([^)]*\)//g; + + if ($ingredients_lc eq 'ja') { + # rm all occurences at the end of words (ブラジル産、エチオピア産) + $origin_string =~ s/(産|製造)//g; + # remove "and more" その他 + $origin_string =~ s/(?: and )?その他//g; + } + # d'origine végétale -> not a geographic origin, add en:vegan if ($origin_string =~ /vegetal|végétal/i) { $vegan = "en:yes"; $vegetarian = "en:yes"; } else { + $origin = join(",", map {canonicalize_taxonomy_tag($ingredients_lc, "origins", $_)} - split(/,/, $origin_string)); + split(/$commas|$and/, $origin_string)); } } else { + $log->debug( + "parse_ingredients_text - sub-ingredients: origin not explicitly written in: $between" + ) if $log->is_debug(); # origins: Fraise (France) my $originid = canonicalize_taxonomy_tag($ingredients_lc, "origins", $between); @@ -2003,6 +2036,9 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { $log->debug("between is an origin", {between => $between, origin => $origin}) if $log->is_debug(); $between = ''; + $log->debug( + "parse_ingredients_text - sub-ingredients: between is an origin: $between") + if $log->is_debug(); } # put origins first because the country can be associated with the label "Made in ..." # Skip too short entries (1 or 2 letters) to avoid false positives @@ -2545,6 +2581,10 @@ sub parse_ingredients_text_service ($product_ref, $updated_product_fields_ref) { 'it' => ['^in proporzion[ei] variabil[ei]$',], + 'ja' => [ + 'その他', # etc. + ], + 'nb' => ['^Pakket i beskyttende atmosfære$',], 'nl' => [ diff --git a/taxonomies/additives.txt b/taxonomies/additives.txt index 81b8149566a3f..b9cfd61b46854 100644 --- a/taxonomies/additives.txt +++ b/taxonomies/additives.txt @@ -2045,7 +2045,7 @@ fr:E160a, carotènes mélangés, carotène, γ-Carotène, gamma-carotène, Alpha hr:E160a, karoten, bojilo karoteni, karoteni hu:E160a, Karotinok it:E160a, Carotene, Carotina, Caroteni -ja:E160a, カロチン, カロテン +ja:E160a, カロチン, カロテン, カロテン色素 lt:E160a, Karotinas lv:E160a, E160a food additive mt:E160a, E160a food additive diff --git a/taxonomies/countries.txt b/taxonomies/countries.txt index 1e03d5afd8e97..d15e72dfc1ef4 100755 --- a/taxonomies/countries.txt +++ b/taxonomies/countries.txt @@ -21480,7 +21480,7 @@ io:Japonia is:Japan it:Giappone iu:ᓃᑉᐊᓐ -ja:日本, 日本国 +ja:日本, 日本国, 国, 国内, 国産 jbo:pongu'e jv:Jepang ka:იაპონია diff --git a/taxonomies/ingredients.txt b/taxonomies/ingredients.txt index a4c199fd5a387..d8fe41329a4ae 100644 --- a/taxonomies/ingredients.txt +++ b/taxonomies/ingredients.txt @@ -143,6 +143,7 @@ wikidata:en:Q421576 wikipedia:en:https://en.wikipedia.org/wiki/Enilconazole # ingredient/fr:imazalil has 23 products @2019-05-29 +en:frying fr:friture # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # @@ -331,7 +332,7 @@ hy:Կարոտին id:Karotena io:Karotino it:Carotene -ja:カロテン +ja:カロテン, カロテン色素 kk:Каротин ko:카로틴 lt:karotinas @@ -10480,6 +10481,7 @@ fr:bœuf, boeuf, Boeufs hr:goveđa, govedi, govedina hu:marha it:manzo +ja:ビーフ nb:storfekjøtt pl:Wołowina, wołowe, wołowa, wołowy pt:Carne bovina @@ -13021,6 +13023,7 @@ bg:свински бульон de:Schweinefleischbrühe es:caldo de cerdo fr:bouillon de porc +ja:ポークブイヨン # "en", + ingredients_text => "Tomatoes (France and Italy)", + } + ], + # Origins : French - X from Y [ "fr-origin-ingredient-origin-and-origin", @@ -506,6 +514,14 @@ Origin of peaches: Spain. Origin of some unknown ingredient: France. origin of A } ], + [ + "fr-origin-and", + { + lc => "fr", + ingredients_text => "Pomme de Terre (France et Italie)", + } + ], + [ "en-vitamin", { @@ -530,6 +546,33 @@ Origin of peaches: Spain. Origin of some unknown ingredient: France. origin of A ingredients_text => "砂糖、小麦粉、全粉乳、カカオマス、ショートニング、植物油脂、ココアバター、小麦全粒粉、小麦ふすま、食塩、小麦胚芽 / 加工デンプン、乳化剤(大豆由来)、膨脹剤、香料", } ], + # origins + [ + "ja-origins", + { + lc => "ja", + ingredients_text => "塩(国産), +クレームフレーシュ(国内製造), +肉(オーストラリア), +オリーブ油(ブラジル産、エチオピア産), +白ワインビネガー(オーストラリア又はフィンランド又はその他), +麦芽(国内製造又は韓国製造), +糖類(外国製造又は国内製造), +ココア(輸入又は国産 (5%未満)), +えだまめ(北海道産). +パンの実(三陸産), +クレメンタイン(九州産)" + } + ], + + [ + "ja-origin-and", + { + lc => "ja", + ingredients_text => "トマト(ときがわ町])", + } + ], + # U+00B7 "·" (Middle Dot) is a character found in ingredient forsome countries (Catalan) [ "ca-middle-dot",