From 782434ee6f06cc325421328b24d2925f6f8918d7 Mon Sep 17 00:00:00 2001
From: benbenben2 <110821832+benbenben2@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:50:24 +0100
Subject: [PATCH 1/6] partial devs
---
lib/ProductOpener/FoodProducts.pm | 2 +
lib/ProductOpener/TaxonomiesEnhancer.pm | 105 ++++++++++++++++++++++++
tests/unit/taxonomies_enhancer.t | 45 ++++++++++
3 files changed, 152 insertions(+)
create mode 100644 lib/ProductOpener/TaxonomiesEnhancer.pm
create mode 100644 tests/unit/taxonomies_enhancer.t
diff --git a/lib/ProductOpener/FoodProducts.pm b/lib/ProductOpener/FoodProducts.pm
index ce57fec8f911d..775fa0d6c8f75 100644
--- a/lib/ProductOpener/FoodProducts.pm
+++ b/lib/ProductOpener/FoodProducts.pm
@@ -58,6 +58,7 @@ use ProductOpener::FoodGroups qw/compute_food_groups/;
use ProductOpener::Nutriscore qw/:all/;
use ProductOpener::EnvironmentalScore qw/compute_environmental_score/;
use ProductOpener::ForestFootprint qw/compute_forest_footprint/;
+use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;
use Log::Any qw($log);
@@ -83,6 +84,7 @@ sub specific_processes_for_food_product ($product_ref) {
extract_ingredients_from_text($product_ref);
extract_additives_from_text($product_ref);
detect_allergens_from_text($product_ref);
+ detect_taxonomy_translation_from_text($product_ref);
# Category analysis
# Food category rules for sweetened/sugared beverages
diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm
new file mode 100644
index 0000000000000..abcdae7b6a92f
--- /dev/null
+++ b/lib/ProductOpener/TaxonomiesEnhancer.pm
@@ -0,0 +1,105 @@
+# This file is part of Product Opener.
+#
+# Product Opener
+# Copyright (C) 2011-2023 Association Open Food Facts
+# Contact: contact@openfoodfacts.org
+# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
+#
+# Product Opener is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+=encoding UTF-8
+
+=head1 NAME
+
+ProductOpener::TaxonomiesEnhancer - analyze ingredients and other fields to enrich the taxonomies
+
+=head1 SYNOPSIS
+
+C analyze
+analyze ingredients and other fields to enrich the taxonomies
+
+ use ProductOpener::TaxonomiesEnhancer qw/:all/;
+
+ [..]
+
+ detect_taxonomy_translation_from_text($product_ref);
+
+=head1 DESCRIPTION
+
+[..]
+
+=cut
+
+package ProductOpener::TaxonomiesEnhancer;
+
+use ProductOpener::PerlStandards;
+use Exporter qw< import >;
+
+BEGIN {
+ use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
+ @EXPORT_OK = qw(
+
+ &detect_taxonomy_translation_from_text
+ ); # symbols to export on request
+ %EXPORT_TAGS = (all => [@EXPORT_OK]);
+}
+
+use vars @EXPORT_OK;
+# use experimental 'smartmatch';
+# use Encode;
+# use Clone qw(clone);
+# use LWP::UserAgent;
+# use Encode;
+# use JSON::MaybeXS;
+use Log::Any qw($log);
+# use List::MoreUtils qw(uniq);
+# use Data::DeepAccess qw(deep_get deep_exists);
+
+# use ProductOpener::Store qw/get_string_id_for_lang unac_string_perl/;
+# use ProductOpener::Config qw/:all/;
+# use ProductOpener::Users qw/:all/;
+# use ProductOpener::Tags qw/:all/;
+# use ProductOpener::Products qw/remove_fields/;
+# use ProductOpener::URL qw/:all/;
+# use ProductOpener::Images qw/extract_text_from_image/;
+# use ProductOpener::Lang qw/$lc %Lang lang/;
+# use ProductOpener::Units qw/normalize_quantity/;
+# use ProductOpener::Food qw/is_fat_oil_nuts_seeds_for_nutrition_score/;
+use ProductOpener::Ingredients qw/parse_ingredients_text_service/;
+
+
+
+=head2 detect_taxonomy_translation_from_text ( product_ref )
+
+This function extracts data for each language from the provided product reference.
+It then detects failed extractions (missing stop words) and identifies missing translations.
+
+=head3 Arguments
+
+=head4 product_ref
+
+A reference to the product data, which is expected to be a hash reference containing the necessary information.
+
+=head3 Return value
+
+This function does not return any value. It performs the extraction and detection internally.
+
+=cut
+
+sub detect_taxonomy_translation_from_text ($product_ref) {
+ $log->debug("detect_taxonomy_translation_from_text - start") if $log->is_debug();
+ print STDERR "detect_taxonomy_translation_from_text - start\n";
+}
+
+1;
diff --git a/tests/unit/taxonomies_enhancer.t b/tests/unit/taxonomies_enhancer.t
new file mode 100644
index 0000000000000..23250ebb41f6f
--- /dev/null
+++ b/tests/unit/taxonomies_enhancer.t
@@ -0,0 +1,45 @@
+#!/usr/bin/perl -w
+
+use Modern::Perl '2017';
+use utf8;
+
+use Test2::V0;
+use Log::Any::Adapter 'TAP';
+
+use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;
+
+
+# example based on 0036595328366
+# should detect stopwords
+my $product_ref = {
+ ingredients_text_cs => "69% pšeničná mouka , pitná voda, řepkový olej , stabilizátor: glycerol; pšeničný lepek , regulátor kyselosti : kyselina jablečná; jedlá sůl , emulgátor : mono - a diglyceridy mastných kyselin ; dextróza , kypřící látka : uhličitany sodné ; konzervanty : propionan vápenatý , sorban draselný ; látka zlepšující mouku : L-cystein. Skladujte v suchu a chraňte před teplem.",
+ ingredients_text_hr => "69% pšenično brašno, voda , repičino ulje , stabilizator. glicerol; pšenični gluten, regulator kiselosti : jabučna kiselina ; kuhinjska sol , emulgator : mono - i digliceridi masnih kiselina ; dekstroza, tvar za rahljenje : natrijevi karbonati; konzervansi : kalcijev propionat , kalijev sorbat ; tvar za tretiranje brašna : L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu => "69% búzaliszt , ivóvíz , repceolaj , stabilizátor: glicerin; búzaglutén , savanyúságot szabályozó anyag : almasav ; étkezési só , emulgeálószer: zsírsavak mono - és digliceridjei ; dextróz , térfogatnövelő szer : nátrium-karbonátok ; tartósítószerek : kalcium-propionát , kálium-szorbát ; lisztkezelő szer : L-Cisztein.",
+ ingredients_text_pl => "69% mąka pszenna , woda , olej rzepakowy , stabilizator: glicerol; gluten pszenny , regulator kwasowości : kwas jabłkowy ; sól , emuglator : mono - i diglicerydy kwasów tłuszczowych; glukoza , substancja spulchniająca: węglany sodu ; substancje konserwujące: propionian wapnia , sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.",
+ ingredients_text_ro => "69% făină de grâu , apă , ulei de rapiță , stabilizator: glicerol; gluten din grâu, corector de aciditate : acid malic ; sare , emulsifiant : mono - şi digliceride ale acizilor graşi; dextroză , agent de afanare : carbonați de sodiu ; conservanți : propionat de calciu, sorbat de potasiu; agent de tratare a făinii : L-cisteină.",
+ ingredients_text_sk => "69% pšeničná múka , pitná voda, repkový olej , stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti : kyselina jablčná ; jedlá soľ , emulgátor : mono - a diglyceridy mastných kyselín ; dextróza , kypriaca látka : uhličitany sodné ; konzervačné látky : propionan vápenatý , sorban draselný ; múku upravujúca látka : L-cystein.",
+ ingredients_text_sl => "69% pšenična moka , voda , olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator : mono - in diglicerid! maščobnih kislin ; dekstroza, sredstvo za vzhajanje : natrijevi karbonati; konzervansa : kalcijev propionat , kalijev sorbat ; sredstvo za obdelavo moke : L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.",
+};
+detect_taxonomy_translation_from_text($product_ref);
+
+# example based on 20201845
+# should suggests translations
+# problem with english: some app translated in english from other languages. NOT producer translation.
+# for example: App translation (infood) probably based on RO:
+# ingredients_text_en => "water, wine vinegar, mustard seeds, [mustard husks], table salt, [acidifying]: citric acid, [natural flavors of cloves], cinnamon, ginger and tarragon, antioxidant: potassium metabisulphite, spice mixture",
+# versus Producer translation:
+# ingredients_text_en => "water, spirit vinegar, mustard seeds, husks of mustard seeds, salt, acidity regulator: citric acid, natural flavorings, antioxidant: potassium metabisulphite, turmeric",
+# in square brackets are unknown ingredients on the product
+# ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
+# ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
+# ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
+# RO has more ingredients
+# ES has a typo ácido citico -> Ácido cítrico
+my $product_ref = {
+ ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
+ ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
+ ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
+};
+
+
+done_testing();
From 64e3a77402fb8a0c32d7599cdbef8f1c06617b4e Mon Sep 17 00:00:00 2001
From: benbenben2 <110821832+benbenben2@users.noreply.github.com>
Date: Sat, 18 Jan 2025 21:52:11 +0100
Subject: [PATCH 2/6] taxonomy-translation-enhancer first draft
---
cpanfile | 3 +
lib/ProductOpener/FoodProducts.pm | 2 -
lib/ProductOpener/Products.pm | 5 +
lib/ProductOpener/TaxonomiesEnhancer.pm | 751 +++++++++++++++++++++++-
tests/unit/taxonomies_enhancer.t | 576 +++++++++++++++++-
5 files changed, 1274 insertions(+), 63 deletions(-)
diff --git a/cpanfile b/cpanfile
index 5d070bdefae10..3526b2e69144b 100644
--- a/cpanfile
+++ b/cpanfile
@@ -110,6 +110,9 @@ requires 'Module::Load';
# To measure the time taken by requests
requires 'Time::Monotonic';
+# To measure similarity between words and find possible typo
+requires 'Text::Levenshtein';
+
on 'test' => sub {
requires 'Test2::V0';
requires 'Mock::Quick';
diff --git a/lib/ProductOpener/FoodProducts.pm b/lib/ProductOpener/FoodProducts.pm
index 775fa0d6c8f75..ce57fec8f911d 100644
--- a/lib/ProductOpener/FoodProducts.pm
+++ b/lib/ProductOpener/FoodProducts.pm
@@ -58,7 +58,6 @@ use ProductOpener::FoodGroups qw/compute_food_groups/;
use ProductOpener::Nutriscore qw/:all/;
use ProductOpener::EnvironmentalScore qw/compute_environmental_score/;
use ProductOpener::ForestFootprint qw/compute_forest_footprint/;
-use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;
use Log::Any qw($log);
@@ -84,7 +83,6 @@ sub specific_processes_for_food_product ($product_ref) {
extract_ingredients_from_text($product_ref);
extract_additives_from_text($product_ref);
detect_allergens_from_text($product_ref);
- detect_taxonomy_translation_from_text($product_ref);
# Category analysis
# Food category rules for sweetened/sugared beverages
diff --git a/lib/ProductOpener/Products.pm b/lib/ProductOpener/Products.pm
index f441518b62509..e7efb99b54474 100644
--- a/lib/ProductOpener/Products.pm
+++ b/lib/ProductOpener/Products.pm
@@ -146,6 +146,7 @@ use ProductOpener::Units qw/normalize_product_quantity_and_serving_size/;
# may be moved to another module at some point
use ProductOpener::Packaging qw/analyze_and_combine_packaging_data/;
use ProductOpener::DataQuality qw/check_quality/;
+use ProductOpener::TaxonomiesEnhancer qw/check_ingredients_between_languages/;
# Specific to the product type
use ProductOpener::FoodProducts qw/specific_processes_for_food_product/;
@@ -3692,6 +3693,10 @@ sub analyze_and_enrich_product_data ($product_ref, $response_ref) {
ProductOpener::DataQuality::check_quality($product_ref);
+ if (defined $taxonomy_fields{'ingredients'}) {
+ check_ingredients_between_languages($product_ref);
+ }
+
# Sort misc_tags in order to have a consistent order
if (defined $product_ref->{misc_tags}) {
$product_ref->{misc_tags} = [sort @{$product_ref->{misc_tags}}];
diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm
index abcdae7b6a92f..1979b200088b5 100644
--- a/lib/ProductOpener/TaxonomiesEnhancer.pm
+++ b/lib/ProductOpener/TaxonomiesEnhancer.pm
@@ -33,7 +33,7 @@ analyze ingredients and other fields to enrich the taxonomies
[..]
- detect_taxonomy_translation_from_text($product_ref);
+ check_ingredients_between_languages($product_ref);
=head1 DESCRIPTION
@@ -43,44 +43,628 @@ analyze ingredients and other fields to enrich the taxonomies
package ProductOpener::TaxonomiesEnhancer;
-use ProductOpener::PerlStandards;
-use Exporter qw< import >;
-
BEGIN {
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
@EXPORT_OK = qw(
-
- &detect_taxonomy_translation_from_text
- ); # symbols to export on request
+ &check_ingredients_between_languages
+ );
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
+use Exporter qw< import >;
+use List::Util qw(any);
+use Log::Log4perl qw(get_logger);
+use Text::Levenshtein qw(distance);
use vars @EXPORT_OK;
-# use experimental 'smartmatch';
-# use Encode;
-# use Clone qw(clone);
-# use LWP::UserAgent;
-# use Encode;
-# use JSON::MaybeXS;
-use Log::Any qw($log);
-# use List::MoreUtils qw(uniq);
-# use Data::DeepAccess qw(deep_get deep_exists);
-
-# use ProductOpener::Store qw/get_string_id_for_lang unac_string_perl/;
-# use ProductOpener::Config qw/:all/;
-# use ProductOpener::Users qw/:all/;
-# use ProductOpener::Tags qw/:all/;
-# use ProductOpener::Products qw/remove_fields/;
-# use ProductOpener::URL qw/:all/;
-# use ProductOpener::Images qw/extract_text_from_image/;
-# use ProductOpener::Lang qw/$lc %Lang lang/;
-# use ProductOpener::Units qw/normalize_quantity/;
-# use ProductOpener::Food qw/is_fat_oil_nuts_seeds_for_nutrition_score/;
+
use ProductOpener::Ingredients qw/parse_ingredients_text_service/;
+use ProductOpener::PerlStandards;
+use ProductOpener::Tags qw/add_tag get_taxonomy_tag_synonyms is_a/;
+
+# Configure Log4perl
+Log::Log4perl->init(\<<'EOL');
+# log4perl.logger = DEBUG, Screen
+log4perl.logger = INFO, Screen
+log4perl.appender.Screen = Log::Log4perl::Appender::Screen
+log4perl.appender.Screen.stderr = 1
+log4perl.appender.Screen.layout = Log::Log4perl::Layout::PatternLayout
+log4perl.appender.Screen.layout.ConversionPattern = %d %p %m %n
+EOL
+
+my $log = get_logger();
+
+=head2 flatten_ingredients ( ingredients )
+
+This function recursively flattens a nested list of ingredients.
+If an ingredient contains sub-ingredients, they are flattened into a single-level array.
+
+=head3 Arguments
+
+=head4 ingredients
+
+An array reference of ingredients that may contain nested sub-ingredients.
+
+=head3 Return value
+
+An array of flattened ingredients, each represented as a hash reference.
+
+=cut
+
+sub flatten_ingredients {
+ my ($ingredients) = @_;
+ my @flat_list;
+
+ foreach my $ingredient (@{$ingredients}) {
+ $ingredient->{id} =~ s/_//g;
+ $ingredient->{text} =~ s/_//g;
+
+ push @flat_list,
+ {
+ id => $ingredient->{id},
+ is_in_taxonomy => $ingredient->{is_in_taxonomy},
+ text => $ingredient->{text},
+ };
+
+ # If the ingredient contains sub-ingredients, flatten them
+ if (exists $ingredient->{ingredients}) {
+ push @flat_list, flatten_ingredients($ingredient->{ingredients});
+ }
+ }
+
+ return @flat_list;
+}
+
+=head2 parse_ingredients_for_language ( ingredients_hash, key )
+
+This function parses and flattens the ingredient list for a specific language based on the provided key.
+
+=head3 Arguments
+
+=head4 ingredients_hash
+
+A hash reference to the product data containing the ingredient text for various languages.
+
+=head4 key
+
+A string representing the language-specific ingredient text key (e.g., "ingredients_text_cs").
+
+=head3 Return value
+
+This function does not return a value but modifies the product reference to store the flattened ingredient list under a new key (e.g., "ingredients_cs").
+
+=cut
+
+sub parse_ingredients_for_language {
+ my ($ingredients_hash, $key) = @_;
+
+ # Extract language code from key (for example, 'ingredients_text_cs' -> 'cs')
+ my $lang = ($key =~ s/^ingredients_text_//r);
+
+ # Apply parse on given lang
+ $ingredients_hash->{"ingredients_lc"} = $lang;
+ $ingredients_hash->{"ingredients_text"} = lc($ingredients_hash->{"ingredients_text_" . $lang});
+
+ parse_ingredients_text_service($ingredients_hash, {}, []);
+
+ # For simplicity, flatten the parsed ingredient list (from sub list to single level list)
+ my @flat_ingredients = flatten_ingredients($ingredients_hash->{"ingredients"});
+
+ $ingredients_hash->{$lang} = \@flat_ingredients;
+
+ # Deleting unnecessary keys created by the parse_ingredients_text_service, ensure they exist before deletion
+ delete $ingredients_hash->{"ingredients"} if exists $ingredients_hash->{"ingredients"};
+ delete $ingredients_hash->{"ingredients_lc"} if exists $ingredients_hash->{"ingredients_lc"};
+ delete $ingredients_hash->{"ingredients_text"} if exists $ingredients_hash->{"ingredients_text"};
+ delete $ingredients_hash->{"ingredients_text_" . $lang} if exists $ingredients_hash->{"ingredients_text_" . $lang};
+}
+
+=head2 not_enough_known_ingredients ( ingredients1, ingredients2 )
+
+This function checks if all or a certain percentage of ingredients in the first language (reference) overlap with ingredients in second language (to analyze).
+
+=head3 Arguments
+
+=head4 ingredients1
+
+An array reference of ingredients in the first language (reference).
+
+=head4 ingredients2
+
+An array reference of ingredients in the second language (to analyze).
+
+=head3 Return value
+
+Returns 1 if the overlap of ingredients between the two lists is below a certain threshold, otherwise returns 0.
+
+=cut
+
+sub not_enough_known_ingredients {
+ my ($ingredients1, $ingredients2) = @_;
+
+ if (@$ingredients1 == 0 || @$ingredients2 == 0) {
+ $log->debug(
+ "check_ingredients_between_languages > not_enough_known_ingredients - one of the ingredients list is empty"
+ ) if $log->is_debug();
+ return 1;
+ }
+
+ # Product with 1 ingredient would be under the threshold defined below
+ if (@$ingredients1 > 1) {
+ my $min_known_percentage = 0.5;
+ my $known_count = 0;
+ for my $i (0 .. $#$ingredients1) {
+ if (any {$_->{id} eq $ingredients1->[$i]{id}} @$ingredients2) {
+ $known_count++;
+ }
+ }
+ # Length of ingredients1 cannot be zero, see above
+ my $known_percentage = $known_count / @$ingredients1;
+ if ($known_percentage < $min_known_percentage) {
+ $log->debug(
+ "check_ingredients_between_languages > not_enough_known_ingredients - too much unknown ingredient between ingredients1 and ingredients2"
+ ) if $log->is_debug();
+ return 1;
+ }
+ }
+
+ return 0;
+
+}
+
+=head2 detect_missing_stop_words_before_list ( ingredients1, ingredients2, lang1, lang2, missing_stop_words_before )
+
+This function detects missing stop words before the first known ingredient in a list of ingredients.
+
+=head3 Arguments
+
+=head4 ingredients1
+
+An array reference of ingredients in the first language (reference).
+
+=head4 ingredients2
+
+An array reference of ingredients in the second language (to analyze).
+
+=head4 lang1
+
+A string representing the language code for the first language.
+
+=head4 lang2
+
+A string representing the language code for the second language.
+
+=head4 missing_stop_words_before
+
+A hash reference to store the missing stop words before the first known ingredient.
+
+=head3 Return value
+
+This function does not return a value but modifies the `missing_stop_words_before` hash reference to store the missing stop words.
+
+=cut
+
+sub detect_missing_stop_words_before_list {
+ my ($ingredients1, $ingredients2, $lang1, $lang2, $missing_stop_words_before) = @_;
+
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_before_list - start, lang1: $lang1, lang2: $lang2"
+ ) if $log->is_debug();
+
+ # Return if first ingredient in ingredients1 is unknown or first ingredient in ingredients2 is known
+ if (!$ingredients1->[0]{is_in_taxonomy} or $ingredients2->[0]{is_in_taxonomy}) {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_before_list - first ingredient in ingredients1 ($ingredients1->[0]{id}) is unknown (is_in_taxonomy => $ingredients1->[0]{is_in_taxonomy}) or first ingredient in ingredients2 is known (is_in_taxonomy => $ingredients2->[0]{is_in_taxonomy})"
+ ) if $log->is_debug();
+ return;
+ }
+
+ # Iterate on all first unknown ingredient from ingredients2 until we find first ingredients1
+ my $previous_ingredients_object;
+ foreach my $i (0 .. $#$ingredients2) {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_before_list - search for first known ingredient in ingredients1: $ingredients2->[$i]{text}"
+ ) if $log->is_debug();
+ # based on previous return condition, first iteration will be else
+ if ($ingredients2->[$i]{id} eq $ingredients1->[0]{id}) {
+ unless (exists $missing_stop_words_before->{$previous_ingredients_object->{id}}) {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_before_list - adding stopword before, first time: $previous_ingredients_object->{id}"
+ ) if $log->is_debug();
+ $missing_stop_words_before->{$lang2} = $previous_ingredients_object->{id};
+ }
+ last;
+ }
+ else {
+ $previous_ingredients_object = $ingredients2->[$i];
+ }
+ }
+}
+
+=head2 get_ingredient_index ( ingredients, ingredient_id )
+
+This function finds the index of a specific ingredient in a list of ingredients based on its ID.
+
+=head3 Arguments
+
+=head4 ingredients
+
+An array reference of ingredients, where each ingredient is represented as a hash reference.
+
+=head4 ingredient_id
+
+A string representing the ID of the ingredient to find.
+
+=head3 Return value
+
+Returns the index of the ingredient if found, otherwise returns -1.
+
+=cut
+
+sub get_ingredient_index {
+ my ($ingredients, $ingredient_id) = @_;
+
+ my $index = -1;
+
+ foreach my $i (0 .. $#{$ingredients}) {
+ if ($ingredients->[$i]{id} eq $ingredient_id) {
+ $index = $i;
+ last;
+ }
+ }
+ return $index;
+}
+
+=head2 detect_missing_stop_words_after_list ( ingredients1, ingredients2, lang1, lang2, missing_stop_words_after )
+
+This function detects missing stop words after the last known ingredient in a list of ingredients.
+
+=head3 Arguments
+
+=head4 ingredients1
+
+An array reference of ingredients in the first language (reference).
+
+=head4 ingredients2
+
+An array reference of ingredients in the second language (to analyze).
+
+=head4 lang1
+A string representing the language code for the first language.
+=head4 lang2
-=head2 detect_taxonomy_translation_from_text ( product_ref )
+A string representing the language code for the second language.
+
+=head4 missing_stop_words_after
+
+A hash reference to store the missing stop words after the last known ingredient.
+
+=head3 Return value
+
+This function does not return a value but modifies the `missing_stop_words_after` hash reference to store the missing stop words.
+
+=cut
+
+sub detect_missing_stop_words_after_list {
+ my ($ingredients1, $ingredients2, $lang1, $lang2, $missing_stop_words_after) = @_;
+
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_after_list - start, lang1: $lang1, lang2: $lang2"
+ ) if $log->is_debug();
+
+ # Check if all known ingredients up to len(lang2) have the same ID at the same position
+ my $translation_difference_count = 0;
+ my $translation_difference_accepted_percentage = 0.5;
+ for my $i (0 .. $#$ingredients2) {
+ # Lower than last index of ingredients1
+ if ($i <= $#$ingredients1) {
+ if ( $ingredients1->[$i]{is_in_taxonomy}
+ && $ingredients2->[$i]{is_in_taxonomy}
+ && $ingredients1->[$i]{id} ne $ingredients2->[$i]{id})
+ {
+ $translation_difference_count += 1;
+ }
+ }
+ }
+ if (scalar(@{$ingredients1}) > 0
+ && $translation_difference_count / scalar(@{$ingredients1}) > $translation_difference_accepted_percentage)
+ {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_after_list - too much difference between languages to raise warning. diff/total > tolerance: $translation_difference_count / $#$ingredients1 = "
+ . $translation_difference_count / $#$ingredients1 . " > "
+ . $translation_difference_accepted_percentage)
+ if $log->is_debug();
+ return;
+ }
+
+ # Check if the ingredient at position len(lang1) + 1 in lang2 is unknown, if unknown, then it is a possible stop word
+ # Last word of ingredients1 should be known
+ if ( @$ingredients2 > @$ingredients1
+ && $ingredients1->[-1]{is_in_taxonomy}
+ && !$ingredients2->[@$ingredients1]{is_in_taxonomy})
+ {
+ my $unknown_ingredient_object = $ingredients2->[@$ingredients1];
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_after_list - should push $unknown_ingredient_object->{id}"
+ ) if $log->is_debug();
+
+ if (exists $missing_stop_words_after->{$lang2}) {
+ my $index_existing_value = get_ingredient_index($ingredients2, $missing_stop_words_after->{$lang2});
+ my $index_new_value = get_ingredient_index($ingredients2, $unknown_ingredient_object->{id});
+
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_after_list - adding stopword after, not first time: previously: $missing_stop_words_after->{$lang2} (index: $index_existing_value), newly: $unknown_ingredient_object->{id} ($index_new_value), check index"
+ ) if $log->is_debug();
+
+ if ($index_new_value < $index_existing_value) {
+ $missing_stop_words_after->{$lang2} = $unknown_ingredient_object->{id};
+ }
+ }
+ else {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_stop_words_after_list - adding stopword after, first time: $unknown_ingredient_object->{id}"
+ ) if $log->is_debug();
+ $missing_stop_words_after->{$lang2} = $unknown_ingredient_object->{id};
+ }
+ }
+}
+
+=head2 remove_duplicates ( @array )
+
+This function removes duplicate elements from an array.
+
+=head3 Arguments
+
+=head4 @array
+
+An array of elements from which duplicates need to be removed.
+
+=head3 Return value
+
+Returns an array with duplicate elements removed.
+
+=cut
+
+sub remove_duplicates {
+ my (@array) = @_;
+
+ my %seen;
+ my @unique_array;
+
+ foreach my $element (@array) {
+ unless ($seen{$element}) {
+ push @unique_array, $element;
+ $seen{$element} = 1;
+ }
+ }
+
+ return @unique_array;
+}
+
+=head2 find_smallest_value_key ( hashmap )
+
+This function finds the key with the smallest value in a hashmap. If multiple keys have the same smallest value, it returns the lexicographically smallest key.
+
+=head3 Arguments
+
+=head4 hashmap
+
+A hash reference where the keys are strings and the values are numeric.
+
+=head3 Return value
+
+Returns the key with the smallest value. If multiple keys have the same smallest value, returns the lexicographically smallest key.
+
+=cut
+
+sub find_smallest_value_key {
+ my ($hashmap) = @_;
+
+ my $smallest_value = undef;
+ my $smallest_key = undef;
+
+ foreach my $key (keys %$hashmap) {
+ $log->debug(
+ "check_ingredients_between_languages > find_smallest_value_key - next key: $key. Distance: $hashmap->{$key}"
+ ) if $log->is_debug();
+ if (!defined $smallest_value || $hashmap->{$key} < $smallest_value) {
+ $smallest_value = $hashmap->{$key};
+ $smallest_key = $key;
+ }
+ elsif ($hashmap->{$key} == $smallest_value) {
+ my ($ingredient_a, $ingredient_b) = get_sorted_strings($smallest_key, $key);
+ $smallest_value = $hashmap->{$ingredient_a};
+ $smallest_key = $ingredient_a;
+ }
+ }
+
+ return $smallest_key;
+}
+
+=head2 get_sorted_strings ( string1, string2 )
+
+This function returns two strings in lexicographically sorted order.
+
+=head3 Arguments
+
+=head4 string1
+
+A string to be compared.
+
+=head4 string2
+
+A string to be compared.
+
+=head3 Return value
+
+Returns a list of two strings sorted in lexicographical order.
+
+=cut
+
+sub get_sorted_strings {
+ my ($string1, $string2) = @_;
+
+ if ($string1 lt $string2) {
+ return ($string1, $string2);
+ }
+ else {
+ return ($string2, $string1);
+ }
+}
+
+=head2 detect_missing_ingredients ( ingredients1, ingredients2, lang1, lang2, missing_ingredients, ingredients_typo, mismatch_in_taxonomy )
+
+This function detects missing ingredients and potential typos between two lists of ingredients in different languages.
+
+=head3 Arguments
+
+=head4 ingredients1
+
+An array reference of ingredients in the first language (reference).
+
+=head4 ingredients2
+
+An array reference of ingredients in the second language (to analyze).
+
+=head4 lang1
+
+A string representing the language code for the first language.
+
+=head4 lang2
+
+A string representing the language code for the second language.
+
+=head4 missing_ingredients
+
+A hash reference to store missing ingredients.
+
+=head4 ingredients_typo
+
+A hash reference to store potential typos in ingredients.
+
+=head4 mismatch_in_taxonomy
+
+A hash reference to store mismatches in the taxonomy between the two languages.
+
+=head3 Return value
+
+This function does not return a value but modifies the `missing_ingredients`, `ingredients_typo`, and `mismatch_in_taxonomy` hash references to store the detected issues.
+
+=cut
+
+sub detect_missing_ingredients {
+ my ($ingredients1, $ingredients2, $lang1, $lang2, $missing_ingredients, $ingredients_typo, $mismatch_in_taxonomy)
+ = @_;
+
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - start, lang1 is $lang1, lang2 is $lang2")
+ if $log->is_debug();
+
+ foreach my $i (0 .. $#$ingredients1) {
+ if ($ingredients1->[$i]{is_in_taxonomy} && !$ingredients2->[$i]{is_in_taxonomy}) {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - $lang1:$ingredients1->[$i]{text} is in the taxonomy but not $ingredients2->[$i]{id}"
+ ) if $log->is_debug();
+
+ my $unknown_ingredient_object = $ingredients2->[$i];
+
+ my @synonyms = get_taxonomy_tag_synonyms($lang2, "ingredients", $ingredients1->[$i]{id});
+
+ if (!@synonyms) {
+ if (exists $missing_ingredients->{$unknown_ingredient_object->{id}}) {
+ my ($id_a, $id_b) = get_sorted_strings($missing_ingredients->{$unknown_ingredient_object->{id}},
+ $ingredients1->[$i]{id});
+
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - adding missing ingredient, additional time: $id_a"
+ ) if $log->is_debug();
+ $missing_ingredients->{$unknown_ingredient_object->{id}} = $id_a;
+ }
+ else {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - adding missing ingredient, first time: $unknown_ingredient_object->{id}"
+ ) if $log->is_debug();
+ $missing_ingredients->{$unknown_ingredient_object->{id}} = $ingredients1->[$i]{id};
+ }
+ }
+ # prevent to divide by zero
+ elsif (length($unknown_ingredient_object->{text}) > 0) {
+ my @unique_synonyms = remove_duplicates(@synonyms);
+ $log->debug("check_ingredients_between_languages > detect_missing_ingredients - retrieved "
+ . scalar(@unique_synonyms)
+ . " unique synonyms: "
+ . join(", ", @unique_synonyms))
+ if $log->is_debug();
+ # Levenshtein distance for each synonym
+ # acceptance of 40%, for example in Croatian: secer -> šećer
+ my %synonym_distance;
+ foreach my $synonym (@unique_synonyms) {
+ my $lev_distance = distance($unknown_ingredient_object->{text}, $synonym);
+ $synonym_distance{$synonym} = $lev_distance / length($unknown_ingredient_object->{text});
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - levenshtein synonyms distance between the ingredient $unknown_ingredient_object->{text} and the synonym $synonym is $lev_distance"
+ ) if $log->is_debug();
+ }
+
+ my $key_for_smallest_levenshtein_value = find_smallest_value_key(\%synonym_distance);
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - levenshtein synonyms smallest distance: $key_for_smallest_levenshtein_value"
+ ) if $log->is_debug();
+
+ if (defined $key_for_smallest_levenshtein_value) {
+ my $smallest_levenshtein_value = $synonym_distance{$key_for_smallest_levenshtein_value};
+
+ if ($smallest_levenshtein_value <= 0.4) {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients: the key with the smallest value is '$key_for_smallest_levenshtein_value' and its value is $smallest_levenshtein_value, which is equal to or less than the threshold."
+ ) if $log->is_debug();
+ unless (exists $ingredients_typo->{$unknown_ingredient_object->{text}}) {
+ $key_for_smallest_levenshtein_value =~ s/\s+/-/g;
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - adding it to ingredients_typo, first time $unknown_ingredient_object->{text}"
+ ) if $log->is_debug();
+ $ingredients_typo->{$unknown_ingredient_object->{id}}
+ = $lang2 . ":" . lc($key_for_smallest_levenshtein_value);
+ }
+ }
+ }
+ }
+ }
+
+ # Check if both are in the taxonomy but with a different id
+ elsif ($ingredients1->[$i]{is_in_taxonomy}
+ && $ingredients2->[$i]{is_in_taxonomy}
+ && $ingredients1->[$i]{id} ne $ingredients2->[$i]{id})
+ {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - different id between ingredients $ingredients1->[$i]{id} and $ingredients2->[$i]{id}"
+ ) if $log->is_debug();
+
+ # Ignore if ids are different but have a child/parent relation
+ if ( !(is_a("ingredients", $ingredients1->[$i]{id}, $ingredients2->[$i]{id}))
+ && !(is_a("ingredients", $ingredients2->[$i]{id}, $ingredients1->[$i]{id})))
+ {
+ $log->debug(
+ "check_ingredients_between_languages > detect_missing_ingredients - different id between ingredients and no relation between them"
+ ) if $log->is_debug();
+
+ my $text_and_id1 = $ingredients1->[$i]{text} . "-id:" . $ingredients1->[$i]{id};
+ my $text_and_id2 = $ingredients2->[$i]{text} . "-id:" . $ingredients2->[$i]{id};
+
+ my ($text_and_id_a, $text_and_id_b) = get_sorted_strings($text_and_id1, $text_and_id2);
+
+ $text_and_id_a =~ s/\s+/-/g;
+ unless (exists $mismatch_in_taxonomy->{$text_and_id_a}) {
+ $text_and_id_b =~ s/\s+/-/g;
+ $mismatch_in_taxonomy->{$text_and_id_a} = $text_and_id_b;
+ }
+ }
+ }
+ }
+}
+
+=head2 check_ingredients_between_languages ( product_ref )
This function extracts data for each language from the provided product reference.
It then detects failed extractions (missing stop words) and identifies missing translations.
@@ -97,9 +681,114 @@ This function does not return any value. It performs the extraction and detectio
=cut
-sub detect_taxonomy_translation_from_text ($product_ref) {
- $log->debug("detect_taxonomy_translation_from_text - start") if $log->is_debug();
- print STDERR "detect_taxonomy_translation_from_text - start\n";
+sub check_ingredients_between_languages {
+ my ($product_ref) = @_;
+
+ $log->debug("check_ingredients_between_languages - start $product_ref->{code}") if $log->is_debug();
+
+ delete $product_ref->{"taxonomies_enhancer_tags"} if exists $product_ref->{"taxonomies_enhancer_tags"};
+
+ # Create a new hash for ingredients_text_ fields to not impact $product_ref
+ my %ingredients_hash;
+ foreach my $key (keys %{$product_ref}) {
+ # ingredients_text_fi yes, ingredients_text_with_allergens_fi no
+ if ($key =~ /^ingredients_text_[a-z]{2}$/) {
+ $ingredients_hash{$key} = $product_ref->{$key};
+ $log->debug("Added key: $key with value: $product_ref->{$key}") if $log->is_debug();
+ }
+ }
+
+ # Process each key in the product reference to parse ingredients for each language
+ foreach my $key (keys %ingredients_hash) {
+ parse_ingredients_for_language(\%ingredients_hash, $key);
+ }
+ # keep only lang code and remove any "allergens", "labels", "labels_lc", "labels_tags", "labels_hierarchy" that might be in the hashmap
+ foreach my $key (keys %ingredients_hash) {
+ delete $ingredients_hash{$key} unless $key =~ /^[a-z]{2}$/;
+ }
+
+ my %missing_stop_words_after;
+ my %missing_stop_words_before;
+ my %missing_ingredients;
+ my %ingredients_typo;
+ my %mismatch_in_taxonomy;
+
+ foreach my $lang1 (keys %ingredients_hash) {
+ foreach my $lang2 (keys %ingredients_hash) {
+ $log->debug(
+ "check_ingredients_between_languages - next iteration lang1 (ref): $lang1 and lang2 (analyzed): $lang2"
+ ) if $log->is_debug();
+
+ # Reminder: ingredients1 is the reference and missing stop words or missing ingredients are searched into ingredients2 only
+ next
+ if $lang1 eq $lang2
+ || not_enough_known_ingredients($ingredients_hash{$lang1}, $ingredients_hash{$lang2});
+
+ if (@{$ingredients_hash{$lang2}} > @{$ingredients_hash{$lang1}}) {
+ detect_missing_stop_words_before_list(
+ $ingredients_hash{$lang1},
+ $ingredients_hash{$lang2},
+ $lang1, $lang2, \%missing_stop_words_before
+ );
+
+ # If a stop word before has been found, there cannot be a 1 to 1 ingredients mapping with other language, hence, no need to call the function
+ if (!$missing_stop_words_before{$lang2}) {
+ detect_missing_stop_words_after_list(
+ $ingredients_hash{$lang1},
+ $ingredients_hash{$lang2},
+ $lang1, $lang2, \%missing_stop_words_after
+ );
+ }
+
+ }
+
+ if (@{$ingredients_hash{$lang1}} == @{$ingredients_hash{$lang2}}) {
+ detect_missing_ingredients(
+ $ingredients_hash{$lang1},
+ $ingredients_hash{$lang2},
+ $lang1, $lang2, \%missing_ingredients, \%ingredients_typo, \%mismatch_in_taxonomy
+ );
+ }
+ }
+ }
+
+ foreach my $lang (keys %missing_stop_words_before) {
+ $log->debug(
+ "check_ingredients_between_languages - detected: en:possible-stop-word-before-$missing_stop_words_before{$lang}"
+ ) if $log->is_debug();
+ add_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-before-$missing_stop_words_before{$lang}");
+ }
+ foreach my $lang (keys %missing_stop_words_after) {
+ $log->debug(
+ "check_ingredients_between_languages - detected: en:possible-stop-word-after-$missing_stop_words_after{$lang}"
+ ) if $log->is_debug();
+ add_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-$missing_stop_words_after{$lang}");
+ }
+ foreach my $new_ingredient_id (keys %missing_ingredients) {
+ $log->debug(
+ "check_ingredients_between_languages - detected: en:ingredients-$new_ingredient_id-is-new-translation-for-$missing_ingredients{$new_ingredient_id}"
+ ) if $log->is_debug();
+ add_tag($product_ref, "taxonomies_enhancer",
+ "en:ingredients-$new_ingredient_id-is-new-translation-for-$missing_ingredients{$new_ingredient_id}");
+ }
+ foreach my $ingredient_with_typo (keys %ingredients_typo) {
+ $log->debug(
+ "check_ingredients_between_languages - detected: en:ingredients-$ingredient_with_typo-is-possible-typo-for-$ingredients_typo{$ingredient_with_typo}"
+ ) if $log->is_debug();
+ add_tag($product_ref, "taxonomies_enhancer",
+ "en:ingredients-$ingredient_with_typo-is-possible-typo-for-$ingredients_typo{$ingredient_with_typo}");
+ }
+ # ignore if there are too many discrepencies found, it might be comparison of old ingredient list in a lang and new ingredient list in other lang, example 8014190017627
+ if (scalar(keys %mismatch_in_taxonomy) < 2) {
+ foreach my $ingredient_id1 (keys %mismatch_in_taxonomy) {
+ $log->debug(
+ "check_ingredients_between_languages - detected: en:ingredients-taxonomy-between-$ingredient_id1-and-$mismatch_in_taxonomy{$ingredient_id1}-should-be-same-id"
+ ) if $log->is_debug();
+ add_tag($product_ref, "taxonomies_enhancer",
+ "en:ingredients-taxonomy-between-$ingredient_id1-and-$mismatch_in_taxonomy{$ingredient_id1}-should-be-same-id"
+ );
+ }
+ }
}
1;
diff --git a/tests/unit/taxonomies_enhancer.t b/tests/unit/taxonomies_enhancer.t
index 23250ebb41f6f..a86f32132c207 100644
--- a/tests/unit/taxonomies_enhancer.t
+++ b/tests/unit/taxonomies_enhancer.t
@@ -1,45 +1,561 @@
#!/usr/bin/perl -w
-use Modern::Perl '2017';
-use utf8;
+use Data::Dumper;
+$Data::Dumper::Terse = 1; # rm variable name
+$Data::Dumper::Indent = 1;
+$Data::Dumper::Sortkeys = 1;
use Test2::V0;
-use Log::Any::Adapter 'TAP';
-use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;
+# use Modern::Perl '2017';
+# use utf8;
+# use Log::Any::Adapter 'TAP';
+use ProductOpener::Tags qw/has_tag/;
+use ProductOpener::TaxonomiesEnhancer qw/check_ingredients_between_languages/;
+
+# TESTS
+my $product_ref = {ingredients_text_hr => "sredsvo za rahljenje",};
+check_ingredients_between_languages($product_ref);
+ok(!exists $product_ref->{"taxonomies_enhancer_tags"}, 'single unknown ingredient should be ignored')
+ or diag Dumper $product_ref;
-# example based on 0036595328366
-# should detect stopwords
my $product_ref = {
- ingredients_text_cs => "69% pšeničná mouka , pitná voda, řepkový olej , stabilizátor: glycerol; pšeničný lepek , regulátor kyselosti : kyselina jablečná; jedlá sůl , emulgátor : mono - a diglyceridy mastných kyselin ; dextróza , kypřící látka : uhličitany sodné ; konzervanty : propionan vápenatý , sorban draselný ; látka zlepšující mouku : L-cystein. Skladujte v suchu a chraňte před teplem.",
- ingredients_text_hr => "69% pšenično brašno, voda , repičino ulje , stabilizator. glicerol; pšenični gluten, regulator kiselosti : jabučna kiselina ; kuhinjska sol , emulgator : mono - i digliceridi masnih kiselina ; dekstroza, tvar za rahljenje : natrijevi karbonati; konzervansi : kalcijev propionat , kalijev sorbat ; tvar za tretiranje brašna : L-cistein. Čuvati na suhom mjestu.",
- ingredients_text_hu => "69% búzaliszt , ivóvíz , repceolaj , stabilizátor: glicerin; búzaglutén , savanyúságot szabályozó anyag : almasav ; étkezési só , emulgeálószer: zsírsavak mono - és digliceridjei ; dextróz , térfogatnövelő szer : nátrium-karbonátok ; tartósítószerek : kalcium-propionát , kálium-szorbát ; lisztkezelő szer : L-Cisztein.",
- ingredients_text_pl => "69% mąka pszenna , woda , olej rzepakowy , stabilizator: glicerol; gluten pszenny , regulator kwasowości : kwas jabłkowy ; sól , emuglator : mono - i diglicerydy kwasów tłuszczowych; glukoza , substancja spulchniająca: węglany sodu ; substancje konserwujące: propionian wapnia , sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.",
- ingredients_text_ro => "69% făină de grâu , apă , ulei de rapiță , stabilizator: glicerol; gluten din grâu, corector de aciditate : acid malic ; sare , emulsifiant : mono - şi digliceride ale acizilor graşi; dextroză , agent de afanare : carbonați de sodiu ; conservanți : propionat de calciu, sorbat de potasiu; agent de tratare a făinii : L-cisteină.",
- ingredients_text_sk => "69% pšeničná múka , pitná voda, repkový olej , stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti : kyselina jablčná ; jedlá soľ , emulgátor : mono - a diglyceridy mastných kyselín ; dextróza , kypriaca látka : uhličitany sodné ; konzervačné látky : propionan vápenatý , sorban draselný ; múku upravujúca látka : L-cystein.",
- ingredients_text_sl => "69% pšenična moka , voda , olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator : mono - in diglicerid! maščobnih kislin ; dekstroza, sredstvo za vzhajanje : natrijevi karbonati; konzervansa : kalcijev propionat , kalijev sorbat ; sredstvo za obdelavo moke : L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_it => "",
+};
+check_ingredients_between_languages($product_ref);
+ok(!exists $product_ref->{"taxonomies_enhancer_tags"}, 'empty list should be ignored') or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr => "69% pšenično brašno, voda",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(!exists $product_ref->{"taxonomies_enhancer_tags"}, 'truncated list should be ignored') or diag Dumper $product_ref;
+
+# TESTS STOP WORDS BEFORE INGREDIENTS LIST
+$product_ref = {
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_sk =>
+ "some unknown words for ingredient: 69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-before-sk:some-unknown-words-for-ingredient"),
+ 'sk has one stop word before')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_sk =>
+ "product name or something. some unknown words for ingredient: 69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-before-sk:some-unknown-words-for-ingredient"),
+ 'sk has one stop word before and only one')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr =>
+ "some unknown words for ingredient in hr: 69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_sk =>
+ "some unknown words for ingredient: 69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:possible-stop-word-before-hr:some-unknown-words-for-ingredient-in-hr"
+ ),
+ 'hr has one stop word before and it is not influenced by other lang stop word before'
+) or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-before-sk:some-unknown-words-for-ingredient"),
+ 'sk has one stop word before and it is not influenced by other lang stop word before')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr =>
+ "69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_sk =>
+ "some unknown words for ingredient: 69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-before-sk:some-unknown-words-for-ingredient"),
+ 'sk has one stop word before and it is not influenced by having 2 languages without stopwords'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr =>
+ "some unknown words for ingredient in hr: 69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_sk =>
+ "product name or something. some unknown words for ingredient: 69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:possible-stop-word-before-hr:some-unknown-words-for-ingredient-in-hr"
+ ),
+ 'hr has one stop word before and it is not influenced by other lang having a stop word before and a word before that stop word'
+) or diag Dumper $product_ref;
+ok(
+ has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-before-sk:some-unknown-words-for-ingredient"),
+ 'sk has one stop word before and only one and it is not influenced by other lang having a stop word before'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr =>
+ "some unknown words for ingredient in hr: 69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% not-in-taxonomy, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ !exists $product_ref->{"taxonomies_enhancer_tags"},
+ 'stopword before but first ingredients is unknown in the reference language, hence it should be ignored + stopword after is not detected because ingredients taken in the order cannot be paired due to the additional stop word before'
+) or diag Dumper $product_ref;
+
+# TESTS STOP WORDS AFTER INGREDIENTS LIST
+$product_ref = {
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_pl =>
+ "69% mąka pszenna, woda, olej rzepakowy, stabilizator: glicerol; gluten pszenny, regulator kwasowości: kwas jabłkowy; sól, mono - i diglicerydy kwasów tłuszczowych; glukoza, substancja spulchniająca: węglany sodu; substancje konserwujące: propionian wapnia, sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.",
+};
+check_ingredients_between_languages($product_ref);
+ok(!exists $product_ref->{"taxonomies_enhancer_tags"}, 'same list length, no missing stop words')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr =>
+ "69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-hr:čuvati-na-suhom-mjestu"),
+ 'hr has one stop word')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr =>
+ "ingredient-in-hr: 69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-before-hr:ingredient-in-hr"),
+ 'if 1 stop word before and 1 stop word after, then only stop word before should be seen'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_cs =>
+ "69% pšeničná mouka, pitná voda, řepkový olej, stabilizátor: glycerol; pšeničný lepek, regulátor kyselosti: kyselina jablečná; jedlá sůl, emulgátor: mono - a diglyceridy mastných kyselin; dextróza, kypřící látka: uhličitany sodné; konzervanty: propionan vápenatý, sorban draselný; látka zlepšující mouku: L-cystein. Skladujte v suchu a chraňte před teplem.",
+ ingredients_text_hr =>
+ "69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_pl =>
+ "69% mąka pszenna, woda, olej rzepakowy, stabilizator: glicerol; gluten pszenny, regulator kwasowości: kwas jabłkowy; sól, emuglator: mono - i diglicerydy kwasów tłuszczowych; glukoza, substancja spulchniająca: węglany sodu; substancje konserwujące: propionian wapnia, sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.",
+ ingredients_text_ro =>
+ "69% făină de grâu, apă, ulei de rapiță, stabilizator: glicerol; gluten din grâu, corector de aciditate: acid malic; sare, emulsifiant: mono - şi digliceride ale acizilor graşi; dextroză, agent de afanare: carbonați de sodiu ; conservanți: propionat de calciu, sorbat de potasiu; agent de tratare a făinii: L-cisteină.",
+ ingredients_text_sk =>
+ "69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:possible-stop-word-after-cs:skladujte-v-suchu-a-chraňte-před-teplem"
+ ),
+ 'cs has one stop word'
+) or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-hr:čuvati-na-suhom-mjestu"),
+ 'hr has one stop word as well')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_sk =>
+ "69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+ ingredients_text_sl =>
+ "69% pšenična moka, voda, olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator: mono - in diglicerid! maščobnih kislin; dekstroza, sredstvo za vzhajanje: natrijevi karbonati; konzervansa: kalcijev propionat , kalijev sorbat; sredstvo za obdelavo moke: L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-sl:uporabno-najmanj-do"),
+ 'sl has one stop word')
+ or diag Dumper $product_ref;
+ok(
+ !has_tag(
+ $product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-sl:glej-odtis-na-zadnji-strani-embalaže"
+ ),
+ 'sl has only one stop word, second word is not reported'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_sk =>
+ "69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+ ingredients_text_hr =>
+ "69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_sl =>
+ "69% pšenična moka, voda, olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator: mono - in diglicerid! maščobnih kislin; dekstroza, sredstvo za vzhajanje: natrijevi karbonati; konzervansa: kalcijev propionat , kalijev sorbat; sredstvo za obdelavo moke: L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-hr:čuvati-na-suhom-mjestu"),
+ 'hr is one of the 2 stop words with sl')
+ or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-sl:uporabno-najmanj-do"),
+ 'sl is one of the 2 stop words with hr')
+ or diag Dumper $product_ref;
+ok(
+ !has_tag(
+ $product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-sl:glej-odtis-na-zadnji-strani-embalaže"
+ ),
+ 'sl has only one stop word with hr, second word for sl is not reported'
+) or diag Dumper $product_ref;
+
+# TEST UNKNOWN INGREDIENTS
+$product_ref = {
+ ingredients_text_hr => "Secer.",
+ ingredients_text_en => "Sugar",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-hr:secer-is-possible-typo-for-hr:šećer"),
+ 'typo should be fetched if both language are having single word although percentage of unknown ingredients is below the threshold'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr => "Sol, secer, jagoda.",
+ ingredients_text_en => "Salt, sugar, strawberry.",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-hr:secer-is-possible-typo-for-hr:šećer"),
+ 'typo should be fetched')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_hr => "Sol, jaggery, jagoda.",
+ ingredients_text_en => "Salt, jaggery, strawberry.",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-hr:jaggery-is-new-translation-for-en:jaggery"),
+ 'suggest Croatian translation for existing ingredient in English')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_en => "Salt, sugar, strawberry.",
+ ingredients_text_hr => "Sol, secer, jagoda.",
+ ingredients_text_pl => "sól, cukier, truskawka.",
};
-detect_taxonomy_translation_from_text($product_ref);
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-hr:secer-is-possible-typo-for-hr:šećer"),
+ 'typo should be fetched')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_en => "Salt, invert sugar solution, strawberry.",
+ ingredients_text_hr => "Sol, newword, jagoda.",
+ ingredients_text_pl => "sól, roztwór cukru inwertowanego, truskawka.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-hr:newword-is-new-translation-for-en:invert-sugar-solution"
+ ),
+ 'new translation for invert sugar solution in hr but known in 2 differents lang'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_en => "Salt, invert sugar solution, strawberry.",
+ ingredients_text_hr => "Sol, mrkva, jagoda.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref,
+ "taxonomies_enhancer",
+ "en:ingredients-taxonomy-between-invert-sugar-solution-id:en:invert-sugar-solution-and-mrkva-id:en:carrot-should-be-same-id"
+ ),
+ 'different ids between lang hr and en leading to taxonomy mismatch warning'
+) or diag Dumper $product_ref;
+$product_ref = {
+ ingredients_text_en => "Salt, invert sugar solution, strawberry.",
+ ingredients_text_hr => "Sol, mrkva, jagoda.",
+ ingredients_text_pl => "sól, roztwór cukru inwertowanego, truskawka.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ !exists $product_ref->{"taxonomies_enhancer_tags"},
+ 'more than 1 taxonomy mismatch found does not raise anything to prevent false positive for example ingredient list has been updated'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_fi =>
+ "Sianliha, suola, maussteet, sokeri, dekstroosi, säilöntäaineet: E250, E252; kuusen savu. Pakattu suojakaasuun. Suolapitoisuus: 5,0%, voimakassuolainen. 100g: aan tuotetta käytetty 128g lihaa.",
+ ingredients_text_sv =>
+ "Svinkött, salt, kryddor, socker, dextros, konserveringsmedel: E250, E252; granrök. Förpackat i en skyddande atmosfär. Salthalt: 5,0%, kraftigt saltat. Till 100g vara har används 128g kött.",
+};
+check_ingredients_between_languages($product_ref);
+ok(!exists $product_ref->{"taxonomies_enhancer_tags"}, 'fi-sv, last word of ingredient1 should be known to push tag')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_fi => "Sianliha",
+ ingredients_text_sv => "Svinkött",
+ taxonomies_enhancer_tags => ["no-matter-what"]
+};
+check_ingredients_between_languages($product_ref);
+ok(!exists $product_ref->{"taxonomies_enhancer_tags"}, 'fi-sv, nothing to do, make sure that tags field is removed')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_de =>
+ "Tomatenfruchtfleisch 40,9%, Wasser, Tomatenmarkkonzentrat 14%, Zwiebeln 12,5%, Sonnenblumenöl, Karotten 3,5%, Salz, natürliche Aromen, Zucker, Basilikum 0,2%, Knoblauch",
+ ingredients_text_en =>
+ "tomato pulp, onions, somenflower oil, carrots 3,5%, salt, natural flavors, sugar, bailic 0,2%, garlic,",
+ ingredients_text_fr =>
+ "Pulpe de tomate 40,9 %, eau, concentré de concentré de tomate 14 %, oignons, huile de tournesol, carottes 3,5 %, sel, arômes naturels, sucre, basilic 0, 2%, ail.",
+ ingredients_text_ro => "",
+};
+check_ingredients_between_languages($product_ref);
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-fr:basilic-0-is-possible-typo-for-fr:basilic"),
+ 'parsing promblem due to OCR resulting in space after comma')
+ or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_bg => "Пастьоризирано МЛЯКО, со…ктериални култури, мая.",
+ ingredients_text_en =>
+ "EN WYDOJONE Lactose free* UHT Milk. The fat content of 3,2%. Source of calcium. *The lactose content < 0,01 g / 100 ml Milk is a valued element of a diet. Contains many necessary ingredients to maintain healthy body like protein and calcium. However, not everyone can consume milk or dairy products because of lactose intolerance naturally occurring sugar in its composition. Lactose intolerance is the inability to digest lactose. The alternative is a lactose free Milk.",
+ ingredients_text_pl => "Mleko bez laktozy UHT. Zawartość tłuszczu 3,2%. Źródło wapnia.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ !exists $product_ref->{"taxonomies_enhancer_tags"},
+ 'nothing is detected. PL: single unknown ingredient BG: couple of ingredient, 1 unrecognized EN: whole product text'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_da =>
+ "_skummetmælk_, sukker, kakaosmør, vand, kakaomasse, kokosolie, glukosesirup, glukose-fruktosesirup, _sødmælkspulver_, vallepulver (_mælk_), _mælkefedt_, emulgatorer (E471, _sojalecithin_, E476), vaniljestang, stabilisatorer (E410, E412, E407), naturlig vaniljearoma (_mælk_), aroma, farvestof (E160a)",
+ ingredients_text_de =>
+ "entrahmte _Milch_, Zucker, Kakaobutter, Wasser, Kakaomasse, Kokosfett, Glukosesirup, Glukose-Fruktose-Sirup , _Vollmilchpulver_, _Molkenerzeugnis_, _Butterreinfett_, Emulgatoren (E471, Lecithine (_Soja_), E476), vermahlene Vanilleschoten, Stabilisatoren (E410, E412, E407), natürliches Vanillearoma (mit _Milch_), Aroma, Farbstoff (E160a)",
+ ingredients_text_en =>
+ "reconstituted skimmed milk , sugar, cocoa butter', water, coconut oil, cocoa mass', glucose syrup, glucose-fructose syrup, whole milk powder, whey solids ( milk ), butter oil ( milk ), emulsifiers (soybean lecithin, e476, e471), exhausted vanilla bean pieces, stabilisers (e407, e410, e412), natural vanilla flavouring', (with milk), flavouring, colour (e160a)",
+ ingredients_text_fr =>
+ "LAIT écrémé réhydraté, sucre, beurre de cacao¹, eau, huile de coco, pâte de cacao¹, sirop de glucose, sirop de glucose-fructose, LAIT en poudre entier, LACTOSE et protéines de LAIT, BEURRE concentré, émulsifiants (lécithine de SOJA, E476, E471), gousses de vanille épuisées broyées, stabilisants (E407, E410, E412), arôme naturel de vanille¹ (dont LAIT), arôme, colorant (E160a). Peut contenir: amande. Sans gluten.",
+ ingredients_text_hu =>
+ "visszaállított sovány _tej_, cukor, kakaóvaj, víz, kakaómassza, kókuszolaj, glükózsirup, glükóz-fruktózszőrp, zsiros _tejpor_, _tejsavókészítmény_, _vajolaj_, emulgeálószerek (E471, _szójalecitin_, E476), vanílla darabkák, stabilizátorok (E410, E412, E407), természetes vanília aroma (_tejszármazékkal_), aromák, szinezék (E160a)",
+ ingredients_text_it =>
+ "_latte_ scremato reidratato, zucchero, burro di cacao, acqua, pasta di cacao, olio di cocco, sciroppo di glucosio, sciroppo di glucosio-fruttosio, _latte_ intero in polvere, _lattosio_ e proteine del _latte_, _burro_ concentrato, emulsionanti (E471, lecitina di _soia_, E476), baccelli di vaniglia, addensanti (E410, E412, E407), aroma naturale di vaniglia (contiene _latte_), aromi, coloranti (E160a)",
+ ingredients_text_nl =>
+ "gerehydrateerde magere _melk_, suiker, cacaoboter, water, cacaomassa, kokosolie, glucosestroop, glucose-fructosestroop, volle _melkpoeder_, _lactose_, _melkeiwitten_, _boterconcentraat_, emulgatoren (E471, _sojalecithine_, E476), uitgepuue gemalen vanillestokjes, stabilisatoren (E410, E412, E407), natuurlijk vanillearoma (met _melk_), aroma, kleurstof (E160a)",
+ ingredients_text_sv =>
+ "_skummjölk_, socker, kakaosmör, vatten, kakaomassa, kokosolja, glukossirap, glukos-fruktossirap, _helmjölkspulver_, vasslepulver (_mjölk_), _mjölkfett_, emulgeringsmedel (E471, _sojalecitin_, E476), vaniljstångbitar, stabiliseringsmedel (E410, E412, E407), naturlig vaniljarom (_mjölk_), arom, färgämne (E160a)",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-hu:vanílla-darabkák-is-new-translation-for-en:exhausted-ground-vanilla-pod"
+ ),
+ 'hu new translation'
+) or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-hu:szinezék-is-possible-typo-for-hu:színezék"),
+ 'hu 1/2 typo')
+ or diag Dumper $product_ref;
+ok(
+ has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-hu:glükózsirup-is-possible-typo-for-hu:glükózszirup"),
+ 'hu 2/2 typo'
+) or diag Dumper $product_ref;
+
+$product_ref = {
+ ingredients_text_en => "spice extracts (including _celeriac_)",
+ ingredients_text_pl => "ekstrakty przypraw (w tym _seler_)",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref,
+ "taxonomies_enhancer",
+ "en:ingredients-taxonomy-between-including-celeriac-id:en:celeriac-and-w-tym-seler-id:en:celery-should-be-same-id"
+ ),
+ 'en-pl, remove underscores'
+) or diag Dumper $product_ref;
+
+# TESTS RELATED TO EXISTING PRODUCTS
+# example based on 0036595328366
+$product_ref = {
+ ingredients_text_cs =>
+ "69% pšeničná mouka, pitná voda, řepkový olej, stabilizátor: glycerol; pšeničný lepek, regulátor kyselosti: kyselina jablečná; jedlá sůl, emulgátor: mono - a diglyceridy mastných kyselin; dextróza, kypřící látka: uhličitany sodné; konzervanty: propionan vápenatý, sorban draselný; látka zlepšující mouku: L-cystein. Skladujte v suchu a chraňte před teplem.",
+ ingredients_text_hr =>
+ "69% pšenično brašno, voda, repičino ulje, stabilizator. glicerol; pšenični gluten, regulator kiselosti: jabučna kiselina; kuhinjska sol, emulgator: mono - i digliceridi masnih kiselina; dekstroza, tvar za rahljenje: natrijevi karbonati; konzervansi: kalcijev propionat, kalijev sorbat; tvar za tretiranje brašna: L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu =>
+ "69% búzaliszt, ivóvíz, repceolaj, stabilizátor: glicerin; búzaglutén, savanyúságot szabályozó anyag: almasav; étkezési só, emulgeálószer: zsírsavak mono - és digliceridjei; dextróz, térfogatnövelő szer: nátrium-karbonátok; tartósítószerek: kalcium-propionát, kálium-szorbát; lisztkezelő szer: L-Cisztein.",
+ ingredients_text_pl =>
+ "69% mąka pszenna, woda, olej rzepakowy, stabilizator: glicerol; gluten pszenny, regulator kwasowości: kwas jabłkowy; sól, emuglator: mono - i diglicerydy kwasów tłuszczowych; glukoza, substancja spulchniająca: węglany sodu; substancje konserwujące: propionian wapnia, sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.",
+ ingredients_text_ro =>
+ "69% făină de grâu, apă, ulei de rapiță, stabilizator: glicerol; gluten din grâu, corector de aciditate: acid malic; sare, emulsifiant: mono - şi digliceride ale acizilor graşi; dextroză, agent de afanare: carbonați de sodiu ; conservanți: propionat de calciu, sorbat de potasiu; agent de tratare a făinii: L-cisteină.",
+ ingredients_text_sk =>
+ "69% pšeničná múka, pitná voda, repkový olej, stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti: kyselina jablčná; jedlá soľ, emulgátor: mono - a diglyceridy mastných kyselín; dextróza, kypriaca látka: uhličitany sodné; konzervačné látky: propionan vápenatý, sorban draselný; múku upravujúca látka: L-cystein.",
+ ingredients_text_sl =>
+ "69% pšenična moka, voda, olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator: mono - in diglicerid! maščobnih kislin; dekstroza, sredstvo za vzhajanje: natrijevi karbonati; konzervansa: kalcijev propionat , kalijev sorbat; sredstvo za obdelavo moke: L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:possible-stop-word-after-cs:skladujte-v-suchu-a-chraňte-před-teplem"
+ ),
+ 'cs-hr-hu-pl-ro-sk-sl, cs stopword'
+) or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-sl:uporabno-najmanj-do"),
+ 'cs-hr-hu-pl-ro-sk-sl, sl stopword')
+ or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:possible-stop-word-after-hr:čuvati-na-suhom-mjestu"),
+ 'cs-hr-hu-pl-ro-sk-sl, hr stopword')
+ or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-sk:pšeničny-gluten-is-new-translation-for-en:wheat-gluten"
+ ),
+ 'cs-hr-hu-pl-ro-sk-sl, sk new translation'
+) or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-ro:carbonați-de-sodiu-is-possible-typo-for-ro:carbonati-de-sodiu"
+ ),
+ 'cs-hr-hu-pl-ro-sk-sl, ro typo in taxonomy'
+) or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-pl:emuglator-is-possible-typo-for-pl:emulgator"),
+ 'cs-hr-hu-pl-ro-sk-sl, pl typo')
+ or diag Dumper $product_ref;
+ok(has_tag($product_ref, "taxonomies_enhancer", "en:ingredients-cs:konzervanty-is-possible-typo-for-cs:konzervant"),
+ 'cs-hr-hu-pl-ro-sk-sl, cs is missing a synonym or handle plural in product opener')
+ or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer", "en:ingredients-cs:kypřící-látka-is-possible-typo-for-cs:kypřicí-látka"
+ ),
+ 'cs-hr-hu-pl-ro-sk-sl, cs missing declension'
+) or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-sk:konzervačne-latky-is-possible-typo-for-sk:konzervačná-látka"
+ ),
+ 'cs-hr-hu-pl-ro-sk-sl, sk missing declension'
+) or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref,
+ "taxonomies_enhancer",
+ "en:ingredients-ro:mono-şi-digliceride-ale-acizilor-graşi-is-possible-typo-for-ro:mono--și-digliceride-ale-acizilor-grași"
+ ),
+ 'cs-hr-hu-pl-ro-sk-sl, ro typo or synonym'
+) or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-ro:agent-de-afanare-is-possible-typo-for-ro:agent-de-afânare"
+ ),
+ 'cs-hr-hu-pl-ro-sk-sl, ro typo or synonym again'
+) or diag Dumper $product_ref;
# example based on 20201845
-# should suggests translations
-# problem with english: some app translated in english from other languages. NOT producer translation.
-# for example: App translation (infood) probably based on RO:
-# ingredients_text_en => "water, wine vinegar, mustard seeds, [mustard husks], table salt, [acidifying]: citric acid, [natural flavors of cloves], cinnamon, ginger and tarragon, antioxidant: potassium metabisulphite, spice mixture",
-# versus Producer translation:
-# ingredients_text_en => "water, spirit vinegar, mustard seeds, husks of mustard seeds, salt, acidity regulator: citric acid, natural flavorings, antioxidant: potassium metabisulphite, turmeric",
-# in square brackets are unknown ingredients on the product
-# ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
-# ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
-# ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
-# RO has more ingredients
-# ES has a typo ácido citico -> Ácido cítrico
-my $product_ref = {
- ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
- ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
- ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
+# problem with english: some app translated in english from other languages. NOT producer translation.
+# for example: App translation (infood) probably based on RO:
+# ingredients_text_en => "water, wine vinegar, mustard seeds, [mustard husks], table salt, [acidifying]: citric acid, [natural flavors of cloves], cinnamon, ginger and tarragon, antioxidant: potassium metabisulphite, spice mixture",
+# versus Producer translation:
+# ingredients_text_en => "water, spirit vinegar, mustard seeds, husks of mustard seeds, salt, acidity regulator: citric acid, natural flavorings, antioxidant: potassium metabisulphite, turmeric",
+# in square brackets are unknown ingredients on the product
+# ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal , acidulante: [ácido citico] ; aromas , antioxidante: metabisulfito potásico ; especia.",
+# ingredients_text_hr => "Voda, alkoholni ocat , 24,5% sjemenke gorušice , [7,5% ljuske gorušice] , kuhinjska sol, kiselina : limunska kiselina; arome , antioksidans: kalijev metabisulfit ; začin.",
+# ingredients_text_ro => "apă , oțet din vin , [semințe de muştar] , [coji de muştar] , sare de masă , acidifiant: acid citric , [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant : metabisulfit de potasiu, amestec de condimente.",
+# RO has more ingredients
+# ES has a typo ácido citico -> Ácido cítrico
+$product_ref = {
+ ingredients_text_es =>
+ "Agua, vinagre de alcohol, 24,5% semillas de mostaza, cáscara de semillas de mostaza, sal, acidulante: ácido citico; aromas, antioxidante: metabisulfito potásico; especia.",
+ ingredients_text_hr =>
+ "Voda, alkoholni ocat, 24,5% sjemenke gorušice, 7,5% ljuske gorušice, kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
+ ingredients_text_ro =>
+ "apă, oțet din vin, semințe de muştar, coji de muştar, sare de masă, acidifiant: acid citric, arome naturale de cuişoare, scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer", "en:ingredients-es:acido-citico-is-possible-typo-for-es:acido-citrico"
+ ),
+ 'es-hr-ro, typo in es'
+) or diag Dumper $product_ref;
+# example based on 8000430133035
+# different lists, in this example fr is missing "presure" but there exists picture with fr ingredients list having "presure"
+# en:pasteurised-cow-s-milk and en:pasteurised-milk might be used indifferently in current taxonomy
+# my $product_ref = {
+# ingredients_text_cs => "",
+# ingredients_text_da => "Pasteuriseret _komælk_ , salt , microbiel løbe , surhedsregulerende middel: citronsyre.",
+# ingredients_text_de => "Pasteurisierte _Milch_ , Salz , Lab , Säuerungsmittel : Citronensäure.",
+# ingredients_text_en => "Pasteurised _milk_ , salt , vegetarian coagulant , acidity regulator : citric acid.",
+# ingredients_text_es => "_Leche_ , sal , coagulante microbiano y corrector de acidez (ácido cítrico).",
+# ingredients_text_fi => "Pastöroitu _maito_ , suola, juoksute , happamuudensäätöaine: sitruunahappo.",
+# ingredients_text_fr => "_Lait_ pasteurisé , sel , correcteur d'acidité acide citrique.",
+# ingredients_text_it => "",
+# ingredients_text_nl => "Gepasteuriseerde _melk_, zout , stremsel , zuurtegraadregelaar citroenzuur.",
+# ingredients_text_pt => "_Leite_ , sal , coalho , regulador de acidez : ácido cítrico.",
+# ingredients_text_ru => "Нормализованнов молоко , регулятор кислотности лимонная кислота, с использованием молокосвертывающего ферментного препарата микробного происхождения, рассол (вода питьевая, пищевая соль).",
+# ingredients_text_sv => "Pastöriserad _komjölk_ , salt , löpe , surhetsreglerande medel : citronsyra.",
+# };
+$product_ref = {
+ ingredients_text_cs => "",
+ ingredients_text_da => "Pasteuriseret _komælk_, salt, microbiel løbe, surhedsregulerende middel: citronsyre.",
+ ingredients_text_de => "Pasteurisierte _Milch_, Salz, Lab, Säuerungsmittel: Citronensäure.",
+ ingredients_text_en => "Pasteurised _milk_, salt, vegetarian coagulant, acidity regulator: citric acid.",
+ ingredients_text_es => "_Leche_, sal, coagulante microbiano y corrector de acidez (ácido cítrico).",
+ ingredients_text_fi => "Pastöroitu _maito_, suola, juoksute, happamuudensäätöaine: sitruunahappo.",
+ ingredients_text_fr => "_Lait_ pasteurisé, sel, correcteur d'acidité acide citrique.",
+ ingredients_text_it => "",
+ ingredients_text_nl => "Gepasteuriseerde _melk_, zout, stremsel, zuurtegraadregelaar citroenzuur.",
+ ingredients_text_pt => "_Leite_, sal, coalho, regulador de acidez: ácido cítrico.",
+ ingredients_text_ru =>
+ "Нормализованнов молоко, регулятор кислотности лимонная кислота, с использованием молокосвертывающего ферментного препарата микробного происхождения, рассол (вода питьевая, пищевая соль).",
+ ingredients_text_sv => "Pastöriserad _komjölk_, salt, löpe, surhetsreglerande medel: citronsyra.",
+};
+check_ingredients_between_languages($product_ref);
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-da:pasteuriseret-komælk-is-new-translation-for-en:pasteurised-cow-s-milk"
+ ),
+ 'cs-da-de-en-es-fi-fr-it-nl-pt-ru-sv, new word for da based on sv as well as en'
+) or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer", "en:ingredients-da:microbiel-løbe-is-new-translation-for-en:coagulant"
+ ),
+ 'cs-da-de-en-es-fi-fr-it-nl-pt-ru-sv, new word for da based on es as well as en'
+) or diag Dumper $product_ref;
+ok(
+ has_tag(
+ $product_ref, "taxonomies_enhancer",
+ "en:ingredients-da:pasteuriseret-komælk-is-possible-typo-for-da:pasteuriseret-mælk"
+ ),
+ 'cs-da-de-en-es-fi-fr-it-nl-pt-ru-sv, typo in da based on fi as well as en'
+) or diag Dumper $product_ref;
done_testing();
From a398f7e1ad608336b96f81e56acf66387fdd6dad Mon Sep 17 00:00:00 2001
From: benbenben2 <110821832+benbenben2@users.noreply.github.com>
Date: Sat, 18 Jan 2025 22:33:52 +0100
Subject: [PATCH 3/6] rm problematic word for POD perl
---
lib/ProductOpener/TaxonomiesEnhancer.pm | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm
index 1979b200088b5..e6b03f7c5abca 100644
--- a/lib/ProductOpener/TaxonomiesEnhancer.pm
+++ b/lib/ProductOpener/TaxonomiesEnhancer.pm
@@ -444,7 +444,7 @@ sub remove_duplicates {
=head2 find_smallest_value_key ( hashmap )
-This function finds the key with the smallest value in a hashmap. If multiple keys have the same smallest value, it returns the lexicographically smallest key.
+This function finds the key with the smallest value in a hashmap. If multiple keys have the same smallest value, it returns the first key in alphabetical order.
=head3 Arguments
@@ -454,7 +454,7 @@ A hash reference where the keys are strings and the values are numeric.
=head3 Return value
-Returns the key with the smallest value. If multiple keys have the same smallest value, returns the lexicographically smallest key.
+Returns the key with the smallest value. If multiple keys have the same smallest value, returns the first key in alphabetical order.
=cut
@@ -484,7 +484,7 @@ sub find_smallest_value_key {
=head2 get_sorted_strings ( string1, string2 )
-This function returns two strings in lexicographically sorted order.
+This function returns two strings in alphabetical order.
=head3 Arguments
From 48b79f02101021a39722c2a7c4bd32eb9d44c742 Mon Sep 17 00:00:00 2001
From: benbenben2 <110821832+benbenben2@users.noreply.github.com>
Date: Sun, 19 Jan 2025 10:26:28 +0100
Subject: [PATCH 4/6] fix some Perl errors
---
lib/ProductOpener/TaxonomiesEnhancer.pm | 12 ++++++++++--
tests/unit/taxonomies_enhancer.t | 10 ++++------
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm
index e6b03f7c5abca..e2dc94e15f924 100644
--- a/lib/ProductOpener/TaxonomiesEnhancer.pm
+++ b/lib/ProductOpener/TaxonomiesEnhancer.pm
@@ -42,6 +42,7 @@ analyze ingredients and other fields to enrich the taxonomies
=cut
package ProductOpener::TaxonomiesEnhancer;
+use Exporter qw< import >;
BEGIN {
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
@@ -51,7 +52,6 @@ BEGIN {
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
-use Exporter qw< import >;
use List::Util qw(any);
use Log::Log4perl qw(get_logger);
use Text::Levenshtein qw(distance);
@@ -156,6 +156,8 @@ sub parse_ingredients_for_language {
delete $ingredients_hash->{"ingredients_lc"} if exists $ingredients_hash->{"ingredients_lc"};
delete $ingredients_hash->{"ingredients_text"} if exists $ingredients_hash->{"ingredients_text"};
delete $ingredients_hash->{"ingredients_text_" . $lang} if exists $ingredients_hash->{"ingredients_text_" . $lang};
+
+ return;
}
=head2 not_enough_known_ingredients ( ingredients1, ingredients2 )
@@ -251,7 +253,7 @@ sub detect_missing_stop_words_before_list {
) if $log->is_debug();
# Return if first ingredient in ingredients1 is unknown or first ingredient in ingredients2 is known
- if (!$ingredients1->[0]{is_in_taxonomy} or $ingredients2->[0]{is_in_taxonomy}) {
+ if (!$ingredients1->[0]{is_in_taxonomy} || $ingredients2->[0]{is_in_taxonomy}) {
$log->debug(
"check_ingredients_between_languages > detect_missing_stop_words_before_list - first ingredient in ingredients1 ($ingredients1->[0]{id}) is unknown (is_in_taxonomy => $ingredients1->[0]{is_in_taxonomy}) or first ingredient in ingredients2 is known (is_in_taxonomy => $ingredients2->[0]{is_in_taxonomy})"
) if $log->is_debug();
@@ -278,6 +280,8 @@ sub detect_missing_stop_words_before_list {
$previous_ingredients_object = $ingredients2->[$i];
}
}
+
+ return;
}
=head2 get_ingredient_index ( ingredients, ingredient_id )
@@ -408,6 +412,8 @@ sub detect_missing_stop_words_after_list {
$missing_stop_words_after->{$lang2} = $unknown_ingredient_object->{id};
}
}
+
+ return;
}
=head2 remove_duplicates ( @array )
@@ -662,6 +668,8 @@ sub detect_missing_ingredients {
}
}
}
+
+ return;
}
=head2 check_ingredients_between_languages ( product_ref )
diff --git a/tests/unit/taxonomies_enhancer.t b/tests/unit/taxonomies_enhancer.t
index a86f32132c207..e18036ebd8911 100644
--- a/tests/unit/taxonomies_enhancer.t
+++ b/tests/unit/taxonomies_enhancer.t
@@ -1,16 +1,14 @@
#!/usr/bin/perl -w
+use Modern::Perl '2017';
+use utf8;
+
+use Test2::V0;
use Data::Dumper;
$Data::Dumper::Terse = 1; # rm variable name
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;
-use Test2::V0;
-
-# use Modern::Perl '2017';
-# use utf8;
-# use Log::Any::Adapter 'TAP';
-
use ProductOpener::Tags qw/has_tag/;
use ProductOpener::TaxonomiesEnhancer qw/check_ingredients_between_languages/;
From 6eb77a0aba32d827e4da923b2d96a7c5620aebd4 Mon Sep 17 00:00:00 2001
From: benbenben2 <110821832+benbenben2@users.noreply.github.com>
Date: Sun, 19 Jan 2025 10:54:55 +0100
Subject: [PATCH 5/6] fix some Perl errors
---
lib/ProductOpener/TaxonomiesEnhancer.pm | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm
index e2dc94e15f924..046135d570785 100644
--- a/lib/ProductOpener/TaxonomiesEnhancer.pm
+++ b/lib/ProductOpener/TaxonomiesEnhancer.pm
@@ -18,8 +18,6 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
-=encoding UTF-8
-
=head1 NAME
ProductOpener::TaxonomiesEnhancer - analyze ingredients and other fields to enrich the taxonomies
@@ -42,6 +40,8 @@ analyze ingredients and other fields to enrich the taxonomies
=cut
package ProductOpener::TaxonomiesEnhancer;
+
+use ProductOpener::PerlStandards;
use Exporter qw< import >;
BEGIN {
@@ -51,14 +51,13 @@ BEGIN {
);
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
+use vars @EXPORT_OK;
use List::Util qw(any);
use Log::Log4perl qw(get_logger);
use Text::Levenshtein qw(distance);
-use vars @EXPORT_OK;
use ProductOpener::Ingredients qw/parse_ingredients_text_service/;
-use ProductOpener::PerlStandards;
use ProductOpener::Tags qw/add_tag get_taxonomy_tag_synonyms is_a/;
# Configure Log4perl
@@ -797,6 +796,8 @@ sub check_ingredients_between_languages {
);
}
}
+
+ return;
}
1;
From b5008e7c8694f6bbc3abb7e633533430c2c3e316 Mon Sep 17 00:00:00 2001
From: benbenben2 <110821832+benbenben2@users.noreply.github.com>
Date: Sun, 19 Jan 2025 11:27:57 +0100
Subject: [PATCH 6/6] fix some Perl errors
---
lib/ProductOpener/TaxonomiesEnhancer.pm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm
index 046135d570785..c73cfb119d8bc 100644
--- a/lib/ProductOpener/TaxonomiesEnhancer.pm
+++ b/lib/ProductOpener/TaxonomiesEnhancer.pm
@@ -797,7 +797,7 @@ sub check_ingredients_between_languages {
}
}
- return;
+ return;
}
1;