From 782434ee6f06cc325421328b24d2925f6f8918d7 Mon Sep 17 00:00:00 2001
From: benbenben2 <110821832+benbenben2@users.noreply.github.com>
Date: Wed, 8 Jan 2025 17:50:24 +0100
Subject: [PATCH] partial devs
---
lib/ProductOpener/FoodProducts.pm | 2 +
lib/ProductOpener/TaxonomiesEnhancer.pm | 105 ++++++++++++++++++++++++
tests/unit/taxonomies_enhancer.t | 45 ++++++++++
3 files changed, 152 insertions(+)
create mode 100644 lib/ProductOpener/TaxonomiesEnhancer.pm
create mode 100644 tests/unit/taxonomies_enhancer.t
diff --git a/lib/ProductOpener/FoodProducts.pm b/lib/ProductOpener/FoodProducts.pm
index ce57fec8f911d..775fa0d6c8f75 100644
--- a/lib/ProductOpener/FoodProducts.pm
+++ b/lib/ProductOpener/FoodProducts.pm
@@ -58,6 +58,7 @@ use ProductOpener::FoodGroups qw/compute_food_groups/;
use ProductOpener::Nutriscore qw/:all/;
use ProductOpener::EnvironmentalScore qw/compute_environmental_score/;
use ProductOpener::ForestFootprint qw/compute_forest_footprint/;
+use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;
use Log::Any qw($log);
@@ -83,6 +84,7 @@ sub specific_processes_for_food_product ($product_ref) {
extract_ingredients_from_text($product_ref);
extract_additives_from_text($product_ref);
detect_allergens_from_text($product_ref);
+ detect_taxonomy_translation_from_text($product_ref);
# Category analysis
# Food category rules for sweetened/sugared beverages
diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm
new file mode 100644
index 0000000000000..abcdae7b6a92f
--- /dev/null
+++ b/lib/ProductOpener/TaxonomiesEnhancer.pm
@@ -0,0 +1,105 @@
+# This file is part of Product Opener.
+#
+# Product Opener
+# Copyright (C) 2011-2023 Association Open Food Facts
+# Contact: contact@openfoodfacts.org
+# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
+#
+# Product Opener is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+=encoding UTF-8
+
+=head1 NAME
+
+ProductOpener::TaxonomiesEnhancer - analyze ingredients and other fields to enrich the taxonomies
+
+=head1 SYNOPSIS
+
+C analyze
+analyze ingredients and other fields to enrich the taxonomies
+
+ use ProductOpener::TaxonomiesEnhancer qw/:all/;
+
+ [..]
+
+ detect_taxonomy_translation_from_text($product_ref);
+
+=head1 DESCRIPTION
+
+[..]
+
+=cut
+
+package ProductOpener::TaxonomiesEnhancer;
+
+use ProductOpener::PerlStandards;
+use Exporter qw< import >;
+
+BEGIN {
+ use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
+ @EXPORT_OK = qw(
+
+ &detect_taxonomy_translation_from_text
+ ); # symbols to export on request
+ %EXPORT_TAGS = (all => [@EXPORT_OK]);
+}
+
+use vars @EXPORT_OK;
+# use experimental 'smartmatch';
+# use Encode;
+# use Clone qw(clone);
+# use LWP::UserAgent;
+# use Encode;
+# use JSON::MaybeXS;
+use Log::Any qw($log);
+# use List::MoreUtils qw(uniq);
+# use Data::DeepAccess qw(deep_get deep_exists);
+
+# use ProductOpener::Store qw/get_string_id_for_lang unac_string_perl/;
+# use ProductOpener::Config qw/:all/;
+# use ProductOpener::Users qw/:all/;
+# use ProductOpener::Tags qw/:all/;
+# use ProductOpener::Products qw/remove_fields/;
+# use ProductOpener::URL qw/:all/;
+# use ProductOpener::Images qw/extract_text_from_image/;
+# use ProductOpener::Lang qw/$lc %Lang lang/;
+# use ProductOpener::Units qw/normalize_quantity/;
+# use ProductOpener::Food qw/is_fat_oil_nuts_seeds_for_nutrition_score/;
+use ProductOpener::Ingredients qw/parse_ingredients_text_service/;
+
+
+
+=head2 detect_taxonomy_translation_from_text ( product_ref )
+
+This function extracts data for each language from the provided product reference.
+It then detects failed extractions (missing stop words) and identifies missing translations.
+
+=head3 Arguments
+
+=head4 product_ref
+
+A reference to the product data, which is expected to be a hash reference containing the necessary information.
+
+=head3 Return value
+
+This function does not return any value. It performs the extraction and detection internally.
+
+=cut
+
+sub detect_taxonomy_translation_from_text ($product_ref) {
+ $log->debug("detect_taxonomy_translation_from_text - start") if $log->is_debug();
+ print STDERR "detect_taxonomy_translation_from_text - start\n";
+}
+
+1;
diff --git a/tests/unit/taxonomies_enhancer.t b/tests/unit/taxonomies_enhancer.t
new file mode 100644
index 0000000000000..23250ebb41f6f
--- /dev/null
+++ b/tests/unit/taxonomies_enhancer.t
@@ -0,0 +1,45 @@
+#!/usr/bin/perl -w
+
+use Modern::Perl '2017';
+use utf8;
+
+use Test2::V0;
+use Log::Any::Adapter 'TAP';
+
+use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;
+
+
+# example based on 0036595328366
+# should detect stopwords
+my $product_ref = {
+ ingredients_text_cs => "69% pšeničná mouka , pitná voda, řepkový olej , stabilizátor: glycerol; pšeničný lepek , regulátor kyselosti : kyselina jablečná; jedlá sůl , emulgátor : mono - a diglyceridy mastných kyselin ; dextróza , kypřící látka : uhličitany sodné ; konzervanty : propionan vápenatý , sorban draselný ; látka zlepšující mouku : L-cystein. Skladujte v suchu a chraňte před teplem.",
+ ingredients_text_hr => "69% pšenično brašno, voda , repičino ulje , stabilizator. glicerol; pšenični gluten, regulator kiselosti : jabučna kiselina ; kuhinjska sol , emulgator : mono - i digliceridi masnih kiselina ; dekstroza, tvar za rahljenje : natrijevi karbonati; konzervansi : kalcijev propionat , kalijev sorbat ; tvar za tretiranje brašna : L-cistein. Čuvati na suhom mjestu.",
+ ingredients_text_hu => "69% búzaliszt , ivóvíz , repceolaj , stabilizátor: glicerin; búzaglutén , savanyúságot szabályozó anyag : almasav ; étkezési só , emulgeálószer: zsírsavak mono - és digliceridjei ; dextróz , térfogatnövelő szer : nátrium-karbonátok ; tartósítószerek : kalcium-propionát , kálium-szorbát ; lisztkezelő szer : L-Cisztein.",
+ ingredients_text_pl => "69% mąka pszenna , woda , olej rzepakowy , stabilizator: glicerol; gluten pszenny , regulator kwasowości : kwas jabłkowy ; sól , emuglator : mono - i diglicerydy kwasów tłuszczowych; glukoza , substancja spulchniająca: węglany sodu ; substancje konserwujące: propionian wapnia , sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.",
+ ingredients_text_ro => "69% făină de grâu , apă , ulei de rapiță , stabilizator: glicerol; gluten din grâu, corector de aciditate : acid malic ; sare , emulsifiant : mono - şi digliceride ale acizilor graşi; dextroză , agent de afanare : carbonați de sodiu ; conservanți : propionat de calciu, sorbat de potasiu; agent de tratare a făinii : L-cisteină.",
+ ingredients_text_sk => "69% pšeničná múka , pitná voda, repkový olej , stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti : kyselina jablčná ; jedlá soľ , emulgátor : mono - a diglyceridy mastných kyselín ; dextróza , kypriaca látka : uhličitany sodné ; konzervačné látky : propionan vápenatý , sorban draselný ; múku upravujúca látka : L-cystein.",
+ ingredients_text_sl => "69% pšenična moka , voda , olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator : mono - in diglicerid! maščobnih kislin ; dekstroza, sredstvo za vzhajanje : natrijevi karbonati; konzervansa : kalcijev propionat , kalijev sorbat ; sredstvo za obdelavo moke : L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.",
+};
+detect_taxonomy_translation_from_text($product_ref);
+
+# example based on 20201845
+# should suggests translations
+# problem with english: some app translated in english from other languages. NOT producer translation.
+# for example: App translation (infood) probably based on RO:
+# ingredients_text_en => "water, wine vinegar, mustard seeds, [mustard husks], table salt, [acidifying]: citric acid, [natural flavors of cloves], cinnamon, ginger and tarragon, antioxidant: potassium metabisulphite, spice mixture",
+# versus Producer translation:
+# ingredients_text_en => "water, spirit vinegar, mustard seeds, husks of mustard seeds, salt, acidity regulator: citric acid, natural flavorings, antioxidant: potassium metabisulphite, turmeric",
+# in square brackets are unknown ingredients on the product
+# ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
+# ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
+# ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
+# RO has more ingredients
+# ES has a typo ácido citico -> Ácido cítrico
+my $product_ref = {
+ ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
+ ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
+ ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
+};
+
+
+done_testing();