From 782434ee6f06cc325421328b24d2925f6f8918d7 Mon Sep 17 00:00:00 2001 From: benbenben2 <110821832+benbenben2@users.noreply.github.com> Date: Wed, 8 Jan 2025 17:50:24 +0100 Subject: [PATCH] partial devs --- lib/ProductOpener/FoodProducts.pm | 2 + lib/ProductOpener/TaxonomiesEnhancer.pm | 105 ++++++++++++++++++++++++ tests/unit/taxonomies_enhancer.t | 45 ++++++++++ 3 files changed, 152 insertions(+) create mode 100644 lib/ProductOpener/TaxonomiesEnhancer.pm create mode 100644 tests/unit/taxonomies_enhancer.t diff --git a/lib/ProductOpener/FoodProducts.pm b/lib/ProductOpener/FoodProducts.pm index ce57fec8f911d..775fa0d6c8f75 100644 --- a/lib/ProductOpener/FoodProducts.pm +++ b/lib/ProductOpener/FoodProducts.pm @@ -58,6 +58,7 @@ use ProductOpener::FoodGroups qw/compute_food_groups/; use ProductOpener::Nutriscore qw/:all/; use ProductOpener::EnvironmentalScore qw/compute_environmental_score/; use ProductOpener::ForestFootprint qw/compute_forest_footprint/; +use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/; use Log::Any qw($log); @@ -83,6 +84,7 @@ sub specific_processes_for_food_product ($product_ref) { extract_ingredients_from_text($product_ref); extract_additives_from_text($product_ref); detect_allergens_from_text($product_ref); + detect_taxonomy_translation_from_text($product_ref); # Category analysis # Food category rules for sweetened/sugared beverages diff --git a/lib/ProductOpener/TaxonomiesEnhancer.pm b/lib/ProductOpener/TaxonomiesEnhancer.pm new file mode 100644 index 0000000000000..abcdae7b6a92f --- /dev/null +++ b/lib/ProductOpener/TaxonomiesEnhancer.pm @@ -0,0 +1,105 @@ +# This file is part of Product Opener. +# +# Product Opener +# Copyright (C) 2011-2023 Association Open Food Facts +# Contact: contact@openfoodfacts.org +# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France +# +# Product Opener is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +=encoding UTF-8 + +=head1 NAME + +ProductOpener::TaxonomiesEnhancer - analyze ingredients and other fields to enrich the taxonomies + +=head1 SYNOPSIS + +C analyze +analyze ingredients and other fields to enrich the taxonomies + + use ProductOpener::TaxonomiesEnhancer qw/:all/; + + [..] + + detect_taxonomy_translation_from_text($product_ref); + +=head1 DESCRIPTION + +[..] + +=cut + +package ProductOpener::TaxonomiesEnhancer; + +use ProductOpener::PerlStandards; +use Exporter qw< import >; + +BEGIN { + use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); + @EXPORT_OK = qw( + + &detect_taxonomy_translation_from_text + ); # symbols to export on request + %EXPORT_TAGS = (all => [@EXPORT_OK]); +} + +use vars @EXPORT_OK; +# use experimental 'smartmatch'; +# use Encode; +# use Clone qw(clone); +# use LWP::UserAgent; +# use Encode; +# use JSON::MaybeXS; +use Log::Any qw($log); +# use List::MoreUtils qw(uniq); +# use Data::DeepAccess qw(deep_get deep_exists); + +# use ProductOpener::Store qw/get_string_id_for_lang unac_string_perl/; +# use ProductOpener::Config qw/:all/; +# use ProductOpener::Users qw/:all/; +# use ProductOpener::Tags qw/:all/; +# use ProductOpener::Products qw/remove_fields/; +# use ProductOpener::URL qw/:all/; +# use ProductOpener::Images qw/extract_text_from_image/; +# use ProductOpener::Lang qw/$lc %Lang lang/; +# use ProductOpener::Units qw/normalize_quantity/; +# use ProductOpener::Food qw/is_fat_oil_nuts_seeds_for_nutrition_score/; +use ProductOpener::Ingredients qw/parse_ingredients_text_service/; + + + +=head2 detect_taxonomy_translation_from_text ( product_ref ) + +This function extracts data for each language from the provided product reference. +It then detects failed extractions (missing stop words) and identifies missing translations. + +=head3 Arguments + +=head4 product_ref + +A reference to the product data, which is expected to be a hash reference containing the necessary information. + +=head3 Return value + +This function does not return any value. It performs the extraction and detection internally. + +=cut + +sub detect_taxonomy_translation_from_text ($product_ref) { + $log->debug("detect_taxonomy_translation_from_text - start") if $log->is_debug(); + print STDERR "detect_taxonomy_translation_from_text - start\n"; +} + +1; diff --git a/tests/unit/taxonomies_enhancer.t b/tests/unit/taxonomies_enhancer.t new file mode 100644 index 0000000000000..23250ebb41f6f --- /dev/null +++ b/tests/unit/taxonomies_enhancer.t @@ -0,0 +1,45 @@ +#!/usr/bin/perl -w + +use Modern::Perl '2017'; +use utf8; + +use Test2::V0; +use Log::Any::Adapter 'TAP'; + +use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/; + + +# example based on 0036595328366 +# should detect stopwords +my $product_ref = { + ingredients_text_cs => "69% pšeničná mouka , pitná voda, řepkový olej , stabilizátor: glycerol; pšeničný lepek , regulátor kyselosti : kyselina jablečná; jedlá sůl , emulgátor : mono - a diglyceridy mastných kyselin ; dextróza , kypřící látka : uhličitany sodné ; konzervanty : propionan vápenatý , sorban draselný ; látka zlepšující mouku : L-cystein. Skladujte v suchu a chraňte před teplem.", + ingredients_text_hr => "69% pšenično brašno, voda , repičino ulje , stabilizator. glicerol; pšenični gluten, regulator kiselosti : jabučna kiselina ; kuhinjska sol , emulgator : mono - i digliceridi masnih kiselina ; dekstroza, tvar za rahljenje : natrijevi karbonati; konzervansi : kalcijev propionat , kalijev sorbat ; tvar za tretiranje brašna : L-cistein. Čuvati na suhom mjestu.", + ingredients_text_hu => "69% búzaliszt , ivóvíz , repceolaj , stabilizátor: glicerin; búzaglutén , savanyúságot szabályozó anyag : almasav ; étkezési só , emulgeálószer: zsírsavak mono - és digliceridjei ; dextróz , térfogatnövelő szer : nátrium-karbonátok ; tartósítószerek : kalcium-propionát , kálium-szorbát ; lisztkezelő szer : L-Cisztein.", + ingredients_text_pl => "69% mąka pszenna , woda , olej rzepakowy , stabilizator: glicerol; gluten pszenny , regulator kwasowości : kwas jabłkowy ; sól , emuglator : mono - i diglicerydy kwasów tłuszczowych; glukoza , substancja spulchniająca: węglany sodu ; substancje konserwujące: propionian wapnia , sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.", + ingredients_text_ro => "69% făină de grâu , apă , ulei de rapiță , stabilizator: glicerol; gluten din grâu, corector de aciditate : acid malic ; sare , emulsifiant : mono - şi digliceride ale acizilor graşi; dextroză , agent de afanare : carbonați de sodiu ; conservanți : propionat de calciu, sorbat de potasiu; agent de tratare a făinii : L-cisteină.", + ingredients_text_sk => "69% pšeničná múka , pitná voda, repkový olej , stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti : kyselina jablčná ; jedlá soľ , emulgátor : mono - a diglyceridy mastných kyselín ; dextróza , kypriaca látka : uhličitany sodné ; konzervačné látky : propionan vápenatý , sorban draselný ; múku upravujúca látka : L-cystein.", + ingredients_text_sl => "69% pšenična moka , voda , olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator : mono - in diglicerid! maščobnih kislin ; dekstroza, sredstvo za vzhajanje : natrijevi karbonati; konzervansa : kalcijev propionat , kalijev sorbat ; sredstvo za obdelavo moke : L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.", +}; +detect_taxonomy_translation_from_text($product_ref); + +# example based on 20201845 +# should suggests translations +# problem with english: some app translated in english from other languages. NOT producer translation. +# for example: App translation (infood) probably based on RO: +# ingredients_text_en => "water, wine vinegar, mustard seeds, [mustard husks], table salt, [acidifying]: citric acid, [natural flavors of cloves], cinnamon, ginger and tarragon, antioxidant: potassium metabisulphite, spice mixture", +# versus Producer translation: +# ingredients_text_en => "water, spirit vinegar, mustard seeds, husks of mustard seeds, salt, acidity regulator: citric acid, natural flavorings, antioxidant: potassium metabisulphite, turmeric", +# in square brackets are unknown ingredients on the product +# ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.", +# ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.", +# ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.", +# RO has more ingredients +# ES has a typo ácido citico -> Ácido cítrico +my $product_ref = { + ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.", + ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.", + ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.", +}; + + +done_testing();