Skip to content

Commit

Permalink
partial devs
Browse files Browse the repository at this point in the history
  • Loading branch information
benbenben2 committed Jan 8, 2025
1 parent 765d796 commit 782434e
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 0 deletions.
2 changes: 2 additions & 0 deletions lib/ProductOpener/FoodProducts.pm
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ use ProductOpener::FoodGroups qw/compute_food_groups/;
use ProductOpener::Nutriscore qw/:all/;
use ProductOpener::EnvironmentalScore qw/compute_environmental_score/;
use ProductOpener::ForestFootprint qw/compute_forest_footprint/;
use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;

use Log::Any qw($log);

Expand All @@ -83,6 +84,7 @@ sub specific_processes_for_food_product ($product_ref) {
extract_ingredients_from_text($product_ref);
extract_additives_from_text($product_ref);
detect_allergens_from_text($product_ref);
detect_taxonomy_translation_from_text($product_ref);

# Category analysis
# Food category rules for sweetened/sugared beverages
Expand Down
105 changes: 105 additions & 0 deletions lib/ProductOpener/TaxonomiesEnhancer.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# This file is part of Product Opener.
#
# Product Opener
# Copyright (C) 2011-2023 Association Open Food Facts
# Contact: [email protected]
# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
#
# Product Opener is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

=encoding UTF-8
=head1 NAME
ProductOpener::TaxonomiesEnhancer - analyze ingredients and other fields to enrich the taxonomies
=head1 SYNOPSIS
C<ProductOpener::TaxonomiesEnhancer> analyze
analyze ingredients and other fields to enrich the taxonomies
use ProductOpener::TaxonomiesEnhancer qw/:all/;
[..]
detect_taxonomy_translation_from_text($product_ref);
=head1 DESCRIPTION
[..]
=cut

package ProductOpener::TaxonomiesEnhancer;

use ProductOpener::PerlStandards;
use Exporter qw< import >;

BEGIN {
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
@EXPORT_OK = qw(
&detect_taxonomy_translation_from_text
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}

use vars @EXPORT_OK;
# use experimental 'smartmatch';
# use Encode;
# use Clone qw(clone);
# use LWP::UserAgent;
# use Encode;
# use JSON::MaybeXS;
use Log::Any qw($log);
# use List::MoreUtils qw(uniq);
# use Data::DeepAccess qw(deep_get deep_exists);

# use ProductOpener::Store qw/get_string_id_for_lang unac_string_perl/;
# use ProductOpener::Config qw/:all/;
# use ProductOpener::Users qw/:all/;
# use ProductOpener::Tags qw/:all/;
# use ProductOpener::Products qw/remove_fields/;
# use ProductOpener::URL qw/:all/;
# use ProductOpener::Images qw/extract_text_from_image/;
# use ProductOpener::Lang qw/$lc %Lang lang/;
# use ProductOpener::Units qw/normalize_quantity/;
# use ProductOpener::Food qw/is_fat_oil_nuts_seeds_for_nutrition_score/;
use ProductOpener::Ingredients qw/parse_ingredients_text_service/;



=head2 detect_taxonomy_translation_from_text ( product_ref )
This function extracts data for each language from the provided product reference.
It then detects failed extractions (missing stop words) and identifies missing translations.
=head3 Arguments
=head4 product_ref
A reference to the product data, which is expected to be a hash reference containing the necessary information.
=head3 Return value
This function does not return any value. It performs the extraction and detection internally.
=cut

sub detect_taxonomy_translation_from_text ($product_ref) {
$log->debug("detect_taxonomy_translation_from_text - start") if $log->is_debug();
print STDERR "detect_taxonomy_translation_from_text - start\n";
}

1;
45 changes: 45 additions & 0 deletions tests/unit/taxonomies_enhancer.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/perl -w

use Modern::Perl '2017';
use utf8;

use Test2::V0;
use Log::Any::Adapter 'TAP';

use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/;


# example based on 0036595328366
# should detect stopwords
my $product_ref = {
ingredients_text_cs => "69% pšeničná mouka , pitná voda, řepkový olej , stabilizátor: glycerol; pšeničný lepek , regulátor kyselosti : kyselina jablečná; jedlá sůl , emulgátor : mono - a diglyceridy mastných kyselin ; dextróza , kypřící látka : uhličitany sodné ; konzervanty : propionan vápenatý , sorban draselný ; látka zlepšující mouku : L-cystein. Skladujte v suchu a chraňte před teplem.",
ingredients_text_hr => "69% pšenično brašno, voda , repičino ulje , stabilizator. glicerol; pšenični gluten, regulator kiselosti : jabučna kiselina ; kuhinjska sol , emulgator : mono - i digliceridi masnih kiselina ; dekstroza, tvar za rahljenje : natrijevi karbonati; konzervansi : kalcijev propionat , kalijev sorbat ; tvar za tretiranje brašna : L-cistein. Čuvati na suhom mjestu.",
ingredients_text_hu => "69% búzaliszt , ivóvíz , repceolaj , stabilizátor: glicerin; búzaglutén , savanyúságot szabályozó anyag : almasav ; étkezési só , emulgeálószer: zsírsavak mono - és digliceridjei ; dextróz , térfogatnövelő szer : nátrium-karbonátok ; tartósítószerek : kalcium-propionát , kálium-szorbát ; lisztkezelő szer : L-Cisztein.",
ingredients_text_pl => "69% mąka pszenna , woda , olej rzepakowy , stabilizator: glicerol; gluten pszenny , regulator kwasowości : kwas jabłkowy ; sól , emuglator : mono - i diglicerydy kwasów tłuszczowych; glukoza , substancja spulchniająca: węglany sodu ; substancje konserwujące: propionian wapnia , sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.",
ingredients_text_ro => "69% făină de grâu , apă , ulei de rapiță , stabilizator: glicerol; gluten din grâu, corector de aciditate : acid malic ; sare , emulsifiant : mono - şi digliceride ale acizilor graşi; dextroză , agent de afanare : carbonați de sodiu ; conservanți : propionat de calciu, sorbat de potasiu; agent de tratare a făinii : L-cisteină.",
ingredients_text_sk => "69% pšeničná múka , pitná voda, repkový olej , stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti : kyselina jablčná ; jedlá soľ , emulgátor : mono - a diglyceridy mastných kyselín ; dextróza , kypriaca látka : uhličitany sodné ; konzervačné látky : propionan vápenatý , sorban draselný ; múku upravujúca látka : L-cystein.",
ingredients_text_sl => "69% pšenična moka , voda , olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator : mono - in diglicerid! maščobnih kislin ; dekstroza, sredstvo za vzhajanje : natrijevi karbonati; konzervansa : kalcijev propionat , kalijev sorbat ; sredstvo za obdelavo moke : L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.",
};
detect_taxonomy_translation_from_text($product_ref);

# example based on 20201845
# should suggests translations
# problem with english: some app translated in english from other languages. NOT producer translation.
# for example: App translation (infood) probably based on RO:
# ingredients_text_en => "water, wine vinegar, mustard seeds, [mustard husks], table salt, [acidifying]: citric acid, [natural flavors of cloves], cinnamon, ginger and tarragon, antioxidant: potassium metabisulphite, spice mixture",
# versus Producer translation:
# ingredients_text_en => "water, spirit vinegar, mustard seeds, husks of mustard seeds, salt, acidity regulator: citric acid, natural flavorings, antioxidant: potassium metabisulphite, turmeric",
# in square brackets are unknown ingredients on the product
# ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
# ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
# ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
# RO has more ingredients
# ES has a typo ácido citico -> Ácido cítrico
my $product_ref = {
ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.",
ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.",
ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.",
};


done_testing();

0 comments on commit 782434e

Please sign in to comment.