-
-
Notifications
You must be signed in to change notification settings - Fork 400
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
765d796
commit 782434e
Showing
3 changed files
with
152 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# This file is part of Product Opener. | ||
# | ||
# Product Opener | ||
# Copyright (C) 2011-2023 Association Open Food Facts | ||
# Contact: [email protected] | ||
# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France | ||
# | ||
# Product Opener is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU Affero General Public License as | ||
# published by the Free Software Foundation, either version 3 of the | ||
# License, or (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU Affero General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU Affero General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
=encoding UTF-8 | ||
=head1 NAME | ||
ProductOpener::TaxonomiesEnhancer - analyze ingredients and other fields to enrich the taxonomies | ||
=head1 SYNOPSIS | ||
C<ProductOpener::TaxonomiesEnhancer> analyze | ||
analyze ingredients and other fields to enrich the taxonomies | ||
use ProductOpener::TaxonomiesEnhancer qw/:all/; | ||
[..] | ||
detect_taxonomy_translation_from_text($product_ref); | ||
=head1 DESCRIPTION | ||
[..] | ||
=cut | ||
|
||
package ProductOpener::TaxonomiesEnhancer; | ||
|
||
use ProductOpener::PerlStandards; | ||
use Exporter qw< import >; | ||
|
||
BEGIN { | ||
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); | ||
@EXPORT_OK = qw( | ||
&detect_taxonomy_translation_from_text | ||
); # symbols to export on request | ||
%EXPORT_TAGS = (all => [@EXPORT_OK]); | ||
} | ||
|
||
use vars @EXPORT_OK; | ||
# use experimental 'smartmatch'; | ||
# use Encode; | ||
# use Clone qw(clone); | ||
# use LWP::UserAgent; | ||
# use Encode; | ||
# use JSON::MaybeXS; | ||
use Log::Any qw($log); | ||
# use List::MoreUtils qw(uniq); | ||
# use Data::DeepAccess qw(deep_get deep_exists); | ||
|
||
# use ProductOpener::Store qw/get_string_id_for_lang unac_string_perl/; | ||
# use ProductOpener::Config qw/:all/; | ||
# use ProductOpener::Users qw/:all/; | ||
# use ProductOpener::Tags qw/:all/; | ||
# use ProductOpener::Products qw/remove_fields/; | ||
# use ProductOpener::URL qw/:all/; | ||
# use ProductOpener::Images qw/extract_text_from_image/; | ||
# use ProductOpener::Lang qw/$lc %Lang lang/; | ||
# use ProductOpener::Units qw/normalize_quantity/; | ||
# use ProductOpener::Food qw/is_fat_oil_nuts_seeds_for_nutrition_score/; | ||
use ProductOpener::Ingredients qw/parse_ingredients_text_service/; | ||
|
||
|
||
|
||
=head2 detect_taxonomy_translation_from_text ( product_ref ) | ||
This function extracts data for each language from the provided product reference. | ||
It then detects failed extractions (missing stop words) and identifies missing translations. | ||
=head3 Arguments | ||
=head4 product_ref | ||
A reference to the product data, which is expected to be a hash reference containing the necessary information. | ||
=head3 Return value | ||
This function does not return any value. It performs the extraction and detection internally. | ||
=cut | ||
|
||
sub detect_taxonomy_translation_from_text ($product_ref) { | ||
$log->debug("detect_taxonomy_translation_from_text - start") if $log->is_debug(); | ||
print STDERR "detect_taxonomy_translation_from_text - start\n"; | ||
} | ||
|
||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/perl -w | ||
|
||
use Modern::Perl '2017'; | ||
use utf8; | ||
|
||
use Test2::V0; | ||
use Log::Any::Adapter 'TAP'; | ||
|
||
use ProductOpener::TaxonomiesEnhancer qw/detect_taxonomy_translation_from_text/; | ||
|
||
|
||
# example based on 0036595328366 | ||
# should detect stopwords | ||
my $product_ref = { | ||
ingredients_text_cs => "69% pšeničná mouka , pitná voda, řepkový olej , stabilizátor: glycerol; pšeničný lepek , regulátor kyselosti : kyselina jablečná; jedlá sůl , emulgátor : mono - a diglyceridy mastných kyselin ; dextróza , kypřící látka : uhličitany sodné ; konzervanty : propionan vápenatý , sorban draselný ; látka zlepšující mouku : L-cystein. Skladujte v suchu a chraňte před teplem.", | ||
ingredients_text_hr => "69% pšenično brašno, voda , repičino ulje , stabilizator. glicerol; pšenični gluten, regulator kiselosti : jabučna kiselina ; kuhinjska sol , emulgator : mono - i digliceridi masnih kiselina ; dekstroza, tvar za rahljenje : natrijevi karbonati; konzervansi : kalcijev propionat , kalijev sorbat ; tvar za tretiranje brašna : L-cistein. Čuvati na suhom mjestu.", | ||
ingredients_text_hu => "69% búzaliszt , ivóvíz , repceolaj , stabilizátor: glicerin; búzaglutén , savanyúságot szabályozó anyag : almasav ; étkezési só , emulgeálószer: zsírsavak mono - és digliceridjei ; dextróz , térfogatnövelő szer : nátrium-karbonátok ; tartósítószerek : kalcium-propionát , kálium-szorbát ; lisztkezelő szer : L-Cisztein.", | ||
ingredients_text_pl => "69% mąka pszenna , woda , olej rzepakowy , stabilizator: glicerol; gluten pszenny , regulator kwasowości : kwas jabłkowy ; sól , emuglator : mono - i diglicerydy kwasów tłuszczowych; glukoza , substancja spulchniająca: węglany sodu ; substancje konserwujące: propionian wapnia , sorbinian potasu ; środek do przetwarzania mąki: L-cysteina.", | ||
ingredients_text_ro => "69% făină de grâu , apă , ulei de rapiță , stabilizator: glicerol; gluten din grâu, corector de aciditate : acid malic ; sare , emulsifiant : mono - şi digliceride ale acizilor graşi; dextroză , agent de afanare : carbonați de sodiu ; conservanți : propionat de calciu, sorbat de potasiu; agent de tratare a făinii : L-cisteină.", | ||
ingredients_text_sk => "69% pšeničná múka , pitná voda, repkový olej , stabilizátor: glycerol; pšeničný glutén, regulátor kyslosti : kyselina jablčná ; jedlá soľ , emulgátor : mono - a diglyceridy mastných kyselín ; dextróza , kypriaca látka : uhličitany sodné ; konzervačné látky : propionan vápenatý , sorban draselný ; múku upravujúca látka : L-cystein.", | ||
ingredients_text_sl => "69% pšenična moka , voda , olje oljne ogrščice, stabilizator: glicerol; pšenični gluten, sredstvo za uravnavanje kislosti: jabolčna kislina ; nejodirana sol, emulgator : mono - in diglicerid! maščobnih kislin ; dekstroza, sredstvo za vzhajanje : natrijevi karbonati; konzervansa : kalcijev propionat , kalijev sorbat ; sredstvo za obdelavo moke : L-cistein. Uporabno najmanj do: glej odtis na zadnji strani embalaže.", | ||
}; | ||
detect_taxonomy_translation_from_text($product_ref); | ||
|
||
# example based on 20201845 | ||
# should suggests translations | ||
# problem with english: some app translated in english from other languages. NOT producer translation. | ||
# for example: App translation (infood) probably based on RO: | ||
# ingredients_text_en => "water, wine vinegar, mustard seeds, [mustard husks], table salt, [acidifying]: citric acid, [natural flavors of cloves], cinnamon, ginger and tarragon, antioxidant: potassium metabisulphite, spice mixture", | ||
# versus Producer translation: | ||
# ingredients_text_en => "water, spirit vinegar, mustard seeds, husks of mustard seeds, salt, acidity regulator: citric acid, natural flavorings, antioxidant: potassium metabisulphite, turmeric", | ||
# in square brackets are unknown ingredients on the product | ||
# ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.", | ||
# ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.", | ||
# ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.", | ||
# RO has more ingredients | ||
# ES has a typo ácido citico -> Ácido cítrico | ||
my $product_ref = { | ||
ingredients_text_es => "Agua, vinagre de alcohol, 24,5% semillas de mostaza, [cáscara de semillas de mostaza], sal, acidulante: [ácido citico]; aromas, antioxidante: metabisulfito potásico; especia.", | ||
ingredients_text_hr => "Voda, alkoholni ocat, 24,5% sjemenke gorušice, [7,5% ljuske gorušice], kuhinjska sol, kiselina: limunska kiselina; arome, antioksidans: kalijev metabisulfit; začin.", | ||
ingredients_text_ro => "apă, oțet din vin, [semințe de muştar], [coji de muştar], sare de masă, acidifiant: acid citric, [arome naturale de cuişoare], scorțișoară, ghimbir și tarhon, antioxidant: metabisulfit de potasiu, amestec de condimente.", | ||
}; | ||
|
||
|
||
done_testing(); |