From e7c1d8282ba117f6c7e1abf5b2c78b3ca928b715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Sun, 17 Nov 2019 14:50:02 +0000 Subject: [PATCH] speed up remove_stopwords() , bug #2607 --- cpanfile | 1 + lib/ProductOpener/Tags.pm | 33 +++++++++++++++++---------------- t/tags.t | 6 ++++++ 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/cpanfile b/cpanfile index 4579a3e86ff75..7466a7e9cd287 100644 --- a/cpanfile +++ b/cpanfile @@ -50,6 +50,7 @@ requires 'Text::CSV', '>= 1.99, < 2.0'; requires 'Text::Fuzzy'; requires 'File::Copy::Recursive'; requires 'Spreadsheet::CSV'; +requires 'List::MoreUtils'; # Mojolicious/Minion requires 'Mojolicious::Lite'; diff --git a/lib/ProductOpener/Tags.pm b/lib/ProductOpener/Tags.pm index 0ee42273d9284..fe26e352bc66b 100644 --- a/lib/ProductOpener/Tags.pm +++ b/lib/ProductOpener/Tags.pm @@ -129,6 +129,7 @@ use ProductOpener::Food qw/:all/; use ProductOpener::Lang qw/:all/; use ProductOpener::Text qw/:all/; use Clone qw(clone); +use List::MoreUtils qw(uniq); use URI::Escape::XS; use Log::Any qw($log); @@ -588,7 +589,8 @@ sub load_tags_hierarchy($$) { } } - +# Cache the stopwords regexp +my %stopwords_regexps = (); sub remove_stopwords($$$) { @@ -598,33 +600,32 @@ sub remove_stopwords($$$) { if (defined $stopwords{$tagtype}{$lc}) { + my $uppercased_stopwords_overrides = 0; + if ($lc eq 'fr') { # "Dés de tomates" -> "des-de-tomates" --> "dés" should not be a stopword $tagid =~ s/\bdes-de\b/DES-DE/g; $tagid =~ s/\ben-des\b/EN-DES/g; + $uppercased_stopwords_overrides = 1; } - foreach my $stopword (@{$stopwords{$tagtype}{$lc}}) { - $tagid =~ s/-${stopword}-/-/g; + if (not defined $stopwords_regexps{$tagtype . '.' . $lc}) { + $stopwords_regexps{$tagtype . '.' . $lc} = join('|', uniq(@{$stopwords{$tagtype}{$lc}})); + } - # some stopwords should not be removed at the start or end - # this can cause issues with spellchecking tags like ingredients - # e.g. purée d'abricot -> puree d' -> urée - # ingredients: stopwords:fr:aux,au,de,le,du,la,a,et,avec,base,ou,en,proportion,variable, contient + my $regexp = $stopwords_regexps{$tagtype . '.' . $lc}; - $tagid =~ s/^${stopword}-//g; + $tagid =~ s/(^|-)($regexp)(-($regexp))*(-|$)/-/g; - if (not - (($lc eq 'fr') and (($tagtype eq "ingredients") or ($tagtype eq "additives")) and not ($stopword =~ /^(en|proportion|proportions|variable|variables|et-derives)$/)) # don't remove French stopwords at the end - ) { - $tagid =~ s/-${stopword}$//g; - } - } + $tagid =~ tr/-/-/s; + $tagid =~ s/^-//; + $tagid =~ s/-$//; - $tagid = lc($tagid); + if ($uppercased_stopwords_overrides) { + $tagid = lc($tagid); + } } return $tagid; - } diff --git a/t/tags.t b/t/tags.t index e2bbeff9a2fc1..1f62cb6006b6b 100644 --- a/t/tags.t +++ b/t/tags.t @@ -411,4 +411,10 @@ is_deeply($product_ref->{stores_tags}, ["intermarche"]); compute_field_tags($product_ref, "de", "stores"); is_deeply($product_ref->{stores_tags}, ["intermarche"]); +is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "correcteurs-d-acidite"), "correcteurs-acidite"); +is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "yaourt-a-la-fraise"), "yaourt-fraise"); +is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "du-miel"), "miel"); +is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "fruits-en-proportion-variable"), "fruits"); +is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "des-de-tomate"), "des-de-tomate"); + done_testing();