Skip to content

Commit

Permalink
speed up remove_stopwords() , bug #2607
Browse files Browse the repository at this point in the history
  • Loading branch information
stephanegigandet committed Nov 17, 2019
1 parent 8840369 commit e7c1d82
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 16 deletions.
1 change: 1 addition & 0 deletions cpanfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ requires 'Text::CSV', '>= 1.99, < 2.0';
requires 'Text::Fuzzy';
requires 'File::Copy::Recursive';
requires 'Spreadsheet::CSV';
requires 'List::MoreUtils';

# Mojolicious/Minion
requires 'Mojolicious::Lite';
Expand Down
33 changes: 17 additions & 16 deletions lib/ProductOpener/Tags.pm
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ use ProductOpener::Food qw/:all/;
use ProductOpener::Lang qw/:all/;
use ProductOpener::Text qw/:all/;
use Clone qw(clone);
use List::MoreUtils qw(uniq);

use URI::Escape::XS;
use Log::Any qw($log);
Expand Down Expand Up @@ -588,7 +589,8 @@ sub load_tags_hierarchy($$) {
}
}


# Cache the stopwords regexp
my %stopwords_regexps = ();

sub remove_stopwords($$$) {

Expand All @@ -598,33 +600,32 @@ sub remove_stopwords($$$) {

if (defined $stopwords{$tagtype}{$lc}) {

my $uppercased_stopwords_overrides = 0;

if ($lc eq 'fr') {
# "Dés de tomates" -> "des-de-tomates" --> "dés" should not be a stopword
$tagid =~ s/\bdes-de\b/DES-DE/g;
$tagid =~ s/\ben-des\b/EN-DES/g;
$uppercased_stopwords_overrides = 1;
}

foreach my $stopword (@{$stopwords{$tagtype}{$lc}}) {
$tagid =~ s/-${stopword}-/-/g;
if (not defined $stopwords_regexps{$tagtype . '.' . $lc}) {
$stopwords_regexps{$tagtype . '.' . $lc} = join('|', uniq(@{$stopwords{$tagtype}{$lc}}));
}

# some stopwords should not be removed at the start or end
# this can cause issues with spellchecking tags like ingredients
# e.g. purée d'abricot -> puree d' -> urée
# ingredients: stopwords:fr:aux,au,de,le,du,la,a,et,avec,base,ou,en,proportion,variable, contient
my $regexp = $stopwords_regexps{$tagtype . '.' . $lc};

$tagid =~ s/^${stopword}-//g;
$tagid =~ s/(^|-)($regexp)(-($regexp))*(-|$)/-/g;

if (not
(($lc eq 'fr') and (($tagtype eq "ingredients") or ($tagtype eq "additives")) and not ($stopword =~ /^(en|proportion|proportions|variable|variables|et-derives)$/)) # don't remove French stopwords at the end
) {
$tagid =~ s/-${stopword}$//g;
}
}
$tagid =~ tr/-/-/s;
$tagid =~ s/^-//;
$tagid =~ s/-$//;

$tagid = lc($tagid);
if ($uppercased_stopwords_overrides) {
$tagid = lc($tagid);
}
}
return $tagid;

}


Expand Down
6 changes: 6 additions & 0 deletions t/tags.t
Original file line number Diff line number Diff line change
Expand Up @@ -411,4 +411,10 @@ is_deeply($product_ref->{stores_tags}, ["intermarche"]);
compute_field_tags($product_ref, "de", "stores");
is_deeply($product_ref->{stores_tags}, ["intermarche"]);

is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "correcteurs-d-acidite"), "correcteurs-acidite");
is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "yaourt-a-la-fraise"), "yaourt-fraise");
is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "du-miel"), "miel");
is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "fruits-en-proportion-variable"), "fruits");
is(ProductOpener::Tags::remove_stopwords("ingredients", "fr", "des-de-tomate"), "des-de-tomate");

done_testing();

0 comments on commit e7c1d82

Please sign in to comment.