Skip to content

Commit

Permalink
refactor: ingredients preparsing unit tests (#11063)
Browse files Browse the repository at this point in the history
This PR is to change ingredients_parsing.t into ingredients_preparsing.t
The tests are exactly the same, but instead of having the expected
results listed in the .t file, we use JSON files like for other tests.

This is because I will change the output of preparsing for many tests,
and it will make the test diffs of the upcoming PR easier to see.
  • Loading branch information
stephanegigandet authored Nov 28, 2024
1 parent d61c7ba commit 62b51d9
Show file tree
Hide file tree
Showing 226 changed files with 2,565 additions and 675 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "1",
"ingredients_text" : "Sel marin, blé, lécithine de soja",
"lc" : "fr",
"preparsed_ingredients_text" : "Sel marin, blé, lécithine de soja"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "10",
"ingredients_text" : "Huiles végétales de palme et d'olive",
"lc" : "fr",
"preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales d'olive"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "100",
"ingredients_text" : "huile de tournesol, cacao maigre en poudre 5.2%",
"lc" : "fr",
"preparsed_ingredients_text" : "huile de tournesol, cacao maigre en poudre 5.2%"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "101",
"ingredients_text" : "regulatory kwasowości: kwas cytrynowy i cytryniany sodu.",
"lc" : "pl",
"preparsed_ingredients_text" : "regulatory kwasowości: kwas cytrynowy i cytryniany sodu."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "102",
"ingredients_text" : "Wasser, Kohlensäure, Farbstoff Zuckerkulör E 150d, Süßungsmittel Aspartam* und Acesulfam-K, Säuerungsmittel Phosphorsäure und Citronensäure, Säureregulator Natriumcitrat, Aroma Koffein, Aroma. enthält eine Phenylalaninquelle",
"lc" : "de",
"preparsed_ingredients_text" : "Wasser, Kohlensäure, Farbstoff : Zuckerkulör e150d, Süßungsmittel : Aspartam* und Acesulfam-K, Säuerungsmittel : Phosphorsäure und Citronensäure, Säureregulator Natriumcitrat, Aroma Koffein, Aroma. enthält eine Phenylalaninquelle"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "103",
"ingredients_text" : "Farbstoffe Betenrot, Paprikaextrakt, Kurkumin",
"lc" : "de",
"preparsed_ingredients_text" : "Farbstoffe : Betenrot, Paprikaextrakt, Kurkumin"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "104",
"ingredients_text" : "Zucker, Glukosesirup, Glukose-Fruktose-Sirup, Stärke, 8,5% Süßholzsaft, brauner Zuckersirup, modifizierte Stärke, Aromen, pflanzliches Öl (Sonnenblume), Überzugsmittel: Bienenwachs, weiß und gelb",
"lc" : "de",
"preparsed_ingredients_text" : "Zucker, Glukosesirup, Glukose-Fruktose-Sirup, Stärke, 8.5% Süßholzsaft, brauner Zuckersirup, modifizierte Stärke, Aromen, pflanzliches Öl (Sonnenblume), Überzugsmittel: Bienenwachs weiß und gelb"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "105",
"ingredients_text" : "Zucker, Glukosesirup, Glukose-Fruktose-Sirup, Stärke, 8,5% Süßholzsaft, brauner Zuckersirup, modifizierte Stärke, Aromen, pflanzliches Öl (Sonnenblume), Überzugsmittel: Bienenwachs (weiß und gelb)",
"lc" : "de",
"preparsed_ingredients_text" : "Zucker, Glukosesirup, Glukose-Fruktose-Sirup, Stärke, 8.5% Süßholzsaft, brauner Zuckersirup, modifizierte Stärke, Aromen, pflanzliches Öl (Sonnenblume), Überzugsmittel: Bienenwachs weiß und gelb"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "106",
"ingredients_text" : "graisse végétale bio (colza)",
"lc" : "fr",
"preparsed_ingredients_text" : "graisse végétale bio de colza"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "107",
"ingredients_text" : "huiles végétales* (huile de tournesol*, huile de colza*). *Ingrédients issus de l'agriculture biologique",
"lc" : "fr",
"preparsed_ingredients_text" : "huiles végétales Bio (huile de tournesol Bio, huile de colza Bio )."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "108",
"ingredients_text" : "huile biologique (tournesol, olive)",
"lc" : "fr",
"preparsed_ingredients_text" : "huile biologique de tournesol, huile biologique d'olive"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"comment" : "xyz: test an unrecognized oil -> do not change",
"id" : "109",
"ingredients_text" : "huile biologique (tournesol, xyz)",
"lc" : "fr",
"preparsed_ingredients_text" : "huile biologique (tournesol, xyz)"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "11",
"ingredients_text" : "Huiles végétales de palme, de colza et de tournesol",
"lc" : "fr",
"preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales de tournesol"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "110",
"ingredients_text" : "huiles biologiques (tournesol, olive)",
"lc" : "fr",
"preparsed_ingredients_text" : "huiles biologiques de tournesol, huiles biologiques d'olive"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "111",
"ingredients_text" : "huiles (tournesol*, olive). * : bio",
"lc" : "fr",
"preparsed_ingredients_text" : "huiles de tournesol Bio, huiles d'olive."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "112",
"ingredients_text" : "huiles* (tournesol*, olive vierge extra), sel marin. *issus de l'agriculture biologique.",
"lc" : "fr",
"preparsed_ingredients_text" : "huiles Bio de tournesol Bio, huiles Bio d'olive vierge extra, sel marin."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "113",
"ingredients_text" : "riz de Camargue (1), sel. (1): IGP : Indication Géographique Protégée.",
"lc" : "fr",
"preparsed_ingredients_text" : "riz de Camargue IGP, sel."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "114",
"ingredients_text" : "cacao (1), sucre (2), beurre de cacao (1). (1) : Commerce équitable. (2) Issue de l'agriculture biologique.",
"lc" : "fr",
"preparsed_ingredients_text" : "cacao Commerce équitable, sucre Bio, beurre de cacao Commerce équitable."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "115",
"ingredients_text" : "Céréales 63,7% (BLE complet 50,5%*, semoule de maïs*), sucre*, sirop de BLE*, cacao maigre en poudre 3,9%*, cacao en poudre 1,7%*, sel, arôme naturel. *Ingrédients issus de l'agriculture biologique.",
"lc" : "fr",
"preparsed_ingredients_text" : "Céréales 63.7% (BLE complet 50.5% Bio, semoule de maïs Bio ), sucre Bio, sirop de BLE Bio, cacao maigre en poudre 3.9% Bio, cacao en poudre 1.7% Bio, sel, arôme naturel."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "116",
"ingredients_text" : "émulsifiant : mono - et diglycérides d'acides gras.",
"lc" : "fr",
"preparsed_ingredients_text" : "émulsifiant : mono- et diglycérides d'acides gras."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "117",
"ingredients_text" : "Sucre. Fabriqué dans un atelier qui utilise des fruits à coques.",
"lc" : "fr",
"preparsed_ingredients_text" : "Sucre. Traces éventuelles : fruits à coques."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "118",
"ingredients_text" : "Sucre. Fabriqué dans un atelier utilisant des fruits à coques et du sésame.",
"lc" : "fr",
"preparsed_ingredients_text" : "Sucre. Traces éventuelles : fruits à coques, Traces éventuelles : sésame."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "119",
"ingredients_text" : "Sucre. Fabriqué dans un atelier qui manipule du lait, de la moutarde et du céleri.",
"lc" : "fr",
"preparsed_ingredients_text" : "Sucre. Traces éventuelles : lait, Traces éventuelles : moutarde, Traces éventuelles : céleri."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "12",
"ingredients_text" : "Huiles végétales de palme, de colza, de tournesol",
"lc" : "fr",
"preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales de tournesol"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "120",
"ingredients_text" : "Sucre. Peut contenir des fruits à coques et du sésame.",
"lc" : "fr",
"preparsed_ingredients_text" : "Sucre. Traces éventuelles : fruits à coques, Traces éventuelles : sésame."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "121",
"ingredients_text" : "vegetable oil (coconut & rapeseed)",
"lc" : "en",
"preparsed_ingredients_text" : "coconut vegetable oil, rapeseed vegetable oil"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "122",
"ingredients_text" : "Masse de cacao°, Quinoa° (1,8%). °Produits issus de l'agriculture biologique.",
"lc" : "fr",
"preparsed_ingredients_text" : "Masse de cacao Bio, Quinoa Bio (1.8%)."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "123",
"ingredients_text" : "Emulgator (Sojalecithine, Mono - und Diglyceride von Speisefettsäuren, Sorbitantristearat)",
"lc" : "de",
"preparsed_ingredients_text" : "Emulgator (Sojalecithine, mono- und Diglyceride von Speisefettsäuren, Sorbitantristearat)"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "124",
"ingredients_text" : "Tomates* (20%). *Ingrédients Bio",
"lc" : "fr",
"preparsed_ingredients_text" : "Tomates Bio (20%)."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "125",
"ingredients_text" : "Tomates* (20%). *Ingrédients biologiques",
"lc" : "fr",
"preparsed_ingredients_text" : "Tomates Bio (20%)."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "126",
"ingredients_text" : "Chocolat. Contient du lait et des noisettes. Peut contenir du blé, du soja et des crustacés.",
"lc" : "fr",
"preparsed_ingredients_text" : "Chocolat. Substances ou produits provoquant des allergies ou intolérances : lait, Substances ou produits provoquant des allergies ou intolérances : noisettes. Traces éventuelles : blé, Traces éventuelles : soja, Traces éventuelles : crustacés."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "127",
"ingredients_text" : "Chocolate. Contains milk, hazelnuts and other nuts. May contain celery and mustard.",
"lc" : "en",
"preparsed_ingredients_text" : "Chocolate. Substances or products causing allergies or intolerances : milk, Substances or products causing allergies or intolerances : hazelnuts, Substances or products causing allergies or intolerances : other nuts. Traces : celery, Traces : mustard."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "128",
"ingredients_text" : "phosphates d'ammonium et de calcium, Phosphate d'aluminium et de sodium, diphosphate d'aluminium et de sodium",
"lc" : "fr",
"preparsed_ingredients_text" : "phosphates d'ammonium, phosphates de calcium, phosphate d'aluminium et de sodium, diphosphate d'aluminium et de sodium"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "129",
"ingredients_text" : "Ingrédient(s) : lentilles vertes* - *issu(e)(s) de l'agriculture biologique.",
"lc" : "fr",
"preparsed_ingredients_text" : "Ingrédients : lentilles vertes Bio"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "13",
"ingredients_text" : "Huiles végétales de palme, de colza et d'olive en proportion variable",
"lc" : "fr",
"preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales d'olive"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "130",
"ingredients_text" : "S. thermophilus, L casei, L.bulgaricus",
"lc" : "en",
"preparsed_ingredients_text" : "streptococcus thermophilus, lactobacillus casei, lactobacillus bulgaricus"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "131",
"ingredients_text" : "jus de citron*. *Ingrédients issus de l'agriculture biologique Peut contenir : œuf, moutarde, graine de sésame, poisson,soja, lait,fruits à coque, céleri.",
"lc" : "fr",
"preparsed_ingredients_text" : "jus de citron Bio. , Traces éventuelles : œuf, Traces éventuelles : moutarde, Traces éventuelles : graine de sésame, Traces éventuelles : poisson, Traces éventuelles : soja, Traces éventuelles : lait, Traces éventuelles : fruits à coque, Traces éventuelles : céleri."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "132",
"ingredients_text" : "Farine, levure. Peut contenir des traces de _soja_, _amandes_, _noisettes_ et _noix de cajou_.",
"lc" : "fr",
"preparsed_ingredients_text" : "Farine, levure. Traces éventuelles : _soja_, Traces éventuelles : _amandes_, Traces éventuelles : _noisettes_, Traces éventuelles : _noix de cajou_."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "133",
"ingredients_text" : "Agua, aceite de girasol*. * Ingredientes ecológicos.",
"lc" : "es",
"preparsed_ingredients_text" : "Agua, aceite de girasol Ecológico."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "134",
"ingredients_text" : "Agua, aceite de girasol*, arroz* (5 %). (*) Ingredientes ecológicos.",
"lc" : "es",
"preparsed_ingredients_text" : "Agua, aceite de girasol Ecológico, arroz Ecológico (5 %)."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "135",
"ingredients_text" : "Tofu* 88% (agua, habas de soja*). *cumple con el reglamento de agricultura ecológica CE 2092/91",
"lc" : "es",
"preparsed_ingredients_text" : "Tofu Ecológico 88% (agua, habas de soja Ecológico )."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "136",
"ingredients_text" : "agua, almendra* (5,5%). *= procedentes de la agricultura ecológica",
"lc" : "es",
"preparsed_ingredients_text" : "agua, almendra Ecológico (5.5%)."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"comment" : "test for bug #3273 that introduced unwanted separators before natural flavor",
"id" : "137",
"ingredients_text" : "non-gmo natural flavor",
"lc" : "en",
"preparsed_ingredients_text" : "non-gmo natural flavor"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "138",
"ingredients_text" : "vit. e, vitamins b2, B3 and K, vit d, vit a & c, vit. B12",
"lc" : "en",
"preparsed_ingredients_text" : "vitamin e, vitamins, vitamin b2, vitamin B3, vitamin K, vitamin d, vitamin a, vitamin c, vitamin B12"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "139",
"ingredients_text" : "vit. pp, vit c, vit. a et b6",
"lc" : "fr",
"preparsed_ingredients_text" : "vitamines, vitamine pp, Vitamine c, Vitamine a, vitamine b6"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "14",
"ingredients_text" : "Huiles végétales de palme, de colza et d'olive",
"lc" : "fr",
"preparsed_ingredients_text" : "Huiles végétales de palme, Huiles végétales de colza, Huiles végétales d'olive"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "140",
"ingredients_text" : "witaminy A i D",
"lc" : "pl",
"preparsed_ingredients_text" : "witaminy, witamina A, witamina D"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "141",
"ingredients_text" : "colorant de surface : caramel ordinaire, agent de traitement de farine (E300), acide citrique",
"lc" : "fr",
"preparsed_ingredients_text" : "colorant de surface : caramel ordinaire, agent de traitement de farine (e300), acide citrique"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "142",
"ingredients_text" : "Agua, edulcorantes (INS420, INS 960, INS N'952, INS N°954, INS°950, INS N 955), conservantes (INS.218, INS #202, INS N 216).",
"lc" : "es",
"preparsed_ingredients_text" : "Agua, edulcorantes (e420, e960, e952, e954, e950, e955), conservantes (e218, e202, e216)."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "143",
"ingredients_text" : "Vitamina E y C",
"lc" : "es",
"preparsed_ingredients_text" : "vitaminas, vitamina E, vitamina C"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "144",
"ingredients_text" : "color E 124",
"lc" : "es",
"preparsed_ingredients_text" : "color : e124"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "145",
"ingredients_text" : "colores E (124, 125)",
"lc" : "es",
"preparsed_ingredients_text" : "colores e124, e125"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "146",
"ingredients_text" : "vitamine A, B, E e K",
"lc" : "it",
"preparsed_ingredients_text" : "vitamine, vitamina A, vitamina B, vitamina E, vitamina K"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "147",
"ingredients_text" : "E 102, E-104 color, E-101(i), E101 (ii), E160a(iv), e172-i, E-160 i",
"lc" : "en",
"preparsed_ingredients_text" : "e102, e104 color, e101i, e101ii, e160aiv, e172i, e160i"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "148",
"ingredients_text" : "E102-E1400",
"lc" : "fr",
"preparsed_ingredients_text" : "e102 - e1400"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"id" : "149",
"ingredients_text" : "E172i-E174ii, E102(i)-E101i",
"lc" : "de",
"preparsed_ingredients_text" : "e172i - e174ii, e102i - e101i"
}
Loading

0 comments on commit 62b51d9

Please sign in to comment.