Skip to content

Commit

Permalink
chore: Add obsolete parameter to handle obsolete product dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
areebahmeddd authored and raphael0202 committed Nov 4, 2024
1 parent 1c26936 commit d844fe4
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
9 changes: 7 additions & 2 deletions openfoodfacts/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
Flavor.off: {
DatasetType.jsonl: "openfoodfacts-products.jsonl.gz",
DatasetType.csv: "en.openfoodfacts.org.products.csv.gz",
DatasetType.obsolete: "openfoodfacts-products_obsolete.jsonl.gz",
},
Flavor.obf: {
DatasetType.jsonl: "openbeautyfacts-products.jsonl.gz",
Expand All @@ -47,6 +46,7 @@ def get_dataset(
force_download: bool = False,
download_newer: bool = False,
cache_dir: Optional[Path] = None,
obsolete: bool = False,
) -> Path:
"""Download (and cache) Open Food Facts dataset.
Expand All @@ -61,10 +61,13 @@ def get_dataset(
version is available (based on file Etag)
:param cache_dir: the cache directory to use, defaults to
~/.cache/openfoodfacts/taxonomy
:param obsolete: if True, download the obsolete dataset, defaults to False
:return: the path of the dataset
"""
cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir
file_name = DATASET_FILE_NAMES[flavor][dataset_type]
if obsolete:
file_name = file_name.replace(".jsonl.gz", "_obsolete.jsonl.gz")
dataset_path = cache_dir / file_name
url = f"{URLBuilder.static(flavor, Environment.org)}/data/{file_name}"
cache_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -83,6 +86,7 @@ def __init__(
flavor: Flavor = Flavor.off,
dataset_type: DatasetType = DatasetType.jsonl,
dataset_path: Optional[Path] = None,
obsolete: bool = False,
**kwargs,
):
"""A product dataset.
Expand All @@ -102,6 +106,7 @@ def __init__(
to DatasetType.jsonl. This parameter is ignored if dataset_path is
provided.
:param dataset_path: the path of the dataset, defaults to None.
:param obsolete: if True, download the obsolete dataset, defaults to False.
:param kwargs: additional arguments passed to `get_dataset` when
downloading the dataset
"""
Expand All @@ -119,7 +124,7 @@ def __init__(
else:
raise ValueError(f"Unknown dataset type: {full_suffix}")
else:
self.dataset_path = get_dataset(flavor, dataset_type, **kwargs)
self.dataset_path = get_dataset(flavor, dataset_type, obsolete=obsolete, **kwargs)

def __iter__(self):
if self.dataset_type is DatasetType.jsonl:
Expand Down
1 change: 0 additions & 1 deletion openfoodfacts/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,6 @@ def check_user_agent(self):
class DatasetType(str, enum.Enum):
csv = "csv"
jsonl = "jsonl"
obsolete = "obsolete"


class TaxonomyType(str, enum.Enum):
Expand Down

0 comments on commit d844fe4

Please sign in to comment.