diff --git a/fasttext_amazon_reviews/fasttext_amazon_reviews.ipynb b/fasttext_amazon_reviews/fasttext_amazon_reviews.ipynb
index 6dbe47b..8c64d9a 100644
--- a/fasttext_amazon_reviews/fasttext_amazon_reviews.ipynb
+++ b/fasttext_amazon_reviews/fasttext_amazon_reviews.ipynb
@@ -48,14 +48,15 @@
"outputs": [],
"source": [
"import json\n",
- "from cleanlab.models.fasttext import FastTextClassifier, data_loader\n",
"import cleanlab\n",
"import numpy as np\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.model_selection import ParameterGrid\n",
"import os\n",
- "from datetime import datetime as dt"
+ "from datetime import datetime as dt\n",
+ "\n",
+ "from fasttext_wrapper import FastTextClassifier, data_loader"
]
},
{
diff --git a/fasttext_amazon_reviews/fasttext_wrapper.py b/fasttext_amazon_reviews/fasttext_wrapper.py
new file mode 100644
index 0000000..a396bac
--- /dev/null
+++ b/fasttext_amazon_reviews/fasttext_wrapper.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2017-2023 Cleanlab Inc.
+# This file is part of cleanlab.
+#
+# cleanlab is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# cleanlab is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with cleanlab. If not, see .
+
+"""
+Text classification with fastText models that are compatible with cleanlab.
+This module allows you to easily find label issues in your text datasets.
+
+You must have fastText installed: ``pip install "fasttext==0.9.2"`` or lower.
+Version 0.9.3 has a regression bug and the official package has been archived on GitHub.
+
+Tips:
+
+* Check out our example using this class: `fasttext_amazon_reviews `_
+* Our `unit tests `_ also provide basic usage examples.
+
+"""
+
+import time
+import os
+import copy
+import numpy as np
+from sklearn.base import BaseEstimator
+from fasttext import train_supervised, load_model
+
+
+LABEL = "__label__"
+NEWLINE = " __newline__ "
+
+
+def data_loader(
+ fn=None,
+ indices=None,
+ label=LABEL,
+ batch_size=1000,
+):
+ """Returns a generator, yielding two lists containing
+ [labels], [text]. Items are always returned in the
+ order in the file, regardless if indices are provided."""
+
+ def _split_labels_and_text(batch):
+ l, t = [list(t) for t in zip(*(z.split(" ", 1) for z in batch))]
+ return l, t
+
+ # Prepare a stack of indices
+ if indices is not None:
+ stack_indices = sorted(indices, reverse=True)
+ stack_idx = stack_indices.pop()
+
+ with open(fn, "r") as f:
+ len_label = len(label)
+ idx = 0
+ batch_counter = 0
+ prev = f.readline()
+ batch = []
+ while True:
+ try:
+ line = f.readline()
+ line = line
+ if line[:len_label] == label or line == "":
+ if indices is None or stack_idx == idx:
+ # Write out prev line and reset prev
+ batch.append(prev.strip().replace("\n", NEWLINE))
+ batch_counter += 1
+
+ if indices is not None:
+ if len(stack_indices):
+ stack_idx = stack_indices.pop()
+ else: # No more data in indices, quit loading data.
+ yield _split_labels_and_text(batch)
+ break
+ prev = ""
+ idx += 1
+ if batch_counter == batch_size:
+ yield _split_labels_and_text(batch)
+ # Reset batch
+ batch_counter = 0
+ batch = []
+ prev += line
+ if line == "":
+ if len(batch) > 0:
+ yield _split_labels_and_text(batch)
+ break
+ except EOFError:
+ if indices is None or stack_idx == idx:
+ # Write out prev line and reset prev
+ batch.append(prev.strip().replace("\n", NEWLINE))
+ batch_counter += 1
+ yield _split_labels_and_text(batch)
+ break
+
+
+class FastTextClassifier(BaseEstimator): # Inherits sklearn base classifier
+ """Instantiate a fastText classifier that is compatible with :py:class:`CleanLearning `.
+
+ Parameters
+ ----------
+ train_data_fn: str
+ File name of the training data in the format compatible with fastText.
+
+ test_data_fn: str, optional
+ File name of the test data in the format compatible with fastText.
+ """
+
+ def __init__(
+ self,
+ train_data_fn,
+ test_data_fn=None,
+ labels=None,
+ tmp_dir="",
+ label=LABEL,
+ del_intermediate_data=True,
+ kwargs_train_supervised={},
+ p_at_k=1,
+ batch_size=1000,
+ ):
+ self.train_data_fn = train_data_fn
+ self.test_data_fn = test_data_fn
+ self.tmp_dir = tmp_dir
+ self.label = label
+ self.del_intermediate_data = del_intermediate_data
+ self.kwargs_train_supervised = kwargs_train_supervised
+ self.p_at_k = p_at_k
+ self.batch_size = batch_size
+ self.clf = None
+ self.labels = labels
+
+ if labels is None:
+ # Find all class labels across the train and test set (if provided)
+ unique_labels = set([])
+ for labels, _ in data_loader(fn=train_data_fn, batch_size=batch_size):
+ unique_labels = unique_labels.union(set(labels))
+ if test_data_fn is not None:
+ for labels, _ in data_loader(fn=test_data_fn, batch_size=batch_size):
+ unique_labels = unique_labels.union(set(labels))
+ else:
+ # Prepend labels with self.label token (e.g. '__label__').
+ unique_labels = [label + str(l) for l in labels]
+ # Create maps: label strings <-> integers when label strings are used
+ unique_labels = sorted(list(unique_labels))
+ self.label2num = dict(zip(unique_labels, range(len(unique_labels))))
+ self.num2label = dict((y, x) for x, y in self.label2num.items())
+
+ def _create_train_data(self, data_indices):
+ """Returns filename of the masked fasttext data file.
+ Items are written in the order they are in the file,
+ regardless if indices are provided."""
+
+ # If X indexes all training data, no need to rewrite the file.
+ if data_indices is None:
+ self.masked_data_was_created = False
+ return self.train_data_fn
+ # Mask training data by data_indices
+ else:
+ len_label = len(LABEL)
+ data_indices = sorted(data_indices, reverse=True)
+ masked_fn = "fastTextClf_" + str(int(time.time())) + ".txt"
+ open(masked_fn, "w").close()
+ # Read in training data one line at a time
+ with open(self.train_data_fn, "r") as rf:
+ idx = 0
+ data_idx = data_indices.pop()
+ for line in rf:
+ # Mask by data_indices
+ if idx == data_idx:
+ with open(masked_fn, "a") as wf:
+ wf.write(line.strip().replace("\n", NEWLINE) + "\n")
+ if line[:len_label] == LABEL:
+ if len(data_indices):
+ data_idx = data_indices.pop()
+ else:
+ break
+ # Increment data index if starts with __label__
+ # This enables support for text data containing '\n'.
+ if line[:len_label] == LABEL:
+ idx += 1
+ self.masked_data_was_created = True
+
+ return masked_fn
+
+ def _remove_masked_data(self, fn):
+ """Deletes intermediate data files."""
+
+ if self.del_intermediate_data and self.masked_data_was_created:
+ os.remove(fn)
+
+ def __deepcopy__(self, memo):
+ if self.clf is None:
+ self_clf_copy = None
+ else:
+ fn = "tmp_{}.fasttext.model".format(int(time.time()))
+ self.clf.save_model(fn)
+ self_clf_copy = load_model(fn)
+ os.remove(fn)
+ # Store self.clf
+ params = self.__dict__
+ clf = params.pop("clf")
+ # Copy params without self.clf (it can't be copied)
+ params_copy = copy.deepcopy(params)
+ # Add clf back to self.clf
+ self.clf = clf
+ # Create copy to return
+ clf_copy = FastTextClassifier(self.train_data_fn)
+ params_copy["clf"] = self_clf_copy
+ clf_copy.__dict__ = params_copy
+ return clf_copy
+
+ def fit(self, X=None, y=None, sample_weight=None):
+ """Trains the fast text classifier.
+ Typical usage requires NO parameters,
+ just clf.fit() # No params.
+
+ Parameters
+ ----------
+ X : iterable, e.g. list, numpy array (default None)
+ The list of indices of the data to use.
+ When in doubt, set as None. None defaults to range(len(data)).
+ y : None
+ Leave this as None. It's a filler to suit sklearns reqs.
+ sample_weight : None
+ Leave this as None. It's a filler to suit sklearns reqs."""
+
+ train_fn = self._create_train_data(data_indices=X)
+ self.clf = train_supervised(train_fn, **self.kwargs_train_supervised)
+ self._remove_masked_data(train_fn)
+
+ def predict_proba(self, X=None, train_data=True, return_labels=False):
+ """Produces a probability matrix with examples on rows and
+ classes on columns, where each row sums to 1 and captures the
+ probability of the example belonging to each class."""
+
+ fn = self.train_data_fn if train_data else self.test_data_fn
+ pred_probs_list = []
+ if return_labels:
+ labels_list = []
+ for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
+ pred = self.clf.predict(text=text, k=len(self.clf.get_labels()))
+ # Get p(label = k | x) matrix of shape (N x K) of pred probs for each x
+ pred_probs = [
+ [p for _, p in sorted(list(zip(*l)), key=lambda x: x[0])] for l in list(zip(*pred))
+ ]
+ pred_probs_list.append(np.array(pred_probs))
+ if return_labels:
+ labels_list.append(labels)
+ pred_probs = np.concatenate(pred_probs_list, axis=0)
+ if return_labels:
+ gold_labels = [self.label2num[z] for l in labels_list for z in l]
+ return (pred_probs, np.array(gold_labels))
+ else:
+ return pred_probs
+
+ def predict(self, X=None, train_data=True, return_labels=False):
+ """Predict labels of X"""
+
+ fn = self.train_data_fn if train_data else self.test_data_fn
+ pred_list = []
+ if return_labels:
+ labels_list = []
+ for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
+ pred = [self.label2num[z[0]] for z in self.clf.predict(text)[0]]
+ pred_list.append(pred)
+ if return_labels:
+ labels_list.append(labels)
+ pred = np.array([z for l in pred_list for z in l])
+ if return_labels:
+ gold_labels = [self.label2num[z] for l in labels_list for z in l]
+ return (pred, np.array(gold_labels))
+ else:
+ return pred
+
+ def score(self, X=None, y=None, sample_weight=None, k=None):
+ """Compute the average precision @ k (single label) of the
+ labels predicted from X and the true labels given by y.
+ score expects a `y` variable. In this case, `y` is the noisy labels."""
+
+ # Set the k for precision@k.
+ # For single label: 1 if label is in top k, else 0
+ if k is None:
+ k = self.p_at_k
+
+ fn = self.test_data_fn
+ pred_list = []
+ if y is None:
+ labels_list = []
+ for labels, text in data_loader(fn=fn, indices=X, batch_size=self.batch_size):
+ pred = self.clf.predict(text, k=k)[0]
+ pred_list.append(pred)
+ if y is None:
+ labels_list.append(labels)
+ pred = np.array([z for l in pred_list for z in l])
+ if y is None:
+ y = [z for l in labels_list for z in l]
+ else:
+ y = [self.num2label[z] for z in y]
+
+ apk = np.mean([y[i] in l for i, l in enumerate(pred)])
+
+ return apk
diff --git a/fasttext_amazon_reviews/requirements.txt b/fasttext_amazon_reviews/requirements.txt
index 2b69d51..b29d0f0 100644
--- a/fasttext_amazon_reviews/requirements.txt
+++ b/fasttext_amazon_reviews/requirements.txt
@@ -1,4 +1,4 @@
# This notebook requires cleanlab versions >= 2.3.0: pip install "cleanlab>=2.3.0"
-fasttext
+fasttext==0.9.2
numpy==1.21.6
scikit_learn==1.0.2