Merge pull request #1 from pmittaldev/ROUGE-metric-(evidentlyai#1318)

Rouge metric (evidentlyai#1318)
pmittaldev · Oct 28, 2024 · ac62235 · ac62235
2 parents cd6cbfb + 6555f17
commit ac62235
Show file tree

Hide file tree

Showing 43 changed files with 2,016 additions and 11 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -115,9 +115,7 @@ jobs:
       - name: Install minimal dependencies
         run: pip install -r requirements.min.txt
       - name: Install package
-        run: pip install -e .[dev,spark,fsspec]
-      - name: Run pip-audit
-        run: pip-audit --ignore-vuln PYSEC-2024-48 --ignore-vuln GHSA-jw8x-6495-233v --ignore-vuln GHSA-4hq2-rpgc-r8r7
+        run: pip install -e .[dev,spark,fsspec,llm]
       - name: Run Tests
         run: python -m pytest --durations=50
   test:
@@ -155,7 +153,7 @@ jobs:
         uses: ./.github/share-actions/get-bikes-dataset-cached
 
       - name: Install package
-        run: pip install -e .[dev,spark,fsspec]
+        run: pip install -e .[dev,spark,fsspec,llm]
       - name: Run Tests
         run: python -m pytest --durations=50
 
@@ -173,7 +171,7 @@ jobs:
           cache: "pip"
           cache-dependency-path: setup.py
       - name: Install dependencies
-        run: pip install -e ".[dev]"
+        run: pip install -e .
       - name: Install wheel
         run: pip install wheel
       - name: Build package

diff --git a/.github/workflows/ui.yml b/.github/workflows/ui.yml
@@ -151,8 +151,8 @@ jobs:
         uses: ./.github/share-actions/ui-node-pnpm-install
 
       - name: Install Playwright Browsers
-        working-directory: ui
-        run: pnpm dlx playwright@1.43.0 install --with-deps
+        working-directory: ui/service
+        run: pnpm exec playwright install --with-deps chromium
 
       - name: 🔍 Get bikes dataset cached
         uses: ./.github/share-actions/get-bikes-dataset-cached
@@ -162,7 +162,7 @@ jobs:
 
       - name: Wait UI to be ready to test
         working-directory: ui/service
-        run: pnpm wait-on tcp:127.0.0.1:8000 -t 200000
+        run: pnpm wait-on tcp:127.0.0.1:8000 -t 4m
 
       - name: Run Service Playwright tests
         working-directory: ui/service

diff --git a/docs/book/reference/all-metrics.md b/docs/book/reference/all-metrics.md
@@ -272,6 +272,9 @@ Check for regular expression matches.
 | **DoesNotContain()** <ul><li>Checks if the text does not contain any or all specified items. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `DoesNotContain(items=["as a large language model"]` | **Required:** <br> `items: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`case_sensitive = True` or `False`</li></ul> |
 | **IncludesWords()** <ul><li> Checks if the text includes **any** (default) or **all** specified words. </li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li> By default, considers inflected and variant forms of the same word. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `IncludesWords(words_list=['booking', 'hotel', 'flight']` | **Required:** <br> `words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'any'` or `'all'`</li><li>`lemmatize = True` or `False`</li></ul> |
 | **ExcludesWords()** <ul><li>Checks if the text excludes all specified words.</li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li>By default, considers inflected and variant forms of the same word. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExcludesWords(words_list=['buy', 'sell', 'bet']`| **Required:** <br>`words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`lemmatize = True` or `False`</li></ul> |
+| **ItemMatch()** <ul><li>Checks whether the text contains **any** (default) or **all** specified  items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="expected")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
+| **ItemNoMatch()** <ul><li>Checks whether the text excludes **any** (default) or **all** specified  items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="forbidden")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
+| **JSONSchemaMatch()** <ul><li>Checks if the text contains a JSON object matching the **expected_schema**. Supports exact (**exact=True**) or minimal (**exact=False**) matching, with optional strict type validation (**validate_types=True**).  </li><li>Returns True/False for each row. </li></ul> Example use:<br> `JSONSchemaMatch(expected_schema={"name": str, "age": int}, exact_match=False, validate_types=True)`| **Required:** <br>`expected_schema: Dict[str, type]`<br><br>**Optional:**<ul><li>`exact_match = True` or `False`</li><li>`validate_types = True` or `False`</li></ul> |
 
 ## Descriptors: Text stats
 

diff --git a/examples/data_generators.py b/examples/data_generators.py
@@ -0,0 +1,66 @@
+from evidently.experimental.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator
+from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
+from evidently.options.base import Options
+
+
+def generate_from_file():
+    file_path = "../cloud_quickstart_tracing.pdf"
+    data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20, splitter="simple")
+
+    generator = QADatasetGenerator(
+        data_collection=data,
+        provider="openai",
+        model="gpt-4o-mini",
+        num_questions=5,
+        options=Options.from_any_options(None)
+    )
+    generated = generator.generate()
+    for _, a in generated.iterrows():
+        print("Q", a["questions"])
+        if "answers" in a:
+            print("A", a["answers"])
+        if "context" in a:
+            print("C", a["context"])
+        print()
+
+
+def main():
+    data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
+    generator = QADatasetGenerator(
+        data_collection=data,
+        provider="openai",
+        model="gpt-4o-mini",
+        num_questions=5,
+        options=Options.from_any_options(None)
+    )
+
+    generated = generator.generate()
+    for _, a in generated.iterrows():
+        print("Q", a["questions"])
+        if "answers" in a:
+            print("A", a["answers"])
+        if "context" in a:
+            print("C", a["context"])
+        print()
+
+    generator = QADatasetFromSeedGenerator(
+        seed_question="What is 'kek'?",
+        num_questions=5,
+        provider="openai",
+        model="gpt-4o-mini",
+        options=Options.from_any_options(None)
+    )
+
+    generated = generator.generate()
+    for _, a in generated.iterrows():
+        print("Q", a["questions"])
+        if "answers" in a:
+            print("A", a["answers"])
+        if "context" in a:
+            print("C", a["context"])
+        print()
+
+
+if __name__ == '__main__':
+    main()
+    # generate_from_file()
diff --git a/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb b/examples/how_to_questions/metrics/data_integrity/dataset_rouge_summary_metric.ipynb
@@ -0,0 +1,117 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evidently Dataset ROUGE Summary Metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from evidently.report import Report\n",
+    "from evidently.metrics import ROUGESummaryMetric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "current_data = {\n",
+    "    \"summary\": [\"hello there\", \"general kenobi\"],\n",
+    "}\n",
+    "\n",
+    "current_df = pd.DataFrame(current_data)\n",
+    "\n",
+    "reference_data = {\n",
+    "    \"summary\": [\"hello there\", \"no de\"]\n",
+    "}\n",
+    "\n",
+    "current_df = pd.DataFrame(current_data)\n",
+    "reference_df = pd.DataFrame(reference_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report = Report(metrics=[\n",
+    "    ROUGESummaryMetric(column_name=\"summary\", rouge_n=2)\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report.run(current_data=current_df, reference_data=reference_df)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report.as_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report.as_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/how_to_questions/metrics/data_integrity/dataset_summary_metric.ipynb b/examples/how_to_questions/metrics/data_integrity/dataset_summary_metric.ipynb
@@ -116,7 +116,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.13"
+   "version": "3.8.19"
   }
  },
  "nbformat": 4,

diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -16,6 +16,7 @@ pip-audit
 pyspark
 ruff==0.3.7
 pre-commit==3.5.0
+evaluate==0.4.1
 
 # service dependencies
 litestar>=2.7.1

diff --git a/requirements.min.txt b/requirements.min.txt
@@ -31,3 +31,5 @@ openai==1.16.2
 evaluate==0.4.1
 transformers[torch]==4.39.3
 sentence-transformers==2.7.0
+rouge-score==0.1.2
+chromadb==0.4.0
diff --git a/setup.cfg b/setup.cfg
@@ -106,6 +106,15 @@ ignore_missing_imports = True
 [mypy-litellm.*]
 ignore_missing_imports = True
 
+[mypy-chromadb.*]
+ignore_missing_imports = True
+
+[mypy-llama_index.*]
+ignore_missing_imports = True
+
+[mypy-pypdf.*]
+ignore_missing_imports = True
+
 [tool:pytest]
 testpaths=tests
 python_classes=*Test

diff --git a/setup.py b/setup.py
@@ -76,6 +76,7 @@
         "deprecation>=2.1.0",
         "uuid6>=2024.7.10",
         "cryptography>=43.0.1",
+        "evaluate>=0.4.1",
     ],
     extras_require={
         "dev": [
@@ -96,12 +97,15 @@
             "ruff==0.3.7",
             "pre-commit==3.5.0",
             "pytest-asyncio==0.23.7",
+            "evaluate>=0.4.1",
         ],
         "llm": [
             "openai>=1.16.2",
             "evaluate>=0.4.1",
             "transformers[torch]>=4.39.3",
             "sentence-transformers>=2.7.0",
+            "rouge-score>=0.1.2",
+            "chromadb>=0.4.0",
         ],
         "spark": ["pyspark>=3.4.0"],
         "fsspec": [

diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
@@ -3,6 +3,7 @@
 from .custom_descriptor import CustomPairColumnEval
 from .hf_descriptor import HuggingFaceModel
 from .hf_descriptor import HuggingFaceToxicityModel
+from .json_schema_match_descriptor import JSONSchemaMatch
 from .llm_judges import BiasLLMEval
 from .llm_judges import ContextQualityLLMEval
 from .llm_judges import DeclineLLMEval
@@ -19,6 +20,8 @@
 from .sentiment_descriptor import Sentiment
 from .text_contains_descriptor import Contains
 from .text_contains_descriptor import DoesNotContain
+from .text_contains_descriptor import ItemMatch
+from .text_contains_descriptor import ItemNoMatch
 from .text_length_descriptor import TextLength
 from .text_part_descriptor import BeginsWith
 from .text_part_descriptor import EndsWith
@@ -47,6 +50,8 @@
     "EndsWith",
     "DoesNotContain",
     "IncludesWords",
+    "ItemMatch",
+    "ItemNoMatch",
     "ExcludesWords",
     "TextLength",
     "TriggerWordsPresence",
@@ -55,5 +60,6 @@
     "SentenceCount",
     "Sentiment",
     "RegExp",
+    "JSONSchemaMatch",
     "_registry",
 ]
diff --git a/src/evidently/descriptors/_registry.py b/src/evidently/descriptors/_registry.py
@@ -15,6 +15,11 @@
     "evidently.descriptors.hf_descriptor.HuggingFaceToxicityModel",
     "evidently:descriptor:HuggingFaceToxicityModel",
 )
+register_type_alias(
+    FeatureDescriptor,
+    "evidently.descriptors.json_schema_match_descriptor.JSONSchemaMatch",
+    "evidently:descriptor:JSONSchemaMatch",
+)
 register_type_alias(
     FeatureDescriptor, "evidently.descriptors.llm_judges.BiasLLMEval", "evidently:descriptor:BiasLLMEval"
 )
@@ -72,6 +77,12 @@
     "evidently.descriptors.text_contains_descriptor.DoesNotContain",
     "evidently:descriptor:DoesNotContain",
 )
+register_type_alias(
+    FeatureDescriptor, "evidently.descriptors.text_contains_descriptor.ItemMatch", "evidently:descriptor:ItemMatch"
+)
+register_type_alias(
+    FeatureDescriptor, "evidently.descriptors.text_contains_descriptor.ItemNoMatch", "evidently:descriptor:ItemNoMatch"
+)
 register_type_alias(
     FeatureDescriptor, "evidently.descriptors.text_length_descriptor.TextLength", "evidently:descriptor:TextLength"
 )

diff --git a/src/evidently/descriptors/json_schema_match_descriptor.py b/src/evidently/descriptors/json_schema_match_descriptor.py
@@ -0,0 +1,23 @@
+from typing import Dict
+
+from evidently.features import json_schema_match_feature
+from evidently.features.generated_features import FeatureDescriptor
+from evidently.features.generated_features import GeneratedFeature
+
+
+class JSONSchemaMatch(FeatureDescriptor):
+    class Config:
+        type_alias = "evidently:descriptor:JSONSchemaMatch"
+
+    expected_schema: Dict[str, type]
+    validate_types: bool = False
+    exact_match: bool = False
+
+    def feature(self, column_name: str) -> GeneratedFeature:
+        return json_schema_match_feature.JSONSchemaMatch(
+            column_name=column_name,
+            expected_schema=self.expected_schema,
+            validate_types=self.validate_types,
+            exact_match=self.exact_match,
+            display_name=self.display_name,
+        )