Unstructured-IO · jsoref · Sep 12, 2024 · Sep 12, 2024 · Sep 11, 2024 · Sep 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -72,7 +72,7 @@ Fix syntax for generated HTML tables
 
 ## 0.7.22
 
-* fix: add logic to handle computation of intersections betwen 2 `Rectangle`s when a `Rectangle` has `None` value in its coordinates
+* fix: add logic to handle computation of intersections between 2 `Rectangle`s when a `Rectangle` has `None` value in its coordinates
 
 ## 0.7.21
 
@@ -111,8 +111,8 @@ Fix syntax for generated HTML tables
 
 * refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings
 * enhancement: support extracting elements with types `Picture` and `Figure`
-* fix: update logger in table initalization where the logger info was not showing
-* chore: supress UserWarning about specified model providers
+* fix: update logger in table initialization where the logger info was not showing
+* chore: suppress UserWarning about specified model providers
 
 ## 0.7.12
 
@@ -215,7 +215,7 @@ we have the mapping from standard language code to paddle language code.
 
 ## 0.6.0
 
-* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables
+* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environment variables
 * update behavior of `pad_image_with_background_color` so that input `pad` is applied to all sides
 
 ## 0.5.31
@@ -256,7 +256,7 @@ we have the mapping from standard language code to paddle language code.
 
 ## 0.5.21
 
-* adds `safe_division` to replae 0 with machine epsilon for `float` to avoid division by 0
+* adds `safe_division` to replace 0 with machine epsilon for `float` to avoid division by 0
 * apply `safe_division` to area overlap calculations in `unstructured_inference/inference/elements.py`
 
 ## 0.5.20
@@ -346,7 +346,7 @@ we have the mapping from standard language code to paddle language code.
 * Added functionality to convert a PDF in small chunks of pages at a time for `pdf2image.convert_from_path`
 * Table processing check for the area of the package to fix division by zero bug
 * Added CUDA and TensorRT execution providers for yolox and detectron2onnx model.
-* Warning for onnx version of detectron2 for empty pages suppresed.
+* Warning for onnx version of detectron2 for empty pages suppressed.
 
 ## 0.5.4
 

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Run `pip install unstructured-inference`.
 
 [Detectron2](https://github.com/facebookresearch/detectron2) is required for using models from the [layoutparser model zoo](#using-models-from-the-layoutparser-model-zoo) 
 but is not automatically installed with this package. 
-For MacOS and Linux, build from source with:
+For macOS and Linux, build from source with:
 ```shell
 pip install 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a'
 ```
@@ -89,6 +89,6 @@ information on how to report security vulnerabilities.
 
 | Section | Description |
 |-|-|
-| [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects  |
-| [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories |
+| [Unstructured Community GitHub](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects  |
+| [Unstructured GitHub](https://github.com/Unstructured-IO) | Unstructured.io open source repositories |
 | [Company Website](https://unstructured.io) | Unstructured.io product and company info |
diff --git a/test_unstructured_inference/models/test_chippermodel.py b/test_unstructured_inference/models/test_chippermodel.py
@@ -190,19 +190,19 @@ def test_no_repeat_ngram_logits():
     )
 
 
-def test_ngram_repetiton_stopping_criteria():
+def test_ngram_repetition_stopping_criteria():
     input_ids = torch.tensor([[1, 2, 3, 4, 0, 1, 2, 3, 4]])
     logits = torch.tensor([[0.1, -0.3, -0.5, 0, 1.0, -0.9]])
 
-    stoppingCriteria = chipper.NGramRepetitonStoppingCriteria(
+    stoppingCriteria = chipper.NGramRepetitionStoppingCriteria(
         repetition_window=2, skip_tokens={0, 1, 2, 3, 4}
     )
 
     output = stoppingCriteria(input_ids=input_ids, scores=logits)
 
     assert output is False
 
-    stoppingCriteria = chipper.NGramRepetitonStoppingCriteria(
+    stoppingCriteria = chipper.NGramRepetitionStoppingCriteria(
         repetition_window=2, skip_tokens={1, 2, 3, 4}
     )
     output = stoppingCriteria(input_ids=input_ids, scores=logits)
@@ -259,7 +259,7 @@ def test_postprocess_bbox(decoded_str, expected_classes):
 def test_predict_tokens_beam_indices():
     model = get_model("chipper")
     model.stopping_criteria = [
-        chipper.NGramRepetitonStoppingCriteria(
+        chipper.NGramRepetitionStoppingCriteria(
             repetition_window=1,
             skip_tokens={},
         ),

diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -927,7 +927,7 @@ def test_table_prediction_output_format(
         assert expectation in result.values
     elif output_format == "cells":
         # other output like bbox are flakey to test since they depend on OCR and it may change
-        # slightly when OCR pacakge changes or even on different machines
+        # slightly when OCR package changes or even on different machines
         validation_fields = ("column_nums", "row_nums", "column header", "cell text")
         assert expectation in [{key: cell[key] for key in validation_fields} for cell in result]
     else:
@@ -1763,11 +1763,11 @@ def test_padded_results_has_right_dimensions(table_transformer, example_image):
     pad = int(min(example_image.size) / 10)
 
     structure = table_transformer.get_structure(example_image, pad_for_structure_detection=pad)
-    # boxes deteced OUTSIDE of the original image; this shouldn't happen but we want to make sure
+    # boxes detected OUTSIDE of the original image; this shouldn't happen but we want to make sure
     # the code handles it as expected
     structure["pred_boxes"][0][0, :2] = 0.5
     structure["pred_boxes"][0][0, 2:] = 1.0
-    # mock a box we know are safly inside the original image with known positions
+    # mock a box we know are safely inside the original image with known positions
     width, height = example_image.size
     padded_width = width + pad * 2
     padded_height = height + pad * 2

diff --git a/test_unstructured_inference/test_utils.py b/test_unstructured_inference/test_utils.py
@@ -77,7 +77,7 @@ def test_pad_image_with_background_color(mock_pil_image):
 
 
 def test_pad_image_with_invalid_input(mock_pil_image):
-    with pytest.raises(ValueError, match="Can not pad an image with negative space!"):
+    with pytest.raises(ValueError, match="Cannot pad an image with negative space!"):
         pad_image_with_background_color(mock_pil_image, -1)
 
 

diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -178,7 +178,7 @@ def separate(region_a: Rectangle, region_b: Rectangle):
     """Reduce leftmost rectangle to don't overlap with the other"""
 
     def reduce(keep: Rectangle, reduce: Rectangle):
-        # Asume intersection
+        # Assume intersection
 
         # Other is down
         if reduce.y2 > keep.y2 and reduce.x1 < keep.x2:

diff --git a/unstructured_inference/logger.py b/unstructured_inference/logger.py
@@ -2,7 +2,7 @@
 
 
 def translate_log_level(level: int) -> int:
-    """Translate Python debugg level to ONNX runtime error level
+    """Translate Python debug level to ONNX runtime error level
     since blank pages error are shown at level 3 that should be the
     exception, and 4 the normal behavior"""
     level_name = logging.getLevelName(level)

diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py
@@ -102,7 +102,7 @@ def initialize(
         ]
 
         self.stopping_criteria = [
-            NGramRepetitonStoppingCriteria(
+            NGramRepetitionStoppingCriteria(
                 repetition_window=30,
                 skip_tokens=get_table_token_ids(self.processor),
             ),
@@ -137,7 +137,7 @@ def initialize(
         else:
             if swap_head_hidden_layer_size is not None:
                 logger.warning(
-                    f"swap_head is False but recieved value {swap_head_hidden_layer_size} for "
+                    f"swap_head is False but received value {swap_head_hidden_layer_size} for "
                     "swap_head_hidden_layer_size, which will be ignored.",
                 )
 
@@ -658,7 +658,7 @@ def reduce_bbox_overlap(
         input_bbox: List[float],
     ) -> List[float]:
         """
-        If an element does overlap with other elements, reduce bouding box by selecting the largest
+        If an element does overlap with other elements, reduce bounding box by selecting the largest
         bbox after blurring existing text
         """
         input_bbox = [int(b) for b in input_bbox]
@@ -1027,7 +1027,7 @@ def __call__(
         )
 
 
-class NGramRepetitonStoppingCriteria(StoppingCriteria):
+class NGramRepetitionStoppingCriteria(StoppingCriteria):
     def __init__(self, repetition_window: int, skip_tokens: set = set()):
         self.repetition_window = repetition_window
         self.skip_tokens = skip_tokens

diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py
@@ -48,7 +48,7 @@
         "model_path": os.path.join(
             HUGGINGFACE_HUB_CACHE,
             "detectron2_quantized",
-            "detectrin2_quantized.onnx",
+            "detectron2_quantized.onnx",
         ),
         "label_map": DEFAULT_LABEL_MAP,
         "confidence_threshold": 0.8,
@@ -131,7 +131,7 @@ def preprocess(self, image: Image.Image) -> Dict[str, np.ndarray]:
         """
         # TODO (benjamin): check other shapes for inference
         img = np.array(image)
-        # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model
+        # TODO (benjamin): We should use models.get_model() but currently returns Detectron model
         session = self.model
         # onnx input expected
         # [3,1035,800]

diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
@@ -84,7 +84,7 @@ def get_structure(
         x: PILImage.Image,
         pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD,
     ) -> dict:
-        """get the table structure as a dictionary contaning different types of elements as
+        """get the table structure as a dictionary containing different types of elements as
         key-value pairs; check table-transformer documentation for more information"""
         with torch.no_grad():
             encoding = self.feature_extractor(

diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py
@@ -91,15 +91,15 @@ def image_processing(
         self,
         image: PILImage.Image,
     ) -> List[LayoutElement]:
-        """Method runing YoloX for layout detection, returns a PageLayout
+        """Method running YoloX for layout detection, returns a PageLayout
         parameters
         ----------
         page
             Path for image file with the image to process
         origin_img
             If specified, an Image object for process with YoloX model
         page_number
-            Number asigned to the PageLayout returned
+            Number assigned to the PageLayout returned
         output_directory
             Boolean indicating if result will be stored
         """
@@ -125,7 +125,7 @@ def image_processing(
         boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
         boxes_xyxy /= ratio
 
-        # Note (Benjamin): Distinct models (quantized and original) requires distincts
+        # Note (Benjamin): Distinct models (quantized and original) requires distinct
         # levels of thresholds
         if "quantized" in self.model_path:
             dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.0, score_thr=0.07)

diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py
@@ -50,7 +50,7 @@ def __len__(self) -> int:
 
 
 def tag(elements: Iterable[LayoutElement]):
-    """Asign an numeric id to the elements in the list.
+    """Assign a numeric id to the elements in the list.
     Useful for debugging"""
     colors = ["red", "blue", "green", "magenta", "brown"]
     for i, e in enumerate(elements):
@@ -72,7 +72,7 @@ def pad_image_with_background_color(
     width, height = image.size
     if pad < 0:
         raise ValueError(
-            "Can not pad an image with negative space! Please use a positive value for `pad`.",
+            "Cannot pad an image with negative space! Please use a positive value for `pad`.",
         )
     new = Image.new(image.mode, (width + pad * 2, height + pad * 2), background_color)
     new.paste(image, (pad, pad))