diff --git a/CHANGELOG.md b/CHANGELOG.md index 111bbbdf53..d99fd4bfeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.25-dev9 +## 0.10.25 ### Enhancements @@ -19,10 +19,10 @@ ocr agent tesseract/paddle in environment variable `OCR_AGENT` for OCRing the en * **Fix chunks breaking on regex-metadata matches.** Fixes "over-chunking" when `regex_metadata` was used, where every element that contained a regex-match would start a new chunk. * **Fix regex-metadata match offsets not adjusted within chunk.** Fixes incorrect regex-metadata match start/stop offset in chunks where multiple elements are combined. * **Map source cli command configs when destination set** Due to how the source connector is dynamically called when the destination connector is set via the CLI, the configs were being set incorrectoy, causing the source connector to break. The configs were fixed and updated to take into account Fsspec-specific connectors. -* **Fix metrics folder not discoverable** Fixes issue where unstructured/metrics folder is not discoverable on PyPI by adding -an `__init__.py` file under the folder. +* **Fix metrics folder not discoverable** Fixes issue where unstructured/metrics folder is not discoverable on PyPI by adding an `__init__.py` file under the folder. * **Fix a bug when `parition_pdf` get `model_name=None`** In API usage the `model_name` value is `None` and the `cast` function in `partition_pdf` would return `None` and lead to attribution error. Now we use `str` function to explicit convert the content to string so it is garanteed to have `starts_with` and other string functions as attributes * **Fix html partition fail on tables without `tbody` tag** HTML tables may sometimes just contain headers without body (`tbody` tag) +* **Fix out-of-order sequencing of split chunks.** Fixes behavior where "split" chunks were inserted at the beginning of the chunk sequence. This would produce a chunk sequence like [5a, 5b, 3a, 3b, 1, 2, 4] when sections 3 and 5 exceeded `max_characters`. ## 0.10.24 diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index c4b378a289..0461176b39 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -23,6 +23,24 @@ from unstructured.partition.html import partition_html +def test_it_splits_a_large_section_into_multiple_chunks(): + elements: List[Element] = [ + Title("Introduction"), + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus" + " porta volutpat." + ), + ] + + chunks = chunk_by_title(elements, combine_text_under_n_chars=50, max_characters=50) + + assert chunks == [ + CompositeElement("Introduction"), + CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing "), + CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."), + ] + + def test_split_elements_by_title_and_table(): elements: List[Element] = [ Title("A Great Day"), diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3d3bf6931e..692c04d172 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.25-dev9" # pragma: no cover +__version__ = "0.10.25" # pragma: no cover diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index fbcb8c56e5..989d684a1c 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -152,25 +152,16 @@ def chunk_by_title( chunk_matches.extend(matches) chunk_regex_metadata[regex_name] = chunk_matches - # Check if text exceeds max_characters - if len(text) > max_characters: - # Chunk the text from the end to the beginning - while len(text) > 0: - if len(text) <= max_characters: - # If the remaining text is shorter than max_characters - # create a chunk from the beginning - chunk_text = text - text = "" - else: - # Otherwise, create a chunk from the end - chunk_text = text[-max_characters:] - text = text[:-max_characters] - - # Prepend the chunk to the beginning of the list - chunked_elements.insert(0, CompositeElement(text=chunk_text, metadata=metadata)) - else: - # If it doesn't exceed, create a single CompositeElement - chunked_elements.append(CompositeElement(text=text, metadata=metadata)) + # -- split chunk into CompositeElements objects maxlen or smaller -- + text_len = len(text) + start = 0 + remaining = text_len + + while remaining > 0: + end = min(start + max_characters, text_len) + chunked_elements.append(CompositeElement(text=text[start:end], metadata=metadata)) + start = end + remaining = text_len - end return chunked_elements