Skip to content

Commit

Permalink
Fix empty df scenario
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Jan 8, 2025
1 parent 0e44926 commit 2d9054d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
11 changes: 11 additions & 0 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,3 +551,14 @@ def test_hocr_to_dataframe():
assert df["width"].iloc[0] == 60
assert df["height"].iloc[0] == 13
assert df["text"].iloc[0] == "word"


def test_hocr_to_dataframe_when_no_prediction_empty_df():
df = OCRAgentTesseract().hocr_to_dataframe(hocr="")

assert df.shape == (0, 5)
assert "left" in df.columns
assert "top" in df.columns
assert "width" in df.columns
assert "text" in df.columns
assert "text" in df.columns
2 changes: 1 addition & 1 deletion unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def hocr_to_dataframe(
"text": text,
}
)
ocr_df = pd.DataFrame(df_entries)
ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"])
return ocr_df

@staticmethod
Expand Down

0 comments on commit 2d9054d

Please sign in to comment.