Skip to content

refactor: remove code related to embedded text extraction #1342

refactor: remove code related to embedded text extraction

refactor: remove code related to embedded text extraction #1342

Workflow file for this run

name: CI
on:
push:
branches: [ main, robinson/initial-repo-setup ]
pull_request:
branches: [ main ]
env:
PYTHON_VERSION: 3.9
jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
lookup-only: true
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Poppler
run: |
sudo apt-get update
sudo apt-get -y install poppler-utils
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
lint:
runs-on: ubuntu-latest
needs: setup
steps:
- uses: actions/checkout@v4
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: .venv
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
# We can take this out when we implement the fix in CORE-99
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
- name: Lint
run: |
source .venv/bin/activate
make check
shellcheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: ShellCheck
uses: ludeeus/action-shellcheck@master
test:
runs-on: ubuntu-latest
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: |
.venv
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
# We can take this out when we implement the fix in CORE-99
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
- name: Install Poppler
run: |
sudo apt-get update
sudo apt-get -y install poppler-utils tesseract-ocr
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_DEFAULT_REGION }}
- name: Test
env:
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
source .venv/bin/activate
aws s3 cp s3://utic-dev-models/ci_test_model/test_ci_model.onnx test_unstructured_inference/models/
CI=true make test
make check-coverage
test_ingest:
strategy:
matrix:
python-version: ["3.9","3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: lint
steps:
- name: Checkout unstructured repo for integration testing
uses: actions/checkout@v4
with:
repository: 'Unstructured-IO/unstructured'
- name: Checkout this repo
uses: actions/checkout@v4
with:
path: inference
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Test
env:
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
run: |
python${{ matrix.python-version }} -m venv .venv
source .venv/bin/activate
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
make install-ci
pip install -e inference/
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install -y diffstat
tesseract --version
make install-all-ingest
# only run ingest tests that check expected output diffs.
bash inference/scripts/test-unstructured-ingest-helper.sh
changelog:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- if: github.ref != 'refs/heads/main'
uses: dorny/paths-filter@v2
id: changes
with:
filters: |
src:
- 'unstructured_inference/**'
- if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
uses: dangoslen/changelog-enforcer@v3