Skip to content

Feat/field to store inner elements #1058

Feat/field to store inner elements

Feat/field to store inner elements #1058

Workflow file for this run

name: CI
on:
push:
branches: [ main, robinson/initial-repo-setup ]
pull_request:
branches: [ main ]
env:
PYTHON_VERSION: 3.8
jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Poppler
run: |
sudo apt-get update
sudo apt-get -y install poppler-utils
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
lint:
runs-on: ubuntu-latest
needs: setup
steps:
- uses: actions/checkout@v4
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: .venv
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
# We can take this out when we implement the fix in CORE-99
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
- name: Lint
run: |
source .venv/bin/activate
make check
shellcheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: ShellCheck
uses: ludeeus/action-shellcheck@master
test:
runs-on: ubuntu-latest
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: |
.venv
key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
# We can take this out when we implement the fix in CORE-99
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
- name: Install Poppler
run: |
sudo apt-get update
sudo apt-get -y install poppler-utils tesseract-ocr
- name: Test
env:
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
source .venv/bin/activate
CI=true make test
make check-coverage
test_ingest:
strategy:
matrix:
python-version: ["3.8","3.9","3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: lint
steps:
- name: Checkout unstructured repo for integration testing
uses: actions/checkout@v4
with:
repository: 'Unstructured-IO/unstructured'
ref: 'benjamin/feat/clean-by-store-pdfminer-inner-elements'
- name: Checkout this repo
uses: actions/checkout@v4
with:
path: inference
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Test
env:
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
run: |
python${{ matrix.python-version }} -m venv .venv
source .venv/bin/activate
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
make install-ci
pip install -e inference/
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install -y diffstat
tesseract --version
make install-ingest-s3
make install-ingest-azure
make install-ingest-discord
make install-ingest-elasticsearch
make install-ingest-dropbox
make install-ingest-gcs
make install-ingest-google-drive
make install-ingest-github
make install-ingest-gitlab
make install-ingest-slack
make install-ingest-wikipedia
# only run ingest tests that check expected output diffs.
bash inference/scripts/test-unstructured-ingest-helper.sh
changelog:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- if: github.ref != 'refs/heads/main'
uses: dorny/paths-filter@v2
id: changes
with:
filters: |
src:
- 'unstructured_inference/**'
- if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
uses: dangoslen/changelog-enforcer@v3