From 007542982f9cf2e77f40102a4f1f314ab5ab3579 Mon Sep 17 00:00:00 2001 From: Alex VanTol Date: Mon, 30 Oct 2023 13:28:06 -0500 Subject: [PATCH] fix(ai): update bucket contents every roll for updates --- gen3/bin/kube-setup-gen3-discovery-ai.sh | 6 +-- kube/services/gen3-discovery-ai/README.md | 45 +++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 kube/services/gen3-discovery-ai/README.md diff --git a/gen3/bin/kube-setup-gen3-discovery-ai.sh b/gen3/bin/kube-setup-gen3-discovery-ai.sh index c8338afbc..5769a5f48 100644 --- a/gen3/bin/kube-setup-gen3-discovery-ai.sh +++ b/gen3/bin/kube-setup-gen3-discovery-ai.sh @@ -139,10 +139,8 @@ fi gen3_log_info "Setup complete, syncing configuration to bucket" -if [ -d "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge/chromadb" ]; then - bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || exit 1 - aws s3 sync "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge/chromadb" "s3://$bucketName/chromadb" --delete -fi +bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || exit 1 +aws s3 sync "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge/chromadb" "s3://$bucketName/chromadb" --delete gen3 roll gen3-discovery-ai g3kubectl apply -f "${GEN3_HOME}/kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml" diff --git a/kube/services/gen3-discovery-ai/README.md b/kube/services/gen3-discovery-ai/README.md new file mode 100644 index 000000000..fb25d8201 --- /dev/null +++ b/kube/services/gen3-discovery-ai/README.md @@ -0,0 +1,45 @@ +# Gen3 Discovery AI + +## Populating Disc for In-Memory Vectordb Chromadb + +In order to setup pre-configured topics, we need to load a bunch of data +into Chromadb (which is an inmem vectordb with an option to persist to disk). + +To load topics consistently, we setup an S3 bucket to house the persisted +vectordb. + +### Getting data into S3 + +Run the service elsewhere, load the data, and persist it to disk. Then move those +files from disk into the VM. The expectation is that for Chromadb loading, the +files are placed in a `gen3-discovery-ai/knowledge/chromadb` folder relative to +where the `manifest.json` is. For example: +`~/cdis-manifest/avantol.planx-pla.net/gen3-discovery-ai/gen3-discovery-ai/knowledge/chromadb` + +You can rsync from local if you've generated it locally. + +#### IMPORTANT: Use the same service image to generate the data locally as is used in the environment + +> IMPORTANT NOTE: There are some oddities with using the persist to disk across different OS's with different security packages. + +You should run the store knowledge commands that eventually create the persisted +disk from within the SAME IMAGE that gets deployed. + +One way to do this is as follows: + +* Use docker to build the image locally and run it with a volume mount +* exec into the running container +* run commands necessary to load the knowledge +* check the location of the volume mount on your host system for the persisted data +* rsync that data to the data commons (or check into cdis-manifest) + +See the Gen3 Discovery AI service repo README for more info. + +``` +rsync -re ssh --progress ~/repos/gen3-discovery-ai/knowledge/ avantol@cdistest_dev.csoc:~/cdis-manifest/avantol.planx-pla.net/gen3-discovery-ai/knowledge/chromadb +``` + +### Getting data from S3 in mem + +We specify a path for Chromadb to use for persisted data and when it sees +data there, it loads it in. \ No newline at end of file