From 007542982f9cf2e77f40102a4f1f314ab5ab3579 Mon Sep 17 00:00:00 2001
From: Alex VanTol <avantol@uchicago.edu>
Date: Mon, 30 Oct 2023 13:28:06 -0500
Subject: [PATCH] fix(ai): update bucket contents every roll for updates

---
 gen3/bin/kube-setup-gen3-discovery-ai.sh  |  6 +--
 kube/services/gen3-discovery-ai/README.md | 45 +++++++++++++++++++++++
 2 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100644 kube/services/gen3-discovery-ai/README.md

diff --git a/gen3/bin/kube-setup-gen3-discovery-ai.sh b/gen3/bin/kube-setup-gen3-discovery-ai.sh
index c8338afbc..5769a5f48 100644
--- a/gen3/bin/kube-setup-gen3-discovery-ai.sh
+++ b/gen3/bin/kube-setup-gen3-discovery-ai.sh
@@ -139,10 +139,8 @@ fi
 
 gen3_log_info "Setup complete, syncing configuration to bucket"
 
-if [ -d "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge/chromadb" ]; then
-  bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || exit 1
-  aws s3 sync "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge/chromadb" "s3://$bucketName/chromadb"  --delete
-fi
+bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || exit 1
+aws s3 sync "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge/chromadb" "s3://$bucketName/chromadb" --delete
 
 gen3 roll gen3-discovery-ai
 g3kubectl apply -f "${GEN3_HOME}/kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml"
diff --git a/kube/services/gen3-discovery-ai/README.md b/kube/services/gen3-discovery-ai/README.md
new file mode 100644
index 000000000..fb25d8201
--- /dev/null
+++ b/kube/services/gen3-discovery-ai/README.md
@@ -0,0 +1,45 @@
+# Gen3 Discovery AI
+
+## Populating Disc for In-Memory Vectordb Chromadb
+
+In order to setup pre-configured topics, we need to load a bunch of data 
+into Chromadb (which is an inmem vectordb with an option to persist to disk).
+
+To load topics consistently, we setup an S3 bucket to house the persisted 
+vectordb. 
+
+### Getting data into S3
+
+Run the service elsewhere, load the data, and persist it to disk. Then move those
+files from disk into the VM. The expectation is that for Chromadb loading, the 
+files are placed in a `gen3-discovery-ai/knowledge/chromadb` folder relative to 
+where the `manifest.json` is. For example:
+`~/cdis-manifest/avantol.planx-pla.net/gen3-discovery-ai/gen3-discovery-ai/knowledge/chromadb`
+
+You can rsync from local if you've generated it locally.
+
+#### IMPORTANT: Use the same service image to generate the data locally as is used in the environment
+
+> IMPORTANT NOTE: There are some oddities with using the persist to disk across different OS's with different security packages.
+
+You should run the store knowledge commands that eventually create the persisted
+disk from within the SAME IMAGE that gets deployed. 
+
+One way to do this is as follows:
+
+* Use docker to build the image locally and run it with a volume mount
+* exec into the running container
+* run commands necessary to load the knowledge
+* check the location of the volume mount on your host system for the persisted data
+* rsync that data to the data commons (or check into cdis-manifest)
+
+See the Gen3 Discovery AI service repo README for more info.
+
+```
+rsync -re ssh --progress ~/repos/gen3-discovery-ai/knowledge/ avantol@cdistest_dev.csoc:~/cdis-manifest/avantol.planx-pla.net/gen3-discovery-ai/knowledge/chromadb
+```
+
+### Getting data from S3 in mem
+
+We specify a path for Chromadb to use for persisted data and when it sees 
+data there, it loads it in. 
\ No newline at end of file