From d1e0132b3fdc733ba46237a193323cd30847f3d2 Mon Sep 17 00:00:00 2001
From: Radu Gheorghe <radu@vespa.ai>
Date: Fri, 17 Jan 2025 14:06:59 +0200
Subject: [PATCH] logstash config and README for parts-purchase-demo

---
 examples/part-purchases-demo/README.md        | 22 ++++++++
 .../101/ch2/ecommerce/ext/logstash.conf       |  7 ++-
 examples/training-artifacts/101/ch3/README.md | 32 ++++++++++++
 .../training-artifacts/101/ch3/logstash.conf  | 51 +++++++++++++++++++
 .../part-purchase/validation-overrides.xml    |  4 +-
 5 files changed, 113 insertions(+), 3 deletions(-)
 create mode 100644 examples/training-artifacts/101/ch3/README.md
 create mode 100644 examples/training-artifacts/101/ch3/logstash.conf
diff --git a/examples/part-purchases-demo/README.md b/examples/part-purchases-demo/README.md
index 64cd47b5b..c0db23878 100644
--- a/examples/part-purchases-demo/README.md
+++ b/examples/part-purchases-demo/README.md
@@ -80,3 +80,25 @@ $ python3 ext/parts.py -f ext/purchase.csv | vespa feed -
 <pre data-test="after">
 $ docker rm -f vespa
 </pre>
+
+----
+
+**Feed the data with Logstash from the CSV file**
+
+You can also feed the data with Logstash from the CSV file directly (no need to run `parts.py`). It may be useful if you want to feed your own data.
+
+You'll need to [install Logstash](https://www.elastic.co/downloads/logstash), then:
+
+1. Install the [Logstash Output Plugin for Vespa](https://github.com/vespa-engine/vespa/tree/master/integration/logstash-plugins/logstash-output-vespa) via:
+
+```
+bin/logstash-plugin install logstash-output-vespa_feed
+```
+
+2. Adapt [logstash.conf from training-artifacts/101/ch3](../training-artifacts/101/ch3/logstash.conf) to point to the absolute path of [purchase.csv](ext/purchase.csv).
+
+3. Run Logstash with the modified `logstash.conf`:
+
+```
+bin/logstash -f $PATH_TO_LOGSTASH_CONF/logstash.conf
+```
\ No newline at end of file
diff --git a/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf b/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf
index 486adad79..1a62f7f8d 100644
--- a/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf
+++ b/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf
@@ -28,6 +28,11 @@ filter {
         quote_char => '"'
     }
 
+    # strip leading and trailing whitespace from fields
+    mutate {
+        strip => ["ProductName", "ProductBrand", "Gender", "Price", "NumImages", "Description", "PrimaryColor"]
+    }
+
     if "_csvparsefailure" in [tags] {
         # we failed to parse this line, mark it so we don't feed it to Vespa
         mutate {
@@ -52,7 +57,7 @@ output {
             namespace => "product"
             document_type => "product"
             # use this field from the parsed CSV as the document ID
-            id_field => "%{ProductID}"
+            id_field => "ProductID"
         }
     }
 }
\ No newline at end of file
diff --git a/examples/training-artifacts/101/ch3/README.md b/examples/training-artifacts/101/ch3/README.md
new file mode 100644
index 000000000..68ec14e54
--- /dev/null
+++ b/examples/training-artifacts/101/ch3/README.md
@@ -0,0 +1,32 @@
+# Sample sales data application
+
+## Deploy the application to Vespa
+
+```
+cd part-purchase
+vespa deploy
+```
+
+## Feeding data
+The data is already converted to JSONL format in [sales-data.jsonl](sales-data.jsonl). So you can just feed it to Vespa:
+```
+vespa feed ../sales-data.jsonl
+```
+
+## Feeding data with Logstash from the CSV file
+
+You can also feed the data with Logstash from the CSV file. You'll need to [install Logstash](https://www.elastic.co/downloads/logstash), then:
+
+1. Install the [Logstash Output Plugin for Vespa](https://github.com/vespa-engine/vespa/tree/master/integration/logstash-plugins/logstash-output-vespa) via:
+
+```
+bin/logstash-plugin install logstash-output-vespa_feed
+```
+
+2. Change [logstash.conf](logstash.conf) to point to the absolute path of [sales-data.csv](sales-data.csv).
+
+3. Run Logstash with the modified `logstash.conf`:
+
+```
+bin/logstash -f $PATH_TO_LOGSTASH_CONF/logstash.conf
+```
diff --git a/examples/training-artifacts/101/ch3/logstash.conf b/examples/training-artifacts/101/ch3/logstash.conf
new file mode 100644
index 000000000..99cd7e612
--- /dev/null
+++ b/examples/training-artifacts/101/ch3/logstash.conf
@@ -0,0 +1,51 @@
+input {
+    file {
+        # TODO: change this to the absolute path of the CSV file
+        # path => "/PATH/TO/sample-apps/examples/training-artifacts/101/ch3/sales-data.csv"
+        # don't store the state of the file, so that Logstash will read the file from the beginning on each restart
+        sincedb_path => "/dev/null"
+        start_position => "beginning"
+    }
+}
+
+filter {
+    csv {
+        columns => ["date", "price", "tax", "item", "customer"]
+        # skip the first line, which contains the column names
+        skip_header => true
+        separator => ","
+        quote_char => '"'
+    }
+
+    # parse the date field (which looks like: 2024-09-18 08:00:00) into a UNIX timestamp
+    date {
+        match => ["date", "yyyy-MM-dd HH:mm:ss"]
+        # this will go to our default @timestamp field
+    }
+
+    mutate {
+        # add a new field with the date (from @timestamp) in UNIX timestamp format
+        add_field => { "date_unix" => "%{+%s}" }
+        # remove unnecessary fields (including the original "date" field)
+        remove_field => ["message", "@timestamp", "@version", "event", "log", "host", "date"]
+    }
+
+    mutate {
+        # rename the new "date_unix" field to "date"
+        rename => { "date_unix" => "date" }
+        # convert it to an integer
+        convert => { "date" => "integer" }
+    }
+}
+
+output {
+    # stdout { codec => rubydebug }
+
+    vespa_feed {
+        vespa_url => "http://localhost:8080"
+        namespace => "purchase"
+        document_type => "purchase"
+        # no need to specify an ID field, it will try to use the "id" field, which doesn't exist
+        # so a random UUID will be generated
+    }
+}
\ No newline at end of file
diff --git a/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml b/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml
index 1de0d5319..83ad4781e 100644
--- a/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml
+++ b/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml
@@ -1,4 +1,4 @@
 <validation-overrides>
-    <allow until='2024-12-15'>indexing-change</allow>
-    <allow until='2024-12-15'>schema-removal</allow>
+    <allow until='2025-02-01'>indexing-change</allow>
+    <allow until='2025-02-01'>schema-removal</allow>
 </validation-overrides>