Merge pull request #1621 from vespa-engine/logstash-parts-purchase-demo

logstash config and README for parts-purchase-demo
vespa-engine · Jan 17, 2025 · ea64b7f · ea64b7f
2 parents 8a00708 + d1e0132
commit ea64b7f
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 3 deletions.
diff --git a/examples/part-purchases-demo/README.md b/examples/part-purchases-demo/README.md
@@ -80,3 +80,25 @@ $ python3 ext/parts.py -f ext/purchase.csv | vespa feed -
 <pre data-test="after">
 $ docker rm -f vespa
 </pre>
+
+----
+
+**Feed the data with Logstash from the CSV file**
+
+You can also feed the data with Logstash from the CSV file directly (no need to run `parts.py`). It may be useful if you want to feed your own data.
+
+You'll need to [install Logstash](https://www.elastic.co/downloads/logstash), then:
+
+1. Install the [Logstash Output Plugin for Vespa](https://github.com/vespa-engine/vespa/tree/master/integration/logstash-plugins/logstash-output-vespa) via:
+
+```
+bin/logstash-plugin install logstash-output-vespa_feed
+```
+
+2. Adapt [logstash.conf from training-artifacts/101/ch3](../training-artifacts/101/ch3/logstash.conf) to point to the absolute path of [purchase.csv](ext/purchase.csv).
+
+3. Run Logstash with the modified `logstash.conf`:
+
+```
+bin/logstash -f $PATH_TO_LOGSTASH_CONF/logstash.conf
+```
diff --git a/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf b/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf
@@ -28,6 +28,11 @@ filter {
         quote_char => '"'
     }
 
+    # strip leading and trailing whitespace from fields
+    mutate {
+        strip => ["ProductName", "ProductBrand", "Gender", "Price", "NumImages", "Description", "PrimaryColor"]
+    }
+
     if "_csvparsefailure" in [tags] {
         # we failed to parse this line, mark it so we don't feed it to Vespa
         mutate {
@@ -52,7 +57,7 @@ output {
             namespace => "product"
             document_type => "product"
             # use this field from the parsed CSV as the document ID
-            id_field => "%{ProductID}"
+            id_field => "ProductID"
         }
     }
 }
diff --git a/examples/training-artifacts/101/ch3/README.md b/examples/training-artifacts/101/ch3/README.md
@@ -0,0 +1,32 @@
+# Sample sales data application
+
+## Deploy the application to Vespa
+
+```
+cd part-purchase
+vespa deploy
+```
+
+## Feeding data
+The data is already converted to JSONL format in [sales-data.jsonl](sales-data.jsonl). So you can just feed it to Vespa:
+```
+vespa feed ../sales-data.jsonl
+```
+
+## Feeding data with Logstash from the CSV file
+
+You can also feed the data with Logstash from the CSV file. You'll need to [install Logstash](https://www.elastic.co/downloads/logstash), then:
+
+1. Install the [Logstash Output Plugin for Vespa](https://github.com/vespa-engine/vespa/tree/master/integration/logstash-plugins/logstash-output-vespa) via:
+
+```
+bin/logstash-plugin install logstash-output-vespa_feed
+```
+
+2. Change [logstash.conf](logstash.conf) to point to the absolute path of [sales-data.csv](sales-data.csv).
+
+3. Run Logstash with the modified `logstash.conf`:
+
+```
+bin/logstash -f $PATH_TO_LOGSTASH_CONF/logstash.conf
+```
diff --git a/examples/training-artifacts/101/ch3/logstash.conf b/examples/training-artifacts/101/ch3/logstash.conf
@@ -0,0 +1,51 @@
+input {
+    file {
+        # TODO: change this to the absolute path of the CSV file
+        # path => "/PATH/TO/sample-apps/examples/training-artifacts/101/ch3/sales-data.csv"
+        # don't store the state of the file, so that Logstash will read the file from the beginning on each restart
+        sincedb_path => "/dev/null"
+        start_position => "beginning"
+    }
+}
+
+filter {
+    csv {
+        columns => ["date", "price", "tax", "item", "customer"]
+        # skip the first line, which contains the column names
+        skip_header => true
+        separator => ","
+        quote_char => '"'
+    }
+
+    # parse the date field (which looks like: 2024-09-18 08:00:00) into a UNIX timestamp
+    date {
+        match => ["date", "yyyy-MM-dd HH:mm:ss"]
+        # this will go to our default @timestamp field
+    }
+
+    mutate {
+        # add a new field with the date (from @timestamp) in UNIX timestamp format
+        add_field => { "date_unix" => "%{+%s}" }
+        # remove unnecessary fields (including the original "date" field)
+        remove_field => ["message", "@timestamp", "@version", "event", "log", "host", "date"]
+    }
+
+    mutate {
+        # rename the new "date_unix" field to "date"
+        rename => { "date_unix" => "date" }
+        # convert it to an integer
+        convert => { "date" => "integer" }
+    }
+}
+
+output {
+    # stdout { codec => rubydebug }
+
+    vespa_feed {
+        vespa_url => "http://localhost:8080"
+        namespace => "purchase"
+        document_type => "purchase"
+        # no need to specify an ID field, it will try to use the "id" field, which doesn't exist
+        # so a random UUID will be generated
+    }
+}
diff --git a/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml b/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml
@@ -1,4 +1,4 @@
 <validation-overrides>
-    <allow until='2024-12-15'>indexing-change</allow>
-    <allow until='2024-12-15'>schema-removal</allow>
+    <allow until='2025-02-01'>indexing-change</allow>
+    <allow until='2025-02-01'>schema-removal</allow>
 </validation-overrides>