From d1e0132b3fdc733ba46237a193323cd30847f3d2 Mon Sep 17 00:00:00 2001 From: Radu Gheorghe Date: Fri, 17 Jan 2025 14:06:59 +0200 Subject: [PATCH] logstash config and README for parts-purchase-demo --- examples/part-purchases-demo/README.md | 22 ++++++++ .../101/ch2/ecommerce/ext/logstash.conf | 7 ++- examples/training-artifacts/101/ch3/README.md | 32 ++++++++++++ .../training-artifacts/101/ch3/logstash.conf | 51 +++++++++++++++++++ .../part-purchase/validation-overrides.xml | 4 +- 5 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 examples/training-artifacts/101/ch3/README.md create mode 100644 examples/training-artifacts/101/ch3/logstash.conf diff --git a/examples/part-purchases-demo/README.md b/examples/part-purchases-demo/README.md index 64cd47b5b..c0db23878 100644 --- a/examples/part-purchases-demo/README.md +++ b/examples/part-purchases-demo/README.md @@ -80,3 +80,25 @@ $ python3 ext/parts.py -f ext/purchase.csv | vespa feed -
 $ docker rm -f vespa
 
+ +---- + +**Feed the data with Logstash from the CSV file** + +You can also feed the data with Logstash from the CSV file directly (no need to run `parts.py`). It may be useful if you want to feed your own data. + +You'll need to [install Logstash](https://www.elastic.co/downloads/logstash), then: + +1. Install the [Logstash Output Plugin for Vespa](https://github.com/vespa-engine/vespa/tree/master/integration/logstash-plugins/logstash-output-vespa) via: + +``` +bin/logstash-plugin install logstash-output-vespa_feed +``` + +2. Adapt [logstash.conf from training-artifacts/101/ch3](../training-artifacts/101/ch3/logstash.conf) to point to the absolute path of [purchase.csv](ext/purchase.csv). + +3. Run Logstash with the modified `logstash.conf`: + +``` +bin/logstash -f $PATH_TO_LOGSTASH_CONF/logstash.conf +``` \ No newline at end of file diff --git a/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf b/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf index 486adad79..1a62f7f8d 100644 --- a/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf +++ b/examples/training-artifacts/101/ch2/ecommerce/ext/logstash.conf @@ -28,6 +28,11 @@ filter { quote_char => '"' } + # strip leading and trailing whitespace from fields + mutate { + strip => ["ProductName", "ProductBrand", "Gender", "Price", "NumImages", "Description", "PrimaryColor"] + } + if "_csvparsefailure" in [tags] { # we failed to parse this line, mark it so we don't feed it to Vespa mutate { @@ -52,7 +57,7 @@ output { namespace => "product" document_type => "product" # use this field from the parsed CSV as the document ID - id_field => "%{ProductID}" + id_field => "ProductID" } } } \ No newline at end of file diff --git a/examples/training-artifacts/101/ch3/README.md b/examples/training-artifacts/101/ch3/README.md new file mode 100644 index 000000000..68ec14e54 --- /dev/null +++ b/examples/training-artifacts/101/ch3/README.md @@ -0,0 +1,32 @@ +# Sample sales data application + +## Deploy the application to Vespa + +``` +cd part-purchase +vespa deploy +``` + +## Feeding data +The data is already converted to JSONL format in [sales-data.jsonl](sales-data.jsonl). So you can just feed it to Vespa: +``` +vespa feed ../sales-data.jsonl +``` + +## Feeding data with Logstash from the CSV file + +You can also feed the data with Logstash from the CSV file. You'll need to [install Logstash](https://www.elastic.co/downloads/logstash), then: + +1. Install the [Logstash Output Plugin for Vespa](https://github.com/vespa-engine/vespa/tree/master/integration/logstash-plugins/logstash-output-vespa) via: + +``` +bin/logstash-plugin install logstash-output-vespa_feed +``` + +2. Change [logstash.conf](logstash.conf) to point to the absolute path of [sales-data.csv](sales-data.csv). + +3. Run Logstash with the modified `logstash.conf`: + +``` +bin/logstash -f $PATH_TO_LOGSTASH_CONF/logstash.conf +``` diff --git a/examples/training-artifacts/101/ch3/logstash.conf b/examples/training-artifacts/101/ch3/logstash.conf new file mode 100644 index 000000000..99cd7e612 --- /dev/null +++ b/examples/training-artifacts/101/ch3/logstash.conf @@ -0,0 +1,51 @@ +input { + file { + # TODO: change this to the absolute path of the CSV file + # path => "/PATH/TO/sample-apps/examples/training-artifacts/101/ch3/sales-data.csv" + # don't store the state of the file, so that Logstash will read the file from the beginning on each restart + sincedb_path => "/dev/null" + start_position => "beginning" + } +} + +filter { + csv { + columns => ["date", "price", "tax", "item", "customer"] + # skip the first line, which contains the column names + skip_header => true + separator => "," + quote_char => '"' + } + + # parse the date field (which looks like: 2024-09-18 08:00:00) into a UNIX timestamp + date { + match => ["date", "yyyy-MM-dd HH:mm:ss"] + # this will go to our default @timestamp field + } + + mutate { + # add a new field with the date (from @timestamp) in UNIX timestamp format + add_field => { "date_unix" => "%{+%s}" } + # remove unnecessary fields (including the original "date" field) + remove_field => ["message", "@timestamp", "@version", "event", "log", "host", "date"] + } + + mutate { + # rename the new "date_unix" field to "date" + rename => { "date_unix" => "date" } + # convert it to an integer + convert => { "date" => "integer" } + } +} + +output { + # stdout { codec => rubydebug } + + vespa_feed { + vespa_url => "http://localhost:8080" + namespace => "purchase" + document_type => "purchase" + # no need to specify an ID field, it will try to use the "id" field, which doesn't exist + # so a random UUID will be generated + } +} \ No newline at end of file diff --git a/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml b/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml index 1de0d5319..83ad4781e 100644 --- a/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml +++ b/examples/training-artifacts/101/ch3/part-purchase/validation-overrides.xml @@ -1,4 +1,4 @@ - indexing-change - schema-removal + indexing-change + schema-removal