diff --git a/docker/airflow/requirements.txt b/docker/airflow/requirements.txt index 5ae4ed5..73175af 100644 --- a/docker/airflow/requirements.txt +++ b/docker/airflow/requirements.txt @@ -1,11 +1,9 @@ -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.1/constraints-3.11.txt +apache-airflow-providers-amazon==8.20.0 apache-airflow-providers-apache-spark==4.7.2 apache-airflow-providers-slack==8.6.2 -plyvel==1.5.1 - -# duckdb==0.10.2 -# polars==0.20.26 -# pyspark==3.5.1 -# apache-airflow-providers-slack -# deltalake -# delta-spark +delta-spark==3.2.0 +deltalake==0.17.3 +duckdb==0.10.2 +polars==0.20.31 +pyspark==3.5.1 diff --git a/docker/jupyter/Dockerfile b/docker/jupyter/Dockerfile index 59ee52a..f7caa05 100644 --- a/docker/jupyter/Dockerfile +++ b/docker/jupyter/Dockerfile @@ -15,4 +15,5 @@ COPY ./requirements.txt /tmp/requirements.txt RUN export DOCKER_BUILDKIT=1 -RUN --mount=type=cache,target=/opt/conda/pkgs conda install --quiet --yes --file /tmp/requirements.txt +RUN conda install -y -c conda-forge mamba +RUN mamba install -y --file /tmp/requirements.txt diff --git a/docker/jupyter/requirements.txt b/docker/jupyter/requirements.txt index 753126d..2b04aea 100644 --- a/docker/jupyter/requirements.txt +++ b/docker/jupyter/requirements.txt @@ -1,11 +1,11 @@ -delta-spark==3.0.0 -deltalake==0.17.4 +delta-spark +deltalake # duckdb_engine==0.12.0 -grpcio==1.62.1 -grpcio-status==1.62.1 -jupyterlab==4.1.6 +grpcio +grpcio-status +jupyterlab==4.2.2 kafka-python==2.0.2 pandas==2.2.2 -polars==0.20.26 -protobuf==4.25.3 +polars==0.20.31 +protobuf pyspark==3.5.1 diff --git a/notebooks/spark.ipynb b/notebooks/spark.ipynb index f69dc0c..599df0d 100644 --- a/notebooks/spark.ipynb +++ b/notebooks/spark.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "f88878dc", "metadata": {}, "outputs": [], @@ -20,10 +20,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c15aad9d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "24/06/14 18:21:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + } + ], "source": [ "# Add postgres jar\n", "spark = (\n", @@ -46,10 +56,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "4640f260-f0a9-40e9-855c-4ffa7a744dff", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- table_catalog: string (nullable = true)\n", + " |-- table_schema: string (nullable = true)\n", + " |-- table_name: string (nullable = true)\n", + " |-- table_type: string (nullable = true)\n", + " |-- self_referencing_column_name: string (nullable = true)\n", + " |-- reference_generation: string (nullable = true)\n", + " |-- user_defined_type_catalog: string (nullable = true)\n", + " |-- user_defined_type_schema: string (nullable = true)\n", + " |-- user_defined_type_name: string (nullable = true)\n", + " |-- is_insertable_into: string (nullable = true)\n", + " |-- is_typed: string (nullable = true)\n", + " |-- commit_action: string (nullable = true)\n", + "\n" + ] + } + ], "source": [ "df = (\n", " spark.read.format(\"jdbc\")\n",