diff --git a/.env.template b/.env.template index e15d7b8..ca28693 100644 --- a/.env.template +++ b/.env.template @@ -32,6 +32,8 @@ AIRFLOW_CONFIG='/opt/airflow/config/airflow.cfg' # Airflow connections AIRFLOW_CONN_SPARK_DEFAULT='spark://spark-master:7077?deploy_mode=client' +AIRFLOW_CONN_MONGO_DEFAULT='mongo://mongo:27017/%3FauthSource%3Dadmin' +AIRFLOW_CONN_HTTP_DEFAULT='' # Airflow slack integration AIRFLOW_CONN_SLACK_API_DEFAULT='slack://:@/?timeout=42' diff --git a/.gitignore b/.gitignore index a33154d..9ba2d12 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__ data/ logs/ +notebooks/.bash_history notebooks/.cache notebooks/.conda notebooks/.ipynb_checkpoints diff --git a/README.md b/README.md index ab09674..614d5ac 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ - [Kafka consumer](#kafka-consumer) - [Airflow](#airflow) - [Slack integration](#slack-integration) +- [Mongo](#mongo) @@ -157,3 +158,5 @@ update their values where necessary. You need to create a Slack app and setup `AIRFLOW_CONN_SLACK_API_DEFAULT` env variable with Slack api key. If you don't want to use this integration, remove the `AIRFLOW_CONN_SLACK_API_DEFAULT` variable from your `.env` file. + +## Mongo diff --git a/airflow/config/airflow.cfg b/airflow/config/airflow.cfg index 3dc8644..1b81d76 100644 --- a/airflow/config/airflow.cfg +++ b/airflow/config/airflow.cfg @@ -639,7 +639,7 @@ encrypt_s3_logs = False # # Variable: AIRFLOW__LOGGING__LOGGING_LEVEL # -logging_level = DEBUG +logging_level = INFO # Logging level for celery. If not set, it uses the value of logging_level # diff --git a/airflow/dags/artic.py b/airflow/dags/artic.py new file mode 100644 index 0000000..956c5de --- /dev/null +++ b/airflow/dags/artic.py @@ -0,0 +1,70 @@ +import logging +from datetime import datetime + +import pymongo +import requests +from airflow.decorators import dag + +from airflow.operators.python import PythonOperator +from airflow.providers.mongo.hooks.mongo import MongoHook +from pymongo import MongoClient, UpdateOne +from pymongo.collection import Collection +from pymongo.database import Database + + +@dag( + dag_id="artic", + schedule=None, + start_date=datetime(2024, 6, 21), + catchup=False, +) +def artic() -> None: + pass + + +def get_art_data_and_write_to_mongo(): + endpoint = "https://api.artic.edu/api/v1/artworks?limit=100" + + hook = MongoHook(mongo_conn_id="mongo_default") + client: MongoClient = hook.get_conn() + db: Database = client.get_database("artic") + collection: Collection = db.get_collection("art") + + collection.create_index([("id", pymongo.ASCENDING)], unique=True) + + page = 1 + pieces = 0 + while True: + logging.info(f"working on page {page}") + resp = requests.get(endpoint, timeout=10) + resp_json = resp.json() + + data = resp_json.get("data", []) + if len(data) == 0: + break + + pieces += len(data) + + # Upsert to mongo + ops = [ + UpdateOne({"id": piece["id"]}, {"$set": piece}, upsert=True) + for piece in data + ] + collection.bulk_write(ops) + + pagination = resp_json.get("pagination", {}) + endpoint = pagination.get("next_url") + + if not endpoint: + break + + page += 1 + + logging.info(f"Finished. Made {page} API calls. Upserted {pieces}.") + + +dag = artic() + +get_art_data = PythonOperator( + task_id="get_art_data", dag=dag, python_callable=get_art_data_and_write_to_mongo +) diff --git a/airflow/dags/spark.py b/airflow/dags/spark.py index 45e337c..2365dec 100644 --- a/airflow/dags/spark.py +++ b/airflow/dags/spark.py @@ -3,7 +3,7 @@ from airflow import DAG from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator -dag = DAG("spark_job_example", start_date=datetime(2024, 5, 20)) +dag = DAG("spark_job_example", start_date=datetime(2024, 5, 20), schedule="@once") spark_task = SparkSubmitOperator( conn_id="spark_master", diff --git a/docker-compose.yaml b/docker-compose.yaml index 333ec80..db66f3b 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,5 +1,10 @@ name: docker +x-common: + &common + networks: + - backend + x-airflow-common: &airflow-common build: ./docker/airflow @@ -22,6 +27,7 @@ x-airflow-common: services: db: + <<: *common container_name: docker-postgres image: postgres:16 ports: @@ -31,8 +37,6 @@ services: - .env volumes: - pgdata:/var/lib/postgresql/data - networks: - - backend healthcheck: test: ["CMD-SHELL", "pg_isready -d postgres -U postgres"] interval: 10s @@ -40,11 +44,10 @@ services: retries: 5 python: + <<: *common container_name: python build: context: . - networks: - - backend depends_on: db: condition: service_healthy @@ -52,6 +55,7 @@ services: - .env trino: + <<: *common container_name: trino ports: - "8080:8080" @@ -61,10 +65,9 @@ services: - ./docker/trino/catalog:/etc/trino/catalog environment: - CATALOG_MANAGEMENT=dynamic - networks: - - backend spark-master: + <<: *common container_name: spark-master build: ./docker/spark ports: @@ -80,9 +83,10 @@ services: environment: - SPARK_LOCAL_IP=spark-master - SPARK_WORKLOAD=master - networks: - - backend + profiles: + - spark spark-worker-a: + <<: *common build: ./docker/spark container_name: spark-worker-a deploy: @@ -102,9 +106,10 @@ services: volumes: - ./spark-apps:/opt/spark-apps - ./data/spark-data:/opt/spark-data - networks: - - backend + profiles: + - spark spark-worker-b: + <<: *common build: ./docker/spark container_name: spark-worker-b deploy: @@ -122,10 +127,11 @@ services: volumes: - ./spark-apps:/opt/spark-apps - ./data/spark-data:/opt/spark-data - networks: - - backend + profiles: + - spark jupyterlab: + <<: *common container_name: jupyterlab build: ./docker/jupyter environment: @@ -137,10 +143,9 @@ services: - .env ports: - "8089:8089" - networks: - - backend scylla-1: + <<: *common build: ./docker/scylladb container_name: scylla-1 restart: always @@ -151,10 +156,11 @@ services: - ./docker/scylladb/cassandra-rackdc.properties.dc1:/etc/scylla/cassandra-rackdc.properties ports: - "19042:19042" - networks: - - backend + profiles: + - scylla scylla-2: + <<: *common build: ./docker/scylladb container_name: scylla-2 restart: always @@ -163,10 +169,11 @@ services: - ./docker/scylladb/scylla.yaml:/etc/scylla/scylla.yaml - ./docker/scylladb/cassandra-rackdc.properties.dc1:/etc/scylla/cassandra-rackdc.properties command: --seeds=scylla-1,scylla-2,scylla-3 --smp 1 --memory 750M --overprovisioned 1 - networks: - - backend + profiles: + - scylla scylla-3: + <<: *common build: ./docker/scylladb container_name: scylla-3 restart: always @@ -175,22 +182,24 @@ services: - ./docker/scylladb/scylla.yaml:/etc/scylla/scylla.yaml - ./docker/scylladb/cassandra-rackdc.properties.dc1:/etc/scylla/cassandra-rackdc.properties command: --seeds=scylla-1,scylla-2,scylla-3 --smp 1 --memory 750M --overprovisioned 1 - networks: - - backend + profiles: + - scylla kafka: + <<: *common container_name: kafka build: ./docker/kafka volumes: - ./data/kafka:/bitnami/kafka - networks: - - backend env_file: - .env ports: - '9092:9092' + - '9093:9093' + - '9094:9094' redis: + <<: *common container_name: redis # Redis is limited to 7.2-bookworm due to licencing change # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ @@ -204,8 +213,6 @@ services: retries: 50 start_period: 30s restart: always - networks: - - backend airflow-webserver: <<: *airflow-common @@ -381,6 +388,22 @@ services: airflow-init: condition: service_completed_successfully + mongo: + <<: *common + container_name: mongo + build: ./docker/mongo + ports: + - "27017:27017" + volumes: + - ./data/mongo:/data/db + healthcheck: + test: ["CMD-SHELL", "mongo --eval 'db.adminCommand(\"ping\")'"] + interval: 10s + timeout: 5s + retries: 5 + restart: always + + volumes: pgdata: diff --git a/docker/airflow/requirements.txt b/docker/airflow/requirements.txt index 73175af..e59cc63 100644 --- a/docker/airflow/requirements.txt +++ b/docker/airflow/requirements.txt @@ -1,9 +1,9 @@ -c https://raw.githubusercontent.com/apache/airflow/constraints-2.9.1/constraints-3.11.txt apache-airflow-providers-amazon==8.20.0 apache-airflow-providers-apache-spark==4.7.2 +apache-airflow-providers-mongo==4.0.0 apache-airflow-providers-slack==8.6.2 delta-spark==3.2.0 deltalake==0.17.3 -duckdb==0.10.2 polars==0.20.31 pyspark==3.5.1 diff --git a/docker/jupyter/Dockerfile b/docker/jupyter/Dockerfile index e4767f7..ca1cc8f 100644 --- a/docker/jupyter/Dockerfile +++ b/docker/jupyter/Dockerfile @@ -1,7 +1,7 @@ FROM jupyter/base-notebook:python-3.11.5 USER root -RUN apt-get update && apt-get install -y openjdk-8-jdk-headless wget +RUN apt-get update && apt-get install -y openjdk-8-jdk-headless wget telnet netcat RUN mkdir -p /opt/spark/jars RUN wget -P /opt/spark/jars https://jdbc.postgresql.org/download/postgresql-42.7.3.jar diff --git a/docker/jupyter/requirements.txt b/docker/jupyter/requirements.txt index 2b04aea..94594e6 100644 --- a/docker/jupyter/requirements.txt +++ b/docker/jupyter/requirements.txt @@ -1,5 +1,6 @@ delta-spark deltalake +duckdb==1.0.0 # duckdb_engine==0.12.0 grpcio grpcio-status @@ -8,4 +9,5 @@ kafka-python==2.0.2 pandas==2.2.2 polars==0.20.31 protobuf +pymongo==4.7.3 pyspark==3.5.1 diff --git a/docker/mongo/Dockerfile b/docker/mongo/Dockerfile new file mode 100644 index 0000000..4bf8cac --- /dev/null +++ b/docker/mongo/Dockerfile @@ -0,0 +1 @@ +FROM mongodb/mongodb-community-server:7.0.9-ubuntu2204 diff --git a/notebooks/kafka_consumer.ipynb b/notebooks/kafka_consumer.ipynb index 2bb06b6..30d631b 100644 --- a/notebooks/kafka_consumer.ipynb +++ b/notebooks/kafka_consumer.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "id": "aeebe5b1-a60d-47f0-be56-e3c83f3a8182", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "id": "058fbdd6-05e7-4ed4-92b9-2a7dd1fd3d8f", "metadata": {}, "outputs": [], @@ -22,23 +22,46 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "80a9becf-73fe-42a2-a5f9-f36e24c41314", "metadata": {}, "outputs": [], "source": [ - "consumer = KafkaConsumer(topic, bootstrap_servers=\"docker-kafka-1:9092\")" + "consumer = KafkaConsumer(topic, bootstrap_servers=\"kafka:9092\", max_poll_records=10)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "5e7ba6ec-96eb-4db1-ab6b-6435d3a5906a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b'message 0' ConsumerRecord(topic='test', partition=0, offset=51, timestamp=1719085894135, timestamp_type=0, key=None, value=b'message 0', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 1' ConsumerRecord(topic='test', partition=0, offset=52, timestamp=1719085894136, timestamp_type=0, key=None, value=b'message 1', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 2' ConsumerRecord(topic='test', partition=0, offset=53, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 2', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 3' ConsumerRecord(topic='test', partition=0, offset=54, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 3', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 4' ConsumerRecord(topic='test', partition=0, offset=55, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 4', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 5' ConsumerRecord(topic='test', partition=0, offset=56, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 5', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 6' ConsumerRecord(topic='test', partition=0, offset=57, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 6', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 7' ConsumerRecord(topic='test', partition=0, offset=58, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 7', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 8' ConsumerRecord(topic='test', partition=0, offset=59, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 8', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "b'message 9' ConsumerRecord(topic='test', partition=0, offset=60, timestamp=1719085894137, timestamp_type=0, key=None, value=b'message 9', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=9, serialized_header_size=-1)\n", + "Consumer interrupted, shutting down...\n" + ] + } + ], "source": [ - "for msg in consumer:\n", - " print(msg.value, msg)" + "try:\n", + " for msg in consumer:\n", + " print(msg.value, msg)\n", + "except KeyboardInterrupt:\n", + " print(\"Consumer interrupted, shutting down...\")\n", + "finally:\n", + " consumer.close()" ] }, { diff --git a/notebooks/kafka_producer.ipynb b/notebooks/kafka_producer.ipynb index 77629b4..a824f84 100644 --- a/notebooks/kafka_producer.ipynb +++ b/notebooks/kafka_producer.ipynb @@ -2,39 +2,42 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "id": "1bbed396-de52-47be-982b-0c6528115491", "metadata": {}, + "outputs": [], "source": [ "from kafka import KafkaProducer" - ], - "outputs": [] + ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "id": "f62fb20d-8fc1-477d-9a99-2158bb3a5a1b", "metadata": {}, + "outputs": [], "source": [ "topic = \"test\"" - ], - "outputs": [] + ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "id": "9d4d0c7b-1f4b-40c8-ac3a-44cbf107b7d7", "metadata": {}, + "outputs": [], "source": [ - "producer = KafkaProducer(bootstrap_servers=\"docker-kafka-1:9092\", acks=1)" - ], - "outputs": [] + "producer = KafkaProducer(\n", + " bootstrap_servers=\"kafka:9092\", acks=1, value_serializer=lambda v: v.encode(\"utf-8\")\n", + ")" + ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "cd65e181-0584-40c8-9259-99a9fadafeb6", "metadata": {}, + "outputs": [], "source": [ "def on_send_success(record_metadata):\n", " print(\n", @@ -44,32 +47,48 @@ "\n", "def on_send_error(excp):\n", " print(excp)" - ], - "outputs": [] + ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "id": "76cfb95a-d514-4fb5-86c5-e1e92676ad08", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 51\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 52\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 53\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 54\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 55\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 56\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 57\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 58\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 59\n", + "record_metadata.topic: test | record_metadata.partition: 0 | record_metadata.offset: 60\n" + ] + } + ], "source": [ "for i in range(10):\n", " message = f\"message {i}\"\n", " future = (\n", - " producer.send(topic, bytes(message, \"utf-8\"))\n", + " producer.send(topic, message)\n", " .add_callback(on_send_success)\n", " .add_errback(on_send_error)\n", " )" - ], - "outputs": [] + ] }, { "cell_type": "code", "execution_count": null, - "id": "899a4079-fead-42ac-9e3e-814889ebec4e", + "id": "8e0d47e9-63fc-4545-95b7-ac936f177e04", "metadata": {}, - "source": [], - "outputs": [] + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/mongo.ipynb b/notebooks/mongo.ipynb new file mode 100644 index 0000000..a0f8ae6 --- /dev/null +++ b/notebooks/mongo.ipynb @@ -0,0 +1,643 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "id": "f6d1dfe6-20e3-4d33-b61f-33f65a2aa9c3", + "metadata": {}, + "outputs": [], + "source": [ + "from pymongo import MongoClient\n", + "from bson.objectid import ObjectId\n", + "from bson.son import SON" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c3f01060-c852-4b93-b459-dd7f0640298d", + "metadata": {}, + "outputs": [], + "source": [ + "client = MongoClient(\"mongo\", 27017, serverSelectionTimeoutMS=5000)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dc83f46e-6e9a-4d58-b0e6-6f2708dece0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['admin', 'artic', 'config', 'local', 'test_db']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.list_database_names()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e81b1f62-acc1-4aad-8ba8-e07ff219aa3f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "d6e6af94-e680-45f4-a58f-bb50860ea8ac", + "metadata": {}, + "source": [ + "# Art collection" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8c67011f-b806-4838-957a-e5a9676e738b", + "metadata": {}, + "outputs": [], + "source": [ + "artic = client.artic\n", + "art = artic.art" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f81fd439-eeec-48e8-945c-343830f525d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "125906" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "art.count_documents({})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa9be08e-0a3d-4798-8612-dc3773539f6f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7ad69081-21d4-4af6-b0e6-748f6afbec32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'_id': {'artist_id': 34946, 'artist_title': 'Utagawa Hiroshige'},\n", + " 'count': 1593},\n", + " {'_id': {'artist_id': 110752, 'artist_title': 'Unknown'}, 'count': 1154},\n", + " {'_id': {'artist_id': 51349, 'artist_title': 'Ancient Roman'}, 'count': 1138},\n", + " {'_id': {'artist_id': 37279, 'artist_title': 'James McNeill Whistler'},\n", + " 'count': 1096},\n", + " {'_id': {'artist_id': 37541, 'artist_title': 'Ancient Egyptian'},\n", + " 'count': 1048},\n", + " {'_id': {'artist_id': 117550, 'artist_title': 'Unknown Maker'}, 'count': 982},\n", + " {'_id': {'artist_id': 35139, 'artist_title': 'Jasper Johns'}, 'count': 841},\n", + " {'_id': {'artist_id': 2601, 'artist_title': 'Ancient Greek'}, 'count': 685},\n", + " {'_id': {'artist_id': 40517, 'artist_title': 'Honoré-Victorin Daumier'},\n", + " 'count': 673},\n", + " {'_id': {'artist_id': 35220, 'artist_title': 'Kawase Hasui'}, 'count': 609}]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline = [\n", + " {\"$match\": {\"artist_title\": {\"$ne\": None}}},\n", + " {\n", + " \"$group\": {\n", + " \"_id\": {\"artist_id\": \"$artist_id\", \"artist_title\": \"$artist_title\"},\n", + " \"count\": {\"$sum\": 1},\n", + " }\n", + " },\n", + " {\"$sort\": SON([(\"count\", -1)])},\n", + "]\n", + "\n", + "results = [i for i in art.aggregate(pipeline)]\n", + "results[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a0c926e0-1b8b-4111-b8ee-ba307cbafead", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "13645" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "cc0dff4f-2c48-44a9-948a-3e7d503d0198", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_id': ObjectId('6676c556d2a1e14374739997'),\n", + " 'id': 21977,\n", + " 'alt_artist_ids': [],\n", + " 'alt_classification_ids': ['TM-279', 'TM-27', 'TM-26', 'TM-46'],\n", + " 'alt_image_ids': [],\n", + " 'alt_material_ids': [],\n", + " 'alt_style_ids': [],\n", + " 'alt_subject_ids': [],\n", + " 'alt_technique_ids': [],\n", + " 'alt_titles': None,\n", + " 'api_link': 'https://api.artic.edu/api/v1/artworks/21977',\n", + " 'api_model': 'artworks',\n", + " 'artist_display': 'Korea',\n", + " 'artist_id': None,\n", + " 'artist_ids': [],\n", + " 'artist_title': None,\n", + " 'artist_titles': [],\n", + " 'artwork_type_id': 23,\n", + " 'artwork_type_title': 'Vessel',\n", + " 'boost_rank': None,\n", + " 'catalogue_display': None,\n", + " 'category_ids': ['PC-7'],\n", + " 'category_titles': ['Arts of Asia'],\n", + " 'classification_id': 'TM-11952',\n", + " 'classification_ids': ['TM-11952', 'TM-279', 'TM-27', 'TM-26', 'TM-46'],\n", + " 'classification_title': 'ewer (vessel)',\n", + " 'classification_titles': ['ewer (vessel)',\n", + " 'stoneware',\n", + " 'vessel',\n", + " 'asian art',\n", + " 'ceramics'],\n", + " 'color': {'h': 37,\n", + " 'l': 68,\n", + " 's': 77,\n", + " 'percentage': 0.0002831120812418428,\n", + " 'population': 2},\n", + " 'colorfulness': 8.4987,\n", + " 'copyright_notice': None,\n", + " 'credit_line': 'Bequest of Russell Tyson',\n", + " 'date_display': 'Goryeo dynasty (918–1392), 12th century',\n", + " 'date_end': 1200,\n", + " 'date_qualifier_id': None,\n", + " 'date_qualifier_title': '',\n", + " 'date_start': 1101,\n", + " 'department_id': 'PC-7',\n", + " 'department_title': 'Arts of Asia',\n", + " 'description': None,\n", + " 'dimensions': '18.5 × 19.6 × 15.7 cm (7 1/4 × 7 3/4 × 6 3/16 in.)',\n", + " 'dimensions_detail': [{'depth': 15,\n", + " 'width': 19,\n", + " 'height': 18,\n", + " 'diameter': None,\n", + " 'clarification': None}],\n", + " 'document_ids': [],\n", + " 'edition': None,\n", + " 'exhibition_history': None,\n", + " 'fiscal_year': None,\n", + " 'fiscal_year_deaccession': None,\n", + " 'gallery_id': None,\n", + " 'gallery_title': None,\n", + " 'has_advanced_imaging': False,\n", + " 'has_educational_resources': False,\n", + " 'has_multimedia_resources': False,\n", + " 'has_not_been_viewed_much': False,\n", + " 'image_id': '968e49fe-a6a4-bf2a-ce6e-948fa31f13ac',\n", + " 'inscriptions': None,\n", + " 'internal_department_id': 8,\n", + " 'is_boosted': False,\n", + " 'is_on_view': False,\n", + " 'is_public_domain': True,\n", + " 'is_zoomable': True,\n", + " 'latitude': None,\n", + " 'latlon': None,\n", + " 'longitude': None,\n", + " 'main_reference_number': '1964.972',\n", + " 'material_id': 'TM-2484',\n", + " 'material_ids': ['TM-2484'],\n", + " 'material_titles': ['stoneware'],\n", + " 'max_zoom_window_size': -1,\n", + " 'medium_display': 'Stoneware with celadon glaze and underglaze incised decoration',\n", + " 'nomisma_id': None,\n", + " 'on_loan_display': None,\n", + " 'place_of_origin': 'Korea',\n", + " 'provenance_text': None,\n", + " 'publication_history': None,\n", + " 'publishing_verification_level': 'Web Basic',\n", + " 'section_ids': [],\n", + " 'section_titles': [],\n", + " 'short_description': None,\n", + " 'site_ids': [],\n", + " 'sound_ids': [],\n", + " 'source_updated_at': '2024-06-22T00:30:09-05:00',\n", + " 'style_id': 'TM-5182',\n", + " 'style_ids': ['TM-5182'],\n", + " 'style_title': 'Korean (culture or style)',\n", + " 'style_titles': ['Korean (culture or style)'],\n", + " 'subject_id': None,\n", + " 'subject_ids': [],\n", + " 'subject_titles': [],\n", + " 'suggest_autocomplete_all': [{'input': ['1964.972'],\n", + " 'contexts': {'groupings': ['accession']}},\n", + " {'input': ['Melon-Shaped Ewer with Stylized Flowers'],\n", + " 'weight': 336,\n", + " 'contexts': {'groupings': ['title']}}],\n", + " 'technique_id': None,\n", + " 'technique_ids': [],\n", + " 'technique_titles': [],\n", + " 'term_titles': ['ewer (vessel)',\n", + " 'stoneware',\n", + " 'Korean (culture or style)',\n", + " 'stoneware',\n", + " 'vessel',\n", + " 'asian art',\n", + " 'ceramics'],\n", + " 'text_ids': [],\n", + " 'theme_titles': [],\n", + " 'thumbnail': {'lqip': 'data:image/gif;base64,R0lGODlhBQAFAPQAAKWnorG0qL28tsPGrMbFyNHSztPUz9rez83N1dbV19bW29PT3tbW3tnZ397g1ePn0eDg3tra4tra49vb5ODg5+Pi6Obk6+Ti7Obl7QAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAAAAAAALAAAAAAFAAUAAAUVYHRZ2NRATqEwxxMgkjEAS5UIBBUCADs=',\n", + " 'width': 2265,\n", + " 'height': 2250,\n", + " 'alt_text': 'A work made of stoneware with celadon glaze and underglaze incised decoration.'},\n", + " 'timestamp': '2024-06-22T14:00:12-05:00',\n", + " 'title': 'Melon-Shaped Ewer with Stylized Flowers',\n", + " 'updated_at': '2024-06-22T00:31:00-05:00',\n", + " 'video_ids': []}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "art.find_one({\"artist_id\": None})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6935d8a9-ea20-40b1-8437-cd8ddc44f37c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_id': ObjectId('6676c556d2a1e14374739994'),\n", + " 'id': 13527,\n", + " 'alt_artist_ids': [],\n", + " 'alt_classification_ids': ['TM-17', 'TM-4'],\n", + " 'alt_image_ids': [],\n", + " 'alt_material_ids': [],\n", + " 'alt_style_ids': [],\n", + " 'alt_subject_ids': ['TM-10118'],\n", + " 'alt_technique_ids': [],\n", + " 'alt_titles': None,\n", + " 'api_link': 'https://api.artic.edu/api/v1/artworks/13527',\n", + " 'api_model': 'artworks',\n", + " 'artist_display': 'Mary Cassatt (American, 1844-1926)\\nprinted with Leroy (French, active 1876-1900)',\n", + " 'artist_id': 33890,\n", + " 'artist_ids': [33890],\n", + " 'artist_title': 'Mary Cassatt',\n", + " 'artist_titles': ['Mary Cassatt'],\n", + " 'artwork_type_id': 18,\n", + " 'artwork_type_title': 'Print',\n", + " 'boost_rank': None,\n", + " 'catalogue_display': '

Mathews & Shapiro 5 17/17

Breeskin 143 11/11

',\n", + " 'category_ids': ['PC-13', 'PC-825'],\n", + " 'category_titles': ['Prints and Drawings', 'Women artists'],\n", + " 'classification_id': 'TM-154',\n", + " 'classification_ids': ['TM-154', 'TM-17', 'TM-4'],\n", + " 'classification_title': 'drypoint',\n", + " 'classification_titles': ['drypoint', 'print', 'prints and drawing'],\n", + " 'color': {'h': 50,\n", + " 'l': 61,\n", + " 's': 47,\n", + " 'percentage': 0.0010551670793152329,\n", + " 'population': 1007},\n", + " 'colorfulness': 0,\n", + " 'copyright_notice': None,\n", + " 'credit_line': 'Mr. and Mrs. Martin A. Ryerson Collection',\n", + " 'date_display': '1890–91',\n", + " 'date_end': 1891,\n", + " 'date_qualifier_id': None,\n", + " 'date_qualifier_title': '',\n", + " 'date_start': 1890,\n", + " 'department_id': 'PC-13',\n", + " 'department_title': 'Prints and Drawings',\n", + " 'description': None,\n", + " 'dimensions': 'Image/plate: 32.1 × 24.7 cm (12 11/16 × 9 3/4 in.); Sheet: 43.6 × 30 cm (17 3/16 × 11 13/16 in.)',\n", + " 'dimensions_detail': [{'depth': None,\n", + " 'width': 24,\n", + " 'height': 32,\n", + " 'diameter': None,\n", + " 'clarification': 'Image/plate'},\n", + " {'depth': None,\n", + " 'width': 30,\n", + " 'height': 43,\n", + " 'diameter': None,\n", + " 'clarification': 'Sheet'}],\n", + " 'document_ids': [],\n", + " 'edition': None,\n", + " 'exhibition_history': None,\n", + " 'fiscal_year': 1933,\n", + " 'fiscal_year_deaccession': None,\n", + " 'gallery_id': None,\n", + " 'gallery_title': None,\n", + " 'has_advanced_imaging': False,\n", + " 'has_educational_resources': False,\n", + " 'has_multimedia_resources': False,\n", + " 'has_not_been_viewed_much': False,\n", + " 'image_id': '907a7782-97d6-9cde-a8cb-1b9bea785ea1',\n", + " 'inscriptions': None,\n", + " 'internal_department_id': 3,\n", + " 'is_boosted': False,\n", + " 'is_on_view': False,\n", + " 'is_public_domain': True,\n", + " 'is_zoomable': True,\n", + " 'latitude': None,\n", + " 'latlon': None,\n", + " 'longitude': None,\n", + " 'main_reference_number': '1932.1287',\n", + " 'material_id': 'TM-2982',\n", + " 'material_ids': ['TM-2982'],\n", + " 'material_titles': ['paper (fiber product)'],\n", + " 'max_zoom_window_size': -1,\n", + " 'medium_display': 'Color drypoint, aquatint and softground etching from two plates, printed à la poupée, on ivory laid paper',\n", + " 'nomisma_id': None,\n", + " 'on_loan_display': '

On loan to Philadelphia Museum of Art in Philadelphia for Mary Cassatt at Work

',\n", + " 'place_of_origin': 'United States',\n", + " 'provenance_text': None,\n", + " 'publication_history': None,\n", + " 'publishing_verification_level': 'Web Basic',\n", + " 'section_ids': [],\n", + " 'section_titles': [],\n", + " 'short_description': None,\n", + " 'site_ids': [],\n", + " 'sound_ids': [],\n", + " 'source_updated_at': '2024-06-22T00:30:09-05:00',\n", + " 'style_id': None,\n", + " 'style_ids': [],\n", + " 'style_title': None,\n", + " 'style_titles': [],\n", + " 'subject_id': 'TM-12350',\n", + " 'subject_ids': ['TM-12350', 'TM-10118'],\n", + " 'subject_titles': ['mothers', 'children'],\n", + " 'suggest_autocomplete_all': [{'input': ['1932.1287'],\n", + " 'contexts': {'groupings': ['accession']}},\n", + " {'input': ['The Bath'],\n", + " 'weight': 1365,\n", + " 'contexts': {'groupings': ['title']}}],\n", + " 'technique_id': None,\n", + " 'technique_ids': [],\n", + " 'technique_titles': [],\n", + " 'term_titles': ['drypoint',\n", + " 'mothers',\n", + " 'paper (fiber product)',\n", + " 'print',\n", + " 'prints and drawing',\n", + " 'children'],\n", + " 'text_ids': [],\n", + " 'theme_titles': ['Women artists'],\n", + " 'thumbnail': {'lqip': 'data:image/gif;base64,R0lGODlhBAAFAPQAAKCgc3yPl4yanb6yhqysmL+1qcy6h8e8j8e6lcq/oMm/stLBi9DGstPJuNLIutTKudXLutbMvNfPv9rQwQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAAAAAAALAAAAAAEAAUAAAURYFMoUIQMjQAYSUAsRzQxUAgAOw==',\n", + " 'width': 8537,\n", + " 'height': 11179,\n", + " 'alt_text': 'A work made of color drypoint, aquatint and softground etching from two plates, printed à la poupée, on ivory laid paper.'},\n", + " 'timestamp': '2024-06-22T14:00:12-05:00',\n", + " 'title': 'The Bath',\n", + " 'updated_at': '2024-06-22T00:31:03-05:00',\n", + " 'video_ids': []}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "art.find_one()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f707fdfb-5423-4f0d-abcd-3f300ff575a0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0187b362-84ef-4b16-be55-3fa7aa96f1b7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "521134e2-4515-4da9-81ae-5ed0069ac956", + "metadata": {}, + "source": [ + "# Test collection" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d4f8a262-c69a-4483-ae4a-095f545341cb", + "metadata": {}, + "outputs": [], + "source": [ + "db = client.test_db" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "79d99b6e-e0a0-4cd0-a5d1-d5af0e25b6c7", + "metadata": {}, + "outputs": [], + "source": [ + "collection = db.test_collection" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "3ffe4b7a-d07f-4df0-866e-fd34385093ce", + "metadata": {}, + "outputs": [], + "source": [ + "post = {\"author\": \"me\"}" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ea994465-f38a-4f3e-959c-7a821dfc9af7", + "metadata": {}, + "outputs": [], + "source": [ + "post_id = collection.insert_one(post).inserted_id" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "53c41546-9a12-4969-b36c-96d61d797735", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectId('667321e16c65440db2aa4344')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "post_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0765b802-1764-4327-b070-62c2cc76592e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "abdbadee-fa1f-4f68-8a23-4d0570aed4bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.count_documents({})" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8e9ebf70-33a5-4fc3-93d1-7fb3fd8d1609", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_id': ObjectId('667321e16c65440db2aa4344'), 'author': 'me'}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.find_one()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "85f40427-f63b-466b-8f0f-7bf01e0559c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_id': ObjectId('667321e16c65440db2aa4344'), 'author': 'me'}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.find_one({\"_id\": ObjectId(post_id)})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7fe0141-33ed-4ffd-8dd7-12b65679797d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}