diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c35e0ee5..2a2727b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,13 +70,9 @@ jobs: echo "sqlalchemy.url : postgresql://postgres:postgres@localhost:5432/desc_data_registry" > $HOME/.config_reg_access # Create schemas - - name: Create data registry production schema + - name: Create data registry schemas run: | - python scripts/create_registry_schema.py --config $HOME/.config_reg_access --schema production - - - name: Create data registry default schema - run: | - python scripts/create_registry_schema.py --config $HOME/.config_reg_access + python scripts/create_registry_schema.py --config $HOME/.config_reg_access --create_both # Run CI tests - name: Run CI tests @@ -152,13 +148,9 @@ jobs: echo "sqlalchemy.url : postgresql://postgres:postgres@localhost:5432/desc_data_registry" > $DATAREG_CONFIG # Create schemas - - name: Create data registry production schema + - name: Create data registry schemas run: | - python scripts/create_registry_schema.py --config $DATAREG_CONFIG --schema production - - - name: Create data registry default schema - run: | - python scripts/create_registry_schema.py --config $DATAREG_CONFIG + python scripts/create_registry_schema.py --config $DATAREG_CONFIG --create_both # Run CI tests - name: Run CI tests diff --git a/CHANGELOG.md b/CHANGELOG.md index b81ced66..19412155 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## Version 1.0.0 (Release) + +- Update default NERSC site to + `/global/cfs/cdirs/lsst/utilities/desc-data-registry` +- Update default schema names (now stored in + `src/dataregistry/schema/default_schema_names.yaml` +- There is now a `reg_admin` account which is the only account to create the + initial schemas. The schema creation script has been updated to give the + correct `reg_writer` and `reg_reader` privileges. +- Remove `version_suffix` + ## Version 0.6.4 - Update `dregs ls` to be a bit cleaner. Also has `dregs ls --extended` option diff --git a/docs/source/dev_notes_database.rst b/docs/source/dev_notes_database.rst new file mode 100644 index 00000000..4725b828 --- /dev/null +++ b/docs/source/dev_notes_database.rst @@ -0,0 +1,56 @@ +Database structure +================== + +The database schemas +-------------------- + +There are two primary database schemas which the majority of users will work with: + +- The "default" schema, which the a hard-coded variable ``DEFAULT_SCHEMA_WORKING`` in + the ``src/dataregistry/db_basic.py`` file. It can be imported by ``from + dataregistry.db_basic import DEFAULT_SCHEMA_WORKING`` +- The production schema. This is where production datasets go, and has only + read access for the general user. By default this schema is named + "production", however during schema creation (see below) you can specify the + name of the production schema (though this should only be changed for testing + purposes). + +Users can specify their own schemas during the initialization of the +``DataRegistry`` object (by default ``DEFAULT_SCHEMA_WORKING`` is connected to). If +they wish to connect to the production schema its name will have to be manually +entered (see production schema tutorial). If the user wishes to connect to a +custom schema they will have to manually enter its name, however they will have +to have created their own schema for it to work. + +When using *SQLite* as the backend (useful for testing), the concepts of +schemas do not exist. + +First time creation of database schemas +--------------------------------------- + +In the top level ``scripts`` directory there is a ``create_registry_schema.py`` +script to do the initial schema creation. Before using the data registry, both +for *Postgres* and *SQLite* backends, this script must have been run. + +First, make sure your ``~/.config_reg_access`` and ``~/.pgpass`` are correctly +setup (see "Getting set up" for more information on these configuration files). +When creating schemas at NERSC, make sure the SPIN instance of the *Postgres* +database is running. + +The script must be run twice, first for the production schema, then for the +general schema (or run in a single call when using the ``--create_both`` +argument). There are three arguments that can be specified (all optional): + +- ``--config`` : Location of the data registry configuration file + (``~/.config_reg_access`` by default) +- ``--schema`` : The name of the schema (default is ``DEFAULT_SCHEMA_WORKING``) +- ``--production-schema``: The name of the production schema (default + "production") +- ``--create_both`` : Create both the production schema and working schema in + one call (the production schema will be made first, then the working schema) + +The typical initlalization would be: + +.. code-block:: bash + + python3 create_registry_schema.py --create_both diff --git a/docs/source/dev_notes_spin.rst b/docs/source/dev_notes_spin.rst new file mode 100644 index 00000000..a98add69 --- /dev/null +++ b/docs/source/dev_notes_spin.rst @@ -0,0 +1,4 @@ +SPIN +==== + +Details on setting up the SPIN instance... diff --git a/docs/source/index.rst b/docs/source/index.rst index 00aed7c2..99a10d90 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -52,6 +52,14 @@ them. reference_cli reference_schema +.. toctree:: + :maxdepth: 2 + :caption: Developer notes: + :hidden: + + dev_notes_spin + dev_notes_database + .. toctree:: :maxdepth: 2 :caption: Contact: diff --git a/docs/source/reference_python.rst b/docs/source/reference_python.rst index 582ce422..72d1886c 100644 --- a/docs/source/reference_python.rst +++ b/docs/source/reference_python.rst @@ -23,6 +23,18 @@ It connects the user to the database, and serves as a wrapper to both the .. automethod:: dataregistry.registrar.dataset.DatasetTable.register +.. automethod:: dataregistry.registrar.dataset.DatasetTable.replace + +.. automethod:: dataregistry.registrar.dataset.DatasetTable.modify + +.. automethod:: dataregistry.registrar.dataset.DatasetTable.delete + +.. automethod:: dataregistry.registrar.dataset.DatasetTable.add_keywords + +.. automethod:: dataregistry.registrar.dataset.DatasetTable.get_modifiable_columns + +.. automethod:: dataregistry.registrar.dataset.DatasetTable.get_keywords + .. automethod:: dataregistry.registrar.execution.ExecutionTable.register .. automethod:: dataregistry.registrar.dataset_alias.DatasetAliasTable.register diff --git a/docs/source/tutorial_cli.rst b/docs/source/tutorial_cli.rst index f355305a..c9f34ed4 100644 --- a/docs/source/tutorial_cli.rst +++ b/docs/source/tutorial_cli.rst @@ -38,8 +38,8 @@ Typing will list all the metadata properties that can be associated with a dataset during registration. As when registering datasets using the ``dataregistry`` -package, the ``relative_path`` and ``version`` string properties are mandatory, -which will always be the first two parameters passed to the ``dregs register +package, the dataset ``name`` and ``version`` properties are mandatory, which +will always be the first two parameters passed to the ``dregs register dataset`` command respectively. For example, say I have produced some data from my latest DESC publication that @@ -59,11 +59,9 @@ would run the CLI as follows: --description "Data from my_paper_dataset" This will recursively copy the ``/some/place/at/nersc/my_paper_dataset/`` -directory into the data registry shared space under the relative path -``my_paper_dataset``. As we did not specify a ``--name`` for the dataset, the -``name`` column in the database will automatically be assigned as -``my_paper_dataset`` (and all other properties we did not specify will keep -their default values). +directory into the data registry shared space with the +``name='my_paper_dataset'`` (other non-specified properties will keep their +default values). Updating a dataset ------------------ @@ -76,26 +74,18 @@ initial registration, we need to create a new version of the dataset. .. code-block:: bash dregs register dataset \ - my_paper_dataset_updated \ + my_paper_dataset \ patch \ --old-location /some/place/at/nersc/my_paper_dataset_updated/ \ --owner_type project \ --owner "DESC Generic Working Group" \ --description "Data from my_paper_dataset describing bugfix" \ - --name my_paper_dataset - -Here we associate it with the previous dataset through ``--name -my_paper_dataset``, and tell the data registry to automatically bump the patch -version to ``1.0.1`` by specifying "patch" as the version string (you could -however have entered "1.0.1" here if you prefer). - -.. note:: - Remember, if the dataset is non-overwritable, the relative paths in the data - registry need to be unique, which is why we could not have the relative path - of the second entry match the first. But for datasets only the ``name`` - plus ``version`` has to be unique, which is how we could associate them with - the same ``name`` column. +Here we associate it with the previous dataset through ``name= +my_paper_dataset`` (and making sure we keep the same `owner` and `owner_type`), +and tell the data registry to automatically bump the patch version to ``1.0.1`` +by specifying "patch" as the version string (you could however have entered +"1.0.1" here if you prefer). Querying the data registry -------------------------- diff --git a/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb b/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb index e9344c4f..3190a577 100644 --- a/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb +++ b/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb @@ -39,8 +39,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c2f92bf-9048-421e-b896-292eb00542c8", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if run multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -55,13 +67,15 @@ "cell_type": "code", "execution_count": null, "id": "72eabcd0-b05e-4e87-9ed1-6450ac196b05", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from dataregistry import DataRegistry\n", "\n", - "# Establish connection to database (using defaults)\n", - "datareg = DataRegistry()" + "# Establish connection to the tutorial schema\n", + "datareg = DataRegistry(schema=\"tutorial_working\", owner=OWNER)" ] }, { @@ -78,7 +92,9 @@ "cell_type": "code", "execution_count": null, "id": "560b857c-7d94-44ad-9637-0b107cd42259", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "print(datareg.Registrar.dataset.get_keywords())" @@ -98,7 +114,9 @@ "cell_type": "code", "execution_count": null, "id": "44581049-1d15-44f0-b1ed-34cff6cdb45a", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new dataset entry with keywords.\n", @@ -132,7 +150,9 @@ "cell_type": "code", "execution_count": null, "id": "09478b87-7d5a-4814-85c7-49f90e0db45d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# List of keywords to add to dataset\n", @@ -160,22 +180,24 @@ "\n", "The files and directories of registered datasets are stored under a path relative to the root directory (`root_dir`), which, by default, is a shared space at NERSC.\n", "\n", - "By default, the relative_path is constructed from the `name`, `version` and `version_suffix` (if there is one), in the format `relative_path=/_`. However, one can also manually select the relative_path during registration, for example" + "By default, the `relative_path` is constructed from the `name`, `version` and `version_suffix` (if there is one), in the format `relative_path=/_`. However, one can also manually select the relative_path during registration, for example" ] }, { "cell_type": "code", "execution_count": null, "id": "5bc0d5b6-f50a-4646-bc1b-7d9e829e91bc", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new entry with a manual relative path.\n", "datareg.Registrar.dataset.register(\n", " \"nersc_tutorial:my_desc_dataset_with_relative_path\",\n", " \"1.0.0\",\n", - " relative_path=\"nersc_tutorial/my_desc_dataset\",\n", - " location_type=\"dummy\", # for testing, means we need no data\n", + " relative_path=f\"NERSC_tutorial/{OWNER}/my_desc_dataset\",\n", + " location_type=\"dummy\", # for testing, means we need no actual data to exist\n", ")" ] }, @@ -216,19 +238,21 @@ "cell_type": "code", "execution_count": null, "id": "718d1cd8-4517-4597-9e36-e403e219cef2", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "from dataregistry.dataset_util import get_dataset_status\n", + "from dataregistry.registrar.dataset_util import get_dataset_status\n", "\n", "# The `get_dataset_status` function takes in a dataset `status` and a bit index, and returns if that bit is True or False\n", "dataset_status = 1\n", "\n", "# Is dataset valid?\n", - "print(f\"Dataset is valid: {get_dataset_status(dataset_status, \"valid\"}\")\n", + "print(f\"Dataset is valid: {get_dataset_status(dataset_status, 'valid')}\")\n", "\n", "# Is dataset replaced?\n", - "print(f\"Dataset is replaced: {get_dataset_status(dataset_status, \"replaced\"}\")" + "print(f\"Dataset is replaced: {get_dataset_status(dataset_status, 'replaced')}\")" ] }, { @@ -257,9 +281,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -271,7 +295,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/source/tutorial_notebooks/production_schema.ipynb b/docs/source/tutorial_notebooks/production_schema.ipynb index 245ebd6b..d3eb125e 100644 --- a/docs/source/tutorial_notebooks/production_schema.ipynb +++ b/docs/source/tutorial_notebooks/production_schema.ipynb @@ -25,7 +25,6 @@ "\n", "- Connect to the production schema and register a new dataset (admin only)\n", "- Query the production schema\n", - "- Transfer a dataset from another schema to the production schema (admin only)\n", "\n", "### Before we begin\n", "\n", @@ -43,8 +42,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "782179b4-4349-4199-b3a3-38d4845188a9", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if run multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -71,7 +82,7 @@ "from dataregistry import DataRegistry\n", "\n", "# Establish connection to the production schema\n", - "datareg = DataRegistry(schema=\"production\", owner=\"DESC CO group\", owner_type=\"production\")" + "datareg = DataRegistry(schema=\"tutorial_production\", owner=\"production\", owner_type=\"production\")" ] }, { @@ -79,9 +90,7 @@ "id": "6f7423fb-32d0-4a33-8e87-cd75e952512f", "metadata": {}, "source": [ - "Here we have connected to the data registry production schema (`schema=\"production\"`). Notice we have assigned a universal owner (`owner=\"DESC CO group\"`) and owner type (`owner_type=\"production\"`) to save some time when registering the datasets during this instance.\n", - "\n", - "Note for the production schema no value other than `production` will be allowed for `owner_type` (the inverse is also true for any schema other than production)." + "Here we have connected to the data registry tutorial production schema (`schema=\"tutorial_production\"`). We have assigned the universal `owner` and `owner_type` to be \"production\", which is the only values allowed for the production schema." ] }, { @@ -93,17 +102,12 @@ }, "outputs": [], "source": [ - "# Production datasets can't be overwritten, so for the purposes of this tutorial, let's generate a random unique name\n", - "import numpy as np\n", - "tag = np.rrandom.andint(0, 100000)\n", - "\n", "# Add new entry.\n", "dataset_id, execution_id = datareg.Registrar.dataset.register(\n", - " f\"nersc_production_tutorial/my_desc_production_dataset_{tag}\",\n", + " f\"nersc_production_tutorial:my_desc_production_dataset_{OWNER}\",\n", " \"1.0.0\",\n", " description=\"An production output from some DESC code\",\n", - " old_location=\"dummy_production_dataset.txt\",\n", - " is_dummy=True\n", + " location_type=\"dummy\"\n", ")\n", "\n", "print(f\"Created dataset {dataset_id}, associated with execution {execution_id}\")" @@ -120,7 +124,7 @@ "\n", "To recap about production datasets:\n", "- Only administrators have write access to the production schema and shared space\n", - "- All datasets in the production schema have `owner_type=\"production\"`\n", + "- All datasets in the production schema have `owner=\"production\"` and `owner_type=\"production\"`\n", "- Production datasets can never be overwritten, even if `is_overwritable=True`" ] }, @@ -147,8 +151,8 @@ }, "outputs": [], "source": [ - "# Create a filter that queries on the dataset name\n", - "f = datareg.Query.gen_filter('dataset.name', '==', 'my_desc_production_dataset')\n", + "# Create a filter that queries on the owner\n", + "f = datareg.Query.gen_filter('dataset.owner', '==', 'production')\n", "\n", "# Query the database\n", "results = datareg.Query.find_datasets(['dataset.dataset_id', 'dataset.name', 'dataset.owner',\n", @@ -166,31 +170,13 @@ "source": [ "Note that when using the command line interface to query datasets, e.g., `dregs ls`, both the default schema you are connected to and the production schema are both searched." ] - }, - { - "cell_type": "markdown", - "id": "db6a2ac8-80ad-4038-a722-de9de8fbe433", - "metadata": {}, - "source": [ - "## Transferring datasets to the production schema\n", - "\n", - "TBD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb87beb4-937c-498c-b1f2-de32cab29b17", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -202,7 +188,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/source/tutorial_notebooks/query_datasets.ipynb b/docs/source/tutorial_notebooks/query_datasets.ipynb index 4c9558dd..cd832aff 100644 --- a/docs/source/tutorial_notebooks/query_datasets.ipynb +++ b/docs/source/tutorial_notebooks/query_datasets.ipynb @@ -9,7 +9,7 @@ " \n", "\n", "\n", - "# Getting started: Part 2 - Simple queries\n", + "# Getting started: Part 3 - Simple queries\n", "\n", "Here we continue our getting started tutorial, introducing queries.\n", "\n", @@ -19,7 +19,7 @@ "\n", "1) Perform a simple query with a single filter\n", "2) Perform a simple query with multiple filters\n", - "3) Query for all datasets with a keyword\n", + "3) Query for all datasets tagged with a given keyword\n", "\n", "### Before we begin\n", "\n", @@ -37,8 +37,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2140d287-56de-4e94-a12b-959e13e28a9c", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if run multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -64,8 +76,8 @@ "source": [ "from dataregistry import DataRegistry\n", "\n", - "# Establish connection to database (using defaults)\n", - "datareg = DataRegistry()" + "# Establish connection to the tutorial schema\n", + "datareg = DataRegistry(schema=\"tutorial_working\", owner=OWNER)" ] }, { @@ -77,7 +89,7 @@ "\n", "Queries are constructed from one or more boolean logic \"filters\", which translate to SQL `WHERE` clauses in the code. \n", "\n", - "For example, to create a filter that will query for all datasets in registry with the name \"my_desc_dataset\" would be as follows:" + "For example, to create a filter that will query for all datasets in registry with the name \"nersc_tutorial:my_first_desc_dataset\" would be as follows:" ] }, { @@ -90,7 +102,7 @@ "outputs": [], "source": [ "# Create a filter that queries on the dataset name\n", - "f = datareg.Query.gen_filter('dataset.name', '==', 'my_desc_dataset')" + "f = datareg.Query.gen_filter('dataset.name', '==', 'nersc_tutorial:my_first_desc_dataset')" ] }, { @@ -172,7 +184,9 @@ "cell_type": "code", "execution_count": null, "id": "54a52029-2908-4056-bc68-4a87f6c3e6df", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "print(datareg.Query.get_all_columns())" @@ -187,7 +201,7 @@ "\n", "We are not limited to using a single filter during queries.\n", "\n", - "Now let's say we want to return all datasets in the registry with a particular `owner`, that were registered after a certain date. We also want the results in a Pandas dataframe format.\n", + "Now let's say we want to return all datasets in the registry with a particular `owner_type`, that were registered after a certain date. We also want the results in a Pandas dataframe format.\n", "\n", "To do this we construct two filter objects, i.e.:" ] @@ -196,11 +210,13 @@ "cell_type": "code", "execution_count": null, "id": "8eec33d8-2139-473f-ab27-3a04ebd5e7f1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Create a filter that queries on the owner\n", - "f = datareg.Query.gen_filter('dataset.owner', '==', 'DESC')\n", + "f = datareg.Query.gen_filter('dataset.owner_type', '==', 'group')\n", "\n", "# Create a 2nd filter that queries on the entry date\n", "f2 = datareg.Query.gen_filter('dataset.creation_date', '>', '01-01-2024')" @@ -218,12 +234,14 @@ "cell_type": "code", "execution_count": null, "id": "d21e982a-5b86-4f75-8b54-7923dec11e04", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Query the database\n", "results = datareg.Query.find_datasets(['dataset.dataset_id', 'dataset.name', 'dataset.owner',\n", - " 'dataset.relative_path', 'dataset.creation_date'],\n", + " 'dataset.relative_path', 'dataset.creation_date', 'dataset.owner_type'],\n", " [f,f2],\n", " return_format=\"dataframe\")" ] @@ -240,7 +258,9 @@ "cell_type": "code", "execution_count": null, "id": "908aa870-c0a4-4e59-a11c-97185e4a3db1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "print(results)" @@ -262,7 +282,9 @@ "cell_type": "code", "execution_count": null, "id": "22310484-e8e5-41c3-8a52-c0f0bc3773ff", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Create a filter on a given keyword\n", @@ -272,15 +294,25 @@ "results = datareg.Query.find_datasets(['dataset.dataset_id', 'dataset.name', 'dataset.owner',\n", " 'dataset.relative_path', 'dataset.creation_date'],\n", " [f],\n", - " return_format=\"dataframe\")" + " return_format=\"dataframe\")\n", + "\n", + "print(results)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdbe8537-6195-4239-bbb8-976daacdfab7", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -292,7 +324,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/source/tutorial_notebooks/register_datasets.ipynb b/docs/source/tutorial_notebooks/register_datasets.ipynb index fca85a2c..b0a627bc 100644 --- a/docs/source/tutorial_notebooks/register_datasets.ipynb +++ b/docs/source/tutorial_notebooks/register_datasets.ipynb @@ -25,7 +25,7 @@ "4) Replace a dataset\n", "5) Modify a previously registered dataset with updated metadata\n", "6) Delete a dataset\n", - "7) Registering external datasets\n", + "7) Register an \"external\" dataset\n", "8) Recap\n", "\n", "### Before we begin\n", @@ -44,8 +44,20 @@ }, "outputs": [], "source": [ + "# Come up with a random owner name to avoid clashes\n", + "from random import randint\n", + "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n", + "\n", "import dataregistry\n", - "print(\"Working with dataregistry version:\", dataregistry.__version__)" + "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d53aab85-bedf-47f6-a804-34a29e72631f", + "metadata": {}, + "source": [ + "**Note** that running some of the cells below may fail, especially if run multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration." ] }, { @@ -72,7 +84,7 @@ "from dataregistry import DataRegistry\n", "\n", "# Establish connection to database (using defaults)\n", - "datareg = DataRegistry()" + "#datareg = DataRegistry() # This is commented out as we will make our connection to the tutorial schema below" ] }, { @@ -82,8 +94,8 @@ "source": [ "With no arguments, the `DataRegistry` class will automatically attempt to:\n", "- establish a connection to the registry database using the information in your `~/.config_reg_access` and `~/.pgpass` files\n", - "- connect to the default database schema\n", - "- use the default NERSC \"`site`\" for the `root_dir`\n", + "- connect to the default \"working\" database schema\n", + "- use the default NERSC `site` for the `root_dir`\n", "\n", "The root directory (`root_dir`) is the base path under which all ingested data will be copied. Other than for testing, this should generally be the NERSC `site` address." ] @@ -140,6 +152,26 @@ "# datareg = DataRegistry(owner=\"desc\", owner_type=\"group\")" ] }, + { + "cell_type": "markdown", + "id": "d22523db-ae92-4474-bb96-5ad98a404d61", + "metadata": {}, + "source": [ + "For these tutorials, there is a stand alone working (`tutorial_working`) and production (`tutorial_production`) tutorial schema that we will connect to as to not interupt the default DESC schemas with random entries. If you are practicing using the `dataregistry` outwith these tutorial notebooks, feel free to also use the tutorial schemas for your entries. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f994c54-1dec-4bcc-b773-3092f0ba40aa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "datareg = DataRegistry(schema=\"tutorial_working\", owner=OWNER)" + ] + }, { "cell_type": "markdown", "id": "2d723a37-4101-496c-b385-0a2644aa7ad8", @@ -154,7 +186,9 @@ "cell_type": "code", "execution_count": null, "id": "6797be3b-434f-4245-a276-a32d9294d1ca", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Make some temporary text files that we can practice ingesting into the dataregistry with\n", @@ -192,7 +226,6 @@ " \"nersc_tutorial:my_first_desc_dataset\", # `name`\n", " \"1.0.0\", # `version`\n", " description=\"An output from some DESC code\",\n", - " owner=\"DESC\",\n", " owner_type=\"group\",\n", " is_overwritable=True,\n", " old_location=temp_files[0].name\n", @@ -241,9 +274,9 @@ "\n", "Registering a dataset does two things; it creates an entry in the DESC data registry database with the appropriate metadata, and it (optionally) copies the dataset contents to the `root_dir`. \n", "\n", - "If the data are already at the correct relative path within the `root_dir`, leave `old_location=None` and set the `relative_path` option to point to the location of the data within the `root_dir` (see special cases section for more information on the `relative_path`). However it's likely for most users the data will need to be copied from another location to the `root_dir`. That initial location may be specified using the `old_location` parameter as we have done in the example above. \n", + "If the data are already at the correct relative path within the `root_dir`, leave `old_location=None` and set the `relative_path` option to point to the location of the data within the `root_dir` (see the next tutorial, \"A deeper look\", for more information on the `relative_path`). However it's likely for most users the data will need to be copied from another location to the `root_dir`. That initial location may be specified using the `old_location` parameter as we have done in the example above. \n", "\n", - "In our example we have created a dummy text file as our dataset and ingested it into the data registry, however this can be any file or directory (directories will be recursively copied).\n", + "In our example we have ingested one of the dummy text files into the data registry, however this can be any file or directory (directories will be recursively copied).\n", "\n", "Note that the dataregistry does not support registering datasets through symbolic links (symlinks).\n", "\n", @@ -261,14 +294,16 @@ "\n", "If you have a dataset that has been previously registered within the data registry, and that dataset has updates, it is simple to register the updated version.\n", "\n", - "Register the new dataset using the same process as before, making sure to keep the same dataset `name`, but updating the dataset version. One can update the version in two ways: manually entering a new version string, or having the dataregistry automatically \"bump\" the dataset version by selecing either \"major\", \"minor\" or \"patch\" for the version string. For example, let's register an updated version of our dataset, bumping the minor tag (i.e., bumping 1.0.0 -> 1.1.0)." + "Register the new dataset using the same process as before, making sure to keep the same dataset `name`, but updating the dataset `version` (and/or `version_suffix`). One can update the version in two ways: (1) manually entering a new version string, or (2) having the dataregistry automatically \"bump\" the dataset version by selecing either \"major\", \"minor\" or \"patch\" for the version string. For example, let's register an updated version of our dataset, bumping the minor tag (i.e., bumping 1.0.0 -> 1.1.0)." ] }, { "cell_type": "code", "execution_count": null, "id": "2a65d3c0-41c1-4720-85be-10d68cef84f9", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new entry for an updated dataset with an updated version.\n", @@ -276,7 +311,6 @@ " \"nersc_tutorial:my_first_desc_dataset\",\n", " \"minor\", # Automatically bumps to \"1.1.0\"\n", " description=\"An output from some DESC code (updated)\",\n", - " owner=\"DESC\",\n", " owner_type=\"group\",\n", " is_overwritable=True,\n", " old_location=temp_files[1].name,\n", @@ -317,7 +351,9 @@ "cell_type": "code", "execution_count": null, "id": "8aa09c52-8283-4f91-bb62-490e65acbb4d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new entry, overwriting the data in the `root_dir`.\n", @@ -325,7 +361,6 @@ " \"nersc_tutorial:my_first_desc_dataset\",\n", " \"1.0.0\", \n", " description=\"An output from some DESC code (further updated)\",\n", - " owner=\"DESC\",\n", " owner_type=\"group\",\n", " is_overwritable=True,\n", " old_location=temp_files[2].name,\n", @@ -343,7 +378,7 @@ "id": "19afb001-d82f-448e-b40d-e4249e534286", "metadata": {}, "source": [ - "Only `valid` datasets with `is_overwritable=True` set or `invalid` datasets can be replaced. Deleted datasets, or archived datasets, cannot be replaced (see next tutorial for information about a datasets `status`)." + "Only `valid` datasets with `is_overwritable=True` set, or `invalid` datasets can be replaced (invalid datasets are those that failed to register, most likely due to a copying error or interuption). Deleted datasets, or archived datasets, cannot be replaced (see next tutorial for information about a datasets `status`)." ] }, { @@ -362,7 +397,9 @@ "cell_type": "code", "execution_count": null, "id": "4252ce7e-7dea-4404-ae61-19b8ca0be2fd", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# What columns in the dataset table are modifiable?\n", @@ -383,7 +420,9 @@ "cell_type": "code", "execution_count": null, "id": "5f20548e-148b-44af-a22a-b1e259d5e994", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# A key-value dict of the columns we want to update, with their new values\n", @@ -399,14 +438,16 @@ "source": [ "## 6) Deleting a dataset in the dataregistry\n", "\n", - "To delete a dataset entry from the dataregistry we call the .delete() function which accepts one argument, the dataset_id of the entry you wish to delete, e.g.," + "To delete a dataset entry from the dataregistry we call the .delete() function which accepts one argument, the `dataset_id` of the entry you wish to delete, e.g.," ] }, { "cell_type": "code", "execution_count": null, "id": "becbc567-e7a0-479d-8a1d-4a315d781bd3", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Delete dataset with entry ID == dataset_id\n", @@ -418,7 +459,7 @@ "id": "594c3b1a-8491-4f6e-b47b-97de00cccd9b", "metadata": {}, "source": [ - "Note that this will remove the dataset data stored under the root_dir, however the entry within the registry database will remain (with an updated status indicated the dataset was deleted)." + "Note that this will remove the files and/or directories associated with the dataset under the `root_dir`, however the entry within the registry database will remain (with an updated `status` bit indicating the dataset was deleted)." ] }, { @@ -426,7 +467,7 @@ "id": "8f6909cf-9826-4533-8b91-bb7a63079b37", "metadata": {}, "source": [ - "## 7) Registering external datasets\n", + "## 7) Registering \"external\" datasets\n", "\n", "Typically when we register datasets we are asking the `dataregistry` to collate provenance data for the dataset and to physically manage the data (either copy the data to the central `root_dir` or verify that it already exists there).\n", "\n", @@ -441,7 +482,9 @@ "cell_type": "code", "execution_count": null, "id": "e959f76f-b5e1-43a6-9932-ee1866cc398d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Add new external dataset entry.\n", @@ -486,9 +529,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "DREGS-env", "language": "python", - "name": "python3" + "name": "venv" }, "language_info": { "codemirror_mode": { @@ -500,7 +543,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/source/tutorial_python.rst b/docs/source/tutorial_python.rst index 81fadc17..5711451c 100644 --- a/docs/source/tutorial_python.rst +++ b/docs/source/tutorial_python.rst @@ -13,14 +13,16 @@ material by viewing the notebooks on GitHub. 1) Getting started with the data registry ----------------------------------------- -**Getting started: Part 1 - Registering datasets**: `Link to notebook `__ +**Getting started: Part 1 - Registering datasets**: `Link to notebook `__ -**Getting started: Part 2 - Simple queries**: `Link to notebook `__ +**Getting started: Part 2 - A deeper dive into datasets**: `Link to notebook `__ -**Getting started: Part 3 - Pipeline datasets**: `Link to notebook `__ +**Getting started: Part 3 - Simple queries**: `Link to notebook `__ -2) The production schema ------------------------- +**Getting started: Part 4 - Pipeline datasets**: `Link to notebook `__ + +2) Extra material +----------------- **The prduction schema**: `Link to notebook `__ diff --git a/scripts/create_registry_schema.py b/scripts/create_registry_schema.py index 9eebf1e1..1440e357 100644 --- a/scripts/create_registry_schema.py +++ b/scripts/create_registry_schema.py @@ -11,13 +11,20 @@ ) from sqlalchemy import ForeignKey, UniqueConstraint, text from sqlalchemy.orm import DeclarativeBase -from dataregistry.db_basic import DbConnection, SCHEMA_VERSION +from dataregistry.db_basic import DbConnection from dataregistry.db_basic import _insert_provenance, _insert_keyword -from dataregistry.schema import load_schema, load_preset_keywords +from dataregistry.schema import ( + load_schema, + load_preset_keywords, + DEFAULT_SCHEMA_WORKING, + DEFAULT_SCHEMA_PRODUCTION, +) """ A script to create a schema. +At NERSC, this script should be run with the `reg_admin` account. + The schema contains the following six tables: - "dataset" : Primary table, contains information on the datasets - "dataset_alias" : Table to associate "alias" names to datasets @@ -65,14 +72,15 @@ def _get_column_definitions(schema, table): for column in schema_data[table]["column_definitions"].keys(): # Special case where column has a foreign key if schema_data[table]["column_definitions"][column]["foreign_key"]: - fk_schema = schema - if ( - schema_data[table]["column_definitions"][column]["foreign_key_schema"] - != "self" - ): - fk_schema = schema_data[table]["column_definitions"][column][ - "foreign_key_schema" - ] + fk_schema = schema_data[table]["column_definitions"][column][ + "foreign_key_schema" + ] + if fk_schema == "self": + fk_schema = schema + elif fk_schema == "production": + fk_schema = prod_schema + else: + raise ValueError(f"{fk_schema} is a bad FK schema") return_dict[column] = Column( column, @@ -193,7 +201,11 @@ def _get_ForeignKey_str(schema, table, column): def _FixDependencyColumns(columns, has_production, production): """ - Special case for dependencies table where some column names need to be tweeked. + Special case for dependencies table where some column names need to be + tweeked. + + This is because you can link to the production schema from the working + schema for dependencies. Columns dict is modified in place. @@ -212,13 +224,14 @@ def _FixDependencyColumns(columns, has_production, production): # Update production schema name else: - if production != "production": + if production != DEFAULT_SCHEMA_PRODUCTION: old_col = columns["input_production_id"] fkey = ForeignKey(f"{production}.dataset.dataset_id") new_input_production_id = Column(old_col.name, old_col.type, fkey) del columns["input_production_id"] columns["input_production_id"] = new_input_production_id + def _BuildTable(schema, table_name, has_production, production): """ Builds a generic schema table from the information in the `schema.yaml` file. @@ -252,35 +265,10 @@ def _BuildTable(schema, table_name, has_production, production): Model = type(class_name, (Base,), {**columns, **meta}) return Model -def _Keyword(schema): - """Stores the list of keywords.""" - - class_name = f"{schema}_keyword" - - # Load columns from `schema.yaml` file - columns = _get_column_definitions(schema, "keyword") - - # Table metadata - meta = {"__tablename__": "keyword", "__table_args__": (UniqueConstraint( - "keyword", name="keyword_u_keyword" - ), {"schema": schema},)} - - Model = type(class_name, (Base,), {**columns, **meta}) - return Model - -def _DatasetKeyword(schema): - """Many-Many link between datasets and keywords.""" - - class_name = f"{schema}_dataset_keyword" - # Load columns from `schema.yaml` file - columns = _get_column_definitions(schema, "dataset_keyword") - - # Table metadata - meta = {"__tablename__": "dataset_keyword", "__table_args__": {"schema": schema}} - - Model = type(class_name, (Base,), {**columns, **meta}) - return Model +# ---------------- +# Database version +# ---------------- # The following should be adjusted whenever there is a change to the structure # of the database tables. @@ -289,127 +277,152 @@ def _DatasetKeyword(schema): _DB_VERSION_PATCH = 0 _DB_VERSION_COMMENT = "Remove `is_overwritten`, `replace_date` and `replace_uid` columns, the information is in `status`" +# ---------------------------- # Parse command line arguments +# ---------------------------- + parser = argparse.ArgumentParser( - description=""" -Creates dataregistry tables for specified schema and connection information (config)""", + description="Creates dataregistry tables for specified schema and connection information (config)", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--schema", help="name of schema to contain tables. Will be created if it doesn't already exist", - default=f"{SCHEMA_VERSION}", + default=f"{DEFAULT_SCHEMA_WORKING}", ) parser.add_argument( "--production-schema", - default="production", + default=f"{DEFAULT_SCHEMA_PRODUCTION}", help="name of schema containing production tables.", ) parser.add_argument("--config", help="Path to the data registry config file") +parser.add_argument( + "--create_both", + help="Create both the production and working schema", + action="store_true", +) +parser.add_argument( + "--no_permission_restrictions", + help="Both `reg_reader` and `reg_writer` get read/write access to all tables. For the tutorial schemas.", + action="store_true", +) args = parser.parse_args() -schema = args.schema -prod_schema = args.production_schema -# Connect to database to find out what the backend is -db_connection = DbConnection(args.config, schema) -if db_connection.dialect == "sqlite": - if schema == prod_schema: - raise ValueError("Production not available for sqlite databases") - # In fact we don't use schemas at all for sqlite - schema = None - prod_schema = None +# ------------------ +# Create the schemas +# ------------------ + +# What schemas are we creating? +if args.create_both: + schema_list = [args.production_schema, args.schema] else: - if schema != prod_schema: - # production schema, tables must already exists and schema - # must be backwards-compatible with prod_schem. That is, major - # versions must match and minor version of prod_schema cannot - # be greater than minor version of schema - stmt = f"select db_version_major, db_version_minor from {prod_schema}.provenance order by provenance_id desc limit 1" - try: - with db_connection.engine.connect() as conn: - result = conn.execute(text(stmt)) - result = pd.DataFrame(result) - except Exception: - raise RuntimeError("production schema does not exist or is ill-formed") - if ( - result["db_version_major"][0] - != _DB_VERSION_MAJOR | int(result["db_version_minor"][0]) - > _DB_VERSION_MINOR - ): - raise RuntimeError("production schema version incompatible") - -if schema: - stmt = f"CREATE SCHEMA IF NOT EXISTS {schema}" - with db_connection.engine.connect() as conn: - conn.execute(text(stmt)) - conn.commit() - - # Grant reg_reader access - try: + schema_list = [args.schema] +prod_schema = args.production_schema + +# Load the preset keywords +keywords = load_preset_keywords() + +# Loop over each schema +for schema in schema_list: + # Connect to database to find out what the backend is + db_connection = DbConnection(args.config, schema) + print(f"Database dialect is '{db_connection.dialect}'") + + if db_connection.dialect == "sqlite": + print(f"Creating sqlite database...") + schema = None + elif schema == prod_schema: + print(f"Creating production schema {prod_schema}...") + else: + print( + f"Creating schema '{schema}', linking to production schema '{prod_schema}'..." + ) + + # Make sure the linked production schema exists / is allowed + if db_connection.dialect == "sqlite": + if schema == prod_schema: + raise ValueError("Production not available for sqlite databases") + else: + if schema != prod_schema: + # production schema, tables must already exists and schema + # must be backwards-compatible with prod_schem. That is, major + # versions must match and minor version of prod_schema cannot + # be greater than minor version of schema + stmt = f"select db_version_major, db_version_minor from {prod_schema}.provenance order by provenance_id desc limit 1" + try: + with db_connection.engine.connect() as conn: + result = conn.execute(text(stmt)) + result = pd.DataFrame(result) + except Exception: + raise RuntimeError("production schema does not exist or is ill-formed") + if ( + result["db_version_major"][0] + != _DB_VERSION_MAJOR | int(result["db_version_minor"][0]) + > _DB_VERSION_MINOR + ): + raise RuntimeError("production schema version incompatible") + + # Create the schema + if db_connection.dialect != "sqlite": + stmt = f"CREATE SCHEMA IF NOT EXISTS {schema}" with db_connection.engine.connect() as conn: - # Grant reg_reader access. - acct = "reg_reader" - usage_prv = f"GRANT USAGE ON SCHEMA {schema} to {acct}" - select_prv = f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} to {acct}" - conn.execute(text(usage_prv)) - conn.execute(text(select_prv)) - - if schema == prod_schema: # also grant privileges to reg_writer - acct = "reg_writer" - usage_priv = f"GRANT USAGE ON SCHEMA {schema} to {acct}" - select_priv = f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} to {acct}" - conn.execute(text(usage_priv)) - conn.execute(text(select_priv)) + conn.execute(text(stmt)) conn.commit() - except Exception as e: - print(f"Could not grant access to {acct} on schema {schema}") - -# Create the tables -for table_name in schema_data.keys(): - _BuildTable(schema, table_name, db_connection.dialect != "sqlite", prod_schema) - -# Generate the database -if schema: - if schema != prod_schema: - Base.metadata.reflect(db_connection.engine, prod_schema) -Base.metadata.create_all(db_connection.engine) - -# Grant access to other accounts. Can only grant access to objects -# after they've been created -try: - with db_connection.engine.connect() as conn: - # Grant reg_reader access. - acct = "reg_reader" - usage_prv = f"GRANT USAGE ON SCHEMA {schema} to {acct}" - select_prv = f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} to {acct}" - conn.execute(text(usage_prv)) - conn.execute(text(select_prv)) - - if schema == prod_schema: # also grant privileges to reg_writer - acct = "reg_writer" - usage_priv = f"GRANT USAGE ON SCHEMA {schema} to {acct}" - select_priv = f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} to {acct}" - conn.execute(text(usage_priv)) - conn.execute(text(select_priv)) - conn.commit() -except Exception: - print(f"Could not grant access to {acct} on schema {schema}") - - -# Add initial provenance information -db = DbConnection(args.config, schema) -prov_id = _insert_provenance( - db, - _DB_VERSION_MAJOR, - _DB_VERSION_MINOR, - _DB_VERSION_PATCH, - "CREATE", - comment=_DB_VERSION_COMMENT, - associated_production=prod_schema, -) -# Populate the preset system keywords for datasets -keywords = load_preset_keywords() -for att in keywords["dataset"]: - _insert_keyword(db, att, True) + # Create the tables + for table_name in schema_data.keys(): + _BuildTable(schema, table_name, db_connection.dialect != "sqlite", prod_schema) + if db_connection.dialect != "sqlite": + print(f"Built table {table_name} in {schema}") + else: + print(f"Built table {table_name}") + + # Generate the database + if db_connection.dialect != "sqlite": + if schema != prod_schema: + Base.metadata.reflect(db_connection.engine, prod_schema) + Base.metadata.create_all(db_connection.engine) + + # Grant access to `reg_writer` and `reg_reader` accounts + if db_connection.dialect != "sqlite": + for acct in ["reg_reader", "reg_writer"]: + try: + with db_connection.engine.connect() as conn: + usage_prv = f"GRANT USAGE ON SCHEMA {schema} to {acct}" + if (acct == "reg_reader" or schema == prod_schema) and ( + not args.no_permission_restrictions + ): + privs = "SELECT" + else: + privs = f"SELECT, INSERT, UPDATE" + select_prv = ( + f"GRANT {privs} ON ALL TABLES IN SCHEMA {schema} to {acct}" + ) + conn.execute(text(usage_prv)) + conn.execute(text(select_prv)) + + # Need select access to sequences to create entries + if ( + acct == "reg_writer" and schema != prod_schema + ) or args.no_permission_restrictions: + privs = f"GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA {schema} TO {acct};" + conn.execute(text(privs)) + conn.commit() + except Exception: + print(f"Could not grant access to {acct} on schema {schema}") + + # Add initial provenance information + prov_id = _insert_provenance( + db_connection, + _DB_VERSION_MAJOR, + _DB_VERSION_MINOR, + _DB_VERSION_PATCH, + "CREATE", + comment=_DB_VERSION_COMMENT, + associated_production=prod_schema, + ) + + # Populate the preset system keywords for datasets + for att in keywords["dataset"]: + _insert_keyword(db_connection, att, True) diff --git a/scripts/create_schema_dirs.sh b/scripts/create_schema_dirs.sh new file mode 100644 index 00000000..bd5fc6e0 --- /dev/null +++ b/scripts/create_schema_dirs.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Script to create initial schema directories within the root_dir +# Must be run under the `descdr` group account + +# Check if the correct number of arguments is provided +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign inputs to variables +BASE_DIR="$1" +FOLDER_NAME="$2" +TARGET_DIR="${BASE_DIR}/${FOLDER_NAME}" + +# Check if the base directory exists +if [ ! -d "$BASE_DIR" ]; then + echo "Error: Base directory '$BASE_DIR' does not exist." + exit 1 +fi + +# Create the main folder in the base directory +mkdir -p "$TARGET_DIR" + +# Check if the main folder was created successfully +if [ $? -ne 0 ]; then + echo "Error: Could not create directory '$TARGET_DIR'." + exit 1 +fi + +# Create subdirectories: user, group, project, production +mkdir -p "$TARGET_DIR/user" "$TARGET_DIR/group" "$TARGET_DIR/project" "$TARGET_DIR/production" + +# Check if subdirectories were created successfully +if [ $? -ne 0 ]; then + echo "Error: Could not create subdirectories." + exit 1 +fi + +# Set the owning group to have read and execute (r-x) permissions on the main folder +chmod g=rx "$TARGET_DIR" + +# Set read and execute ACL for the lsst group on the main folder +#setfacl -m g:lsst:rx "$TARGET_DIR" + +# Check if the ACL was set successfully +if [ $? -eq 0 ]; then + echo "Folder '$TARGET_DIR' with subdirectories created and ACL set for 'lsst' group." +else + echo "Error: Failed to set ACL for the 'lsst' group." + exit 1 +fi diff --git a/src/dataregistry/_version.py b/src/dataregistry/_version.py index 364e7bae..5becc17c 100644 --- a/src/dataregistry/_version.py +++ b/src/dataregistry/_version.py @@ -1 +1 @@ -__version__ = "0.6.4" +__version__ = "1.0.0" diff --git a/src/dataregistry/db_basic.py b/src/dataregistry/db_basic.py index 57ede031..97dfe95b 100644 --- a/src/dataregistry/db_basic.py +++ b/src/dataregistry/db_basic.py @@ -7,18 +7,16 @@ from datetime import datetime from dataregistry import __version__ from dataregistry.exceptions import DataRegistryException +from dataregistry.schema import DEFAULT_SCHEMA_WORKING """ Low-level utility routines and classes for accessing the registry """ -SCHEMA_VERSION = "registry_beta" - __all__ = [ "DbConnection", "add_table_row", "TableMetadata", - "SCHEMA_VERSION", ] @@ -133,7 +131,7 @@ def __init__(self, config_file=None, schema=None, verbose=False): self._schema = None else: if schema is None: - self._schema = SCHEMA_VERSION + self._schema = DEFAULT_SCHEMA_WORKING else: self._schema = schema @@ -179,9 +177,12 @@ def __init__(self, db_connection, get_db_version=True): prov_name = ".".join([self._schema, "provenance"]) if prov_name not in self._metadata.tables: - raise DataRegistryException("Incompatible database: no Provenance table") + raise DataRegistryException( + f"Incompatible database: no Provenance table {prov_name}, " + f"listed tables are {self._metadata.tables}" + ) - if prov_name in self._metadata.tables and get_db_version: + if get_db_version: prov_table = self._metadata.tables[prov_name] stmt = select(column("associated_production")).select_from(prov_table) stmt = stmt.order_by(prov_table.c.provenance_id.desc()) @@ -205,6 +206,14 @@ def __init__(self, db_connection, get_db_version=True): self._db_major = None self._db_minor = None self._db_patch = None + self._prod_schema = None + + @property + def is_production_schema(self): + if self._prod_schema == self._schema: + return True + else: + return False @property def db_version_major(self): @@ -264,19 +273,10 @@ def _insert_provenance( from git import InvalidGitRepositoryError version_fields = __version__.split(".") - patch = version_fields[2] - suffix = None - if "-" in patch: - subfields = patch.split("-") - patch = subfields[0] - suffix = "-".join(subfields[1:]) - values = dict() values["code_version_major"] = version_fields[0] values["code_version_minor"] = version_fields[1] - values["code_version_patch"] = patch - if suffix: - values["code_version_suffix"] = suffix + values["code_version_patch"] = version_fields[2] values["db_version_major"] = db_version_major values["db_version_minor"] = db_version_minor values["db_version_patch"] = db_version_patch diff --git a/src/dataregistry/registrar/base_table_class.py b/src/dataregistry/registrar/base_table_class.py index 01a2e01e..f47111a5 100644 --- a/src/dataregistry/registrar/base_table_class.py +++ b/src/dataregistry/registrar/base_table_class.py @@ -56,9 +56,10 @@ def __init__(self, db_connection, root_dir, owner, owner_type): # Database engine and dialect. self._engine = db_connection.engine self._schema = db_connection.schema + self._dialect = db_connection._dialect # Link to Table Metadata. - self._metadata_getter = TableMetadata(db_connection) + self._table_metadata = TableMetadata(db_connection) # Store user id self._uid = os.getenv("USER") @@ -77,7 +78,7 @@ def __init__(self, db_connection, root_dir, owner, owner_type): self.schema_yaml = load_schema() def _get_table_metadata(self, tbl): - return self._metadata_getter.get(tbl) + return self._table_metadata.get(tbl) def delete(self, entry_id): """ diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 64ae4b24..85034977 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -105,9 +105,9 @@ def _validate_register_inputs( if kwargs_dict["owner_type"] == "production": if kwargs_dict["is_overwritable"]: raise ValueError("Cannot overwrite production entries") - if kwargs_dict["version_suffix"] is not None: - raise ValueError("Production entries can't have version suffix") - if self._schema != "production" and not kwargs_dict["test_production"]: + if (not self._table_metadata.is_production_schema) and ( + not kwargs_dict["test_production"] + ): raise ValueError( "Only the production schema can handle owner_type='production'" ) @@ -116,10 +116,11 @@ def _validate_register_inputs( if kwargs_dict["owner"] != "production": raise ValueError("`owner` for production datasets must be 'production'") else: - if self._schema == "production" or kwargs_dict["test_production"]: - raise ValueError( - "Only owner_type='production' can go in the production schema" - ) + if self._dialect != "sqlite" and not kwargs_dict["test_production"]: + if self._table_metadata.is_production_schema: + raise ValueError( + "Only owner_type='production' can go in the production schema" + ) # Validate the keywords (make sure they are registered) if len(kwargs_dict["keywords"]) > 0: @@ -213,10 +214,6 @@ def _register_row(self, name, version, kwargs_dict): kwargs_dict[ "execution_name" ] = f"for_dataset_{name}-{kwargs_dict['version_string']}" - if kwargs_dict["version_suffix"]: - kwargs_dict[ - "execution_name" - ] = f"{kwargs_dict['execution_name']}-{kwargs_dict['version_suffix']}" if kwargs_dict["execution_description"] is None: kwargs_dict[ "execution_description" @@ -311,7 +308,6 @@ def register( self, name, version, - version_suffix=None, creation_date=None, description=None, execution_id=None, @@ -354,7 +350,6 @@ def register( ---------- name** : str version** : str - version_suffix** : str, optional creation_date** : datetime, optional description** : str, optional execution_id** : int, optional @@ -414,13 +409,12 @@ def register( # If `relative_path` not passed, automatically generate it if kwargs_dict["relative_path"] is None: kwargs_dict["relative_path"] = _relpath_from_name( - name, kwargs_dict["version_string"], kwargs_dict["version_suffix"] + name, kwargs_dict["version_string"] ) # Make sure the relative_path in the `root_dir` is avaliable if kwargs_dict["location_type"] in ["dataregistry", "dummy"]: previous_datasets = self._find_previous( - None, None, None, kwargs_dict["owner"], @@ -458,7 +452,6 @@ def register( previous_datasets = self._find_previous( name, kwargs_dict["version_string"], - kwargs_dict["version_suffix"], kwargs_dict["owner"], kwargs_dict["owner_type"], ) @@ -466,7 +459,7 @@ def register( if len(previous_datasets) > 0: raise ValueError( "There is already a dataset with combination name," - "version_string, version_suffix, owner, owner_type" + "version_string, owner, owner_type" ) # Register the new row in the dataset table @@ -479,7 +472,6 @@ def replace( self, name, version, - version_suffix=None, creation_date=None, description=None, execution_id=None, @@ -509,7 +501,7 @@ def replace( Replace a dataset in the registry. This is so a user can keep the same - name/version/version_suffix/ower/owner_type combination as a previous + name/version/ower/owner_type combination as a previous dataset. Note the original dataset must have `is_overwritable=True` to allow the replace to work. @@ -546,15 +538,11 @@ def replace( previous_datasets = self._find_previous( name, kwargs_dict["version_string"], - kwargs_dict["version_suffix"], kwargs_dict["owner"], kwargs_dict["owner_type"], ) - full_name = ( - f"name: {name} v: {kwargs_dict['version_string']} " - f"v-suff: {kwargs_dict['version_suffix']}" - ) + full_name = f"name: {name} v: {kwargs_dict['version_string']}" if len(previous_datasets) == 0: raise ValueError(f"Dataset {full_name} does not exist") @@ -693,12 +681,6 @@ def _handle_data(self, relative_path, old_location, owner, owner_type, verbose): # Copy data into data registry if old_location: - # Stop if we don't have write permission to the root_dir - if not self.root_dir_write_access: - raise Exception( - f"Cannot copy data, no write access to {self._root_dir}" - ) - if verbose: tic = time.time() print( @@ -715,14 +697,13 @@ def _find_previous( self, name, version_string, - version_suffix, owner, owner_type, relative_path=None, ): """ Find all dataset entries with the same `name`, `version`, - `version_suffix`, `owner` and `owner_type`. + `owner` and `owner_type`. If `relative_path` is not None, instead search for all dataset entries with the same `owner`, `owner_type` and `relative_path` combination. @@ -732,7 +713,7 @@ def _find_previous( Parameters ---------- - name/version/version_suffix/owner/owner_type : str + name/version/owner/owner_type : str Returns ------- @@ -765,7 +746,6 @@ def _find_previous( stmt = stmt.where( dataset_table.c.name == name, dataset_table.c.version_string == version_string, - dataset_table.c.version_suffix == version_suffix, dataset_table.c.owner == owner, dataset_table.c.owner_type == owner_type, ) diff --git a/src/dataregistry/registrar/registrar_util.py b/src/dataregistry/registrar/registrar_util.py index fe138583..1478b3fd 100644 --- a/src/dataregistry/registrar/registrar_util.py +++ b/src/dataregistry/registrar/registrar_util.py @@ -19,7 +19,7 @@ _nonneg_int_re = "0|[1-9][0-9]*" -def _parse_version_string(version, with_suffix=False): +def _parse_version_string(version): """ Parase a version string into its components. @@ -27,33 +27,23 @@ def _parse_version_string(version, with_suffix=False): ---------- version : str Version string - with_suffix : bool - False means version string *must not* include suffix - True means it *may* have a suffix Returns ------- d : dict - Dict with keys "major", "minor", "patch" and optionally "suffix" + Dict with keys "major", "minor", "patch" """ cmp = version.split(VERSION_SEPARATOR) - if not with_suffix: - if len(cmp) != 3: - raise ValueError("Version string must have 3 components") - else: - if len(cmp) < 3 or len(cmp) > 4: - raise ValueError("Version string must have 3 or 4 components") - for c in cmp[0:3]: + if len(cmp) != 3: + raise ValueError("Version string must have 3 components") + for c in cmp: if not re.fullmatch(_nonneg_int_re, c): raise ValueError(f"Version component {c} is not non-negative int") d = {"major": cmp[0]} d["minor"] = cmp[1] d["patch"] = cmp[2] - if len(cmp) > 3: - d["suffix"] = cmp[3] - return d @@ -152,7 +142,7 @@ def _bump_version(name, v_string, dataset_table, engine): # Find the previous dataset based on the name and version stmt = select( - dataset_table.c["version_major", "version_minor", "version_patch", "version_suffix"] + dataset_table.c["version_major", "version_minor", "version_patch"] ).where(dataset_table.c.name == name) stmt = ( stmt.order_by(dataset_table.c.version_major.desc()) @@ -168,14 +158,6 @@ def _bump_version(name, v_string, dataset_table, engine): old_minor = 0 old_patch = 0 else: - # We don't bump datasets with a version suffix - if r.version_suffix is not None: - raise ValueError( - "Cannot bump dataset automatically as it " - f"has a version suffix ({r.version_suffix}). " - "Select the version/suffix manually instead." - ) - old_major = int(r.version_major) old_minor = int(r.version_minor) old_patch = int(r.version_patch) @@ -345,7 +327,7 @@ def _compute_checksum(file_path): raise Exception(e) -def _relpath_from_name(name, version, version_suffix): +def _relpath_from_name(name, version): """ Construct a relative path from the name and version of a dataset. We use this when the `relative_path` is not explicitly defined. @@ -356,17 +338,11 @@ def _relpath_from_name(name, version, version_suffix): Dataset name version : str Dataset version - version_suffix : str - Dataset version suffix + Returns ------- relative_path : str Automatically generated `relative_path` """ - if version_suffix is not None: - relative_path = f"{name}_{version}_{version_suffix}" - else: - relative_path = f"{name}_{version}" - - return relative_path + return f"{name}_{version}" diff --git a/src/dataregistry/schema/__init__.py b/src/dataregistry/schema/__init__.py index 9669f7f6..ae6fcb33 100644 --- a/src/dataregistry/schema/__init__.py +++ b/src/dataregistry/schema/__init__.py @@ -1 +1 @@ -from .load_schema import load_schema, load_preset_keywords +from .load_schema import load_schema, load_preset_keywords, DEFAULT_SCHEMA_WORKING, DEFAULT_SCHEMA_PRODUCTION diff --git a/src/dataregistry/schema/default_schema_names.yaml b/src/dataregistry/schema/default_schema_names.yaml new file mode 100644 index 00000000..27ca7312 --- /dev/null +++ b/src/dataregistry/schema/default_schema_names.yaml @@ -0,0 +1,2 @@ +working: lsst_desc_working +production: lsst_desc_production diff --git a/src/dataregistry/schema/load_schema.py b/src/dataregistry/schema/load_schema.py index ddd438e0..0ebf9c87 100644 --- a/src/dataregistry/schema/load_schema.py +++ b/src/dataregistry/schema/load_schema.py @@ -29,7 +29,7 @@ def _populate_defaults(mydict): for att in atts.keys(): if att not in mydict[table]["column_definitions"][row].keys(): if att not in atts.keys(): - raise ValueError(f"The {att} attribute has no default value") + raise ValueError(f"The {att} attribute has no default value") mydict[table]["column_definitions"][row][att] = atts[att] @@ -48,6 +48,7 @@ def load_schema(): return yaml_data + def load_preset_keywords(): """Load the system preset keywords from the yaml file""" @@ -59,3 +60,27 @@ def load_preset_keywords(): yaml_data = yaml.safe_load(file) return yaml_data + + +def get_default_schema_working(): + yaml_file_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "default_schema_names.yaml" + ) + with open(yaml_file_path, "r") as file: + yaml_data = yaml.safe_load(file) + + return yaml_data["working"] + + +def get_default_schema_production(): + yaml_file_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "default_schema_names.yaml" + ) + with open(yaml_file_path, "r") as file: + yaml_data = yaml.safe_load(file) + + return yaml_data["production"] + + +DEFAULT_SCHEMA_WORKING = get_default_schema_working() +DEFAULT_SCHEMA_PRODUCTION = get_default_schema_production() diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index b151f74d..c0f44f30 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -167,9 +167,6 @@ tables: type: "Integer" description: "Patch version of code when this schema was created" nullable: False - code_version_suffix: - type: "String" - description: "Version suffix of code when this schema was created" creator_uid: type: "StringShort" description: "UID of person who registered the entry" @@ -330,7 +327,7 @@ tables: unique_constraints: dataset_unique: - unique_list: ["name", "version_string", "version_suffix", "owner", "owner_type", "replace_iteration"] + unique_list: ["name", "version_string", "owner", "owner_type", "replace_iteration"] column_definitions: dataset_id: @@ -339,11 +336,11 @@ tables: description: "Unique identifier for this dataset" name: type: "String" - description: "Any convenient, evocative name for the human. Note the combination of name, version and version_suffix must be unique." + description: "Any convenient, evocative name for the human. Note the combination of name and version must be unique." nullable: False relative_path: type: "String" - description: "Relative path storing the data, relative to ``. If None, generated from the `name`, `version_string` and `version_suffix`" + description: "Relative path storing the data, relative to ``. If None, generated from the `name` and `version_string`" nullable: False cli_optional: True version_major: @@ -358,10 +355,6 @@ tables: type: "Integer" description: "Patch version in semantic string (i.e., x.x.X)" nullable: False - version_suffix: - type: "String" - description: "Optional version suffix to place at the end of the version string. Cannot be used for production datasets." - cli_optional: True version_string: type: "String" description: "Version string" diff --git a/src/dataregistry/site_config/site_rootdir.yaml b/src/dataregistry/site_config/site_rootdir.yaml index 22bd05d9..dd90d50d 100644 --- a/src/dataregistry/site_config/site_rootdir.yaml +++ b/src/dataregistry/site_config/site_rootdir.yaml @@ -1 +1 @@ -nersc: /global/cfs/cdirs/desc-co/registry-beta +nersc: /global/cfs/cdirs/lsst/utilities/data-registry diff --git a/src/dataregistry_cli/cli.py b/src/dataregistry_cli/cli.py index 9425d93a..be82b15c 100644 --- a/src/dataregistry_cli/cli.py +++ b/src/dataregistry_cli/cli.py @@ -1,7 +1,7 @@ import os import sys import argparse -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING, DEFAULT_SCHEMA_PRODUCTION from .register import register_dataset from .delete import delete_dataset from .query import dregs_ls @@ -30,8 +30,13 @@ def _add_generic_arguments(parser_obj): ) parser_obj.add_argument( "--schema", - default=f"{SCHEMA_VERSION}", - help="Which schema to connect to", + default=f"{DEFAULT_SCHEMA_WORKING}", + help="Which working schema to connect to", + ) + parser_obj.add_argument( + "--prod_schema", + default=f"{DEFAULT_SCHEMA_PRODUCTION}", + help="Which production schema to connect to", ) @@ -218,7 +223,7 @@ def get_parser(): "name", help=( "Any convenient, evocative name for the human. Note the " - "combination of name, version and version_suffix must be unique." + "combination of name and version must be unique." ), type=str, ) diff --git a/src/dataregistry_cli/query.py b/src/dataregistry_cli/query.py index a8c3dbdc..bafd755c 100644 --- a/src/dataregistry_cli/query.py +++ b/src/dataregistry_cli/query.py @@ -49,10 +49,10 @@ def dregs_ls(args): ) # Establish connection to the production schema - if datareg.db_connection.schema != "production": + if datareg.db_connection.schema != args.prod_schema: datareg_prod = DataRegistry( config_file=args.config_file, - schema="production", + schema=args.prod_schema, root_dir=args.root_dir, site=args.site, ) diff --git a/src/dataregistry_cli/register.py b/src/dataregistry_cli/register.py index cc659deb..dbbc7b35 100644 --- a/src/dataregistry_cli/register.py +++ b/src/dataregistry_cli/register.py @@ -40,7 +40,6 @@ def register_dataset(args): new_id = datareg.Registrar.dataset.register( args.name, args.version, - version_suffix=args.version_suffix, creation_date=args.creation_date, access_api=args.access_api, execution_id=args.execution_id, diff --git a/src/dataregistry_cli/show.py b/src/dataregistry_cli/show.py index 99f42bb8..5bb8e157 100644 --- a/src/dataregistry_cli/show.py +++ b/src/dataregistry_cli/show.py @@ -37,10 +37,10 @@ def dregs_show(show_what, args): ) # Establish connection to the production schema - if datareg.db_connection.schema != "production": + if datareg.db_connection.schema != args.prod_schema: datareg_prod = DataRegistry( config_file=args.config_file, - schema="production", + schema=args.prod_schema, root_dir=args.root_dir, site=args.site, ) diff --git a/tests/end_to_end_tests/database_test_utils.py b/tests/end_to_end_tests/database_test_utils.py index 03e89a7e..12ca31ad 100644 --- a/tests/end_to_end_tests/database_test_utils.py +++ b/tests/end_to_end_tests/database_test_utils.py @@ -1,7 +1,7 @@ import os import pytest -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING __all__ = [ "dummy_file", @@ -77,7 +77,7 @@ def dummy_file(tmp_path): tmp_root_dir = tmp_path / "root_dir" # Make some dummy data already on location - for THIS_SCHEMA in [SCHEMA_VERSION + "/", ""]: + for THIS_SCHEMA in [DEFAULT_SCHEMA_WORKING + "/", ""]: for f in ["dummy_dir", "dummy_dir_2"]: p = tmp_root_dir / f"{THIS_SCHEMA}user/{os.getenv('USER')}/{f}" p.mkdir(parents=True) @@ -175,7 +175,6 @@ def _insert_dataset_entry( owner=None, description=None, execution_id=None, - version_suffix=None, old_location=None, is_overwritable=False, which_datareg=None, @@ -210,7 +209,6 @@ def _insert_dataset_entry( dataset_id, execution_id = datareg.Registrar.dataset.register( name, version, - version_suffix=version_suffix, creation_date=None, description=description, old_location=old_location, @@ -247,7 +245,6 @@ def _replace_dataset_entry( owner=None, description=None, execution_id=None, - version_suffix=None, old_location=None, is_overwritable=False, which_datareg=None, @@ -282,7 +279,6 @@ def _replace_dataset_entry( dataset_id, execution_id = datareg.Registrar.dataset.replace( name, version, - version_suffix=version_suffix, creation_date=None, description=description, old_location=old_location, diff --git a/tests/end_to_end_tests/test_cli.py b/tests/end_to_end_tests/test_cli.py index f2eba090..e5d862b0 100644 --- a/tests/end_to_end_tests/test_cli.py +++ b/tests/end_to_end_tests/test_cli.py @@ -3,7 +3,7 @@ import dataregistry_cli.cli as cli import pytest from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING, DEFAULT_SCHEMA_PRODUCTION from database_test_utils import dummy_file from dataregistry.registrar.dataset_util import get_dataset_status, set_dataset_status @@ -17,16 +17,16 @@ def test_simple_query(dummy_file): # Register a dataset cmd = "register dataset my_cli_dataset 0.0.1 --location_type dummy" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Update the registered dataset cmd = "register dataset my_cli_dataset patch --location_type dummy" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Check - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset") results = datareg.Query.find_datasets( ["dataset.name", "dataset.version_string"], [f] @@ -44,14 +44,14 @@ def test_dataset_entry_with_execution(dummy_file): cmd = "register dataset my_cli_dataset3 1.2.3 --location_type dummy" cmd += " --description 'This is my dataset description'" cmd += " --access_api 'Awesome API' --owner DESC --owner_type group" - cmd += " --version_suffix test --creation_date '2020-01-01'" + cmd += " --creation_date '2020-01-01'" cmd += " --input_datasets 1 2 --execution_name 'I have given the execution a name'" cmd += " --is_overwritable" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Check - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset3") results = datareg.Query.find_datasets( [ @@ -74,13 +74,13 @@ def test_production_entry(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema="production") + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_PRODUCTION) if datareg.Query._dialect != "sqlite": # Register a dataset cmd = "register dataset my_production_cli_dataset 0.1.2 --location_type dummy" cmd += " --owner_type production --owner production" - cmd += f" --schema production --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_PRODUCTION} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Check @@ -100,11 +100,11 @@ def test_delete_dataset(dummy_file): # Register a dataset cmd = "register dataset my_cli_dataset_to_delete 0.0.1 --location_type dummy" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Find the dataset id - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset_to_delete") results = datareg.Query.find_datasets(["dataset.dataset_id"], [f]) assert len(results["dataset.dataset_id"]) == 1, "Bad result from query dcli4" @@ -112,11 +112,11 @@ def test_delete_dataset(dummy_file): # Delete the dataset cmd = f"delete dataset {d_id}" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Check - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset_to_delete") results = datareg.Query.find_datasets( [ @@ -143,11 +143,11 @@ def test_dataset_entry_with_keywords(dummy_file): # Register a dataset with many options cmd = "register dataset my_cli_dataset_keywords 1.0.0 --location_type dummy" cmd += " --is_overwritable --keywords simulation observation" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Check - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset_keywords") results = datareg.Query.find_datasets( [ @@ -162,6 +162,7 @@ def test_dataset_entry_with_keywords(dummy_file): assert getattr(r, "dataset.name") == "my_cli_dataset_keywords" assert getattr(r, "keyword.keyword") in ["observation", "simulation"] + def test_modify_dataset(dummy_file): """Make a simple entry, then modify it""" @@ -170,11 +171,11 @@ def test_modify_dataset(dummy_file): # Register a dataset cmd = "register dataset my_cli_dataset_to_modify 0.0.1 --location_type dummy" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Find the dataset id - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset_to_modify") results = datareg.Query.find_datasets(["dataset.dataset_id"], [f]) assert len(results["dataset.dataset_id"]) == 1, "Bad result from query dcli5" @@ -182,11 +183,11 @@ def test_modify_dataset(dummy_file): # Modify dataset cmd = f"modify dataset {d_id} description 'Updated CLI desc'" - cmd += f" --schema {SCHEMA_VERSION} --root_dir {str(tmp_root_dir)}" + cmd += f" --schema {DEFAULT_SCHEMA_WORKING} --root_dir {str(tmp_root_dir)}" cli.main(shlex.split(cmd)) # Check - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) f = datareg.Query.gen_filter("dataset.name", "==", "my_cli_dataset_to_modify") results = datareg.Query.find_datasets( [ diff --git a/tests/end_to_end_tests/test_database_functions.py b/tests/end_to_end_tests/test_database_functions.py index 52e27e93..4de7ad33 100644 --- a/tests/end_to_end_tests/test_database_functions.py +++ b/tests/end_to_end_tests/test_database_functions.py @@ -2,7 +2,7 @@ import pytest from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from database_test_utils import * @@ -15,7 +15,7 @@ def test_get_dataset_absolute_path(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) dset_name = "DESC:datasets:get_dataset_absolute_path_test" dset_ownertype = "group" @@ -40,7 +40,11 @@ def test_get_dataset_absolute_path(dummy_file): ) else: assert v == os.path.join( - str(tmp_root_dir), SCHEMA_VERSION, dset_ownertype, dset_owner, dset_relpath + str(tmp_root_dir), + DEFAULT_SCHEMA_WORKING, + dset_ownertype, + dset_owner, + dset_relpath, ) @@ -54,7 +58,7 @@ def test_find_entry(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Make a dataset d_id = _insert_dataset_entry(datareg, "test_find_entry:dataset", "0.0.1") @@ -90,7 +94,7 @@ def test_get_modifiable_columns(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) mod_list = datareg.Registrar.dataset.get_modifiable_columns() assert "description" in mod_list @@ -104,7 +108,7 @@ def test_get_keywords(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) keywords = datareg.Registrar.dataset.get_keywords() diff --git a/tests/end_to_end_tests/test_delete_dataset.py b/tests/end_to_end_tests/test_delete_dataset.py index 575fd4d7..a06767e4 100644 --- a/tests/end_to_end_tests/test_delete_dataset.py +++ b/tests/end_to_end_tests/test_delete_dataset.py @@ -2,7 +2,7 @@ import pytest from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from dataregistry.registrar.dataset_util import get_dataset_status from dataregistry.registrar.registrar_util import _form_dataset_path @@ -14,7 +14,7 @@ def test_delete_dataset_bad_entry(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Make sure we raise an exception trying to delete a dataset that doesn't exist with pytest.raises(ValueError, match="not found in"): @@ -38,7 +38,7 @@ def test_delete_dataset_entry(dummy_file, is_dummy, dataset_name): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Where is the real data? if is_dummy: @@ -91,7 +91,7 @@ def test_delete_dataset_entry(dummy_file, is_dummy, dataset_name): getattr(r, "dataset.owner_type"), getattr(r, "dataset.owner"), getattr(r, "dataset.relative_path"), - schema=SCHEMA_VERSION, + schema=DEFAULT_SCHEMA_WORKING, root_dir=str(tmp_root_dir), ) if dataset_name == "real_dataset_to_delete": diff --git a/tests/end_to_end_tests/test_keywords.py b/tests/end_to_end_tests/test_keywords.py index c0258b89..5675141c 100644 --- a/tests/end_to_end_tests/test_keywords.py +++ b/tests/end_to_end_tests/test_keywords.py @@ -6,7 +6,7 @@ import sqlalchemy import yaml from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from dataregistry.registrar.dataset_util import get_dataset_status, set_dataset_status from dataregistry.registrar.registrar_util import _form_dataset_path @@ -23,7 +23,7 @@ def test_register_dataset_with_bad_keywords(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Test case where keywords are not strings with pytest.raises(ValueError, match="not a valid keyword string"): @@ -53,7 +53,7 @@ def test_register_dataset_with_keywords(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Register two datasets with keywords d_id = _insert_dataset_entry( @@ -96,7 +96,7 @@ def test_modify_dataset_with_keywords(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Register a dataset with keywords d_id = _insert_dataset_entry( diff --git a/tests/end_to_end_tests/test_modify.py b/tests/end_to_end_tests/test_modify.py index 6fdc6a64..ec1a7f1c 100644 --- a/tests/end_to_end_tests/test_modify.py +++ b/tests/end_to_end_tests/test_modify.py @@ -1,6 +1,6 @@ import pytest from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from database_test_utils import * @@ -19,7 +19,7 @@ def test_modify_dataset(dummy_file, dataset_name, column, new_value): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( @@ -54,7 +54,7 @@ def test_modify_execution(dummy_file, execution_name, column, new_value): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry e_id = _insert_execution_entry( @@ -80,7 +80,7 @@ def test_modify_not_allowed(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( diff --git a/tests/end_to_end_tests/test_production_schema.py b/tests/end_to_end_tests/test_production_schema.py index 8a06e165..85a5f3b1 100644 --- a/tests/end_to_end_tests/test_production_schema.py +++ b/tests/end_to_end_tests/test_production_schema.py @@ -4,13 +4,14 @@ import pytest import yaml from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION, DbConnection +from dataregistry.schema import DEFAULT_SCHEMA_WORKING, DEFAULT_SCHEMA_PRODUCTION +from dataregistry.db_basic import DbConnection from database_test_utils import * # This is just to see what backend we are using # Remember no production schema when using sqlite backend -db_connection = DbConnection(None, schema=SCHEMA_VERSION) +db_connection = DbConnection(None, schema=DEFAULT_SCHEMA_WORKING) @pytest.mark.skipif( @@ -21,8 +22,10 @@ def test_register_with_production_dependencies(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - datareg_prod = DataRegistry(root_dir=str(tmp_root_dir), schema="production") + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) + datareg_prod = DataRegistry( + root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_PRODUCTION + ) # Make a dataset in each schema d_id_prod = _insert_dataset_entry( @@ -77,7 +80,7 @@ def test_production_schema_register(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema="production") + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_PRODUCTION) d_id = _insert_dataset_entry( datareg, @@ -112,7 +115,7 @@ def test_production_schema_bad_register(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema="production") + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_PRODUCTION) # Try to register dataset without production owner type with pytest.raises(ValueError, match="can go in the production schema"): @@ -132,16 +135,3 @@ def test_production_schema_bad_register(dummy_file): owner_type="production", is_overwritable=True, ) - - # Try to have a version suffix - with pytest.raises( - ValueError, match="Production entries can't have version suffix" - ): - d_id = _insert_dataset_entry( - datareg, - "DESC:datasets:bad_production_dataset_3", - "0.0.1", - owner="production", - owner_type="production", - version_suffix="prod", - ) diff --git a/tests/end_to_end_tests/test_query.py b/tests/end_to_end_tests/test_query.py index 827513e2..92268e6e 100644 --- a/tests/end_to_end_tests/test_query.py +++ b/tests/end_to_end_tests/test_query.py @@ -4,7 +4,7 @@ from sqlalchemy import inspect from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from database_test_utils import * # Establish connection to database (default schema) @@ -43,7 +43,7 @@ def test_query_all(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( @@ -68,7 +68,7 @@ def test_query_between_columns(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry _NAME = "DESC:datasets:test_query_between_columns" diff --git a/tests/end_to_end_tests/test_register_dataset_alias.py b/tests/end_to_end_tests/test_register_dataset_alias.py index 4b9ef6a0..3d413f50 100644 --- a/tests/end_to_end_tests/test_register_dataset_alias.py +++ b/tests/end_to_end_tests/test_register_dataset_alias.py @@ -1,5 +1,5 @@ from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from database_test_utils import * @@ -9,7 +9,7 @@ def test_register_dataset_alias(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add two dataset d_id = _insert_dataset_entry( diff --git a/tests/end_to_end_tests/test_register_dataset_dummy.py b/tests/end_to_end_tests/test_register_dataset_dummy.py index 7ccd45a3..6fa8ccc3 100644 --- a/tests/end_to_end_tests/test_register_dataset_dummy.py +++ b/tests/end_to_end_tests/test_register_dataset_dummy.py @@ -6,7 +6,7 @@ import sqlalchemy import yaml from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from dataregistry.registrar.dataset_util import get_dataset_status, set_dataset_status from dataregistry.registrar.registrar_util import _form_dataset_path @@ -23,7 +23,7 @@ def test_register_dataset_defaults(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( @@ -50,7 +50,6 @@ def test_register_dataset_defaults(dummy_file): assert results["owner_type"][0] == "user" assert results["description"][0] == None assert results["relative_path"][0] == f"{_NAME}_0.0.1" - assert results["version_suffix"][0] == None assert results["data_org"][0] == "dummy" assert results["execution_id"][0] >= 0 assert results["dataset_id"][0] >= 0 @@ -88,13 +87,12 @@ def test_register_dataset_manual(dummy_file): _OWNER = "test_owner" _OWNER_TYPE = "group" _REL_PATH = "manual/rel/path" - _V_SUFFIX = "test" _ACCESS_API = "test_api" _IS_OVERWRITABLE = True # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( @@ -105,7 +103,6 @@ def test_register_dataset_manual(dummy_file): owner=_OWNER, owner_type=_OWNER_TYPE, relative_path=_REL_PATH, - version_suffix=_V_SUFFIX, access_api=_ACCESS_API, is_overwritable=_IS_OVERWRITABLE, ) @@ -128,7 +125,6 @@ def test_register_dataset_manual(dummy_file): assert results["owner_type"][0] == _OWNER_TYPE assert results["description"][0] == _DESCRIPTION assert results["relative_path"][0] == _REL_PATH - assert results["version_suffix"][0] == _V_SUFFIX assert results["data_org"][0] == "dummy" assert results["execution_id"][0] >= 0 assert results["dataset_id"][0] >= 0 @@ -155,32 +151,6 @@ def test_register_dataset_manual(dummy_file): assert results["replace_iteration"][0] == 0 -def test_bump_vsuffix(dummy_file): - """Should not be able to bump datasets with a version suffix""" - - _NAME = "DESC:datasets:test_bump_vsuffix" - - # Establish connection to database - tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) - - # Add entry - d_id = _insert_dataset_entry( - datareg, - _NAME, - "0.0.1", - version_suffix="custom_suffix", - ) - - # Try to bump dataset with version suffix (should fail) - with pytest.raises(ValueError, match="Cannot bump"): - d_id = _insert_dataset_entry( - datareg, - _NAME, - "major", - ) - - @pytest.mark.parametrize( "v_type,ans", [ @@ -199,7 +169,7 @@ def test_dataset_bumping(dummy_file, v_type, ans): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( @@ -233,7 +203,7 @@ def test_dataset_owner_types(dummy_file, owner_type): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( @@ -266,7 +236,7 @@ def test_register_dataset_with_global_owner_set(dummy_file): tmp_src_dir, tmp_root_dir = dummy_file datareg = DataRegistry( root_dir=str(tmp_root_dir), - schema=SCHEMA_VERSION, + schema=DEFAULT_SCHEMA_WORKING, owner="DESC group", owner_type="group", ) @@ -307,7 +277,7 @@ def test_register_dataset_with_modified_default_execution(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) d_id_1 = _insert_dataset_entry( datareg, @@ -380,7 +350,7 @@ def test_dataset_query_return_format(dummy_file, return_format_str, expected_typ # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) _NAME = f"DESC:datasets:query_return_test_{return_format_str}" @@ -406,7 +376,7 @@ def test_query_all(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Register a dataset d_id_1 = _insert_dataset_entry( @@ -429,7 +399,7 @@ def test_dataset_bad_name_string(dummy_file, name): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Register a dataset with pytest.raises(ValueError, match="Cannot have character"): diff --git a/tests/end_to_end_tests/test_register_dataset_external.py b/tests/end_to_end_tests/test_register_dataset_external.py index 9264c538..b516258b 100644 --- a/tests/end_to_end_tests/test_register_dataset_external.py +++ b/tests/end_to_end_tests/test_register_dataset_external.py @@ -6,7 +6,7 @@ import sqlalchemy import yaml from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from dataregistry.registrar.dataset_util import get_dataset_status, set_dataset_status from dataregistry.registrar.registrar_util import _form_dataset_path @@ -18,7 +18,7 @@ def test_bad_register_dataset_external(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry with pytest.raises(ValueError, match="require either a url or contact_email"): @@ -43,7 +43,7 @@ def test_register_dataset_external(dummy_file, contact_email, url, rel_path): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry d_id = _insert_dataset_entry( diff --git a/tests/end_to_end_tests/test_register_dataset_real_data.py b/tests/end_to_end_tests/test_register_dataset_real_data.py index b33087a4..015b54ca 100644 --- a/tests/end_to_end_tests/test_register_dataset_real_data.py +++ b/tests/end_to_end_tests/test_register_dataset_real_data.py @@ -4,7 +4,7 @@ import pytest import yaml from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from dataregistry.registrar.dataset_util import get_dataset_status, set_dataset_status from dataregistry.registrar.registrar_util import _form_dataset_path from dataregistry.exceptions import DataRegistryRootDirBadState @@ -17,7 +17,7 @@ def test_copy_data(dummy_file, data_org): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # File/directory we are copying in if data_org == "file": @@ -63,7 +63,7 @@ def test_on_location_data(dummy_file, data_org, data_path): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) d_id = _insert_dataset_entry( datareg, @@ -100,7 +100,7 @@ def test_registering_symlinks(dummy_file, link): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) data_path = str(tmp_src_dir / link) @@ -123,7 +123,7 @@ def test_registering_bad_relative_path(dummy_file, link): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) data_path = str(tmp_src_dir / link) @@ -160,7 +160,7 @@ def test_registering_deleted_relative_path(dummy_file, link): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) data_path = str(tmp_src_dir / link) @@ -234,7 +234,7 @@ def test_registering_data_already_there(dummy_file, link, dest): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) data_path = str(tmp_src_dir / link) diff --git a/tests/end_to_end_tests/test_register_execution.py b/tests/end_to_end_tests/test_register_execution.py index b7e085a2..beae33c1 100644 --- a/tests/end_to_end_tests/test_register_execution.py +++ b/tests/end_to_end_tests/test_register_execution.py @@ -4,7 +4,7 @@ import pytest import yaml from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from database_test_utils import * @@ -28,7 +28,7 @@ def test_register_execution_with_config_file(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add entry _make_dummy_config(tmp_src_dir) diff --git a/tests/end_to_end_tests/test_register_pipeline.py b/tests/end_to_end_tests/test_register_pipeline.py index 36f6ad92..622ff569 100644 --- a/tests/end_to_end_tests/test_register_pipeline.py +++ b/tests/end_to_end_tests/test_register_pipeline.py @@ -4,7 +4,7 @@ import pytest import yaml from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from database_test_utils import * @@ -57,7 +57,7 @@ def test_pipeline_entry(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Execution for stage 1 ex_id_1 = _insert_execution_entry( diff --git a/tests/end_to_end_tests/test_replace_dataset.py b/tests/end_to_end_tests/test_replace_dataset.py index cf257af7..f34c2ea1 100644 --- a/tests/end_to_end_tests/test_replace_dataset.py +++ b/tests/end_to_end_tests/test_replace_dataset.py @@ -1,5 +1,5 @@ from dataregistry import DataRegistry -from dataregistry.db_basic import SCHEMA_VERSION +from dataregistry.schema import DEFAULT_SCHEMA_WORKING from database_test_utils import * import pytest @@ -10,7 +10,7 @@ def test_register_dataset_twice(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add two dataset d_id = _insert_dataset_entry( @@ -24,7 +24,7 @@ def test_register_dataset_twice(dummy_file): datareg, "DESC:dataset:test_register_dataset_twice", "0.0.1", - relative_path="test_register_dataset_twice/test" + relative_path="test_register_dataset_twice/test", ) @@ -123,7 +123,7 @@ def test_replace_dataset(dummy_file, _REL_PATH, name_tag): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add a dataset d_id = _insert_dataset_entry( @@ -172,7 +172,7 @@ def test_replacing_deleted_dataset(dummy_file): # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) _NAME = "DESC:dataset:test_replacing_deleted_dataset" @@ -195,12 +195,13 @@ def test_replacing_deleted_dataset(dummy_file): "0.0.1", ) + def test_replacing_non_overwritable_dataset(dummy_file): """Should not be able to replace a non-overwritable dataset""" # Establish connection to database tmp_src_dir, tmp_root_dir = dummy_file - datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=SCHEMA_VERSION) + datareg = DataRegistry(root_dir=str(tmp_root_dir), schema=DEFAULT_SCHEMA_WORKING) # Add dataset d_id = _insert_dataset_entry( diff --git a/tests/unit_tests/test_registrar_util.py b/tests/unit_tests/test_registrar_util.py index f5e7461a..67acd392 100644 --- a/tests/unit_tests/test_registrar_util.py +++ b/tests/unit_tests/test_registrar_util.py @@ -15,10 +15,8 @@ "v_str,v_len,ans_maj,ans_min,ans_ptc,ans_suf,w_suf,bad", [ ("1.2.3", 3, "1", "2", "3", None, False, False), - ("4.5.6.mysuffix", 4, "4", "5", "6", "mysuffix", True, False), ("7.8.9", 3, "7", "8", "9", None, True, False), ("1.2.3.4.5", 5, "1", "2", "3", None, False, True), - ("1.2.3.mysuffix", 4, "1", "2", "3", "mysuffix", False, True), ("-1.2.3", 3, "-1", "2", "3", None, False, True), ], ) @@ -30,18 +28,16 @@ def test_parse_version_string( # Test bad cases, should raise ValueError if bad: with pytest.raises(ValueError): - tmp = _parse_version_string(v_str, with_suffix=w_suf) + tmp = _parse_version_string(v_str) # Test good cases else: - tmp = _parse_version_string(v_str, with_suffix=w_suf) + tmp = _parse_version_string(v_str) assert type(tmp) == dict assert len(tmp.keys()) == v_len assert tmp["major"] == ans_maj assert tmp["minor"] == ans_min assert tmp["patch"] == ans_ptc - if v_len == 4: - assert tmp["suffix"] == ans_suf @pytest.mark.parametrize( @@ -161,20 +157,19 @@ def test_read_file(tmpdir, nchars, max_config_length, ans): _read_configuration_file("i_dont_exist.txt", 10) @pytest.mark.parametrize( - "name,version_string,version_suffix,ans", + "name,version_string,ans", [ - ("mydataset", "1.1.1", None, "mydataset_1.1.1"), - ("mydataset", "1.1.1", "v1", "mydataset_1.1.1_v1"), + ("mydataset", "1.1.1", "mydataset_1.1.1"), ], ) -def test_relpath_from_name(name, version_string, version_suffix, ans): +def test_relpath_from_name(name, version_string, ans): """ Test dataset path construction Datasets should come back with the format: /// """ - tmp = _relpath_from_name(name, version_string, version_suffix) + tmp = _relpath_from_name(name, version_string) assert tmp == ans @pytest.mark.parametrize( diff --git a/tests/unit_tests/test_root_dir.py b/tests/unit_tests/test_root_dir.py index 7878b33a..08b99034 100644 --- a/tests/unit_tests/test_root_dir.py +++ b/tests/unit_tests/test_root_dir.py @@ -5,7 +5,7 @@ from dataregistry import DataRegistry _TEST_ROOT_DIR = "DataRegistry_data" -_NERSC_SITE = "/global/cfs/cdirs/desc-co/registry-beta" +_NERSC_SITE = "/global/cfs/cdirs/lsst/utilities/data-registry" _ENV_SITE = "nersc"