Merge pull request #150 from LSSTDESC/release_1.0.0

Release 1.0.0
LSSTDESC · Sep 24, 2024 · bba71d6 · bba71d6
2 parents ff27fcd + 691c5b2
commit bba71d6
Show file tree

Hide file tree

Showing 45 changed files with 692 additions and 528 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -70,13 +70,9 @@ jobs:
           echo "sqlalchemy.url : postgresql://postgres:postgres@localhost:5432/desc_data_registry" > $HOME/.config_reg_access
 
       # Create schemas
-      - name: Create data registry production schema
+      - name: Create data registry schemas
         run: |
-          python scripts/create_registry_schema.py --config $HOME/.config_reg_access --schema production
-
-      - name: Create data registry default schema
-        run: |
-          python scripts/create_registry_schema.py --config $HOME/.config_reg_access
+          python scripts/create_registry_schema.py --config $HOME/.config_reg_access --create_both
 
       # Run CI tests
       - name: Run CI tests
@@ -152,13 +148,9 @@ jobs:
           echo "sqlalchemy.url : postgresql://postgres:postgres@localhost:5432/desc_data_registry" > $DATAREG_CONFIG
 
       # Create schemas
-      - name: Create data registry production schema
+      - name: Create data registry schemas
         run: |
-          python scripts/create_registry_schema.py --config $DATAREG_CONFIG --schema production
-
-      - name: Create data registry default schema
-        run: |
-          python scripts/create_registry_schema.py --config $DATAREG_CONFIG
+          python scripts/create_registry_schema.py --config $DATAREG_CONFIG --create_both
 
       # Run CI tests
       - name: Run CI tests

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,14 @@
+## Version 1.0.0 (Release)
+
+- Update default NERSC site to
+  `/global/cfs/cdirs/lsst/utilities/desc-data-registry`
+- Update default schema names (now stored in
+  `src/dataregistry/schema/default_schema_names.yaml`
+- There is now a `reg_admin` account which is the only account to create the
+  initial schemas. The schema creation script has been updated to give the
+  correct `reg_writer` and `reg_reader` privileges.
+- Remove `version_suffix`
+
 ## Version 0.6.4
 
 - Update `dregs ls` to be a bit cleaner. Also has `dregs ls --extended` option

diff --git a/docs/source/dev_notes_database.rst b/docs/source/dev_notes_database.rst
@@ -0,0 +1,56 @@
+Database structure
+==================
+
+The database schemas
+--------------------
+
+There are two primary database schemas which the majority of users will work with:
+
+- The "default" schema, which the a hard-coded variable ``DEFAULT_SCHEMA_WORKING`` in
+  the ``src/dataregistry/db_basic.py`` file. It can be imported by ``from
+  dataregistry.db_basic import DEFAULT_SCHEMA_WORKING``
+- The production schema. This is where production datasets go, and has only
+  read access for the general user. By default this schema is named
+  "production", however during schema creation (see below) you can specify the
+  name of the production schema (though this should only be changed for testing
+  purposes).
+
+Users can specify their own schemas during the initialization of the
+``DataRegistry`` object (by default ``DEFAULT_SCHEMA_WORKING`` is connected to). If
+they wish to connect to the production schema its name will have to be manually
+entered (see production schema tutorial). If the user wishes to connect to a
+custom schema they will have to manually enter its name, however they will have
+to have created their own schema for it to work.
+
+When using *SQLite* as the backend (useful for testing), the concepts of
+schemas do not exist.
+
+First time creation of database schemas
+---------------------------------------
+
+In the top level ``scripts`` directory there is a ``create_registry_schema.py``
+script to do the initial schema creation. Before using the data registry, both
+for *Postgres* and *SQLite* backends, this script must have been run.
+
+First, make sure your ``~/.config_reg_access`` and ``~/.pgpass`` are correctly
+setup (see "Getting set up" for more information on these configuration files).
+When creating schemas at NERSC, make sure the SPIN instance of the *Postgres*
+database is running.
+
+The script must be run twice, first for the production schema, then for the
+general schema (or run in a single call when using the ``--create_both``
+argument).  There are three arguments that can be specified (all optional):
+
+- ``--config`` : Location of the data registry configuration file
+  (``~/.config_reg_access`` by default)
+- ``--schema`` : The name of the schema (default is ``DEFAULT_SCHEMA_WORKING``)
+- ``--production-schema``: The name of the production schema (default
+  "production")
+- ``--create_both`` : Create both the production schema and working schema in
+  one call (the production schema will be made first, then the working schema)
+
+The typical initlalization would be:
+
+.. code-block:: bash
+   
+   python3 create_registry_schema.py --create_both
diff --git a/docs/source/dev_notes_spin.rst b/docs/source/dev_notes_spin.rst
@@ -0,0 +1,4 @@
+SPIN
+====
+
+Details on setting up the SPIN instance...
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -52,6 +52,14 @@ them.
    reference_cli
    reference_schema
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Developer notes:
+   :hidden:
+
+   dev_notes_spin
+   dev_notes_database
+
 .. toctree::
    :maxdepth: 2
    :caption: Contact:

diff --git a/docs/source/reference_python.rst b/docs/source/reference_python.rst
@@ -23,6 +23,18 @@ It connects the user to the database, and serves as a wrapper to both the
 
 .. automethod:: dataregistry.registrar.dataset.DatasetTable.register
 
+.. automethod:: dataregistry.registrar.dataset.DatasetTable.replace
+
+.. automethod:: dataregistry.registrar.dataset.DatasetTable.modify
+
+.. automethod:: dataregistry.registrar.dataset.DatasetTable.delete
+
+.. automethod:: dataregistry.registrar.dataset.DatasetTable.add_keywords
+
+.. automethod:: dataregistry.registrar.dataset.DatasetTable.get_modifiable_columns
+
+.. automethod:: dataregistry.registrar.dataset.DatasetTable.get_keywords
+
 .. automethod:: dataregistry.registrar.execution.ExecutionTable.register
 
 .. automethod:: dataregistry.registrar.dataset_alias.DatasetAliasTable.register
diff --git a/docs/source/tutorial_cli.rst b/docs/source/tutorial_cli.rst
@@ -38,8 +38,8 @@ Typing
 
 will list all the metadata properties that can be associated with a dataset
 during registration. As when registering datasets using the ``dataregistry``
-package, the ``relative_path`` and ``version`` string properties are mandatory,
-which will always be the first two parameters passed to the ``dregs register
+package, the dataset ``name`` and ``version`` properties are mandatory, which
+will always be the first two parameters passed to the ``dregs register
 dataset`` command respectively.  
 
 For example, say I have produced some data from my latest DESC publication that
@@ -59,11 +59,9 @@ would run the CLI as follows:
       --description "Data from my_paper_dataset" 
 
 This will recursively copy the ``/some/place/at/nersc/my_paper_dataset/``
-directory into the data registry shared space under the relative path
-``my_paper_dataset``.  As we did not specify a ``--name`` for the dataset, the
-``name`` column in the database will automatically be assigned as
-``my_paper_dataset`` (and all other properties we did not specify will keep
-their default values). 
+directory into the data registry shared space with the
+``name='my_paper_dataset'`` (other non-specified properties will keep their
+default values). 
 
 Updating a dataset
 ------------------
@@ -76,26 +74,18 @@ initial registration, we need to create a new version of the dataset.
 .. code-block:: bash
 
    dregs register dataset \
-      my_paper_dataset_updated \
+      my_paper_dataset \
       patch \
       --old-location /some/place/at/nersc/my_paper_dataset_updated/ \
       --owner_type project \
       --owner "DESC Generic Working Group" \
       --description "Data from my_paper_dataset describing bugfix" \
-      --name my_paper_dataset
-
-Here we associate it with the previous dataset through ``--name
-my_paper_dataset``, and tell the data registry to automatically bump the patch
-version to ``1.0.1`` by specifying "patch" as the version string (you could
-however have entered "1.0.1" here if you prefer).
-
-.. note::
 
-   Remember, if the dataset is non-overwritable, the relative paths in the data
-   registry need to be unique, which is why we could not have the relative path
-   of the second entry match the first.  But for datasets only the ``name``
-   plus ``version`` has to be unique, which is how we could associate them with
-   the same ``name`` column.
+Here we associate it with the previous dataset through ``name=
+my_paper_dataset`` (and making sure we keep the same `owner` and `owner_type`),
+and tell the data registry to automatically bump the patch version to ``1.0.1``
+by specifying "patch" as the version string (you could however have entered
+"1.0.1" here if you prefer).
 
 Querying the data registry
 --------------------------

diff --git a/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb b/docs/source/tutorial_notebooks/datasets_deeper_look.ipynb
@@ -39,8 +39,20 @@
    },
    "outputs": [],
    "source": [
+    "# Come up with a random owner name to avoid clashes\n",
+    "from random import randint\n",
+    "OWNER = \"tutorial_\" + str(randint(0,int(1e6)))\n",
+    "\n",
     "import dataregistry\n",
-    "print(\"Working with dataregistry version:\", dataregistry.__version__)"
+    "print(f\"Working with dataregistry version: {dataregistry.__version__} as random owner {OWNER}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c2f92bf-9048-421e-b896-292eb00542c8",
+   "metadata": {},
+   "source": [
+    "**Note** that running some of the cells below may fail, especially if run multiple times. This will likely be from clashes with the unique constraints within the database (hopefully the error output is informative). In these events either; (1) run the cell above to establish a new database connection with a new random user, or (2) manually change the conflicting database column(s) that are clashing during registration."
    ]
   },
   {
@@ -55,13 +67,15 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "72eabcd0-b05e-4e87-9ed1-6450ac196b05",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from dataregistry import DataRegistry\n",
     "\n",
-    "# Establish connection to database (using defaults)\n",
-    "datareg = DataRegistry()"
+    "# Establish connection to the tutorial schema\n",
+    "datareg = DataRegistry(schema=\"tutorial_working\", owner=OWNER)"
    ]
   },
   {
@@ -78,7 +92,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "560b857c-7d94-44ad-9637-0b107cd42259",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "print(datareg.Registrar.dataset.get_keywords())"
@@ -98,7 +114,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "44581049-1d15-44f0-b1ed-34cff6cdb45a",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# Add new dataset entry with keywords.\n",
@@ -132,7 +150,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "09478b87-7d5a-4814-85c7-49f90e0db45d",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# List of keywords to add to dataset\n",
@@ -160,22 +180,24 @@
     "\n",
     "The files and directories of registered datasets are stored under a path relative to the root directory (`root_dir`), which, by default, is a shared space at NERSC.\n",
     "\n",
-    "By default, the relative_path is constructed from the `name`, `version` and `version_suffix` (if there is one), in the format `relative_path=<name>/<version>_<version_suffix>`. However, one can also manually select the relative_path during registration, for example"
+    "By default, the `relative_path` is constructed from the `name`, `version` and `version_suffix` (if there is one), in the format `relative_path=<name>/<version>_<version_suffix>`. However, one can also manually select the relative_path during registration, for example"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "5bc0d5b6-f50a-4646-bc1b-7d9e829e91bc",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# Add new entry with a manual relative path.\n",
     "datareg.Registrar.dataset.register(\n",
     "    \"nersc_tutorial:my_desc_dataset_with_relative_path\",\n",
     "    \"1.0.0\",\n",
-    "    relative_path=\"nersc_tutorial/my_desc_dataset\",\n",
-    "    location_type=\"dummy\", # for testing, means we need no data\n",
+    "    relative_path=f\"NERSC_tutorial/{OWNER}/my_desc_dataset\",\n",
+    "    location_type=\"dummy\", # for testing, means we need no actual data to exist\n",
     ")"
    ]
   },
@@ -216,19 +238,21 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "718d1cd8-4517-4597-9e36-e403e219cef2",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
-    "from dataregistry.dataset_util import get_dataset_status\n",
+    "from dataregistry.registrar.dataset_util import get_dataset_status\n",
     "\n",
     "# The `get_dataset_status` function takes in a dataset `status` and a bit index, and returns if that bit is True or False\n",
     "dataset_status = 1\n",
     "\n",
     "# Is dataset valid?\n",
-    "print(f\"Dataset is valid: {get_dataset_status(dataset_status, \"valid\"}\")\n",
+    "print(f\"Dataset is valid: {get_dataset_status(dataset_status, 'valid')}\")\n",
     "\n",
     "# Is dataset replaced?\n",
-    "print(f\"Dataset is replaced: {get_dataset_status(dataset_status, \"replaced\"}\")"
+    "print(f\"Dataset is replaced: {get_dataset_status(dataset_status, 'replaced')}\")"
    ]
   },
   {
@@ -257,9 +281,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "DREGS-env",
    "language": "python",
-   "name": "python3"
+   "name": "venv"
   },
   "language_info": {
    "codemirror_mode": {
@@ -271,7 +295,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,