From bec67a076e93c91c2de5d47bfd3943e345d63f81 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Mon, 10 Jun 2024 14:26:45 -0700 Subject: [PATCH 1/8] (HP-1483): add healdata-utils to pyproject.toml and re-lock --- vlmd-submission-tools/poetry.lock | 302 +++++++++++++++++- vlmd-submission-tools/pyproject.toml | 2 + .../template_submission_bad_format.csv | 7 - .../tests/test_mapping_utils.py | 68 ---- .../vlmd_submission_tools/common/fields.json | 209 ------------ .../common/mapping_utils.py | 163 ---------- .../vlmd_submission_tools/common/schemas.py | 29 -- 7 files changed, 302 insertions(+), 478 deletions(-) delete mode 100644 vlmd-submission-tools/tests/templates/template_submission_bad_format.csv delete mode 100644 vlmd-submission-tools/tests/test_mapping_utils.py delete mode 100644 vlmd-submission-tools/vlmd_submission_tools/common/fields.json delete mode 100644 vlmd-submission-tools/vlmd_submission_tools/common/mapping_utils.py delete mode 100644 vlmd-submission-tools/vlmd_submission_tools/common/schemas.py diff --git a/vlmd-submission-tools/poetry.lock b/vlmd-submission-tools/poetry.lock index adfb0dc2..9b63f028 100644 --- a/vlmd-submission-tools/poetry.lock +++ b/vlmd-submission-tools/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "attrs" @@ -276,6 +276,17 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "exceptiongroup" version = "1.1.3" @@ -374,6 +385,29 @@ pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] reauth = ["pyu2f (>=0.1.5)"] requests = ["requests (>=2.20.0,<3.0.0dev)"] +[[package]] +name = "healdata-utils" +version = "0.5.1" +description = "Data packaging tools for the HEAL data ecosystem" +optional = false +python-versions = "*" +files = [ + {file = "healdata_utils-0.5.1-py3-none-any.whl", hash = "sha256:76c1dc5c7855e990c3891a18c3a2c8365f1d79221d1d1c46a69380ca09dd91af"}, + {file = "healdata_utils-0.5.1.tar.gz", hash = "sha256:681b815a1a1b53ad107027352a7abd9d14fa805141d8942f1bfef3d6957c0a57"}, +] + +[package.dependencies] +charset-normalizer = ">=2.1" +click = ">=8.1.3" +jsonschema = ">=4.17.3" +openpyxl = "*" +pandas = ">=1.4" +petl = "1.7.12" +pyreadstat = ">=1.2.0" +python-slugify = "*" +PyYaml = ">=6.0" +visions = ">=0.7.5" + [[package]] name = "humanize" version = "4.6.0" @@ -606,6 +640,80 @@ files = [ {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +[[package]] +name = "multimethod" +version = "1.11.2" +description = "Multiple argument dispatching." +optional = false +python-versions = ">=3.9" +files = [ + {file = "multimethod-1.11.2-py3-none-any.whl", hash = "sha256:cb338f09395c0ee87d36c7691cdd794d13d8864358082cf1205f812edd5ce05a"}, + {file = "multimethod-1.11.2.tar.gz", hash = "sha256:7f2a4863967142e6db68632fef9cd79053c09670ba0c5f113301e245140bba5c"}, +] + +[[package]] +name = "networkx" +version = "3.2.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.9" +files = [ + {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"}, + {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"}, +] + +[package.extras] +default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"] +developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"] +doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"] +test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] + +[[package]] +name = "numpy" +version = "1.26.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, + {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, + {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, + {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, + {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, + {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, + {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, + {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, + {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, + {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, +] + [[package]] name = "oauthlib" version = "3.2.2" @@ -622,6 +730,20 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "openpyxl" +version = "3.1.3" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.3-py2.py3-none-any.whl", hash = "sha256:25071b558db709de9e8782c3d3e058af3b23ffb2fc6f40c8f0c45a154eced2c3"}, + {file = "openpyxl-3.1.3.tar.gz", hash = "sha256:8dd482e5350125b2388070bb2477927be2e8ebc27df61178709bc8c8751da2f9"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "packaging" version = "23.1" @@ -633,6 +755,79 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pandas" +version = "2.2.2" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"}, + {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"}, + {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, + {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"}, + {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"}, + {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" + +[package.extras] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] + [[package]] name = "parameterized" version = "0.9.0" @@ -779,6 +974,44 @@ files = [ [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyreadstat" +version = "1.2.7" +description = "Reads and Writes SAS, SPSS and Stata files into/from pandas data frames." +optional = false +python-versions = "*" +files = [ + {file = "pyreadstat-1.2.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8b53853d05527a44a5dca33df309b3a7d5c2ca4a513ee9056ffc1b0bf6cbf917"}, + {file = "pyreadstat-1.2.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb79cfcad497a90ae116dde05ad45bdeab26c85915493b8e29a474c449ab55cf"}, + {file = "pyreadstat-1.2.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34693684338acd6d0dcf02cc2dc6bc6fe70a4a7bc2d52fea4a67d6f7bfd8f648"}, + {file = "pyreadstat-1.2.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:200096fa11562723f2c53e13e5e76b9ab72ae395329de1ada32ccb743b9c1752"}, + {file = "pyreadstat-1.2.7-cp310-cp310-win_amd64.whl", hash = "sha256:5c6852a28b6ee9b5eae4c7a6a29aaeb2072a3b9f0a2c8084b96d6e84eff95990"}, + {file = "pyreadstat-1.2.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:603d7117dfdb6ca7231f9e15fa8496cf4187dc6358768547e66bbfff0b4ceda6"}, + {file = "pyreadstat-1.2.7-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:177ca4500b4f1c471297b8041437487fcc83d9fb39f8a45493bdf37a62a3f965"}, + {file = "pyreadstat-1.2.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61fcf55f2db2306c984e488c4fbd22786e01a06a4c0e297b52b23c8e0a59eefb"}, + {file = "pyreadstat-1.2.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd2100cbe74ecbc13d84556aeb8d4ffc2e2a448f2fa0056cb00d66cea79d55aa"}, + {file = "pyreadstat-1.2.7-cp311-cp311-win_amd64.whl", hash = "sha256:c801cbb4ec6aa07346347fcafeda1e08af50508f7b1a1850be5dc369e2d12e6f"}, + {file = "pyreadstat-1.2.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3b2867a76ced9286942f8f75d50e8dcd2fd3601c12d1d1ed5c62d55f99747920"}, + {file = "pyreadstat-1.2.7-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:6303c8acfd54607bd5d6334149a0051d4f5ce0458089e22e7ad876ad0c12d354"}, + {file = "pyreadstat-1.2.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6314a0fc91a0e56f29c91da5ac9502429006800487793f82eabc797205d4e224"}, + {file = "pyreadstat-1.2.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7656c368711e356babece8fba6a8780b800c7e1379cfda6f22d97081b60a7fb9"}, + {file = "pyreadstat-1.2.7-cp312-cp312-win_amd64.whl", hash = "sha256:d9a8ec8bde0e82f5e45a5906499deb3615d874e80bd36756c42c8a43bb5a59c9"}, + {file = "pyreadstat-1.2.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4a4f0dfb117183986c5ccf74e45e7a356cdefe839483d60ebb7a19ae7da820ac"}, + {file = "pyreadstat-1.2.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0e21c94514d02ac46d8e9f9008f99b0760478df90ca59f95be8705ba1a48a514"}, + {file = "pyreadstat-1.2.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e1d2c516e5233fb0f632a082748ca694fc8fff7ab88622b6c6585a35c6701a0"}, + {file = "pyreadstat-1.2.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b1823ee4aacefe92f6a419b6b5f30a6632a0c1a6785dc352a05fb259697304a"}, + {file = "pyreadstat-1.2.7-cp38-cp38-win_amd64.whl", hash = "sha256:41ff6a79eea517f83958f0d4d2ba0dd17996d11028f3dbdaac2fb119e5381076"}, + {file = "pyreadstat-1.2.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:82152e0df092cd0fdb0d43c466210f3cf75d8e67854ecd6bd93a9ac05e218312"}, + {file = "pyreadstat-1.2.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fe67f8f8c92f1861a0e044bc275644de552231e9a28960d54c6c5ca4cb285cf"}, + {file = "pyreadstat-1.2.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8985b9f76dbcc9fa687b0b64090ec821a5c5af2d1f57410635e2d5bdfc0b20b"}, + {file = "pyreadstat-1.2.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:259797c81d96e0c179f224565b75902d2bac02dd731a0a865ab04731ffdfc682"}, + {file = "pyreadstat-1.2.7-cp39-cp39-win_amd64.whl", hash = "sha256:91015a4b9e853544f8737c818a88881a36de4b8dbcf80387773aacbb6f0d678b"}, + {file = "pyreadstat-1.2.7.tar.gz", hash = "sha256:bc2142be4773a9e7ff844068d0b48c413f9f46ba9511408bcd5dbec9b20aab6d"}, +] + +[package.dependencies] +pandas = ">=1.2.0" + [[package]] name = "pyrsistent" version = "0.19.3" @@ -886,6 +1119,17 @@ text-unidecode = ">=1.3" [package.extras] unidecode = ["Unidecode (>=1.1.1)"] +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + [[package]] name = "pyyaml" version = "6.0" @@ -956,6 +1200,23 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-mock" +version = "1.12.1" +description = "Mock out responses from the requests package" +optional = false +python-versions = ">=3.5" +files = [ + {file = "requests-mock-1.12.1.tar.gz", hash = "sha256:e9e12e333b525156e82a3c852f22016b9158220d2f47454de9cae8a77d371401"}, + {file = "requests_mock-1.12.1-py2.py3-none-any.whl", hash = "sha256:b1e37054004cdd5e56c84454cc7df12b25f90f382159087f4b6915aaeef39563"}, +] + +[package.dependencies] +requests = ">=2.22,<3" + +[package.extras] +fixture = ["fixtures"] + [[package]] name = "requests-oauthlib" version = "1.3.1" @@ -1167,6 +1428,17 @@ files = [ {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, ] +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + [[package]] name = "urllib3" version = "1.26.9" @@ -1199,6 +1471,32 @@ decorator = ">=3.4.0" [package.extras] test = ["flake8 (>=2.4.0)", "isort (>=4.2.2)", "pytest (>=2.2.3)"] +[[package]] +name = "visions" +version = "0.7.6" +description = "Visions" +optional = false +python-versions = ">=3.8" +files = [ + {file = "visions-0.7.6-py3-none-any.whl", hash = "sha256:72b7f8dbc374e9d6055e938c8c67b0b8da52f3bcb8320f25d86b1a57457e7aa6"}, + {file = "visions-0.7.6.tar.gz", hash = "sha256:00f494a7f78917db2292e11ea832c6e026b64783e688b11da24f4c271ef1631d"}, +] + +[package.dependencies] +attrs = ">=19.3.0" +multimethod = ">=1.4" +networkx = ">=2.4" +numpy = ">=1.23.2" +pandas = ">=2.0.0" + +[package.extras] +all = ["Pillow", "attrs (>=19.3.0)", "imagehash", "matplotlib", "multimethod (>=1.4)", "networkx (>=2.4)", "numpy (>=1.23.2)", "pandas (>=2.0.0)", "pydot", "pygraphviz", "shapely"] +dev = ["IPython", "Sphinx-copybutton", "black (>=20.8b1)", "isort (>=5.0.9)", "mypy (>=0.770)", "nbsphinx", "recommonmark (>=0.6.0)", "setuptools (>=46.1.3)", "sphinx-autodoc-typehints (>=1.10.3)", "sphinx-rtd-theme (>=0.4.3)", "wheel (>=0.34.2)"] +plotting = ["matplotlib", "pydot", "pygraphviz"] +test = ["Pillow", "big-o (>=0.10.1)", "black (>=19.10b0)", "check-manifest (>=0.41)", "imagehash", "isort (>=5.0.9)", "matplotlib", "mypy (>=0.800)", "numba", "pandas", "pre-commit", "pyarrow (>=1.0.1)", "pydot", "pyspark", "pytest (>=5.2.0)", "pytest-spark (>=0.6.0)", "shapely", "twine (>=3.1.1)"] +type-geometry = ["shapely"] +type-image-path = ["Pillow", "imagehash"] + [[package]] name = "websocket-client" version = "1.5.3" @@ -1218,4 +1516,4 @@ test = ["websockets"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "329f221b6b67c43ae8ff4cedd7ecca5f22acc58a01506300f244a69dadac3776" +content-hash = "f873068e454e91ce3abec04042338c0f0a2704f8a56157d2b3f26e52bf275ce0" diff --git a/vlmd-submission-tools/pyproject.toml b/vlmd-submission-tools/pyproject.toml index fb28cc18..e8dbd442 100644 --- a/vlmd-submission-tools/pyproject.toml +++ b/vlmd-submission-tools/pyproject.toml @@ -28,6 +28,8 @@ frictionless = "^5.12.1" parameterized = "^0.9.0" pytest = "^7.4.2" pytest-cov = "^4.1.0" +healdata-utils = "^0.5.1" +requests-mock = "^1.12.1" [tool.poetry.dev-dependencies] diff --git a/vlmd-submission-tools/tests/templates/template_submission_bad_format.csv b/vlmd-submission-tools/tests/templates/template_submission_bad_format.csv deleted file mode 100644 index 02eb5f8d..00000000 --- a/vlmd-submission-tools/tests/templates/template_submission_bad_format.csv +++ /dev/null @@ -1,7 +0,0 @@ -name,title,description,type,format,constraints.maxLength,constraints.pattern,constraints.minimum,constraints.maximum,ordered,missingValues,trueValues,falseValues,repo_link,cde_id,ontology_id,encoding,constraints.enum -participant_id,Participant Id,Unique identifier for participant,string,9999,,[A-Z][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9],,,,,,,,,,, -race,Race,Self-reported race,integer,,,,,,,99,,,,NLM=Fakc6Jy2x|NLM=m1_atF7L7U,,1=White|2=Black or African American|3=American Indian or Alaska Native|4=Native| 5=Hawaiian or Other Pacific Islander|6=Asian|7=Some other race|8=Multiracial|99=Not reported,1|2|3|4|5|6|7|8 -age,Age,What is your age? (age at enrollment),integer,years,,,0,90,,,,,,,,, -hispanic,"Hispanic, Latino, or Spanish Origin","Are you of Hispanic, Latino, or Spanish origin?",boolean,,,,,,,Not reported,No,Yes,,,,, -sex_at_birth,Sex at Birth,The self-reported sex of the participant/subject at birth,string,,,,,,,Prefer not to answer|Unknown,,,,NLM=ezelurehr2,,,Male|Female|Intersex|None of these describe me|Prefer not to answer|Unknown -SU4,Heroin Days Used,During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use],integer,days,,,,,,,,,,,is=CHEBI=27808|is=RXNORM=3304,, diff --git a/vlmd-submission-tools/tests/test_mapping_utils.py b/vlmd-submission-tools/tests/test_mapping_utils.py deleted file mode 100644 index 27bd802a..00000000 --- a/vlmd-submission-tools/tests/test_mapping_utils.py +++ /dev/null @@ -1,68 +0,0 @@ -from vlmd_submission_tools.common import mapping_utils - -class TestMappingUtils: - - - def test_split_str_array(self): - string="foo|bar" - expected=["foo", "bar"] - assert mapping_utils.split_str_array(string) == expected - sep="#" - string="foo#bar" - expected=["foo", "bar"] - assert mapping_utils.split_str_array(string,sep) == expected - - - def test_map_keys_vals(self): - keys=['key1', 'key2', 'key3'] - vals=['val1', 'val2', 'val3'] - expected={'key1': 'val1', 'key2': 'val2', 'key3': 'val3'} - assert mapping_utils.map_keys_vals(keys,vals) == expected - - - def test_split_and_map(self): - string="1=foo|2=bar|3=flim" - prop = { - 'items': { - 'properties': { - 'key1': 'val1', - 'key2': 'val2', - 'key3': 'val3' - } - } - } - expected=[ - {'key1': '1', 'key2': 'foo'}, - {'key1': '2', 'key2': 'bar'}, - {'key1': '3', 'key2': 'flim'} - ] - assert mapping_utils.split_and_map(string,prop) == expected - - - def test_loads_dict(self): - string="1=foo|2=bar|3=flim" - expected={'1': 'foo', '2': 'bar', '3': 'flim'} - assert mapping_utils.loads_dict(string)== expected - string2="1_foo#2_bar#3_flim" - result = mapping_utils.loads_dict(string2,item_sep='#',key_val_sep='_') - assert result == expected - - - def test_to_bool(self): - test_vals=['True', 'true', '1', 'Yes', 'Y', 'Required'] - for val in test_vals: - assert mapping_utils.to_bool(val) - test_vals=['False', 'false', '0', 'No', 'N', 'Not Required'] - for val in test_vals: - assert not mapping_utils.to_bool(val) - test_vals=['Foo', 'Bar'] - expected_empty = "" - for val in test_vals: - assert mapping_utils.to_bool(val) == expected_empty - - - def test_join_dict_vals(self): - sep="|" - dict={'key1': 'val1', 'key2': 'val2', 'key3': 'val3'} - expected="val1|val2|val3" - assert mapping_utils.join_dictvals(dict,sep) == expected diff --git a/vlmd-submission-tools/vlmd_submission_tools/common/fields.json b/vlmd-submission-tools/vlmd_submission_tools/common/fields.json deleted file mode 100644 index 9dbcfe9d..00000000 --- a/vlmd-submission-tools/vlmd_submission_tools/common/fields.json +++ /dev/null @@ -1,209 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "vlmd-fields", - "title": "HEAL Variable Level Metadata Fields", - "description": "Variable level metadata individual fields integrated into the variable level metadata object within the HEAL platform metadata service.\n", - "type": "object", - "required": [ - "name", - "description" - ], - "properties": { - "name": { - "type": "string", - "title": "Variable Name", - "description": "The name of a variable (i.e., field) as it appears in the data.\n" - }, - "title": { - "type": "string", - "title": "Variable Label (ie Title)", - "description": "The human-readable title of the variable." - }, - "description": { - "type": "string", - "title": "Variable Description", - "description": "An extended description of the variable.", - "examples": [ - "Definition", - "Question text (if a survey)" - ] - }, - "type": { - "type": "string", - "title": "Variable Type", - "description": "A classification allowing the user (analyst, researcher or computer) to know how to use the variable\n" - }, - "format": { - "type": "string", - "title": "Variable Format", - "description": "Indicates the format of the type specified in the `type` property. This may describe the type of unit (such as for time fields like year or month) or the format of a date field (such as %y%m%d).\n" - }, - "constraints": { - "type": "object", - "properties": { - "maxLength": { - "type": "integer", - "title": "Maximum Length", - "description": "Indicates the maximum length of an iterable (e.g., array, string, or object). For example, if 'Hello World' is the longest value of a categorical variable, this would be a maxLength of 11.\n" - }, - "enum": { - "type": "array", - "title": "Variable Possible Values", - "description": "Constrains possible values to a set of values.\n" - }, - "pattern": { - "type": "string", - "title": "Regular Expression Pattern", - "description": "A regular expression pattern the data MUST conform to.\n" - }, - "maximum": { - "type": "integer", - "title": "Maximum Value", - "description": "Specifies the maximum value of a field (e.g., maximum -- or most recent -- date, maximum integer etc). Note, this is different then maxLength property.\n" - }, - "minimum": { - "type": "integer", - "title": "Minimum Value", - "description": "Specifies the minimum value of a field (e.g., miniimum -- or oldest -- date, minimum integer etc).\n" - } - } - }, - "encoding": { - "type": "object", - "title": "Variable Value Encodings (i.e., mappings; value labels)", - "description": "Encodings (and mappings) allow categorical values to be stored as numerical values. IMPORTANT: the ==key should be the value represented IN the data== and the ==value should be the to-be-mapped label==. Many analytic software programs use numerical encodings and some algorithms only support numerical values. Additionally, this field provides a way to store categoricals that are stored as \"short\" labels (such as abbreviations)\n", - "examples": [ - "{0:'No',1:'Yes'}", - "{'HW':'Hello world','GBW':'Good bye world'}" - ] - }, - "ordered": { - "type": "boolean", - "title": "An ordered variable", - "description": "Indicates whether a categorical variable is ordered. This variable is relevant for variables that have an ordered relationship but not necessarily a numerical relationship (e.g., Strongly disagree < Disagree < Neutral < Agree).\n" - }, - "missingValues": { - "type": "array", - "title": "Missing Values", - "description": "A list of missing values specific to a variable." - }, - "trueValues": { - "type": "array", - "title": "Boolean True Value Labels", - "description": "For boolean (true) variable (as defined in type field), this field allows a physical string representation to be cast as true (increasing readability of the field)\n", - "items": { - "type": "string" - }, - "examples": [ - "Yes", - "1", - "True", - "true", - "Correct" - ] - }, - "falseValues": { - "type": "array", - "title": "Boolean False Value Labels", - "description": "For boolean (false) variable (as defined in type field), this field allows a physical string representation to be cast as false (increasing readability of the field)\n", - "items": { - "type": "string" - }, - "examples": [ - "No", - "0", - "False", - "false", - "Incorrect" - ] - }, - "repo_link": { - "type": "string", - "title": "Variable Repository Link", - "description": "A link to the variable as it exists on the home repository, if applicable\n" - }, - "cde_id": { - "type": "array", - "title": "Common Data Element Id", - "description": "The source and id for the NIH Common Data Elements program.", - "items": { - "type": "object", - "properties": { - "source": { - "type": "string" - }, - "id": { - "type": "string" - } - } - } - }, - "ontology_id": { - "type": "array", - "title": "Ontology ID", - "description": "Ontological information for the given variable as indicated by the source, id, and relation to the specified classification. One or more ontology classifications can be specified. \n", - "items": { - "type": "object", - "properties": { - "relation": { - "type": "string" - }, - "source": { - "type": "string" - }, - "id": { - "type": "string" - } - - } - } - }, - "univar_stats": { - "type": "object", - "properties": { - "median": { - "type": "number" - }, - "mean": { - "type": "number" - }, - "std": { - "type": "number" - }, - "min": { - "type": "number" - }, - "max": { - "type": "number" - }, - "mode": { - "type": "number" - }, - "count": { - "type": "integer", - "minimum": 0 - }, - "twenty_five_percentile": { - "type": "number" - }, - "seventy_five_percentile": { - "type": "number" - }, - "cat_marginals": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "count": { - "type": "integer" - } - } - } - } - } - } - } -} diff --git a/vlmd-submission-tools/vlmd_submission_tools/common/mapping_utils.py b/vlmd-submission-tools/vlmd_submission_tools/common/mapping_utils.py deleted file mode 100644 index 3b97e658..00000000 --- a/vlmd-submission-tools/vlmd_submission_tools/common/mapping_utils.py +++ /dev/null @@ -1,163 +0,0 @@ -''' -contains mappings (both lambda functions or column mappings) -''' -from vlmd_submission_tools.common import schemas - -# split array columns -def split_str_array(string,sep='|'): - if string: - return [s.strip() for s in string.split(sep)] - else: - return None - -# if object within array, assign to properties -def map_keys_vals(keys,vals): - ''' zips two lists of the same size as - a dictionary - ''' - return dict(zip(keys,vals)) - - -def split_and_map(string,prop): - ''' - splits nested stringified delimited lists - (delimiters being | for outer and = for inner) - and zips/maps each of the inner lists to a set - of values (right now keys of a dictionary) - TODO: rename function split_and_map_to_keys - TODO: generalize to more than keys - - ''' - if string: - keys = prop['items']['properties'].keys() - return [ - map_keys_vals(keys,split_str_array(x,sep='=')) - for x in split_str_array(string,sep='|') - ] - else: - return None - - -def loads_dict(string,item_sep='|',key_val_sep='='): - if string: - return dict([split_str_array(s,key_val_sep) - for s in split_str_array(string,item_sep)]) - - -def convert_rec_to_json(field): - ''' - converts a flattened dictionary to a nested dictionary - based on JSON path dot notation indicating nesting - ''' - # print(f"Working on field {field}") - field_json = {} - for prop_path,prop in field.items(): - if prop: - # initiate the prop to be added with the entire - # field - prop_json = field_json - # get the inner most dictionary item of the jsonpath - nested_names = prop_path.split('.') - for i,prop_name in enumerate(nested_names): - is_last_nested = i+1==len(nested_names) - if prop_json.get(prop_name) and not is_last_nested: - prop_json = prop_json[prop_name] - # if no object currently - elif not is_last_nested: - prop_json[prop_name] = {} - prop_json = prop_json[prop_name] - #assign property to inner most item - else: - prop_json[prop_name] = prop - - return field_json - - -def mapval(v,mapping): - v = str(v) - if v in mapping: - return mapping[v] - else: - return v - - -def to_bool(v): - if v.lower() in true_values: - return True - elif v.lower() in false_values: - return False - else: - return "" - - -typemap = { - #from bacpac - 'text':'string', - 'float':'number', - #from hemo - 'NUM':'number', - 'CHAR':'string' -} - - -formatmap = { - 'ISO8601':'' # NOTE: this is the default date format for frictionless so not necessary to specify -} - - -props = schemas.heal['data_dictionary']['properties'] - #mappings for array of dicts, arrays, and dicts - - -true_values = ["true","1","yes","required","y"] -false_values = ["false","0","no","not required","n"] - - -fieldmap = { - 'constraints.enum': lambda v: split_str_array(v), - # 'constraints.maximum':int, - # 'constraints.minimum':int, #TODO:need to add to schema - # 'constraints.maxLength':int, - 'cde_id': lambda v: split_and_map(v, props['cde_id']), - 'ontology_id': lambda v: split_and_map(v, props['ontology_id']), - 'encoding':lambda v: loads_dict(v), - 'format': lambda v: mapval(v,formatmap), - 'type':lambda v: mapval(v,typemap), - #'univar_stats.cat_marginals':lambda v: split_and_map(v, prop['univar_stats']['cat_marginals']), - 'missingValues':lambda v: split_str_array(v), - 'trueValues': lambda v: split_str_array(v), - 'falseValues':lambda v: split_str_array(v), - # 'constraints.required': lambda v: to_bool(v), - # TODO: add stats -} - - -# join mappings for json to csv - -def join_iter(iterable,sep_list="|"): - return sep_list.join([str(p) for p in iterable]) - - -def join_dictvals(dictionary:dict,sep:str): - return sep.join(dictionary.values()) - - -def join_dictitems(dictionary:dict,sep_keyval='=',sep_items='|'): - dict_list = [key+sep_keyval+val for key,val in dictionary.items()] - return sep_items.join(dict_list) - - -joinmap = { - 'constraints.enum': join_iter, - 'cde_id': join_dictvals, - 'ontology_id': join_dictvals, - 'encodings': join_dictitems, - 'missingValues':join_iter, - 'trueValues': join_iter, - 'falseValues':join_iter, - # TODO: add stats -} - - -def join_prop(propname,prop): - return joinmap[propname](prop) if propname in joinmap else prop diff --git a/vlmd-submission-tools/vlmd_submission_tools/common/schemas.py b/vlmd-submission-tools/vlmd_submission_tools/common/schemas.py deleted file mode 100644 index 50c1e393..00000000 --- a/vlmd-submission-tools/vlmd_submission_tools/common/schemas.py +++ /dev/null @@ -1,29 +0,0 @@ -from pathlib import Path -from frictionless import Schema -import json -import jsonschema - -# TODO: use data_dictionary.json -# TODO: output informative error messages in validation -# NOTE: would it be good to also have a frictionless CSV template with regexs?...may be easier to spot text errors? - -# can change to request.get(github) -with open('vlmd_submission_tools/common/fields.json') as f: - data = json.load(f) - -heal = { - 'data_dictionary': data -} - -schema = { - 'type':'object', - 'required':[ - 'title', - 'data_dictionary' - ], - 'properties':{ - 'title':{'type':'string'}, - 'description':{'type':'string'}, - 'data_dictionary':{'type':'array','items':heal['data_dictionary']} - } -} From df3181219145bba98b6aca268c2a394f591db700 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Mon, 10 Jun 2024 14:28:19 -0700 Subject: [PATCH 2/8] (hp-1483): add new test templates --- .../templates/template_submission_invalid.csv | 8 ++++++++ .../templates/template_submission_invalid.json | 16 ++++++++++++++++ .../templates/template_submission_invalid.tsv | 7 +++++++ .../templates/template_submission_small.csv | 3 +++ .../templates/template_submission_small.json | 18 ++++++++++++++++++ .../templates/template_submission_small.tsv | 3 +++ 6 files changed, 55 insertions(+) create mode 100644 vlmd-submission-tools/tests/templates/template_submission_invalid.csv create mode 100644 vlmd-submission-tools/tests/templates/template_submission_invalid.json create mode 100644 vlmd-submission-tools/tests/templates/template_submission_invalid.tsv create mode 100644 vlmd-submission-tools/tests/templates/template_submission_small.csv create mode 100644 vlmd-submission-tools/tests/templates/template_submission_small.json create mode 100644 vlmd-submission-tools/tests/templates/template_submission_small.tsv diff --git a/vlmd-submission-tools/tests/templates/template_submission_invalid.csv b/vlmd-submission-tools/tests/templates/template_submission_invalid.csv new file mode 100644 index 00000000..436dc924 --- /dev/null +++ b/vlmd-submission-tools/tests/templates/template_submission_invalid.csv @@ -0,0 +1,8 @@ +name,description,type +participant_id,Unique identifier for participant,character +race,Self-reported race,integer +,,integer +hispanic,"Are you of Hispanic, Latino, or Spanish origin?",boolean +sex_at_birth,The self-reported sex of the participant/subject at birth, +SU4,During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use],integer +pulse_rate,Heart rate measured at systemic artery,number diff --git a/vlmd-submission-tools/tests/templates/template_submission_invalid.json b/vlmd-submission-tools/tests/templates/template_submission_invalid.json new file mode 100644 index 00000000..198cd029 --- /dev/null +++ b/vlmd-submission-tools/tests/templates/template_submission_invalid.json @@ -0,0 +1,16 @@ +{ + "title": "Example VLMD", + "description": "This is an example", + "fields": [ + { + "name": "participant_id", + "description": "Unique identifier for participant", + "type": "character" + }, + { + "name": "race", + "description": "Self-reported race", + "type": "integer" + } + ] +} diff --git a/vlmd-submission-tools/tests/templates/template_submission_invalid.tsv b/vlmd-submission-tools/tests/templates/template_submission_invalid.tsv new file mode 100644 index 00000000..1129cf95 --- /dev/null +++ b/vlmd-submission-tools/tests/templates/template_submission_invalid.tsv @@ -0,0 +1,7 @@ +name title description type +participant_id Participant Id Unique identifier for participant character +race Race Self-reported race integer +age Age What is your age? (age at enrollment) integer +hispanic "Hispanic Latino or Spanish Origin" "Are you of Hispanic Latino or Spanish origin?" boolean +sex_at_birth Sex at Birth The self-reported sex of the participant/subject at birth string +SU4 Heroin Days Used During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use] integer diff --git a/vlmd-submission-tools/tests/templates/template_submission_small.csv b/vlmd-submission-tools/tests/templates/template_submission_small.csv new file mode 100644 index 00000000..370e1a41 --- /dev/null +++ b/vlmd-submission-tools/tests/templates/template_submission_small.csv @@ -0,0 +1,3 @@ +module,name,title,description,type,format, +,participant_id,Participant Id,Unique identifier for participant,string,, +,race,Race,Self-reported race,integer,, diff --git a/vlmd-submission-tools/tests/templates/template_submission_small.json b/vlmd-submission-tools/tests/templates/template_submission_small.json new file mode 100644 index 00000000..4a7d0430 --- /dev/null +++ b/vlmd-submission-tools/tests/templates/template_submission_small.json @@ -0,0 +1,18 @@ +{ + "title": "Example VLMD", + "description": "This is an example", + "fields": [ + { + "name": "participant_id", + "title": "Participant Id", + "description": "Unique identifier for participant", + "type": "string" + }, + { + "name": "race", + "title": "Race", + "description": "Self-reported race", + "type": "integer" + } + ] +} diff --git a/vlmd-submission-tools/tests/templates/template_submission_small.tsv b/vlmd-submission-tools/tests/templates/template_submission_small.tsv new file mode 100644 index 00000000..b47a98eb --- /dev/null +++ b/vlmd-submission-tools/tests/templates/template_submission_small.tsv @@ -0,0 +1,3 @@ +name title description type +participant_id Participant Id Unique identifier for participant string +race Race Self-reported race integer From a29e937d7bf9f259d6796d80c04df154f7be54af Mon Sep 17 00:00:00 2001 From: George Thomas Date: Mon, 10 Jun 2024 14:29:30 -0700 Subject: [PATCH 3/8] (HP-1483): update submission tools --- .../vlmd_submission_tools/common/utils.py | 4 +- .../read_and_validate_dictionary.py | 188 ++++++++++-------- .../subcommands/upload_dictionary_to_mds.py | 22 +- 3 files changed, 118 insertions(+), 96 deletions(-) diff --git a/vlmd-submission-tools/vlmd_submission_tools/common/utils.py b/vlmd-submission-tools/vlmd_submission_tools/common/utils.py index 794aecf8..62fb1280 100644 --- a/vlmd-submission-tools/vlmd_submission_tools/common/utils.py +++ b/vlmd-submission-tools/vlmd_submission_tools/common/utils.py @@ -59,9 +59,9 @@ def check_mds_study_id(study_id, hostname=config.HOST_NAME): if response.json().get("_guid_type") != "discovery_metadata": raise ValueError("Study ID is not dicovery metadata") - existing_data_dictionaries = response.json().get("data_dictionaries", {}) + existing_vlmd = response.json().get("variable_level_metadata", {}) - return existing_data_dictionaries + return existing_vlmd def get_client_token(hostname: str, client_id: str, client_secret: str): diff --git a/vlmd-submission-tools/vlmd_submission_tools/subcommands/read_and_validate_dictionary.py b/vlmd-submission-tools/vlmd_submission_tools/subcommands/read_and_validate_dictionary.py index 38bcb763..2e76b124 100644 --- a/vlmd-submission-tools/vlmd_submission_tools/subcommands/read_and_validate_dictionary.py +++ b/vlmd-submission-tools/vlmd_submission_tools/subcommands/read_and_validate_dictionary.py @@ -7,20 +7,14 @@ from argparse import ArgumentParser, Namespace import json -import jsonschema -import os import traceback from urllib.parse import unquote -from frictionless import Resource, FrictionlessException -import petl as etl import requests +from healdata_utils import validate_vlmd_csv, validate_vlmd_json +from healdata_utils.conversion import convert_to_vlmd from vlmd_submission_tools.common.logger import Logger -from vlmd_submission_tools.common import config -from vlmd_submission_tools.common import mapping_utils -from vlmd_submission_tools.common import utils -from vlmd_submission_tools.common import schemas from vlmd_submission_tools.subcommands import Subcommand @@ -76,9 +70,8 @@ def __get_description__(cls) -> str: """ return ( "Takes a presigned url and fetches the data dictionary. " - "Converts any csv/tsv to json and saves to local file system. " - "Validates the dictionary against the provided schema. " - "Writes JSON output with json_local_path and is_valid_dictionary." + "Validates the dictionary against the healdata-utils schema. " + "Writes JSON output with json_local_path and validation report. " ) @classmethod @@ -95,89 +88,107 @@ def main(cls, options: Namespace) -> None: file_type = cls._get_file_type_from_filename(options.file_name) json_local_path = options.json_local_path + local_path = None + is_valid_dictionary = None + errors_list = None - # pull in schema - schema = schemas.heal['data_dictionary'] - data_dictionary_props = schema['properties'] - data_dictionary = {"title": "dictionary title"} - mappings = mapping_utils.fieldmap + # download from url and save local copy + try: + local_path = cls._download_from_url(file_type, dictionary_url, json_local_path) + if local_path: + logger.info(f"Data dictionary saved in {local_path}") + + except Exception as e: + logger.error(f"Could not read dictionary from url {dictionary_url}") + logger.error(e) + logger.error(f"Exception type = {type(e)}") + return + + # get validation report with healdata-utils.validate_vlmd + logger.info(f"Getting validation report for {local_path}") + try: + if file_type == 'json': + result = validate_vlmd_json(local_path) + elif file_type == 'csv' or file_type == 'tsv': + result = validate_vlmd_csv(local_path) + validation_report = result.get('report') + + is_valid_dictionary = validation_report.get('valid') + errors_list = validation_report.get('errors') + except Exception as e: + logger.error(f"Error in validation: {e}") + + logger.info(f"Valid dictionary = {is_valid_dictionary}") + logger.info(f"Errors from validation report = {errors_list}") - logger.info(f"Fetching dictionary from s3 url.") + # convert csv to json for uploading to MDS. if file_type == 'csv' or file_type == 'tsv': - try: - source = Resource(dictionary_url) - source = source.to_petl() - - logger.info(f"Converting {file_type} file to json") - logger.info(f"Column names in petl: {source.fieldnames()}") - fields_to_add = [ - (field,'') - for field in mappings.keys() - if not field in source.fieldnames() - ] - template_tbl = ( - source - .addfields(fields_to_add) # add fields from mappings not in the csv template to allow convert fxns to work - .convert(mappings) - .convertnumbers() - .cut(source.fieldnames()) # want to include only fields in csv - ) - except FrictionlessException: - is_valid_dictionary = False - traceback.print_exc() - raise FrictionlessException(f"Frictionless could not read dictionary from url {dictionary_url}") - except: - is_valid_dictionary = False - traceback.print_exc() - raise Exception(f"Could not read dictionary from url {dictionary_url}") - - try: - data_dictionary['data_dictionary'] = [mapping_utils.convert_rec_to_json(rec) for rec in etl.dicts(template_tbl)] - except: - is_valid_dictionary = False - traceback.print_exc() - raise Exception(f"Could not convert {file_type} to json") - else: - # JSON format is read directly without conversion - try: - response = requests.get(dictionary_url) - data_dictionary_json = response.text - data_dictionary = json.loads(data_dictionary_json) - except: - is_valid_dictionary = False - traceback.print_exc() - raise Exception(f"Could not read dictionary from url {dictionary_url}") - - logger.info("Reading schema into schema_array") - schema_array = { - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "vlmd", - "title":"Variable Level Metadata (Data Dictionaries)", - "description": "This schema defines the variable level metadata for one data dictionary for a given study.Note a given study can have multiple data dictionaries", - "type": "array", - "items": schema + logger.info(f"Converting {file_type} to JSON") + props = { + "description": f"Json dictionary converted from {file_type}", + "title": "HEAL compliant variable level metadata dictionary" + } + + vlmd_dict = convert_to_vlmd( + input_filepath = local_path, + data_dictionary_props = props, + inputtype = "csv-data-dict", + ) + converted_json = vlmd_dict.get('jsontemplate') + + # logger.info(f"Converted JSON is valid dictionary = convertis_valid_dictionary}") + # logger.info(f"Errors from validation report = {errors_list}") + logger.info(f"Errors = {vlmd_dict.get('errors')}") + + with open(json_local_path, 'w', encoding='utf-8') as o: + json.dump(converted_json, o, ensure_ascii=False, indent=4) + logger.info(f"Converted JSON data dictionary saved in {json_local_path}") + + + report_json = { + "json_local_path": json_local_path, + "is_valid_dictionary": is_valid_dictionary, + "errors": errors_list } + # save the validation report artifact + with open(options.output, 'w', encoding='utf-8') as o: + json.dump(report_json, o, ensure_ascii=False, indent=4) + logger.info(f"Validation report saved in {options.output}") + - logger.info("Validating dictionary.") - is_valid_dictionary = True + @classmethod + def _download_from_url(cls, file_type: str, url: str, json_local_path: str) -> str: + """ + Sends a request to the url and saves data in the local_path + + Args: + file_type (str): 'csv', 'tsv', 'json' + url (str): the url for the data dictionary + json_local_path (str): the path to the local copy, eg, '/tmp/vlmd/dict.json' + + Returns: + path of saved contents, None if error in downloading. + """ + local_path = None try: - jsonschema.validate(data_dictionary['data_dictionary'],schema=schema_array) - except: - is_valid_dictionary = False - traceback.print_exc() - raise Exception("Not a valid dictionary") - logger.info(f"Valid={is_valid_dictionary}") - - # save the data dictionary - with open(json_local_path, 'w', encoding='utf-8') as o: - json.dump(data_dictionary, o, ensure_ascii=False, indent=4) - logger.info(f"JSON data dictionary saved in {json_local_path}") - - # save the json_local_path and is_valid_dictionary output parameters - record_json = {"json_local_path": json_local_path, "is_valid_dictionary": is_valid_dictionary} - with open(options.output, 'w', encoding='utf-8') as o: - json.dump(record_json, o, ensure_ascii=False, indent=4) - logger.info(f"JSON response saved in {options.output}") + response = requests.get(url) + data_dictionary = response.text + if file_type == 'json': + data_dictionary = response.text + data_dictionary = json.loads(data_dictionary) + with open(json_local_path, 'w', encoding='utf-8') as f: + json.dump(data_dictionary, f, ensure_ascii=False, indent=4) + return json_local_path + elif file_type == 'csv' or file_type == 'tsv': + data_dictionary = response.content + csv_local_path = json_local_path.replace('json', f"{file_type}") + with open(csv_local_path, 'wb') as f: + f.write(data_dictionary) + return csv_local_path + except Exception as exc: + raise(exc) + + return local_path @classmethod @@ -190,4 +201,5 @@ def _get_file_type_from_filename(cls, file_name: str): file_type = 'tsv' else: raise Exception("Could not get file type suffix from filename") + return file_type diff --git a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py index a368c273..afe17453 100644 --- a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py @@ -89,9 +89,15 @@ def main(cls, options: Namespace) -> None: except: raise Exception("Could not read local json dictionary.") + # verify that the submitted study-id exists in mds db + # TODO: decide if we want to handle exception here logger.info(f"Checking for study ID {options.study_id} in MDS") - existing_data_dictionaries = utils.check_mds_study_id(options.study_id, config.HOST_NAME) + vlmd_for_study = utils.check_mds_study_id(options.study_id, config.HOST_NAME) + logger.info(f"Existing vlmd = {vlmd_for_study}") + # if empty then fill in required key: 'data_dictionaries' + if vlmd_for_study.get('data_dictionaries') == None: + vlmd_for_study['data_dictionaries'] = {} # test the client token - maybe put this in a try statement. # get token for mds api call @@ -107,7 +113,8 @@ def main(cls, options: Namespace) -> None: try: guid = str(uuid.uuid4()) data = { "_guid_type": "data_dictionary", - "data_dictionary": data_dictionary['data_dictionary']} + "title": options.dictionary_name, + "data_dictionary": data_dictionary} url = f"https://{config.HOST_NAME}/mds/metadata/{guid}" headers = {"Authorization": "bearer " + token, "content-type": "application/json"} response = requests.post(url, headers=headers, json=data) @@ -122,14 +129,17 @@ def main(cls, options: Namespace) -> None: if response.status_code != 200 and response.status_code != 201: logger.error("Error in uploading dictionary to MDS") - # add this name and guid to the study ID metadata + # add this name and guid to the study ID variable level metadata logger.info(f"Adding dictionary_name '{options.dictionary_name}' to study ID = {options.study_id}") try: - existing_data_dictionaries[options.dictionary_name] = f"{guid}" - data = {"data_dictionaries": existing_data_dictionaries} + vlmd_for_study['data_dictionaries'][options.dictionary_name] = f"{guid}" + json_data = { + "variable_level_metadata": vlmd_for_study + } + # data = {"data_dictionaries": existing_data_dictionaries} url = f"https://{config.HOST_NAME}/mds/metadata/{options.study_id}?merge=True" - response = requests.put(url, headers=headers, json=data) + response = requests.put(url, headers=headers, json=json_data) response.raise_for_status() logger.info("Success") except: From 1a0f702565fcad0f7c58700ba8653e14cafbc9a5 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Mon, 10 Jun 2024 14:48:05 -0700 Subject: [PATCH 4/8] (HP-1483): update tests --- .secrets.baseline | 20 +- .../tests/test_common_utils.py | 8 +- ...subcommand_read_and_validate_dictionary.py | 439 ++++++++++++++---- ...est_subcommand_upload_dictionary_to_mds.py | 11 +- 4 files changed, 368 insertions(+), 110 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 72559e6f..38d485e5 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -1,9 +1,9 @@ { "exclude": { - "files": "^.secrets.baseline$", + "files": null, "lines": null }, - "generated_at": "2023-09-28T19:27:35Z", + "generated_at": "2024-06-10T21:46:21Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -442,14 +442,6 @@ "type": "Base64 High Entropy String" } ], - "azlinux-jupyter-scipy/start.sh": [ - { - "hashed_secret": "f98370d81077aed0aa3500bfc8de3f3e1dac52f6", - "is_verified": false, - "line_number": 133, - "type": "Secret Keyword" - } - ], "jupyter-geo/start.sh": [ { "hashed_secret": "f98370d81077aed0aa3500bfc8de3f3e1dac52f6", @@ -468,9 +460,9 @@ ], "vlmd-submission-tools/poetry.lock": [ { - "hashed_secret": "5b240644452ed40dfe194673b7db6b641971c720", + "hashed_secret": "e1df343623dcc5d44e3a7da0e09ee4b0c980b52f", "is_verified": false, - "line_number": 1221, + "line_number": 1519, "type": "Hex High Entropy String" } ], @@ -478,7 +470,7 @@ { "hashed_secret": "8318df9ecda039deac9868adf1944a29a95c7114", "is_verified": false, - "line_number": 100, + "line_number": 102, "type": "Secret Keyword" } ], @@ -494,7 +486,7 @@ { "hashed_secret": "8318df9ecda039deac9868adf1944a29a95c7114", "is_verified": false, - "line_number": 129, + "line_number": 134, "type": "Secret Keyword" } ] diff --git a/vlmd-submission-tools/tests/test_common_utils.py b/vlmd-submission-tools/tests/test_common_utils.py index 80adc927..bbb5ff3f 100644 --- a/vlmd-submission-tools/tests/test_common_utils.py +++ b/vlmd-submission-tools/tests/test_common_utils.py @@ -43,14 +43,16 @@ def test_check_mds_study_id(self, mocked_post): hostname = "mycommons.planx-pla.net" study_id = "my_study_id" expected_data_dictionaries = { - "my first dictionary": "guid1", - "my second dictionary": "guid2" + "data_dictionaries" : { + "my first dictionary": "guid1", + "my second dictionary": "guid2" + } } mock_mds_response = MagicMock(requests.Response) mock_mds_response.status_code = 200 mock_mds_response.json.return_value = { "_guid_type": "discovery_metadata", - "data_dictionaries": expected_data_dictionaries + "variable_level_metadata": expected_data_dictionaries } mocked_post.return_value = mock_mds_response diff --git a/vlmd-submission-tools/tests/test_subcommand_read_and_validate_dictionary.py b/vlmd-submission-tools/tests/test_subcommand_read_and_validate_dictionary.py index 4cdb24bf..2c39f4c2 100644 --- a/vlmd-submission-tools/tests/test_subcommand_read_and_validate_dictionary.py +++ b/vlmd-submission-tools/tests/test_subcommand_read_and_validate_dictionary.py @@ -1,20 +1,22 @@ """Tests for the ``vlmd_submission_tools.subcommands.ReadAndValidateDictionary`` subcommand""" import os +import re from typing import NamedTuple +from unittest import mock from unittest.mock import MagicMock, patch -from frictionless import FrictionlessException import json from parameterized import parameterized from pathlib import Path import pytest import requests - - +import requests_mock from vlmd_submission_tools.subcommands import ReadAndValidateDictionary from utils import cleanup_files +DIR = Path(__file__).resolve().parent + class MockArgs(NamedTuple): file_name: str json_local_path: str @@ -22,15 +24,60 @@ class MockArgs(NamedTuple): output: str -class TestReadAndValidateDictionarySubcommand: +@pytest.fixture(scope="session") +def download_dir(tmpdir_factory): + path = tmpdir_factory.mktemp("vlmd_download_dir") + return path + + +@pytest.fixture +def template_submission_json(): + with open(Path(DIR, "templates/template_submission_small.json")) as f: + return json.load(f) + + +@pytest.fixture +def template_submission_invalid_json(): + with open(Path(DIR, "templates/template_submission_invalid.json")) as f: + return json.load(f) + + +@pytest.fixture +def template_submission_csv(): + with open(Path(DIR, "templates/template_submission_small.csv")) as f: + # return re.escape(f.read()) + # return bytes(f.read(), 'utf-8') + return f.read() + + +@pytest.fixture +def template_submission_invalid_csv(): + with open(Path(DIR, "templates/template_submission_invalid.csv")) as f: + return f.read() - def get_mock_args(self, file_name, dictionary_url): +@pytest.fixture +def template_submission_tsv(): + with open(Path(DIR, "templates/template_submission_small.tsv")) as f: + # return re.escape(f.read()) + # return bytes(f.read(), 'utf-8') + return f.read() + + +@pytest.fixture +def template_submission_invalid_tsv(): + with open(Path(DIR, "templates/template_submission_invalid.tsv")) as f: + return f.read() + + +class TestReadAndValidateDictionarySubcommand: + + def get_mock_args(self, file_name, json_local_path, dictionary_url, output): return MockArgs( file_name=file_name, - json_local_path="test_dictionary.json", + json_local_path=json_local_path, dictionary_url=dictionary_url, - output="validate.json", + output=output, ) @@ -49,114 +96,326 @@ def test_get_file_type_from_filename_exception(self): ReadAndValidateDictionary._get_file_type_from_filename(bad_filename) - @parameterized.expand(["csv", "tsv"]) - def test_read_and_validate_dictionary_csv(self, suffix): - # read valid csv/tsv dictionaries directly from file - args = self.get_mock_args(f"template_submission.{suffix}",f"tests/templates/template_submission.{suffix}") - expected_json = { + def test_download_from_url( + self, + template_submission_json, + template_submission_csv, + template_submission_tsv, + download_dir + ): + + url = "https://some.url" + json_local_path = f"{download_dir}/test_dict.json" + with requests_mock.Mocker() as m: + + file_type = "json" + # good data from url + m.get(url, json=template_submission_json) + result = ReadAndValidateDictionary._download_from_url( + file_type, url, json_local_path + ) + assert result == json_local_path + assert os.path.exists(json_local_path) + # TODO: read file and assert that is equal to mock_data + with open(json_local_path, 'r') as f: + downloaded_json = json.load(f) + assert downloaded_json == template_submission_json + + file_type = "csv" + expected_local_path = json_local_path.replace('json', f"{file_type}") + m.get( + url, + content = bytes(template_submission_csv, 'utf-8') + ) + + result = ReadAndValidateDictionary._download_from_url( + file_type, url, json_local_path + ) + assert result == expected_local_path + assert os.path.exists(expected_local_path) + with open(expected_local_path, 'r') as f: + downloaded_csv = f.read() + assert downloaded_csv == template_submission_csv + + file_type = "tsv" + expected_local_path = json_local_path.replace('json', f"{file_type}") + m.get( + url, + content = bytes(template_submission_tsv, 'utf-8') + ) + + result = ReadAndValidateDictionary._download_from_url( + file_type, url, json_local_path + ) + assert result == expected_local_path + assert os.path.exists(expected_local_path) + with open(expected_local_path, 'r') as f: + downloaded_tsv = f.read() + assert downloaded_tsv == template_submission_tsv + + + def test_download_from_url_failures(self, download_dir): + + file_type = "json" + url = "https://some.url" + json_local_path = f"{download_dir}/test_dict.json" + csv_local_path = json_local_path.replace('json', 'csv') + if os.path.exists(json_local_path): + Path(json_local_path).unlink() + + with requests_mock.Mocker() as m: + # bad url - request throws exception + output_path = None + m.get(url, exc=requests.HTTPError('Mocked HTTP Error')) + expected_error=f"Mocked HTTP Error" + with pytest.raises(Exception, match=expected_error): + output_path = ReadAndValidateDictionary._download_from_url( + file_type, url, json_local_path + ) + assert output_path == None + assert os.path.exists(json_local_path) == False + + # have a good url but a bad json_local_path + output_path = None + bad_local_path = "/does/not/exist.json" + csv_local_path = bad_local_path.replace('json', 'csv') + mock_data = {"title": "test json data"} + m.get(url, json=mock_data) + expected_error = re.escape(f"[Errno 2] No such file or directory: '{bad_local_path}'") + with pytest.raises(Exception, match=expected_error): + output_path = ReadAndValidateDictionary._download_from_url( + file_type, url, bad_local_path + ) + # no output file, no converted json, no original csv + assert output_path == None + assert os.path.exists(json_local_path) == False + assert os.path.exists(csv_local_path) == False + + + def test_read_and_validate_dictionary_json(self, template_submission_json, download_dir): + # read valid json dictionary + json_file_name = "template_submission_small.json" + path_to_input_dict = f"tests/templates/{json_file_name}" + args = self.get_mock_args( + file_name=json_file_name, + json_local_path=f"{download_dir}/test_dictionary.json", + dictionary_url="https://some.url", + output=f"{download_dir}/validate_artifact.json", + ) + expected_validation_report = { "json_local_path": args.json_local_path, - "is_valid_dictionary": True + "is_valid_dictionary": True, + "errors": [] } + try: - ReadAndValidateDictionary.main(options=args) - - # The converted json dictionary - assert Path(args.json_local_path).resolve().is_file() - with open(args.json_local_path, 'r') as fh: - converted_json = json.load(fh) - assert "title" in converted_json - assert "data_dictionary" in converted_json - - # The output json for subcommand - with open(args.output, 'r') as fh: - result_json = json.load(fh) - assert json.dumps(result_json) == json.dumps(expected_json) + with requests_mock.Mocker() as m: + m.get( + args.dictionary_url, + text=json.dumps(template_submission_json) + ) + + ReadAndValidateDictionary.main(options=args) + + # downloaded json dict is saved in 'json_local_path'. + assert Path(args.json_local_path).resolve().is_file() + with open(args.json_local_path, 'r') as fh: + downloaded_json = json.load(fh) + assert "fields" in downloaded_json + assert downloaded_json == template_submission_json + + # The output validation report json for subcommand + with open(args.output, 'r') as fh: + validation_report = json.load(fh) + assert validation_report == expected_validation_report finally: cleanup_files([args.json_local_path, args.output]) - @parameterized.expand(["csv"]) - def test_read_and_validate_dictionary_csv_invalid_dictionary(self, suffix): - # read valid csv/tsv dictionaries directly from file - args = self.get_mock_args(f"template_submission_bad_format.{suffix}",f"tests/templates/template_submission_bad_format.{suffix}") + def test_read_and_validate_dictionary_bad_url(self): + args = self.get_mock_args( + file_name="some_template.json", + json_local_path=f"{download_dir}/test_dictionary.json", + dictionary_url="https://some.url", + output=f"{download_dir}/validate_artifact.json", + ) + with requests_mock.Mocker() as m: + m.get( + args.dictionary_url, + text="404 file not found", + status_code = 404 + ) - # Exception from bad input file - expected_error="Not a valid dictionary" - with pytest.raises(Exception, match=expected_error): ReadAndValidateDictionary.main(args) - assert os.path.exists(args.output) == False + assert os.path.exists(args.json_local_path) == False + assert os.path.exists(args.output) == False - @parameterized.expand(["csv", "tsv"]) - def test_read_and_validate_dictionary_csv_does_not_exist(self, suffix): - # read valid csv/tsv dictionaries directly from file - args = self.get_mock_args(f"dict_does_not_exist.{suffix}",f"tests/templates/dict_does_not_exist.{suffix}") + def test_read_and_validate_dictionary_bad_local_path( + self, template_submission_json, download_dir + ): + args = self.get_mock_args( + file_name="some_template.json", + json_local_path="/does/not/exist", + dictionary_url="https://some.url", + output=f"{download_dir}/validate_artifact.json", + ) + with requests_mock.Mocker() as m: + m.get( + args.dictionary_url, + text=json.dumps(template_submission_json) + ) - try: - # Exception from bad input file - expected_error=f"Frictionless could not read dictionary from url {args.dictionary_url}" - with pytest.raises(FrictionlessException, match=expected_error): - ReadAndValidateDictionary.main(args) + ReadAndValidateDictionary.main(args) assert os.path.exists(args.json_local_path) == False assert os.path.exists(args.output) == False - finally: - cleanup_files([args.json_local_path, args.output]) - - # JSON test will need mock of requests.get so that is reads the file from disk. - @patch('requests.get') - def test_read_and_validate_dictionary_json(self, mocked_request): - # read valid json dictionary - json_file_name = "template_submission_minimal.json" - path_to_input_dict = f"tests/templates/{json_file_name}" - args = self.get_mock_args(json_file_name,path_to_input_dict) - expected_json = { + def test_read_and_validate_dictionary_json_invalid_dictionary( + self, template_submission_invalid_json, download_dir + ): + args = self.get_mock_args( + file_name=f"template_submission_invalid.json", + json_local_path=f"{download_dir}/test_dict.json", + dictionary_url="https://some.url", + output=f"{download_dir}/validate_artifact.json", + ) + expected_validation_report = { "json_local_path": args.json_local_path, - "is_valid_dictionary": True + "is_valid_dictionary": False, + "errors": [{ + 'json_path': '$.fields[0].type', + 'message': "'character' is not one of ['number', 'integer', 'string', 'any', 'boolean', 'date', 'datetime', 'time', 'year', 'yearmonth', 'duration', 'geopoint']" + }] } - # mock pre-signed url response by reading from local test file - with open(path_to_input_dict, 'r') as fh: - input_dict_json = json.load(fh) - mocked_request.return_value.text = json.dumps(input_dict_json) try: - ReadAndValidateDictionary.main(options=args) - - # when input dict is json then converted json is the same. - assert Path(args.json_local_path).resolve().is_file() - with open(args.json_local_path, 'r') as fh: - converted_json = json.load(fh) - assert "title" in converted_json - assert "data_dictionary" in converted_json - assert json.dumps(converted_json) == json.dumps(input_dict_json) - - # The output json for subcommand - with open(args.output, 'r') as fh: - result_json = json.load(fh) - assert json.dumps(result_json) == json.dumps(expected_json) + with requests_mock.Mocker() as m: + m.get( + args.dictionary_url, + text=json.dumps(template_submission_invalid_json) + ) + ReadAndValidateDictionary.main(options=args) + # we should have our downloaded json file + assert Path(args.json_local_path).resolve().is_file() + # the validation report should show errors + assert Path(args.output).resolve().is_file() + with open(args.output, 'r') as fh: + validation_report = json.load(fh) + assert validation_report == expected_validation_report finally: cleanup_files([args.json_local_path, args.output]) - @patch('requests.get') - def test_read_and_validate_dictionary_json_does_not_exist(self, mocked_request): - # read valid csv/tsv dictionaries directly from file - args = self.get_mock_args(f"dict_does_not_exist.json",f"https://tests/templates/dict_does_not_exist.json") + @pytest.mark.parametrize( + "suffix", + ["csv", "tsv"] + ) + def test_read_and_validate_dictionary_csv( + self, + template_submission_csv, + template_submission_tsv, + template_submission_json, + download_dir, + suffix + ): + + args = self.get_mock_args( + file_name=f"template_submission.{suffix}", + json_local_path=f"{download_dir}/test_dictionary.json", + dictionary_url="https://some.url", + output=f"{download_dir}/validate_artifact.json", + ) + expected_validation_report = { + "json_local_path": args.json_local_path, + "is_valid_dictionary": True, + "errors": [] + } + expected_converted_json = template_submission_json + try: + with requests_mock.Mocker() as m: + + if suffix == 'csv': + m.get( + args.dictionary_url, + content = bytes(template_submission_csv, 'utf-8') + ) + elif suffix == 'tsv': + m.get( + args.dictionary_url, + content = bytes(template_submission_tsv, 'utf-8') + ) + + ReadAndValidateDictionary.main(options=args) + + # we should have a file for unconverted data + expected_local_path = args.json_local_path.replace('json', f'{suffix}') + assert Path(expected_local_path).resolve().is_file() + # we should have a converted json dictionary + assert Path(args.json_local_path).resolve().is_file() + with open(args.json_local_path, 'r') as f: + converted_json = json.load(f) + assert converted_json.get('fields') == expected_converted_json.get('fields') + # output validation report json artifact + with open(args.output, 'r') as f: + validation_report = json.load(f) + assert validation_report == expected_validation_report - # pre-signed url returns 404 - mocked_response = MagicMock(requests.Response) - mocked_response.status_code = 404 - mocked_response.json.return_value = { - "error": "no record found", + finally: + cleanup_files([expected_local_path, args.json_local_path, args.output]) + + + @pytest.mark.parametrize( + "suffix", + ["csv", "tsv"] + ) + def test_read_and_validate_dictionary_csv_invalid_dictionary( + self, + template_submission_invalid_csv, + template_submission_invalid_tsv, + download_dir, + suffix + ): + args = self.get_mock_args( + file_name=f"template_submission_invalid.{suffix}", + json_local_path=f"{download_dir}/test_dict.json", + dictionary_url="https://some.url", + output=f"{download_dir}/validate_artifact.json", + ) + expected_validation_report = { + "json_local_path": args.json_local_path, + "is_valid_dictionary": False, + "errors": [{ + 'json_path': '$[0].type', + 'message': "'character' is not valid under any of the given schemas" + }] } - mocked_request.return_value = mocked_response - mocked_request.return_value.text = "404 file not found" - # Exception from bad input file - expected_error=f"Could not read dictionary from url {args.dictionary_url}" - with pytest.raises(Exception, match=expected_error): - ReadAndValidateDictionary.main(args) - assert os.path.exists(args.json_local_path) == False - assert os.path.exists(args.output) == False + try: + with requests_mock.Mocker() as m: + if suffix == 'csv': + m.get( + args.dictionary_url, + content=bytes(template_submission_invalid_csv, 'utf-8') + ) + elif suffix == 'tsv': + m.get( + args.dictionary_url, + content=bytes(template_submission_invalid_tsv, 'utf-8') + ) + + ReadAndValidateDictionary.main(options=args) + + # csv file should have been downloaded + csv_local_path = args.json_local_path.replace('json',f'{suffix}') + assert Path(csv_local_path).resolve().is_file() + assert Path(args.json_local_path).resolve().is_file() + # validation report should show errors + assert Path(args.output).resolve().is_file() + with open(args.output, 'r') as fh: + validation_report = json.load(fh) + assert validation_report == expected_validation_report + + finally: + cleanup_files([csv_local_path, args.json_local_path, args.output]) diff --git a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py index 52d3528b..b767afb9 100644 --- a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py @@ -39,8 +39,10 @@ def test_upload_dictionary_to_mds(self, mocked_mds_put, mocked_mds_post, mocked_ args = self.get_mock_args() existing_data_dictionaries = { - "CVS baseline": "guid-1", - "JSON followup": "guid-2" + "data_dictionaries": { + "CVS baseline": "guid-1", + "JSON followup": "guid-2" + } } mocked_check_mds.return_value = existing_data_dictionaries @@ -54,6 +56,7 @@ def test_upload_dictionary_to_mds(self, mocked_mds_put, mocked_mds_post, mocked_ mocked_post_response.status_code = 200 mocked_post_response.json.return_value = { "_guid_type": "data_dictionary", + "title": args.dictionary_name, "data_dictionary": json_dictionary } mocked_mds_post.return_value = mocked_post_response @@ -66,7 +69,9 @@ def test_upload_dictionary_to_mds(self, mocked_mds_put, mocked_mds_post, mocked_ new_metadata = { "_guid_type": "discovery_metadata", "gen3_discovery": "discovery_metadata", - "data_dictionaries": new_data_dictionaries + "variable_level_metadata": { + "data_dictionaries": new_data_dictionaries + } } mocked_put_response = MagicMock(requests.Response) mocked_put_response.status_code = 200 From 46357b74b1d32f45d4306467c5f1eb72147fb682 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Fri, 14 Jun 2024 15:07:20 -0700 Subject: [PATCH 5/8] (HP-1483): update secrets.baseline --- .secrets.baseline | 4 ++-- .../test_subcommand_upload_dictionary_to_mds.py | 12 +++++++++++- .../subcommands/upload_dictionary_to_mds.py | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 38d485e5..0f6a468a 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2024-06-10T21:46:21Z", + "generated_at": "2024-06-14T21:59:54Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -486,7 +486,7 @@ { "hashed_secret": "8318df9ecda039deac9868adf1944a29a95c7114", "is_verified": false, - "line_number": 134, + "line_number": 136, "type": "Secret Keyword" } ] diff --git a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py index b767afb9..d995aec9 100644 --- a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py @@ -15,16 +15,18 @@ class MockArgs(NamedTuple): json_local_path: str dictionary_name: str + is_valid_dictionary: str study_id: str output: str class TestGetDictionaryUrlSubcommand: - def get_mock_args(self): + def get_mock_args(self,is_valid_dictionary='True'): return MockArgs( json_local_path="tests/templates/template_submission_minimal.json", dictionary_name="Minimal_json_dict", + is_valid_dictionary=is_valid_dictionary, study_id="my_study_id", output="upload_output.json", ) @@ -151,3 +153,11 @@ def test_upload_dictionary_to_mds_failed_update(self, mocked_mds_put, mocked_mds with pytest.raises(Exception, match=expected_error): UploadDictionaryToMds.main(options=args) assert os.path.exists(args.output) == False + + + def test_upload_dictionary_to_mds_invalid_dict(self): + + args = self.get_mock_args(is_valid_dictionary='False') + + UploadDictionaryToMds.main(options=args) + assert os.path.exists(args.output) == False diff --git a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py index afe17453..11b2562c 100644 --- a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py @@ -41,6 +41,17 @@ def __add_arguments__(cls, parser: ArgumentParser) -> None: ), ) + parser.add_argument( + "-v", + "--is_valid_dictionary", + required=False, + type=str, + default="True", + help=( + "Skip the upload if not True" + ), + ) + parser.add_argument( "-s", "--study_id", @@ -81,6 +92,10 @@ def main(cls, options: Namespace) -> None: logger = Logger.get_logger(cls.__tool_name__()) logger.info(cls.__get_description__()) + if options.is_valid_dictionary.lower() != 'true': + logger.info("Skipping MDS upload. Dictionary is not valid.") + return + # Read json dictionary from local path logger.info("Reading dictionary from local file system.") try: From c1825e43216903b8272c6f2069edc44d02ef4a54 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Fri, 14 Jun 2024 17:36:01 -0700 Subject: [PATCH 6/8] (HP-1483): write output artifact when skipping upload --- .../tests/test_subcommand_upload_dictionary_to_mds.py | 10 +++++++++- .../subcommands/upload_dictionary_to_mds.py | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py index d995aec9..dbdbfbc3 100644 --- a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py @@ -158,6 +158,14 @@ def test_upload_dictionary_to_mds_failed_update(self, mocked_mds_put, mocked_mds def test_upload_dictionary_to_mds_invalid_dict(self): args = self.get_mock_args(is_valid_dictionary='False') + expected_output = { + "upload_status": None, + "dictionary_name": args.dictionary_name, + "mds_guid": None + } UploadDictionaryToMds.main(options=args) - assert os.path.exists(args.output) == False + assert os.path.exists(args.output) + with open(args.output, 'r') as fh: + result_json = json.load(fh) + assert result_json == expected_output diff --git a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py index 11b2562c..d826c48e 100644 --- a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py @@ -94,6 +94,15 @@ def main(cls, options: Namespace) -> None: if options.is_valid_dictionary.lower() != 'true': logger.info("Skipping MDS upload. Dictionary is not valid.") + # save the upload_status, dictionary_name and MDS guid output parameters + record_json = { + "upload_status": None, + "dictionary_name": options.dictionary_name, + "mds_guid": None + } + with open(options.output, 'w', encoding='utf-8') as o: + json.dump(record_json, o, ensure_ascii=False, indent=4) + logger.info(f"JSON response saved in {options.output}") return # Read json dictionary from local path @@ -104,7 +113,6 @@ def main(cls, options: Namespace) -> None: except: raise Exception("Could not read local json dictionary.") - # verify that the submitted study-id exists in mds db # TODO: decide if we want to handle exception here logger.info(f"Checking for study ID {options.study_id} in MDS") From 144a9169fb596ad64802da281fafe941fbe03572 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Fri, 14 Jun 2024 17:38:35 -0700 Subject: [PATCH 7/8] (HP-1483): remove comment line --- .../subcommands/upload_dictionary_to_mds.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py index d826c48e..e389415d 100644 --- a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py @@ -114,7 +114,6 @@ def main(cls, options: Namespace) -> None: raise Exception("Could not read local json dictionary.") # verify that the submitted study-id exists in mds db - # TODO: decide if we want to handle exception here logger.info(f"Checking for study ID {options.study_id} in MDS") vlmd_for_study = utils.check_mds_study_id(options.study_id, config.HOST_NAME) logger.info(f"Existing vlmd = {vlmd_for_study}") From dcc2faf5234698ee9cf5b73c3ca691f27345e341 Mon Sep 17 00:00:00 2001 From: George Thomas Date: Mon, 17 Jun 2024 15:03:39 -0700 Subject: [PATCH 8/8] (HP-1483): update secrets.baseline --- .secrets.baseline | 4 +- ...est_subcommand_upload_dictionary_to_mds.py | 57 ++++++++++++++----- .../subcommands/upload_dictionary_to_mds.py | 1 - 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 0f6a468a..244d80ac 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -3,7 +3,7 @@ "files": null, "lines": null }, - "generated_at": "2024-06-14T21:59:54Z", + "generated_at": "2024-06-17T22:02:51Z", "plugins_used": [ { "name": "AWSKeyDetector" @@ -486,7 +486,7 @@ { "hashed_secret": "8318df9ecda039deac9868adf1944a29a95c7114", "is_verified": false, - "line_number": 136, + "line_number": 163, "type": "Secret Keyword" } ] diff --git a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py index dbdbfbc3..e9d8c12c 100644 --- a/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/tests/test_subcommand_upload_dictionary_to_mds.py @@ -9,6 +9,7 @@ import uuid from utils import cleanup_files +from vlmd_submission_tools.common import config from vlmd_submission_tools.subcommands import UploadDictionaryToMds @@ -31,22 +32,31 @@ def get_mock_args(self,is_valid_dictionary='True'): output="upload_output.json", ) - @patch('vlmd_submission_tools.common.utils.check_mds_study_id') @patch('vlmd_submission_tools.common.utils.get_client_secret') @patch('vlmd_submission_tools.common.utils.get_client_token') @patch('requests.post') @patch('requests.put') - def test_upload_dictionary_to_mds(self, mocked_mds_put, mocked_mds_post, mocked_client_token, mocked_client_secret, mocked_check_mds): + def test_upload_dictionary_to_mds( + self, + mocked_mds_put, + mocked_mds_post, + mocked_client_token, + mocked_client_secret, + mocked_check_mds, + ): args = self.get_mock_args() - existing_data_dictionaries = { + + mock_new_guid = "bbf91e87-837d-4f36-88b8-96e83bd77e9a" + vlmd_for_study = { "data_dictionaries": { "CVS baseline": "guid-1", "JSON followup": "guid-2" - } + }, + "common_data_elements" : {"foo": "bar"} } - mocked_check_mds.return_value = existing_data_dictionaries + mocked_check_mds.return_value = vlmd_for_study mocked_client_secret.return_value = {"client_id": "client_id", "client_secret": "client_secret"} mocked_client_token.return_value = "my_client_token" @@ -56,24 +66,20 @@ def test_upload_dictionary_to_mds(self, mocked_mds_put, mocked_mds_post, mocked_ json_dictionary = json.load(fh) mocked_post_response = MagicMock(requests.Response) mocked_post_response.status_code = 200 - mocked_post_response.json.return_value = { + post_request_data = { "_guid_type": "data_dictionary", "title": args.dictionary_name, "data_dictionary": json_dictionary } + mocked_post_response.json.return_value = post_request_data mocked_mds_post.return_value = mocked_post_response - new_data_dictionaries = { - "CVS baseline": "guid-1", - "JSON followup": "guid-2", - args.dictionary_name: "guid-3" - } + # use a mock guid for a placeholder for the mock response + vlmd_for_study['data_dictionaries'][args.dictionary_name] = mock_new_guid new_metadata = { "_guid_type": "discovery_metadata", "gen3_discovery": "discovery_metadata", - "variable_level_metadata": { - "data_dictionaries": new_data_dictionaries - } + "variable_level_metadata": vlmd_for_study, } mocked_put_response = MagicMock(requests.Response) mocked_put_response.status_code = 200 @@ -82,15 +88,36 @@ def test_upload_dictionary_to_mds(self, mocked_mds_put, mocked_mds_post, mocked_ try: UploadDictionaryToMds.main(options=args) + + # output artifact should have the dictionary name and guid with open(args.output, 'r') as fh: result_json = json.load(fh) assert result_json.get("upload_status") == "ok" assert result_json.get("dictionary_name") == args.dictionary_name + new_guid = result_json.get("mds_guid") try: - uuid.UUID(result_json.get("mds_guid")) + uuid.UUID(new_guid) assert True except ValueError: assert False + + # check the post request with the data dictionary + mocked_mds_post.assert_called_with( + f'https://{config.HOST_NAME}/mds/metadata/{new_guid}', + headers={'Authorization': 'bearer my_client_token', 'content-type': 'application/json'}, + json=post_request_data + ) + + # get the actual guid generated by update + vlmd_for_study['data_dictionaries'][args.dictionary_name] = new_guid + + # check that request to update the study VLMD includes the updated VLMD + mocked_mds_put.assert_called_with( + f'https://{config.HOST_NAME}/mds/metadata/my_study_id?merge=True', + headers={'Authorization': 'bearer my_client_token', 'content-type': 'application/json'}, + json={'variable_level_metadata': vlmd_for_study} + ) + finally: cleanup_files([args.output]) diff --git a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py index e389415d..f9e120a9 100644 --- a/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py +++ b/vlmd-submission-tools/vlmd_submission_tools/subcommands/upload_dictionary_to_mds.py @@ -159,7 +159,6 @@ def main(cls, options: Namespace) -> None: json_data = { "variable_level_metadata": vlmd_for_study } - # data = {"data_dictionaries": existing_data_dictionaries} url = f"https://{config.HOST_NAME}/mds/metadata/{options.study_id}?merge=True" response = requests.put(url, headers=headers, json=json_data) response.raise_for_status()