From 47a23a888ed614178e1ada94d6b344fe0510d394 Mon Sep 17 00:00:00 2001
From: Yajing Tang <phillis.tt@gmail.com>
Date: Fri, 22 Feb 2019 17:09:09 -0600
Subject: [PATCH] feat(datasets): add datasets endpoint (#111)

---
 .travis.yml                                |  2 +-
 openapis/swagger.yaml                      | 24 +++++++++
 peregrine/api.py                           |  3 ++
 peregrine/blueprints/datasets.py           | 62 ++++++++++++++++++++++
 peregrine/resources/submission/__init__.py | 18 +++++++
 run_tests.sh                               |  2 +-
 tests/conftest.py                          |  8 +++
 tests/graphql/test_datasets.py             | 47 ++++++++++++++++
 8 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100644 peregrine/blueprints/datasets.py
 create mode 100644 tests/graphql/test_datasets.py

diff --git a/.travis.yml b/.travis.yml
index c3a693bf..b8e879ca 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,7 +35,7 @@ before_script:
 
 # command to run tests
 script:
-  - py.test -vv --cov=peregrine --cov-report xml tests/system_test.py tests/graphql/test_graphql.py
+  - py.test -vv --cov=peregrine --cov-report xml tests/system_test.py tests/graphql/test_graphql.py tests/graphql/test_datasets.py
 
 after_script:
   - python-codacy-coverage -r coverage.xml
diff --git a/openapis/swagger.yaml b/openapis/swagger.yaml
index d0191b48..efce733c 100644
--- a/openapis/swagger.yaml
+++ b/openapis/swagger.yaml
@@ -19,6 +19,30 @@ tags:
   - name: system
     description: System endpoints
 paths:
+  /datasets:
+    get:
+      tags:
+        - datasets
+      summary: Get counts for nodes for each project
+      parameters:
+        - in: query
+          name: nodes
+          description: comma delimited nodes to get counts for
+          schema:
+            type: string
+      responses:
+        '200':
+          description: node counts for each project
+          content:
+            application/json:
+              schema:
+                type: object
+                example: {project__A: {case: 0, aliquot: 1}, project_B: {case: 2, aliquot: 3}}
+        '401':
+          description: unauthorized request
+        '400':
+          description: invalid request
+
   /graphql:
     post:
       tags:
diff --git a/peregrine/api.py b/peregrine/api.py
index e3ef9fe2..4e03aa60 100644
--- a/peregrine/api.py
+++ b/peregrine/api.py
@@ -15,6 +15,7 @@
 
 import peregrine
 from peregrine import dictionary
+from peregrine.blueprints import datasets
 from .errors import APIError, setup_default_handlers, UnhealthyCheck
 from .resources import submission
 from .version_data import VERSION, COMMIT, DICTVERSION, DICTCOMMIT
@@ -32,12 +33,14 @@ def app_register_blueprints(app):
     app.url_map.strict_slashes = False
 
     app.register_blueprint(peregrine.blueprints.blueprint, url_prefix=v0+'/submission')
+    app.register_blueprint(datasets.blueprint, url_prefix=v0+'/datasets')
 
 
 def app_register_duplicate_blueprints(app):
     # TODO: (jsm) deprecate this v0 version under root endpoint.  This
     # root endpoint duplicates /v0 to allow gradual client migration
     app.register_blueprint(peregrine.blueprints.blueprint, url_prefix='/submission')
+    app.register_blueprint(datasets.blueprint, url_prefix='/datasets')
 
 
 def async_pool_init(app):
diff --git a/peregrine/blueprints/datasets.py b/peregrine/blueprints/datasets.py
new file mode 100644
index 00000000..f755dfe4
--- /dev/null
+++ b/peregrine/blueprints/datasets.py
@@ -0,0 +1,62 @@
+import flask
+import os
+import re
+
+from peregrine.resources.submission import (
+    graphql,
+    set_read_access_projects_for_public_endpoint,
+    set_read_access_projects,
+)
+
+from cdiserrors import UserError, AuthZError
+
+blueprint = flask.Blueprint("datasets", "datasets")
+
+
+@blueprint.route("/", methods=["GET"])
+def get_datasets():
+    """
+    Get dataset level summary counts, if a deployment is configured
+    to set PUBLIC_DATASETS to True, this endpoint will be open to
+    anonymous users
+    """
+    nodes = flask.request.args.get("nodes", "")
+    nodes = nodes.split(",")
+    if not nodes:
+        raise UserError("Need to provide target nodes in query param")
+    if os.environ.get("PUBLIC_DATASETS", False) == "true":
+        set_read_access_projects_for_public_endpoint()
+    else:
+        set_read_access_projects()
+    projects = flask.g.read_access_projects
+    if not projects:
+        raise AuthZError("You are not authorized to access any projects")
+    # construct a query that get counts for all projects
+    # because graphql can't add structure to group by projects,
+    # we labeled the count by project index and later parse it
+    # with regex to add structure to response
+    query = "{"
+    for i, project_id in enumerate(projects):
+        query += (
+            " ".join(
+                map(
+                    lambda x: """i{i}_{node}: _{node}_count(project_id: "{p}")""".format(
+                        i=i, node=x, p=project_id
+                    ),
+                    nodes,
+                )
+            )
+            + " "
+        )
+    query += "}"
+    data, errors = graphql.execute_query(query, variables={})
+    if errors:
+        return flask.jsonify({"data": data, "errors": errors}), 400
+    result = {project_id: {} for project_id in projects}
+
+    for name, value in data.iteritems():
+        match = re.search("^i(\d)_(.*)", name)
+        index = int(match.group(1))
+        node = match.group(2)
+        result[projects[index]][node] = value
+    return flask.jsonify(result)
diff --git a/peregrine/resources/submission/__init__.py b/peregrine/resources/submission/__init__.py
index 99f7d12b..b120bdc5 100644
--- a/peregrine/resources/submission/__init__.py
+++ b/peregrine/resources/submission/__init__.py
@@ -43,6 +43,24 @@ def get_open_project_ids():
             for program in project['programs']
         ]
 
+def set_read_access_projects_for_public_endpoint():
+    """
+    Set the global user project list to include all projects for endpoint
+    that doesn't need authorization
+    """
+
+    with flask.current_app.db.session_scope():
+        projects = (
+            flask.current_app.db
+            .nodes(models.Project)
+            .all()
+        )
+        flask.g.read_access_projects = [
+            program['name'] + '-' + project['code']
+            for project in projects
+            for program in project['programs']
+        ]
+
 
 def set_read_access_projects():
     """
diff --git a/run_tests.sh b/run_tests.sh
index 1d384f0c..5b8efaa6 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -11,4 +11,4 @@ userdatamodel-init --db test_userapi
 python bin/setup_test_database.py
 mkdir -p tests/resources/keys; cd tests/resources/keys; openssl genrsa -out test_private_key.pem 2048; openssl rsa -in test_private_key.pem -pubout -out test_public_key.pem; cd -
 
-py.test -vv --cov=peregrine --cov-report xml tests/system_test.py tests/graphql/test_graphql.py
+py.test -vv --cov=peregrine --cov-report xml tests/system_test.py tests/graphql/test_graphql.py tests/graphql/test_datasets.py
diff --git a/tests/conftest.py b/tests/conftest.py
index dc447b86..8d807416 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -238,3 +238,11 @@ def es_teardown():
     es.indices.refresh(index=INDEX)
 
     json_data.close()
+
+
+@pytest.fixture
+def public_dataset_api(request):
+    os.environ["PUBLIC_DATASETS"] = "true"
+    def tearDown():
+        os.environ["PUBLIC_DATASETS"] = "false"
+    request.addfinalizer(tearDown)
diff --git a/tests/graphql/test_datasets.py b/tests/graphql/test_datasets.py
new file mode 100644
index 00000000..f8788725
--- /dev/null
+++ b/tests/graphql/test_datasets.py
@@ -0,0 +1,47 @@
+from test_graphql import post_example_entities_together
+from datamodelutils import models
+import os
+
+
+def test_authorized_call_with_protected_config(
+    client, submitter, pg_driver_clean, cgci_blgsp
+):
+    post_example_entities_together(client, pg_driver_clean, submitter)
+    #: number of nodes to change project_id on, there should be 5
+    with pg_driver_clean.session_scope() as s:
+        cases = pg_driver_clean.nodes(models.Case).all()
+        case_count = len(cases)
+        for case in cases[0:-3]:
+            case.project_id = "OTHER-OTHER"
+            s.merge(case)
+    r = client.get("/datasets?nodes=case,aliquot", headers=submitter)
+    assert r.json.keys() == ["CGCI-BLGSP"]
+    assert r.json["CGCI-BLGSP"]["case"] == case_count - 2
+
+
+def test_anonymous_call_with_protected_config(client, pg_driver_clean, cgci_blgsp):
+    r = client.get("/datasets?nodes=case,aliquot")
+    assert r.status_code == 401
+
+
+def test_anonymous_call_with_public_config(
+    client, submitter, pg_driver_clean, cgci_blgsp, public_dataset_api
+):
+    post_example_entities_together(client, pg_driver_clean, submitter)
+    with pg_driver_clean.session_scope() as s:
+        project = models.Project("other", code="OTHER")
+        program = pg_driver_clean.nodes(models.Program).props(name="CGCI").first()
+        project.programs = [program]
+        s.add(project)
+        aliquot_count = pg_driver_clean.nodes(models.Aliquot).count()
+        cases = pg_driver_clean.nodes(models.Case).all()
+        case_count = len(cases)
+        for case in cases[0:-3]:
+            case.project_id = "CGCI-OTHER"
+            s.merge(case)
+
+    r = client.get("/datasets?nodes=case,aliquot")
+    assert r.json["CGCI-BLGSP"]["case"] == case_count - 2
+    assert r.json["CGCI-BLGSP"]["aliquot"] == aliquot_count
+    assert r.json["CGCI-OTHER"]["aliquot"] == 0
+    assert r.json["CGCI-OTHER"]["case"] == 2