From 7d92352380736342029a6cbbd402529476d3c9e0 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 8 Aug 2024 17:54:39 -0700 Subject: [PATCH 01/12] Merge visitors and mappers modules into miners #515 Signed-off-by: Jono Yang --- minecode/api.py | 4 +- minecode/collectors/maven.py | 2 +- minecode/management/commands/check_uri.py | 3 +- .../commands/get_maven_release_dates.py | 4 +- minecode/management/commands/import_queue.py | 12 +- minecode/management/commands/maven_crawler.py | 2 +- .../management/commands/priority_queue.py | 4 +- minecode/management/commands/run_map.py | 6 +- minecode/management/commands/run_visit.py | 6 +- minecode/management/commands/seed.py | 6 +- minecode/mappers/__init__.py | 40 -- minecode/mappers/apache.py | 268 ----------- minecode/mappers/bitbucket.py | 139 ------ minecode/mappers/debian.py | 441 ------------------ minecode/mappers/dockerhub.py | 56 --- minecode/mappers/freebsd.py | 54 --- minecode/mappers/github.py | 142 ------ minecode/mappers/gitlab.py | 57 --- minecode/mappers/golang.py | 49 -- minecode/mappers/googlecode.py | 127 ----- minecode/mappers/gstreamer.py | 51 -- minecode/mappers/haxe.py | 35 -- minecode/mappers/maven.py | 135 ------ minecode/mappers/npm.py | 61 --- minecode/mappers/nuget.py | 182 -------- minecode/mappers/openssl.py | 69 --- minecode/mappers/openwrt.py | 90 ---- minecode/mappers/repomd.py | 31 -- minecode/{visitors => miners}/__init__.py | 26 +- minecode/{visitors => miners}/apache.py | 257 +++++++++- minecode/{visitors => miners}/bitbucket.py | 123 ++++- minecode/{mappers => miners}/bower.py | 68 ++- minecode/{visitors => miners}/conan.py | 0 minecode/{mappers => miners}/cpan.py | 187 +++++++- minecode/{mappers => miners}/cran.py | 35 +- minecode/{visitors => miners}/debian.py | 420 ++++++++++++++++- minecode/{visitors => miners}/dockerhub.py | 60 ++- minecode/{mappers => miners}/eclipse.py | 153 +++++- minecode/{mappers => miners}/fdroid.py | 85 +++- minecode/{visitors => miners}/fedora.py | 0 minecode/{visitors => miners}/freebsd.py | 52 ++- minecode/{mappers => miners}/freedesktop.py | 42 +- minecode/{visitors => miners}/generic.py | 0 minecode/{visitors => miners}/github.py | 170 ++++++- minecode/{visitors => miners}/gitlab.py | 63 ++- minecode/{visitors => miners}/gnu.py | 2 +- minecode/{visitors => miners}/golang.py | 51 +- minecode/{visitors => miners}/googlecode.py | 132 +++++- minecode/{visitors => miners}/gstreamer.py | 55 ++- minecode/{visitors => miners}/haxe.py | 39 +- minecode/{visitors => miners}/java_stream.py | 0 .../{visitors => miners}/java_stream.py.ABOUT | 0 .../java_stream.py.LICENSE | 0 minecode/{visitors => miners}/maven.py | 117 ++++- minecode/{visitors => miners}/npm.py | 39 +- minecode/{visitors => miners}/nuget.py | 185 +++++++- minecode/{visitors => miners}/openssl.py | 73 ++- minecode/{visitors => miners}/openwrt.py | 88 +++- minecode/{mappers => miners}/packagist.py | 51 +- minecode/{mappers => miners}/pypi.py | 115 ++++- minecode/{visitors => miners}/repodata.py | 0 .../{visitors => miners}/repodata_rpms.py | 2 +- .../repomd_parser.py => miners/repomd.py} | 35 +- minecode/{mappers => miners}/rubygems.py | 126 ++++- minecode/{mappers => miners}/sourceforge.py | 81 +++- minecode/{visitors => miners}/ubuntu.py | 0 minecode/models.py | 5 +- minecode/tests/test_apache.py | 16 +- minecode/tests/test_bitbucket.py | 10 +- minecode/tests/test_bower.py | 8 +- minecode/tests/test_conan.py | 4 +- minecode/tests/test_cpan.py | 16 +- minecode/tests/test_cran.py | 12 +- minecode/tests/test_debian.py | 108 ++--- minecode/tests/test_dockerhub.py | 6 +- minecode/tests/test_eclipse.py | 10 +- minecode/tests/test_fdroid.py | 9 +- minecode/tests/test_freebsd.py | 6 +- minecode/tests/test_freedesktop.py | 8 +- minecode/tests/test_generic.py | 2 +- minecode/tests/test_github.py | 10 +- minecode/tests/test_gitlab.py | 6 +- minecode/tests/test_gnu.py | 2 +- minecode/tests/test_golang.py | 8 +- minecode/tests/test_googlecode.py | 12 +- minecode/tests/test_gstreamer.py | 7 +- minecode/tests/test_haxe.py | 6 +- minecode/tests/test_maven.py | 171 +++---- minecode/tests/test_models.py | 4 +- minecode/tests/test_npm.py | 24 +- minecode/tests/test_nuget.py | 8 +- minecode/tests/test_openssl.py | 4 +- minecode/tests/test_openwrt.py | 8 +- minecode/tests/test_packagist.py | 6 +- minecode/tests/test_pypi.py | 35 +- minecode/tests/test_repodata.py | 2 +- minecode/tests/test_repodata_rpms.py | 2 +- minecode/tests/test_repomd_parser.py | 10 +- minecode/tests/test_rubygems.py | 26 +- minecode/tests/test_run_visit.py | 2 +- minecode/tests/test_seed.py | 2 +- minecode/tests/test_sourceforge.py | 12 +- minecode/visitors/bower.py | 74 --- minecode/visitors/cpan.py | 191 -------- minecode/visitors/cran.py | 44 -- minecode/visitors/eclipse.py | 158 ------- minecode/visitors/fdroid.py | 94 ---- minecode/visitors/freedesktop.py | 48 -- minecode/visitors/packagist.py | 57 --- minecode/visitors/pypi.py | 131 ------ minecode/visitors/rubygems.py | 145 ------ minecode/visitors/sourceforge.py | 90 ---- packagedb/api.py | 4 +- .../management/commands/fix_purl_values.py | 4 +- purl2vcs/src/purl2vcs/find_source_repo.py | 2 +- purldb_project/settings.py | 2 +- 116 files changed, 3057 insertions(+), 3582 deletions(-) delete mode 100644 minecode/mappers/__init__.py delete mode 100644 minecode/mappers/apache.py delete mode 100644 minecode/mappers/bitbucket.py delete mode 100644 minecode/mappers/debian.py delete mode 100644 minecode/mappers/dockerhub.py delete mode 100644 minecode/mappers/freebsd.py delete mode 100644 minecode/mappers/github.py delete mode 100644 minecode/mappers/gitlab.py delete mode 100644 minecode/mappers/golang.py delete mode 100644 minecode/mappers/googlecode.py delete mode 100644 minecode/mappers/gstreamer.py delete mode 100644 minecode/mappers/haxe.py delete mode 100644 minecode/mappers/maven.py delete mode 100644 minecode/mappers/npm.py delete mode 100644 minecode/mappers/nuget.py delete mode 100644 minecode/mappers/openssl.py delete mode 100644 minecode/mappers/openwrt.py delete mode 100644 minecode/mappers/repomd.py rename minecode/{visitors => miners}/__init__.py (92%) rename minecode/{visitors => miners}/apache.py (52%) rename minecode/{visitors => miners}/bitbucket.py (65%) rename minecode/{mappers => miners}/bower.py (63%) rename minecode/{visitors => miners}/conan.py (100%) rename minecode/{mappers => miners}/cpan.py (61%) rename minecode/{mappers => miners}/cran.py (83%) rename minecode/{visitors => miners}/debian.py (63%) rename minecode/{visitors => miners}/dockerhub.py (72%) rename minecode/{mappers => miners}/eclipse.py (53%) rename minecode/{mappers => miners}/fdroid.py (74%) rename minecode/{visitors => miners}/fedora.py (100%) rename minecode/{visitors => miners}/freebsd.py (59%) rename minecode/{mappers => miners}/freedesktop.py (60%) rename minecode/{visitors => miners}/generic.py (100%) rename minecode/{visitors => miners}/github.py (56%) rename minecode/{visitors => miners}/gitlab.py (58%) rename minecode/{visitors => miners}/gnu.py (95%) rename minecode/{visitors => miners}/golang.py (78%) rename minecode/{visitors => miners}/googlecode.py (59%) rename minecode/{visitors => miners}/gstreamer.py (55%) rename minecode/{visitors => miners}/haxe.py (69%) rename minecode/{visitors => miners}/java_stream.py (100%) rename minecode/{visitors => miners}/java_stream.py.ABOUT (100%) rename minecode/{visitors => miners}/java_stream.py.LICENSE (100%) rename minecode/{visitors => miners}/maven.py (93%) rename minecode/{visitors => miners}/npm.py (82%) rename minecode/{visitors => miners}/nuget.py (50%) rename minecode/{visitors => miners}/openssl.py (61%) rename minecode/{visitors => miners}/openwrt.py (53%) rename minecode/{mappers => miners}/packagist.py (70%) rename minecode/{mappers => miners}/pypi.py (55%) rename minecode/{visitors => miners}/repodata.py (100%) rename minecode/{visitors => miners}/repodata_rpms.py (97%) rename minecode/{visitors/repomd_parser.py => miners/repomd.py} (76%) rename minecode/{mappers => miners}/rubygems.py (68%) rename minecode/{mappers => miners}/sourceforge.py (55%) rename minecode/{visitors => miners}/ubuntu.py (100%) delete mode 100644 minecode/visitors/bower.py delete mode 100644 minecode/visitors/cpan.py delete mode 100644 minecode/visitors/cran.py delete mode 100644 minecode/visitors/eclipse.py delete mode 100644 minecode/visitors/fdroid.py delete mode 100644 minecode/visitors/freedesktop.py delete mode 100644 minecode/visitors/packagist.py delete mode 100644 minecode/visitors/pypi.py delete mode 100644 minecode/visitors/rubygems.py delete mode 100644 minecode/visitors/sourceforge.py diff --git a/minecode/api.py b/minecode/api.py index 13146acb..487c4491 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -24,8 +24,8 @@ from rest_framework.response import Response # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import visitors # NOQA +# But importing the miners module triggers routes registration +from minecode import miners # NOQA from minecode import priority_router from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI from minecode.permissions import IsScanQueueWorkerAPIUser diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index aab631fb..22fcc02c 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -1,5 +1,5 @@ from dateutil.parser import parse as dateutil_parse -from minecode.visitors.maven import get_artifacts, is_worthy_artifact, build_url_and_filename +from minecode.miners.maven import get_artifacts, is_worthy_artifact, build_url_and_filename from packagedcode.maven import get_urls from minecode.utils import fetch_http, get_temp_file from packagedcode.models import PackageData diff --git a/minecode/management/commands/check_uri.py b/minecode/management/commands/check_uri.py index f3a67355..5308dbff 100644 --- a/minecode/management/commands/check_uri.py +++ b/minecode/management/commands/check_uri.py @@ -16,8 +16,7 @@ # NOTE: mappers and visitors are Unused Import here: But importing the mappers # module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA +from minecode import miners # NOQA from minecode import map_router from minecode import visit_router from minecode.models import ResourceURI diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index 5dc1e928..ded0916e 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -15,8 +15,8 @@ import requests from minecode.management.commands import VerboseCommand -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_for_artifacts +from minecode.miners.maven import collect_links_from_text +from minecode.miners.maven import filter_for_artifacts from packagedb.models import Package diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 52c0c355..6dfa3a85 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -22,14 +22,14 @@ from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand from minecode.models import ImportableURI -from minecode.visitors.maven import get_artifact_links -from minecode.visitors.maven import get_classifier_from_artifact_url -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_only_directories -from minecode.visitors.maven import get_artifact_sha1 +from minecode.miners.maven import get_artifact_links +from minecode.miners.maven import get_classifier_from_artifact_url +from minecode.miners.maven import collect_links_from_text +from minecode.miners.maven import filter_only_directories +from minecode.miners.maven import get_artifact_sha1 from minecode.model_utils import merge_or_create_package from packagedcode.models import PackageData -from minecode.visitors.maven import determine_namespace_name_version_from_url +from minecode.miners.maven import determine_namespace_name_version_from_url logger = logging.getLogger(__name__) diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index 647a94ae..9a90815b 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -10,7 +10,7 @@ import logging import sys -from minecode.visitors.maven import crawl_maven_repo_from_root +from minecode.miners.maven import crawl_maven_repo_from_root from minecode.management.commands import VerboseCommand diff --git a/minecode/management/commands/priority_queue.py b/minecode/management/commands/priority_queue.py index 94fc4edd..ca3c5bff 100644 --- a/minecode/management/commands/priority_queue.py +++ b/minecode/management/commands/priority_queue.py @@ -16,8 +16,8 @@ from django.utils import timezone # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import visitors # NOQA +# But importing the miners module triggers routes registration +from minecode import miners # NOQA from minecode import priority_router from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand diff --git a/minecode/management/commands/run_map.py b/minecode/management/commands/run_map.py index 0ae56574..fbf80b9c 100644 --- a/minecode/management/commands/run_map.py +++ b/minecode/management/commands/run_map.py @@ -17,9 +17,9 @@ from django.utils import timezone # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA +# But importing the miners module triggers routes registration + +from minecode import miners # NOQA from minecode import map_router from minecode.models import ResourceURI diff --git a/minecode/management/commands/run_visit.py b/minecode/management/commands/run_visit.py index 0bae2cca..4295400d 100644 --- a/minecode/management/commands/run_visit.py +++ b/minecode/management/commands/run_visit.py @@ -23,9 +23,9 @@ import reppy.cache # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA +# But importing the miners module triggers routes registration + +from minecode import miners # NOQA from minecode import visit_router from minecode.management.commands import get_error_message diff --git a/minecode/management/commands/seed.py b/minecode/management/commands/seed.py index 9ee8fd9f..ef3c3121 100644 --- a/minecode/management/commands/seed.py +++ b/minecode/management/commands/seed.py @@ -15,9 +15,9 @@ from django.db import transaction # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA +# But importing the miners module triggers routes registration + +from minecode import miners # NOQA from minecode import seed from minecode.models import ResourceURI diff --git a/minecode/mappers/__init__.py b/minecode/mappers/__init__.py deleted file mode 100644 index f8ccc9fe..00000000 --- a/minecode/mappers/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import pkgutil - - -class Mapper(object): - """ - Abstract base class for mappers. Subclasses must implement the - get_packages() method and use a routing decorator for the URIs they can - handle. - """ - - def __call__(self, uri, resource_uri): - # Note: we let exceptions bubble up and they will be caught and - # processed by the worker loop - return self.get_packages(uri, resource_uri) - - def get_packages(self, uri, resource_uri): - """ - This method must yield ScannedPackage objects (or return a list) built - from a resource_uri ResourceURI object. - """ - raise NotImplementedError - - -""" -Minimal way to recursively import all submodules dynamically. If this module is -imported, all submodules will be imported: this triggers the actual registration -of mappers. This should stay as the last import in this init module. -""" -for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + '.'): - __import__(name) diff --git a/minecode/mappers/apache.py b/minecode/mappers/apache.py deleted file mode 100644 index 25b68aec..00000000 --- a/minecode/mappers/apache.py +++ /dev/null @@ -1,268 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json -import logging - -from packageurl import PackageURL - -from commoncode import fileutils -import packagedcode.models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date -from minecode.visitors.apache import CHECKSUM_EXTS - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - -# TODO: Declared license should be an Apache license - -# common licenses found in JSON -APACHE_LICENSE_URL = { - 'http://usefulinc.com/doap/licenses/asl20', - 'https://usefulinc.com/doap/licenses/asl20', - 'http://spdx.org/licenses/Apache-2.0', - 'https://spdx.org/licenses/Apache-2.0', - 'http://www.apache.org/licenses/LICENSE-2.0', - 'https://www.apache.org/licenses/LICENSE-2.0', - 'http://www.apache.org/licenses/LICENSE-2.0.txt', - 'https://www.apache.org/licenses/LICENSE-2.0.txt', - 'http://www.apache.org/licenses/', - 'http://forrest.apache.org/license.html', - 'https://svn.apache.org/repos/asf/tomee/tomee/trunk/LICENSE', -} - - -# FIXME: this is NOT specific to a download URL but to a project: disabled for now -# @map_router.route('https://projects.apache.org/json/foundation/projects.json') -class ApacheProjectJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - metadata = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - return build_packages_from_projects(metadata, uri=uri) - - -def build_packages_from_projects(metadata, uri=None): - """ - Yield Package built from Apache a `metadata` mapping - which is a dictionary keyed by project name and values are project_metadata. - Yield as many Package as there are download URLs. - """ - for project_name, project_meta in metadata.items(): - short_desc = project_meta.get('shortdesc') - long_desc = project_meta.get('description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - common_data = dict( - datasource_id="apache_json", - type='apache', - name=project_name, - description=description, - homepage_url=project_meta.get('homepage'), - bug_tracking_url=project_meta.get('bug-database'), - primary_language=project_meta.get('programming-language'), - ) - - # FIXME: setting the download-page as the download_url is not right - if project_meta.get('download-page'): - download_url = project_meta.get('download-page') - common_data['download_url'] = download_url - for repo in project_meta.get('repository', []): - common_data['code_view_url'] = repo - # Package code_view_url only support one URL, so break when - # finding a code_view_url - break - - maintainers = project_meta.get('maintainer', []) - for maintainer in maintainers: - mailbox = maintainer.get('mbox', '').replace('mailto:', '') - name = maintainer.get('name') - party = scan_models.Party( - type=scan_models.party_person, name=name, role='maintainer', email=mailbox) - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - common_data['parties'].append(party.to_dict()) - - # license is just a URL in the json file, for example: - # http://usefulinc.com/doap/licenses/asl20 - license_url = project_meta.get('license') - common_data['extracted_license_statement'] = license_url - - if license_url in APACHE_LICENSE_URL: - common_data['declared_license_expression'] = 'apache-2.0' - common_data['declared_license_expression_spdx'] = 'Apache-2.0' - common_data['license_detections'] = [] - - keywords = [] - category = project_meta.get('category', '') - for kw in category.split(','): - kw = kw.strip() - if kw: - keywords.append(kw) - common_data['keywords'] = keywords - - common_data['primary_language'] = project_meta.get( - 'programming-language') - - # FIXME: these cannot be related to actual packages with a download URL - releases = project_meta.get('release') - if releases: - for release in releases: - rdata = dict(common_data) - rdata['version'] = release.get('revision') - if release.get('created') and len(release.get('created')) == 10: - rdata['release_date'] = parse_date(release.get('created')) - else: - logger.warn('Unexpected date format for release date: {}'.format( - release.get('created'))) - package = scan_models.Package.from_package_data( - package_data=rdata, - datafile_path=uri, - ) - yield package - else: - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - yield package - - -# FIXME: this is NOT specific to a download URL but to a project: disabled for now -# FIXME: this is casting too wide a net! -# @map_router.route('http?://[\w\-\.]+.incubator.apache.org/"') -class ApachePodlingsMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - metadata = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - return build_packages_from_podlings(metadata, resource_uri.package_url) - - -def build_packages_from_podlings(metadata, purl): - """ - Yield Package built from Apache podlings metadata - which is a dictionary keyed by project name and values are project_metadata. - Yield as many Package as there are download URLs. - """ - name = metadata.get('name') - if name: - common_data = dict( - type='apache-podling', - name=name, - description=metadata.get('description'), - homepage_url=metadata.get('homepage'), - ) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package - - -@map_router.route('http?s://(archive\.)?apache\.org/dist/.*') -class ApacheDownloadMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages build from a bare download URI or download checksum URI. - """ - if uri.endswith(CHECKSUM_EXTS): - # 1. create a regular package from the URL stripped from its checksum extension - archive_uri, _, checksum_type = uri.rpartition('.') - - pack = build_package_from_download( - archive_uri, resource_uri.package_url) - # 2. collect the checksum inside the file - # and attach it to the package - checksum_value = resource_uri.data.strip() - if checksum_value: - checksum_field_name = 'download_{checksum_type}'.format( - **locals()) - setattr(pack, checksum_field_name, checksum_value) - yield pack - else: - # a plain download URI - yield build_package_from_download(uri, resource_uri.package_url) - - -def build_package_from_download(uri, purl=None): - """ - Return a Package built from an Apache dist download archive URL. - - The uri could be: - http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip - https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip - """ - name, version = get_name_version(uri) - if purl: - purl = PackageURL.from_string(purl) - if not name: - name = purl.name - # FIXME: use purl data?? - package = scan_models.Package( - type='apache', - namespace=purl.namespace, - name=name, - version=version, - download_url=uri, - ) - package.set_purl(purl) - return package - - -# FIXME: there should be only one such method and this one is rather weak -def get_name_version(uri): - """ - Return name and version extracted from a path. - """ - # base_url will end being 'https://archive.apache.org/dist' or 'https://apache.org/dist' - # path is the uri without base url, for example: - # /groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip - _, _, path = uri.partition('apache.org/dist/') - base_name = fileutils.file_base_name(path) - version = None - package_name = '' - name_segments = base_name.split('-') - for segment in name_segments: - try: - # To test if each split segment with . is integer. - # For example in '1.2.3' all chars are integer or period. - # If so, this segment is a version segment. - if version: - # The segment after integer segment should belong to version too. - # For example: turbine-4.0-M1, after detecting 4.0, - # M1 should be including in version too, so the final version is 4.0-M1 - version = '-'.join([version, segment]) - continue - - is_all_int = all(n.isdigit() for n in segment.split('.')) - if is_all_int: - version = segment - except ValueError: - # Connect the package_name with - because we split it with - eariler, util - # when we meet version, package_name should be good. - if not package_name: - package_name = segment - else: - package_name = ('-').join([package_name, segment]) - continue - return package_name, version diff --git a/minecode/mappers/bitbucket.py b/minecode/mappers/bitbucket.py deleted file mode 100644 index 5764cb58..00000000 --- a/minecode/mappers/bitbucket.py +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json -import logging - -from packagedcode import models as scan_models -from packageurl import PackageURL - -from minecode import map_router -from minecode.mappers import Mapper - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -@map_router.route( - 'https://api.bitbucket\.org/2\.0/repositories/.*/downloads/', -) -class BitbucketDownloadMapper(Mapper): - """ - Build package from download urls if present. - """ - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single package version. - """ - downloads_data = json.loads( - resource_uri.data, object_pairs_hook=OrderedDict) - for download_data in downloads_data.get('values', []): - for package in build_bitbucket_download_packages( - download_data, resource_uri.package_url): - yield package - - -def build_bitbucket_download_packages(download_data, purl): - """ - Yield scanned Packages for each download - https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads/ - """ - purl = PackageURL.from_string(purl) - namespace = purl.namespace - name = purl.name - - # FIXME: add these ? - filename = download_data.get('name') - download_counts = download_data.get('downloads', 0) - - download_url = download_data.get('links', {}).get('self', {}).get('href') - size = download_data.get('size') - - package = scan_models.Package( - type='bitbucket', - name=name, - namespace=namespace, - download_url=download_url, - size=size, - ) - package.set_purl(purl) - yield package - - -# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') -class BitbucketIndexMapper(Mapper): - """ - Build a Package for a repo. - """ - - def get_packages(self, uri, resource_uri): - repo = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - if not repo: - return - yield build_bitbucket_repo_package(repo, resource_uri.package_url) - - -# FIXME: disabled as this is for a package template -# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') -class BitbucketRepoMapper(Mapper): - """ - Build a Package for a repo. - """ - - def get_packages(self, uri, resource_uri): - repo = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - if not repo: - return - yield build_bitbucket_repo_package(repo, resource_uri.package_url) - - -def build_bitbucket_repo_package(repo_data, purl): - """ - Peturn a Package "template" from repository data. - Notes: this is not version-specific and has no download URL. - """ - purl = PackageURL.from_string(purl) - scm_protocol = repo_data.get('scm') - if not scm_protocol: - scm_protocol = 'git' - bb_url = '{protocol}+https://bitbucket.org/{namespace}/{name}'.format( - protocol=scm_protocol, **purl.to_dict()) - - owner = repo_data.get('owner') - owner_party = scan_models.Party( - type=scan_models.party_person, - name=owner.get('username'), - role='owner', - url=owner.get('links', {}).get('html', {}).get('href', {}) - ) - - if repo_data.get('has_issues'): - bug_tracking_url = bb_url + '/issues' - else: - bug_tracking_url = None - - package = scan_models.Package( - type=purl.type, - namespace=purl.namespace, - name=purl.name, - homepage_url=repo_data.get('website') or bb_url, - code_view_url=bb_url + '/src', - bug_tracking_url=bug_tracking_url, - description=repo_data.get('description'), - vcs_url=bb_url, - primary_language=repo_data.get('language'), - parties=[owner_party], - ) - package.set_purl(purl) - return package diff --git a/minecode/mappers/debian.py b/minecode/mappers/debian.py deleted file mode 100644 index a2f6e01a..00000000 --- a/minecode/mappers/debian.py +++ /dev/null @@ -1,441 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -from collections import defaultdict -import json -import logging - -import attr -from debian_inspector import debcon -from packagedcode import models as scan_models -from packageurl import PackageURL - -from minecode import ls -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url -from minecode import debutils - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -# FIXME: We are not returning download URLs. Returned information is incorrect - - -def get_dependencies(data): - """ - Return a list of DependentPackage extracted from a Debian `data` mapping. - """ - scopes = { - 'Build-Depends': dict(is_runtime=False, is_optional=True), - 'Depends': dict(is_runtime=True, is_optional=False), - 'Pre-Depends': dict(is_runtime=True, is_optional=False), - # 'Provides': dict(is_runtime=True, is_optional=False), - # 'Recommends': dict(is_runtime=True, is_optional=True), - # 'Suggests': dict(is_runtime=True, is_optional=True), - } - dep_pkgs = [] - for scope, flags in scopes.items(): - depends = data.get(scope) - if not depends: - continue - - dependencies = None # debutils.comma_separated(depends) - if not dependencies: - continue - # break each dep in package names and version constraints - # FIXME:!!! - for name in dependencies: - purl = PackageURL(type='deb', namespace='debian', name=name) - dep = scan_models.DependentPackage( - purl=purl.to_string(), score=scope, **flags) - dep_pkgs.append(dep) - - return dep_pkgs - - -def get_vcs_repo(description): - """ - Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found. - """ - repos = [] - for vcs_tool, vcs_repo in description.items(): - vcs_tool = vcs_tool.lower() - if not vcs_tool.startswith('vcs-') or vcs_tool.startswith('vcs-browser'): - continue - _, _, vcs_tool = vcs_tool.partition('-') - repos.append((vcs_tool, vcs_repo)) - - if len(repos) > 1: - raise TypeError( - 'Debian description with more than one Vcs repos: %(repos)r' % locals()) - - if repos: - vcs_tool, vcs_repo = repos[0] - else: - vcs_tool = None - vcs_repo = None - - return vcs_tool, vcs_repo - - -@map_router.route('http://ftp.debian.org/debian/pool/.*\.dsc') -class DebianDescriptionMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield packages parsed from a dsc Debian control file mapping. - """ - return parse_description( - metadata=json.loads(resource_uri.data), - purl=resource_uri.package_url, - base_download_url=None) - - -def get_files(text): - """ - Yield tuples of (checksum, size, filename) collected from a files field - `text`. - """ - if text: - for line in text.splitlines(False): - # we have htree space-separated items, so we perform two partitions - line = ' '.join(line.split()) - checksum, _, rest = line.partition(' ') - size, _, filename = rest.partition(' ') - yield checksum, size, filename - - -def parse_description(metadata, purl=None, base_download_url=None): - """ - Yield Scanned Package parse from description `metadata` mapping - for a single package version. - Yield as many Package as there are download URLs. - Optionally use the `purl` Package URL string if provided. - """ - # FIXME: this may not be correct: Source and Binary are package names - common_data = dict( - name=metadata['Source'], - version=metadata['Version'], - homepage_url=metadata.get('Homepage'), - code_view_url=metadata.get('Vcs-Browser'), - parties=[] - ) - - if metadata.get('Label'): - common_data['keywords'] = [metadata.get('Label')] - - vcs_tool, vcs_repo = get_vcs_repo(metadata) - if vcs_tool and vcs_repo: - vcs_repo = form_vcs_url(vcs_tool, vcs_repo) - common_data['vcs_url'] = vcs_repo - - dependencies = get_dependencies(metadata) - if dependencies: - common_data['dependencies'] = dependencies - - # TODO: add "original maintainer" seen in Ubuntu - maintainer = metadata.get('Maintainer') - if maintainer: - name, email = debutils.parse_email(maintainer) - if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - common_data['parties'].append(party) - - @attr.s() - class File(object): - name = attr.ib(default=None) - size = attr.ib(default=None) - md5 = attr.ib(default=None) - sha1 = attr.ib(default=None) - sha256 = attr.ib(default=None) - - def collect_files(existing_files, field_value, checksum_name): - for checksum, size, name in get_files(field_value): - fl = existing_files[name] - if not fl.name: - fl.name = name - fl.size = size - setattr(fl, checksum_name, checksum) - - # TODO: what do we do with files? - # FIXME: we should store them in the package record - files = defaultdict(File) - collect_files(existing_files=files, field_value=metadata.get( - 'Files'), checksum_name='md5') - collect_files(existing_files=files, field_value=metadata.get( - 'Checksums-Sha1'), checksum_name='sha1') - collect_files(existing_files=files, field_value=metadata.get( - 'Checksums-Sha256'), checksum_name='sha256') - - # FIXME: craft a download_url - download_url = None - if base_download_url: - download_url = None - common_data['download_url'] = download_url - - package = scan_models.DebianPackage(**common_data) - package.set_purl(purl) - yield package - - -@map_router.route('http://ftp.debian.org/debian/dists/.*Sources.gz') -class DebianSourceFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield ScannedPackages built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - return parse_packages(metadata, resource_uri.package_url) - - -def build_source_file_packages(metadata, purl=None): - """ - Yield packages from the passing source file metadata. - metadata: json metadata content - purl: String value of the package url of the ResourceURI object - """ - for source in debcon.get_paragraphs_data(metadata): - package_name = source.get('Package') - - parties = [] - maintainer_names = debutils.comma_separated( - source.get('Maintainer', '')) - if maintainer_names: - for maintainer in maintainer_names: - name, email = debutils.parse_email(maintainer) - if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - parties.append(party) - contributor_names = debutils.comma_separated( - source.get('Uploaders', '')) - if contributor_names: - for contributor in contributor_names: - name, email = debutils.parse_email(contributor) - if name: - party = scan_models.Party( - name=name, role='contributor', email=email) - parties.append(party) - - dependencies = get_dependencies(source, ['Build-Depends']) - - keywords = set() - keywords.update(debutils.comma_separated(source.get('Binary', ''))) - if source.get('Section'): - keywords.add(source.get('Section')) - - files = source.get('Files') - for f in files: - name = f.get('name') - package = dict( - name=package_name, - version=source.get('Version'), - dependencies=dependencies, - parties=parties, - code_view_url=source.get('Vcs-Browser'), - homepage_url=source.get('Homepage'), - keywords=list(keywords), - ) - - download_url = 'http://ftp.debian.org/debian/{path}/{name}'.format( - path=source.get('Directory'), - name=name) - - package['download_url'] = download_url - - vcs_tool, vcs_repo = get_vcs_repo(source) - if vcs_tool and vcs_repo: - vcs_repo = form_vcs_url(vcs_tool, vcs_repo) - package['vcs_url'] = vcs_repo - - package['md5'] = f.get('md5sum') - # TODO: Why would we have more than a single SHA1 or SHA256 - sha1s = source.get('Checksums-Sha1', []) - for sha1 in sha1s: - sha1value = sha1.get('sha1') - name = sha1.get('name') - if name and sha1value: - package['sha1'] = sha1value - sha256s = source.get('Checksums-Sha256', []) - for sha256 in sha256s: - sha256value = sha256.get('sha256') - name = sha256.get('name') - if name and sha256value: - package['sha256'] = sha256value - package = scan_models.DebianPackage(**package) - package.set_purl(purl) - yield package - - -@map_router.route('http://ftp.debian.org/debian/dists/.*Packages.gz') -class DebianPackageFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages from a Debian Packages inex. - """ - metadata = resource_uri.data - return parse_packages(metadata, resource_uri.package_url) - - -def get_programming_language(tags): - """ - Return the programming language extracted from list of `tags` strings. - """ - for tag in tags: - key, _, value = tag.partition('::') - if key == 'implemented-in': - return value - - -def parse_packages(metadata, purl=None): - """ - Yield packages from Debian package text data. - metadata: Debian data (e.g. a Packages files) - purl: String value of the package url of the ResourceURI object - """ - for pack in debcon.get_paragraphs_data(metadata): - data = dict( - name=pack['Package'], - version=pack['Version'], - homepage_url=pack.get('Homepage'), - code_view_url=pack.get('Vcs-Browser'), - description=pack.get('Description'), - bug_tracking_url=pack.get('Bugs'), - parties=[], - md5=pack.get('MD5sum'), - sha1=pack.get('SHA1'), - sha256=pack.get('SHA256'), - ) - - filename = pack.get('Filename'), - if filename: - data['download_url'] = 'http://ftp.debian.org/debian/{}'.format( - filename) - - maintainers = pack.get('Maintainer') - if maintainers: - name, email = debutils.parse_email(maintainers) - if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - data['parties'].append(party) - - dependencies = get_dependencies(pack) - if dependencies: - data['dependencies'] = dependencies - - keywords = debutils.comma_separated(pack.get('Tag', '')) - - section = pack.get('Section') - if section: - keywords.append(section) - data['keywords'] = keywords - - data['primary_language'] = get_programming_language(keywords) - - package = scan_models.DebianPackage(**data) - if purl: - package.set_purl(purl) - yield package - - -################################################################################# -# FIXME: this cannot work since we do not fetch these yet AND what are the zip jar and gz in this??? -################################################################################# - - -@map_router.route('http://ftp.debian.org/debian/dists/.*\.zip', - 'http://ftp.debian.org/debian/dists/.*\.jar', - 'http://ftp.debian.org/debian/dists/.*\.gz') -class DebianArchiveFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - return build_packages_from_dist_archive(resource_uri.data, resource_uri.uri) - - -def build_packages_from_dist_archive(metadata, uri): - """ - Yield Package built from Debian project URI and the ls content associated - which is a result by running ls LR command at the Debiain root folder. - Yield as many Package as there are download URLs. - """ - debian_dist_length = len('http://ftp.debian.org/debian/dists') - # The parent folder URI related to uri file itself. - folder_uri = uri[debian_dist_length: uri.rindex('/')] - debian_dist_length = len('http://ftp.debian.org/debian/dists') - # project name by trucking the uri - name = uri[debian_dist_length:uri.index('/', debian_dist_length)] - folder_length = debian_dist_length + len(name) + 1 - # version by analysing the uri - version = uri[folder_length:uri.index('/', folder_length)] - common_data = dict( - datasource_id="debian_archive_file", - name=name, - version=version, - ) - - # FIXME: this is NOT RIGHT - def get_resourceuri_by_uri(uri): - """ - Return the Resource URI by searching with passing uri string value. - """ - from minecode.models import ResourceURI - uris = ResourceURI.objects.filter(uri=uri) - if uris: - return uris[0] - - url_template = 'http://ftp.debian.org/debian/dists{name}' - download_urls = [] - for entry in ls.parse_directory_listing(metadata): - if entry.type != ls.FILE: - continue - path = entry.path - - if path.startswith(folder_uri): - path = path.lstrip('/') - url = url_template.format(name=path) - # FIXME: this is NOT RIGHT - if path.endswith('.md5') and url.replace('.md5', '') == uri: - if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).md5: - common_data['md5'] = get_resourceuri_by_uri(url).md5 - # FIXME: this is NOT RIGHT - if path.endswith('.sha') and url.replace('.sha', '') == uri: - if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).sha1: - common_data['sha1'] = get_resourceuri_by_uri(url).sha1 - - if path.endswith(('.jar', 'zip', 'gz')) and url != uri: - download_urls.append(url) - - if download_urls: - for download_url in download_urls: - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package['download_url'] = download_url - yield package - else: - # yield package without a download_url value - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - # FIXME: this is NOT RIGHT: purl is not defined - package.set_purl(package.purl) - yield package diff --git a/minecode/mappers/dockerhub.py b/minecode/mappers/dockerhub.py deleted file mode 100644 index 92b8697f..00000000 --- a/minecode/mappers/dockerhub.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/') -class DockerHubLiraryJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_jsonfile( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_jsonfile(metadata, uri=None, purl=None): - """ - Yield Package built from Docker Hub json content. - metadata: json metadata content - uri: String value of uri of the ResourceURI object. - purl: String value of the package url of the ResourceURI object - """ - content = json.loads(metadata) - dockhub_library_htmlpage_template = 'https://hub.docker.com/_/{project}' - name = content.get('name') - if name: - short_desc = content.get('description') - long_desc = content.get('full_description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - common_data = dict( - type='docker', - name=name, - description=description, - homepage_url=dockhub_library_htmlpage_template.format( - project=name), - ) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/freebsd.py b/minecode/mappers/freebsd.py deleted file mode 100644 index fb0a760b..00000000 --- a/minecode/mappers/freebsd.py +++ /dev/null @@ -1,54 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -from io import StringIO -import os -import saneyaml - -from packagedcode.freebsd import CompactManifestHandler - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import get_temp_dir - - -@map_router.route('https://pkg.freebsd.org/.*packagesite.txz') -class FreeBSDIndexMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - return build_packages(resource_uri.data, resource_uri.package_url) - - -def build_packages(metadata, purl=None): - """ - Yield the package by parsing the passing json content. - metadata: json metadata content - purl: String value of the package url of the ResourceURI object - """ - buf = StringIO(metadata) - # The passing metadata is not a well-formatted yaml or json, but each line is a yaml, so read by line and parse with FreeBSDPackage parser. - for each_line in buf: - if each_line and each_line.strip() in ('', '{', '}'): - continue - content = saneyaml.load(each_line) - if content and content.get('name'): - temp_dir = get_temp_dir('freebsd_index') - location = os.path.join(temp_dir, '+COMPACT_MANIFEST') - with open(location, 'w') as manifest: - manifest.write(each_line) - with open(location, encoding='utf-8') as loc: - yaml_data = saneyaml.load(loc) - package = CompactManifestHandler._parse(yaml_data=yaml_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/github.py b/minecode/mappers/github.py deleted file mode 100644 index 6ef72f00..00000000 --- a/minecode/mappers/github.py +++ /dev/null @@ -1,142 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json -import logging - -import attr - -import packagedcode.models as scan_models -from packageurl import PackageURL - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url -from minecode.utils import parse_date - - -logger = logging.getLogger(__name__) - - -@map_router.route('https://api\.github\.com/repos/([^/]+)/([^/]+)') -class GithubMetaFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - visited_data = resource_uri.data - if not visited_data: - return - return build_github_packages(visited_data, resource_uri.uri, resource_uri.package_url) - - -def build_github_packages(visited_data, uri, purl=None): - """ - Yield Package built from Github API visited_data as a JSON string. - metadata: HTML metadata content - uri: String value of the uri from ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - visited_data = json.loads(visited_data, object_pairs_hook=OrderedDict) - - full_name = visited_data['full_name'] - namespace, name = split_org_repo(full_name) - # FIXME: when could this ever happen?? - assert name == visited_data['name'], 'build_github_packages: Inconsistent name and org for URI: ' + uri - - description = visited_data['description'] - - vcs_url = visited_data.get('git_url'), - if vcs_url: - vcs_url = form_vcs_url('git', vcs_url) - package = scan_models.Package( - type='github', - namespace=namespace, - name=name, - description=description, - primary_language=visited_data.get('language'), - homepage_url=visited_data.get('html_url'), - vcs_url=vcs_url, - # this size does not make sense - size=visited_data.get('size'), - ) - - if visited_data.get('owner'): - package.parties = [ - scan_models.Party( - # FIXME: we can add the org or user URL and we can know if this - # is an org or a perrsone too. - type=scan_models.party_person, - name=visited_data.get('owner'), - role='owner') - ] - - package.set_purl(purl) - - downloads = visited_data.get('downloads') or [] - for download in downloads: - html_url = download.get('html_url') - if html_url: - # make a copy - package = attr.evolve(package) - package.download_url = html_url - package.size = download.get('size') - package.release_date = parse_date(download.get('created_at')) - yield package - - tags = visited_data.get('tags') or [] - for tag in tags: - package = attr.evolve(package) - package.version = tag.get('name') - package_url = PackageURL(type='github', name=package.name, - namespace=namespace, version=tag.get('name')).to_string() - package.sha1 = tag.get('sha1') - if tag.get('tarball_url'): - package.download_url = tag.get('tarball_url') - package.set_purl(package_url) - yield package - if tag.get('zipball_url'): - package.download_url = tag.get('zipball_url') - package.set_purl(package_url) - yield package - - branches_download_urls = visited_data.get('branches_download_urls') or [] - for branches_download_url in branches_download_urls: - package = attr.evolve(package) - package.download_url = branches_download_url - yield package - - -def split_org_repo(url_like): - """ - Given a URL-like string to a GitHub repo or a repo name as in org/name, - split and return the org and name. - - For example: - >>> split_org_repo('foo/bar') - ('foo', 'bar') - >>> split_org_repo('https://api.github.com/repos/foo/bar/') - ('foo', 'bar') - >>> split_org_repo('github.com/foo/bar/') - ('foo', 'bar') - >>> split_org_repo('git://github.com/foo/bar.git') - ('foo', 'bar') - """ - segments = [s.strip() for s in url_like.split('/') if s.strip()] - if not len(segments) >= 2: - raise ValueError('Not a GitHub-like URL: {}'.format(url_like)) - org = segments[-2] - name = segments[-1] - if name.endswith('.git'): - name, _, _ = name .rpartition('.git') - return org, name diff --git a/minecode/mappers/gitlab.py b/minecode/mappers/gitlab.py deleted file mode 100644 index 8a6d7c6c..00000000 --- a/minecode/mappers/gitlab.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -import packagedcode.models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url -from minecode.utils import parse_date - - -@map_router.route('https://gitlab.com/.*') -class GitLabMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_json(metadata, resource_uri.package_url) - - -def build_packages_from_json(metadata, purl=None): - """ - Yield Package built from gitlab json content - metadata: Json metadata content - purl: String value of the package url of the ResourceURI object - """ - content = json.loads(metadata) - - name = content.get('name') - if name: - common_data = dict( - type='gitlab', - name=name, - homepage_url=content.get('web_url'), - description=content.get('description'), - ) - repo_url = content.get('http_url_to_repo') - if repo_url: - repo_url = form_vcs_url('git', repo_url) - common_data['vcs_url'] = repo_url - common_data['code_view_url'] = repo_url - common_data['release_date'] = parse_date(content.get('created_at')) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/golang.py b/minecode/mappers/golang.py deleted file mode 100644 index cd750da1..00000000 --- a/minecode/mappers/golang.py +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode import models as scan_models -from packageurl import PackageURL - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url - - -@map_router.route('pkg:golang/.*') -class GolangApiDocMapper(Mapper): - - def get_packages(self, uri, resource_uri): - package = json.loads(resource_uri.data) - yield build_golang_package(package, resource_uri.package_url) - - -def build_golang_package(package_data, purl): - """ - Return a single Golang package - """ - package_url = PackageURL.from_string(purl) - vcs_url = package_url.qualifiers.get('vcs_repository') - homepage_url = '/'.join(['https:/', - package_url.namespace, package_url.name]) - vcs_tool = 'git' if 'github.com' in package_url.namespace else None - if vcs_tool: - vcs_url = form_vcs_url(vcs_tool, vcs_url) - # TODO: collect stats and counter from package_data too - package = scan_models.Package( - name=package_url.name, - namespace=package_url.namespace, - type=package_url.type, - primary_language='Go', - description=package_data.get('synopsis'), - homepage_url=homepage_url, - vcs_url=vcs_url, - ) - return package diff --git a/minecode/mappers/googlecode.py b/minecode/mappers/googlecode.py deleted file mode 100644 index b06bc538..00000000 --- a/minecode/mappers/googlecode.py +++ /dev/null @@ -1,127 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from django.core.validators import URLValidator -from django.core.exceptions import ValidationError - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json') -class GoogleNewAPIV2ProjectJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - # FIXME: JSON deserialization should be handled eventually by the - # framework - metadata = json.loads(resource_uri.data) - return build_packages_from_projectsjson_v2(metadata, resource_uri.package_url, uri) - - -def build_packages_from_projectsjson_v2(metadata, purl=None, uri=None): - """ - Yield Package built from Googlecode API json `metadata` mapping - which is a dictionary keyed by project name and values are metadatadata. - Yield as many Package as there are download URLs. - metadata: json metadata content from API call - purl: String value of the package url of the ResourceURI object - """ - short_desc = metadata.get('summary') - long_desc = metadata.get('description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - common_data = dict( - datasource_id='googlecode_api_json', - type='googlecode', - name=metadata.get('name'), - description=description - ) - - license_name = metadata.get('license') - if license_name: - common_data['extracted_license_statement'] = license_name - common_data['license_detections'] = [] - - keywords = [] - labels = metadata.get('labels') - for label in labels: - if label: - keywords.append(label.strip()) - common_data['keywords'] = keywords - - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package - - -@map_router.route('https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media') -class GoogleNewAPIV1ProjectJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - # FIXME: JSON deserialization should be handled eventually by the - # framework - metadata = json.loads(resource_uri.data) - return build_packages_from_projectsjson_v1(metadata, resource_uri.package_url, uri) - - -def build_packages_from_projectsjson_v1(metadata, purl=None, uri=None): - """Yield Package from the project.json passed by the google code v1 API - metadata: json metadata content from API call - purl: String value of the package url of the ResourceURI object - """ - if metadata.get('name'): - common_data = dict( - datasource_id="googlecode_json", - type='googlecode', - name=metadata.get('name'), - description=metadata.get('description') - ) - - license_name = metadata.get('license') - if license_name: - common_data['extracted_license_statement'] = license_name - common_data['license_detections'] = [] - - keywords = [] - labels = metadata.get('labels') - for label in labels: - if label: - keywords.append(label.strip()) - common_data['keywords'] = keywords - - common_data['vcs_url'] = metadata.get('ancestorRepo') - common_data['namespace'] = metadata.get('domain') - - # createTime doesn't make sense since the timestamp value is incorrect - # and parsing it will give a wrong year out of range. - - # created_time = metadata.get('creationTime') - # if created_time: - # common_data['release_date'] = date.fromtimestamp(created_time) - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/gstreamer.py b/minecode/mappers/gstreamer.py deleted file mode 100644 index 8d953302..00000000 --- a/minecode/mappers/gstreamer.py +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -from commoncode import fileutils -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2\\.gz|\.tar\.xz]') -class GstreamerURLMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - """ - return build_package_from_url(resource_uri.uri, resource_uri.package_url) - - -def build_package_from_url(uri, purl=None): - """ - Return Package built from uri and package_url. - uri: String value of uri of the ResourceURI object. - purl: String value of the package url of the ResourceURI object - """ - file_name = fileutils.file_name(uri) - file_name_without_prefix = file_name - prefixes = ('.tar.bz2', '.tar.gz', '.tar.xz') - for prefix in prefixes: - file_name_without_prefix = file_name_without_prefix.replace(prefix, '') - if '-' in file_name_without_prefix: - project_name, _, version = file_name.rpartition('-') - common_data = dict( - type='gstreamer', - name=project_name, - version=version, - download_url=uri, - homepage_url='https://gstreamer.freedesktop.org' - ) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/haxe.py b/minecode/mappers/haxe.py deleted file mode 100644 index a8b6e594..00000000 --- a/minecode/mappers/haxe.py +++ /dev/null @@ -1,35 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode.haxe import HaxelibJsonHandler - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json') -class HaxePackageJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from package json file. - """ - # FIXME: JSON deserialization should be handled eventually by the framework - metadata = json.loads(resource_uri.data) - return build_packages_with_json(metadata, resource_uri.package_url) - - -def build_packages_with_json(metadata, purl=None): - # yield package by getting package from the build_package parser in scancode - package = HaxelibJsonHandler._parse(json_data=metadata) - if package: - package.set_purl(purl) - yield package diff --git a/minecode/mappers/maven.py b/minecode/mappers/maven.py deleted file mode 100644 index a3ec9855..00000000 --- a/minecode/mappers/maven.py +++ /dev/null @@ -1,135 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import logging -import packageurl -from packageurl import PackageURL - -from commoncode.text import as_unicode -from packagedcode.models import PackageData -from packagedcode.maven import _parse - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date -from minecode.visitors.maven import Artifact - - -TRACE = False - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -if TRACE: - import sys - logging.basicConfig(stream=sys.stdout) - logger.setLevel(logging.DEBUG) - - -@map_router.route('maven-index://.*') -class MavenIndexArtifactMapper(Mapper): - """ - Process the minimal artifacts collected for a Maven Jar or POM in an - index visit. - """ - - def get_packages(self, uri, resource_uri): - yield get_mini_package(resource_uri.data, uri, resource_uri.package_url) - - -def get_mini_package(data, uri, purl): - """ - Return a MavenPomPackage built from the minimal artifact data available in a - nexus index, given a `data` JSON string, a `uri` string and a `purl` - PacxkageURL string. Return None if the package cannot be built. - """ - if not data: - return - - artdata = json.loads(data) - - # FIXME: this should a slot in Artifact - download_url = artdata.pop('download_url') - # FIXME: what if this is an ArtifactExtended?? - artifact = Artifact(**artdata) - - if purl: - if isinstance(purl, str): - purl = PackageURL.from_string(purl) - assert isinstance(purl, PackageURL) - - qualifiers = None - if purl and purl.qualifiers: - qualifiers = packageurl.normalize_qualifiers( - purl.qualifiers, encode=False) - if qualifiers: - assert isinstance(qualifiers, dict) - logger.debug('get_mini_package: qualifiers: {}'.format(qualifiers)) - - package = PackageData( - type='maven', - namespace=artifact.group_id, - name=artifact.artifact_id, - version=artifact.version, - qualifiers=qualifiers, - description=artifact.description, - download_url=download_url, - release_date=parse_date(artifact.last_modified), - size=artifact.size, - sha1=artifact.sha1 or None, - ) - logger.debug('get_mini_package: package.qualifiers: {}'.format( - package.qualifiers)) - logger.debug( - 'get_mini_package for uri: {}, package: {}'.format(uri, package)) - return package - - -# FIXME this should be valid for any POM -@map_router.route('https?://repo1.maven.org/maven2/.*\.pom') -class MavenPomMapper(Mapper): - """ - Map a proper full POM visited as XML. - """ - - def get_packages(self, uri, resource_uri): - - logger.debug('MavenPomMapper.get_packages: uri: {}, resource_uri: {}, purl:' - .format(uri, resource_uri.uri, resource_uri.package_url)) - package = get_package(resource_uri.data, resource_uri.package_url) - if package: - logger.debug('MavenPomMapper.get_packages: uri: {}, package: {}' - .format(uri, package)) - yield package - - -def get_package(text, package_url=None, - baseurl='https://repo1.maven.org/maven2'): - """ - Return a ScannedPackage built from a POM XML string `text`. - """ - text = as_unicode(text) - package = _parse( - datasource_id='maven_pom', - package_type='maven', - primary_language='Java', - text=text - ) - if package: - # FIXME: this should be part of the parse call - if package_url: - purl = PackageURL.from_string(package_url) - package.set_purl(purl) - # Build proper download_url given a POM: this must be the URL for - # the Jar which is the key to the PackageDB record - # FIXME the download is hardcoded to Maven Central? - # package.download_url = package.repository_download_url(baseurl=baseurl) - return package diff --git a/minecode/mappers/npm.py b/minecode/mappers/npm.py deleted file mode 100644 index 0e13d5d5..00000000 --- a/minecode/mappers/npm.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import json -import logging - - -from packagedcode.npm import NpmPackageJsonHandler - -from minecode import map_router -from minecode.mappers import Mapper - - -TRACE = False - -logger = logging.getLogger(__name__) - -if TRACE: - import sys - logging.basicConfig(stream=sys.stdout) - logger.setLevel(logging.DEBUG) - - -# FIXME: This route may not work when we have scoped Packages or URLs to a specific version -# or yarn URLs -@map_router.route('https://registry.npmjs.org/[^\/]+') -class NpmPackageMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield NpmPackage built from a resource_uri record that contains many - npm versions for a given npm name. - """ - if not resource_uri.data: - return - visited_data = json.loads(resource_uri.data) - return build_packages(visited_data) - - -# FIXME: Consider using PURL here -def build_packages(data): - """ - Yield NpmPackage built from data corresponding to a single package name - and many npm versions. - """ - versions = data.get('versions', {}) - - logger.debug('build_packages: versions: ' + repr(type(versions))) - for version, data in versions.items(): - logger.debug('build_packages: version: ' + repr(version)) - logger.debug('build_packages: data: ' + repr(data)) - package = NpmPackageJsonHandler._parse(json_data=data) - if package: - yield package diff --git a/minecode/mappers/nuget.py b/minecode/mappers/nuget.py deleted file mode 100644 index 7b4b2c0d..00000000 --- a/minecode/mappers/nuget.py +++ /dev/null @@ -1,182 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from bs4 import BeautifulSoup - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://api.nuget.org/v3/catalog.+\.json') -class NugetPackageMapper(Mapper): - """ - Return NugetPackage object by parsing the ResourceURI stored in db referenced by the - nuget API URIs. - """ - - def get_packages(self, uri, resource_uri): - if not resource_uri.data: - return - pkg_data = json.loads(resource_uri.data) - return build_packages_with_json(pkg_data, resource_uri.package_url) - - -def build_packages_with_json(metadata, purl=None): - """ - Yield package from the json metadata passed - metadata: json metadata content from API call - purl: String value of the package url of the ResourceURI object - """ - licenseUrl = metadata.get('licenseUrl') - copyr = metadata.get('copyright') - - authors = [] - names = metadata.get('authors') - if names: - for name in names.split(','): - authors.append(scan_models.Party(name=name.strip(), role='author')) - - keywords = metadata.get('tags', []) - - # TODO: the content has the SHA512, our model may extend to SHA512 - - if name: - short_desc = metadata.get('summary') - long_desc = metadata.get('description') - if long_desc == short_desc: - long_desc = None - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - package_mapping = dict( - type='nuget', - name=metadata['id'], - version=metadata['version'], - homepage_url=metadata.get('projectUrl'), - description=description, - extracted_license_statement=licenseUrl, - license_detections=[], - copyright=copyr, - parties=authors, - keywords=keywords, - ) - package = scan_models.PackageData.from_data( - package_data=package_mapping) - package.set_purl(purl) - yield package - - -@map_router.route('https://api.nuget.org/packages/.*\.nupkg') -class NugetNUPKGDownloadMapper(Mapper): - """ - Return NugetPackage object by parsing the download URL. - For example: https://api.nuget.org/packages/entityframework.4.3.1.nupkg - """ - - def get_packages(self, uri, resource_uri): - if not resource_uri.data: - return - pkg_data = json.loads(resource_uri.data) - return build_packages_with_nupkg_download_url(pkg_data, resource_uri.package_url, resource_uri.uri) - - -def build_packages_with_nupkg_download_url(metadata, purl, uri): - if purl: - package = scan_models.PackageData( - type='nuget', - name=purl.name, - download_url=uri - ) - package.set_purl(purl) - yield package - - -@map_router.route('https://www.nuget.org/packages/[\w\-\.]+', - 'https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+') -class NugetHTMLPackageMapper(Mapper): - """ - Return NugetPackage object by parsing the package HTML content. - For example: https://www.nuget.org/packages/log4net - """ - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri data. - """ - metadata = resource_uri.data - build_packages_from_html( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_html(metadata, uri, purl=None): - """ - Yield Package built from Nuget a `metadata` content - metadata: json metadata content - uri: the uri of the ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - download_url_format = 'https://www.nuget.org/api/v2/package/{name}/{version}' - soup = BeautifulSoup(metadata, 'lxml') - h1 = soup.find('h1') - if h1 and h1.contents: - license_value = None - name = str(h1.contents[0]).strip() - for a in soup.find_all('a'): - if a.get('data-track') and a.get('data-track') == 'outbound-license-url': - license_value = a.string - if license_value: - license_value = str(license_value).strip() - - copyright_value = None - h2s = soup.find_all('h2') - for h2 in h2s: - # Copyright will be after the copyright h2 node - # The exmaple is like this: - #

Copyright

- #

Copyright 2004-2017 The Apache Software Foundation

- if h2.string and h2.string == 'Copyright': - next_element = h2.find_next_sibling('p') - if next_element: - copyright_value = next_element.string - - description = None - for m in soup.find_all('meta'): - if m.get('property') and m.get('property') == 'og:description' and m.get('content'): - description = m.get('content') - - for tbody in soup.find_all('tbody'): - if tbody.get('class') and tbody.get('class')[0] == 'no-border': - for a in tbody.find_all('a'): - version = a.string - if not version or not version.strip(): - continue - version = version.strip() - download_url = download_url_format.format( - name=name, version=version) - package_mapping = dict( - datasource_id="nuget_metadata_json", - name=name, - type='nuget', - version=version, - homepage_url=uri, - description=description, - download_url=download_url, - extracted_license_statement=license_value, - license_detections=[], - copyright=copyright_value - ) - package = scan_models.Package.from_package_data( - package_data=package_mapping, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/openssl.py b/minecode/mappers/openssl.py deleted file mode 100644 index c082cd52..00000000 --- a/minecode/mappers/openssl.py +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from datetime import datetime -import logging - -from commoncode import fileutils -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -@map_router.route('https://ftp.openssl.org/.*') -class OpenSSLMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield ScannedPackage built from resource_uri record for a single package - version. Yield as many Package from the uri - """ - return build_packages(resource_uri, resource_uri.package_url) - - -def build_packages(resource_uri, purl=None): - """ - Yield Package from resource_uri metadata - resource_uri: ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - uri = resource_uri.uri - file_name = fileutils.file_name(uri) - version = file_name.replace('.tar.gz', '').replace('openssl-', '').replace('.tar.gz', '').replace( - '.asc', '').replace('.md5', '').replace('.sha1', '').replace('.sha256', '') - common_data = dict( - datasource_id="openssl_metadeta", - type='generic', - name=file_name, - description='The OpenSSL Project is a collaborative effort to develop a robust, commercial-grade, fully featured, and Open Source toolkit implementing the Transport Layer Security (TLS) protocols (including SSLv3) as well as a full-strength general purpose cryptographic library.', - version=version, - size=resource_uri.size, - release_date=parse_date(resource_uri.last_modified_date), - extracted_license_statement='OpenSSL License', - license_detections=[], - homepage_url='https://www.openssl.org/', - download_url=uri, - copyright='Copyright (c) 1998-2018 The OpenSSL Project\nCopyright (c) 1995-1998 Eric A. Young, Tim J. Hudson\nAll rights reserved.', - vcs_url='git+https://github.com/openssl/openssl.git', - code_view_url='https://github.com/openssl/openssl', - bug_tracking_url='https://github.com/openssl/openssl/issues', - ) - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/openwrt.py b/minecode/mappers/openwrt.py deleted file mode 100644 index 7c1b9ec1..00000000 --- a/minecode/mappers/openwrt.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import logging - -from packagedcode import models as scan_models - -from minecode import debutils -from minecode import map_router -from minecode.mappers import Mapper -from minecode.mappers.debian import get_dependencies - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -""" -OpenWRT IPK package data files are using the Deb822 format. -""" - - -@map_router.route('https://downloads.openwrt.org/.*\.ipk') -class OpenwrtIpkMetadataMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield ScannedPackage built from resource_uri record for a single package - version. Yield as many Package as there are download URLs. - """ - metadata = json.loads(resource_uri.data) - return build_packages(metadata, resource_uri.package_url, uri) - - -def build_packages(metadata, purl=None, uri=None): - """ - Yield ScannedPackage built from the passing metadata. - metadata: metadata mapping - purl: String value of the package url of the ResourceURI object - """ - common_data = dict( - type='openwrt', - datasource_id='openwrt_metadata', - name=metadata.get('Package'), - version=metadata.get('Version'), - description=metadata.get('Description'), - size=metadata.get('Installed-Size'), - ) - - dependencies = get_dependencies(metadata, ['Depends']) - if dependencies: - common_data['dependencies'] = dependencies - - maintainers = metadata.get('Maintainer') - if maintainers: - name, email = debutils.parse_email(maintainers) - if name: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - party = scan_models.Party( - name=name, role='maintainer', email=email) - common_data['parties'].append(party) - - lic = metadata.get('License') - if lic: - common_data['declared_license'] = lic - - common_data['keywords'] = [] - section = metadata.get('Section') - if section: - common_data['keywords'].append(section) - architecture = metadata.get('Architecture') - if architecture: - common_data['keywords'].append(architecture) - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/repomd.py b/minecode/mappers/repomd.py deleted file mode 100644 index 10b8bf66..00000000 --- a/minecode/mappers/repomd.py +++ /dev/null @@ -1,31 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode.models import PackageData - -from minecode import map_router - - -@map_router.route('.+/repomd.xml') -def map_repomd_data(uris, resource_uri): - """ - Returns a list of RpmPackage objects collected from visitors. - """ - if not resource_uri.data: - return - packages = [] - for pkg_data in json.loads(resource_uri.data): - # 'name' is required for every package - # FIXME: how could we obtain a package without a name??? - # FIXME: This cannot work unless we use **pkg_data - if pkg_data.get('name'): - packages.append(PackageData(pkg_data)) - return packages diff --git a/minecode/visitors/__init__.py b/minecode/miners/__init__.py similarity index 92% rename from minecode/visitors/__init__.py rename to minecode/miners/__init__.py index 0120c82d..f353e8c5 100644 --- a/minecode/visitors/__init__.py +++ b/minecode/miners/__init__.py @@ -7,13 +7,9 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - from functools import total_ordering -import gzip import json -import os import pkgutil -import tempfile from minecode.utils import fetch_http from minecode.utils import get_temp_file @@ -245,11 +241,29 @@ def loads(self, content): return json.loads(content) +class Mapper(object): + """ + Abstract base class for mappers. Subclasses must implement the + get_packages() method and use a routing decorator for the URIs they can + handle. + """ + def __call__(self, uri, resource_uri): + # Note: we let exceptions bubble up and they will be caught and + # processed by the worker loop + return self.get_packages(uri, resource_uri) + + def get_packages(self, uri, resource_uri): + """ + This method must yield ScannedPackage objects (or return a list) built + from a resource_uri ResourceURI object. + """ + raise NotImplementedError + + """ Minimal way to recursively import all submodules dynamically. If this module is imported, all submodules will be imported: this triggers the actual registration -of visitors. -This should stay as the last import in this init module. +of miners. This should stay as the last import in this init module. """ for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + '.'): __import__(name) diff --git a/minecode/visitors/apache.py b/minecode/miners/apache.py similarity index 52% rename from minecode/visitors/apache.py rename to minecode/miners/apache.py index 382f5058..e12619ec 100644 --- a/minecode/visitors/apache.py +++ b/minecode/miners/apache.py @@ -2,21 +2,30 @@ # Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import unicode_literals - from itertools import chain +import json +import logging +from commoncode import fileutils from packageurl import PackageURL +import packagedcode.models as scan_models from minecode import ls from minecode import seed +from minecode import map_router from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpVisitor +from minecode.miners import HttpJsonVisitor +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI +from minecode.utils import parse_date + -from minecode.visitors import HttpVisitor -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) """ @@ -335,3 +344,237 @@ def get_uris(self, content): data=project_meta, source_uri=self.uri, visited=True) + + +# common licenses found in JSON +APACHE_LICENSE_URL = { + 'http://usefulinc.com/doap/licenses/asl20', + 'https://usefulinc.com/doap/licenses/asl20', + 'http://spdx.org/licenses/Apache-2.0', + 'https://spdx.org/licenses/Apache-2.0', + 'http://www.apache.org/licenses/LICENSE-2.0', + 'https://www.apache.org/licenses/LICENSE-2.0', + 'http://www.apache.org/licenses/LICENSE-2.0.txt', + 'https://www.apache.org/licenses/LICENSE-2.0.txt', + 'http://www.apache.org/licenses/', + 'http://forrest.apache.org/license.html', + 'https://svn.apache.org/repos/asf/tomee/tomee/trunk/LICENSE', +} + + +# FIXME: this is NOT specific to a download URL but to a project: disabled for now +# @map_router.route('https://projects.apache.org/json/foundation/projects.json') +class ApacheProjectJsonMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + metadata = json.loads(resource_uri.data) + return build_packages_from_projects(metadata, uri=uri) + + +def build_packages_from_projects(metadata, uri=None): + """ + Yield Package built from Apache a `metadata` mapping + which is a dictionary keyed by project name and values are project_metadata. + Yield as many Package as there are download URLs. + """ + for project_name, project_meta in metadata.items(): + short_desc = project_meta.get('shortdesc') + long_desc = project_meta.get('description') + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = '\n'.join(descriptions) + common_data = dict( + datasource_id="apache_json", + type='apache', + name=project_name, + description=description, + homepage_url=project_meta.get('homepage'), + bug_tracking_url=project_meta.get('bug-database'), + primary_language=project_meta.get('programming-language'), + ) + + # FIXME: setting the download-page as the download_url is not right + if project_meta.get('download-page'): + download_url = project_meta.get('download-page') + common_data['download_url'] = download_url + for repo in project_meta.get('repository', []): + common_data['code_view_url'] = repo + # Package code_view_url only support one URL, so break when + # finding a code_view_url + break + + maintainers = project_meta.get('maintainer', []) + for maintainer in maintainers: + mailbox = maintainer.get('mbox', '').replace('mailto:', '') + name = maintainer.get('name') + party = scan_models.Party(type=scan_models.party_person, name=name, role='maintainer', email=mailbox) + parties = common_data.get('parties') + if not parties: + common_data['parties'] = [] + common_data['parties'].append(party.to_dict()) + + # license is just a URL in the json file, for example: + # http://usefulinc.com/doap/licenses/asl20 + license_url = project_meta.get('license') + common_data['extracted_license_statement'] = license_url + + if license_url in APACHE_LICENSE_URL: + common_data['declared_license_expression'] = 'apache-2.0' + common_data['declared_license_expression_spdx'] = 'Apache-2.0' + common_data['license_detections'] = [] + + keywords = [] + category = project_meta.get('category', '') + for kw in category.split(','): + kw = kw.strip() + if kw: + keywords.append(kw) + common_data['keywords'] = keywords + + common_data['primary_language'] = project_meta.get('programming-language') + + # FIXME: these cannot be related to actual packages with a download URL + releases = project_meta.get('release') + if releases: + for release in releases: + rdata = dict(common_data) + rdata['version'] = release.get('revision') + if release.get('created') and len(release.get('created')) == 10: + rdata['release_date'] = parse_date(release.get('created')) + else: + logger.warn('Unexpected date format for release date: {}'.format(release.get('created'))) + package = scan_models.Package.from_package_data( + package_data=rdata, + datafile_path=uri, + ) + yield package + else: + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + yield package + + +# FIXME: this is NOT specific to a download URL but to a project: disabled for now +# FIXME: this is casting too wide a net! +# @map_router.route('http?://[\w\-\.]+.incubator.apache.org/"') +class ApachePodlingsMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + metadata = json.loads(resource_uri.data) + return build_packages_from_podlings(metadata, resource_uri.package_url) + + +def build_packages_from_podlings(metadata, purl): + """ + Yield Package built from Apache podlings metadata + which is a dictionary keyed by project name and values are project_metadata. + Yield as many Package as there are download URLs. + """ + name = metadata.get('name') + if name: + common_data = dict( + type='apache-podling', + name=name, + description=metadata.get('description'), + homepage_url=metadata.get('homepage'), + ) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package + + +@map_router.route('http?s://(archive\.)?apache\.org/dist/.*') +class ApacheDownloadMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Packages build from a bare download URI or download checksum URI. + """ + if uri.endswith(CHECKSUM_EXTS): + # 1. create a regular package from the URL stripped from its checksum extension + archive_uri, _, checksum_type = uri.rpartition('.') + + pack = build_package_from_download(archive_uri, resource_uri.package_url) + # 2. collect the checksum inside the file + # and attach it to the package + checksum_value = resource_uri.data.strip() + if checksum_value: + checksum_field_name = 'download_{checksum_type}'.format(**locals()) + setattr(pack, checksum_field_name, checksum_value) + yield pack + else: + # a plain download URI + yield build_package_from_download(uri, resource_uri.package_url) + + +def build_package_from_download(uri, purl=None): + """ + Return a Package built from an Apache dist download archive URL. + + The uri could be: + http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip + https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip + """ + name, version = get_name_version(uri) + if purl: + purl = PackageURL.from_string(purl) + if not name: + name = purl.name + # FIXME: use purl data?? + package = scan_models.Package( + type='apache', + namespace=purl.namespace, + name=name, + version=version, + download_url=uri, + ) + package.set_purl(purl) + return package + + +# FIXME: there should be only one such method and this one is rather weak +def get_name_version(uri): + """ + Return name and version extracted from a path. + """ + # base_url will end being 'https://archive.apache.org/dist' or 'https://apache.org/dist' + # path is the uri without base url, for example: + # /groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip + _, _, path = uri.partition('apache.org/dist/') + base_name = fileutils.file_base_name(path) + version = None + package_name = '' + name_segments = base_name.split('-') + for segment in name_segments: + try: + # To test if each split segment with . is integer. + # For example in '1.2.3' all chars are integer or period. + # If so, this segment is a version segment. + if version: + # The segment after integer segment should belong to version too. + # For example: turbine-4.0-M1, after detecting 4.0, + # M1 should be including in version too, so the final version is 4.0-M1 + version = '-'.join([version, segment]) + continue + + is_all_int = all(n.isdigit() for n in segment.split('.')) + if is_all_int: + version = segment + except ValueError: + # Connect the package_name with - because we split it with - eariler, util + # when we meet version, package_name should be good. + if not package_name: + package_name = segment + else: + package_name = ('-').join([package_name, segment]) + continue + return package_name, version diff --git a/minecode/visitors/bitbucket.py b/minecode/miners/bitbucket.py similarity index 65% rename from minecode/visitors/bitbucket.py rename to minecode/miners/bitbucket.py index 4595c23d..8e7ae845 100644 --- a/minecode/visitors/bitbucket.py +++ b/minecode/miners/bitbucket.py @@ -3,18 +3,18 @@ # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - +import json import logging +from packagedcode import models as scan_models from packageurl import PackageURL from minecode import seed +from minecode import map_router from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import URI logger = logging.getLogger(__name__) @@ -197,3 +197,114 @@ def get_repo_uris(repo_data, source_uri): # paginated, we want them all url += '?pagelen=100' yield URI(uri=url, package_url=package_url, source_uri=source_uri) + + +@map_router.route( + 'https://api.bitbucket\.org/2\.0/repositories/.*/downloads/', +) +class BitbucketDownloadMapper(Mapper): + """ + Build package from download urls if present. + """ + + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single package version. + """ + downloads_data = json.loads(resource_uri.data) + for download_data in downloads_data.get('values', []): + for package in build_bitbucket_download_packages( + download_data, resource_uri.package_url): + yield package + + +def build_bitbucket_download_packages(download_data, purl): + """ + Yield scanned Packages for each download + https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads/ + """ + purl = PackageURL.from_string(purl) + namespace = purl.namespace + name = purl.name + + # FIXME: add these ? + filename = download_data.get('name') + download_counts = download_data.get('downloads', 0) + + download_url = download_data.get('links', {}).get('self', {}).get('href') + size = download_data.get('size') + + package = scan_models.Package( + type='bitbucket', + name=name, + namespace=namespace, + download_url=download_url, + size=size, + ) + package.set_purl(purl) + yield package + + +# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') +class BitbucketIndexMapper(Mapper): + """ + Build a Package for a repo. + """ + def get_packages(self, uri, resource_uri): + repo = json.loads(resource_uri.data) + if not repo: + return + yield build_bitbucket_repo_package(repo, resource_uri.package_url) + + +# FIXME: disabled as this is for a package template +# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') +class BitbucketRepoMapper(Mapper): + """ + Build a Package for a repo. + """ + def get_packages(self, uri, resource_uri): + repo = json.loads(resource_uri.data) + if not repo: + return + yield build_bitbucket_repo_package(repo, resource_uri.package_url) + + +def build_bitbucket_repo_package(repo_data, purl): + """ + Peturn a Package "template" from repository data. + Notes: this is not version-specific and has no download URL. + """ + purl = PackageURL.from_string(purl) + scm_protocol = repo_data.get('scm') + if not scm_protocol: + scm_protocol = 'git' + bb_url = '{protocol}+https://bitbucket.org/{namespace}/{name}'.format(protocol=scm_protocol, **purl.to_dict()) + + owner = repo_data.get('owner') + owner_party = scan_models.Party( + type=scan_models.party_person, + name=owner.get('username'), + role='owner', + url=owner.get('links', {}).get('html', {}).get('href', {}) + ) + + if repo_data.get('has_issues'): + bug_tracking_url = bb_url + '/issues' + else: + bug_tracking_url = None + + package = scan_models.Package( + type=purl.type, + namespace=purl.namespace, + name=purl.name, + homepage_url=repo_data.get('website') or bb_url, + code_view_url=bb_url + '/src', + bug_tracking_url=bug_tracking_url, + description=repo_data.get('description'), + vcs_url=bb_url, + primary_language=repo_data.get('language'), + parties=[owner_party], + ) + package.set_purl(purl) + return package diff --git a/minecode/mappers/bower.py b/minecode/miners/bower.py similarity index 63% rename from minecode/mappers/bower.py rename to minecode/miners/bower.py index c27f6fc3..811f6aee 100644 --- a/minecode/mappers/bower.py +++ b/minecode/miners/bower.py @@ -11,9 +11,75 @@ from packagedcode import models as scan_models from packagedcode.models import DependentPackage +from packageurl import PackageURL +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import URI + + +class BowerSeed(seed.Seeder): + + def get_seeds(self): + yield 'https://registry.bower.io/packages' + + +@visit_router.route('https://registry.bower.io/packages') +class BowerTopJsonVisitor(HttpJsonVisitor): + """ + Collect URIs for all packages from the json returned. + """ + + def get_uris(self, content): + """ + The json content is a list with name and url, like the following format: + ... + { + "name": "bello", + "url": "https://github.com/QiaoBuTang/bello.git" + }, + { + "name": "bello-gfw", + "url": "https://gitcafe.com/GilbertSun/bello.git" + }, + ... + The url could be in the following formats like github, loglg, gitcafe, bitbuckets etc. + # FIXME: We should cover all urls beyond the above four categories. + """ + github_base_url = 'https://raw.githubusercontent.com/{owner}/{name}/master/bower.json' + lolg_base_url = 'https://lolg.it/{owner}/{name}/raw/master/bower.json' + gitcafe_base_url = 'https://coding.net/u/{owner}/p/{name}/git/raw/master/bower.json' + bitbucket_base_url = 'https://bitbucket.org/{owner}/{name}/raw/master/bower.json' + base_url_map = { + 'https://github.com/': github_base_url, + 'https://lolg.it/': lolg_base_url, + 'https://gitcafe.com/': gitcafe_base_url, + 'https://bitbucket.org/': bitbucket_base_url + } + for entry in content: + name = entry.get('name') + url = entry.get('url') + if name in url: + owner = None + package_url = PackageURL(type='bower', name=name).to_string() + for host_name, base_url in base_url_map.iteritems(): + if url.startswith(host_name): + owner = url[len(host_name): url.index(name) - 1] + yield URI(uri=base_url.format(owner=owner, name=name), package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https://raw.githubusercontent.com/.*/master/bower.json', + 'https://lolg.it/.*/master/bower.json', + 'https://coding.net/.*/master/bower.json', + 'https://bitbucket.org/*/master/bower.json') +class BowerJsonVisitor(HttpJsonVisitor): + """ + Collect content of the json itself by the visitor. + """ + pass @map_router.route('https://raw.githubusercontent.com/.*/master/bower.json', diff --git a/minecode/visitors/conan.py b/minecode/miners/conan.py similarity index 100% rename from minecode/visitors/conan.py rename to minecode/miners/conan.py diff --git a/minecode/mappers/cpan.py b/minecode/miners/cpan.py similarity index 61% rename from minecode/mappers/cpan.py rename to minecode/miners/cpan.py index 4ea7fce4..b6421562 100644 --- a/minecode/mappers/cpan.py +++ b/minecode/miners/cpan.py @@ -7,18 +7,197 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict import json +from bs4 import BeautifulSoup +from packageurl import PackageURL import packagedcode.models as scan_models import saneyaml -from packageurl import PackageURL +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import URI from minecode.utils import parse_date +class CpanSeed(seed.Seeder): + + def get_seeds(self): + yield 'http://www.cpan.org/modules/01modules.index.html' + author_search_template = 'https://fastapi.metacpan.org/author/_search?q=email:{char}*&size=5000' + for char in 'abcdefghijklmnopqrstuvwxyz'.split(): + yield author_search_template.format(char) + +# The idea of CPAN API visitor is based on +# https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md +# +# From the doc: You can certainly scroll if you are fetching less than 5,000 +# items. You might want to do this if you are expecting a large data set, but +# will still need to run many requests to get all of the required data. +# +# To get all results for sure it's over 5000, we should use search twice based +# on author and release. +# +# First get all authors by searching email from a-z, then get all releases based +# on each author. It will make the returned result a small set. + +# For example: + +# First try to reach the author search, the following search URL will get all +# authors whose email starts with 'a', this will loop from 'a' to 'z. + +# https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 + +# If we get the Author ID in above returned json, we can pass to release search +# URL as follows, it will get all releases from the passing author. + +# https://fastapi.metacpan.org/release/_search?q=author:ABERNDT&size=5000 + + +@visit_router.route('https://fastapi.metacpan.org/author/_search\?q=email:[a-z]\*&size=5000') +class MetaCpanAuthorURLVisitors(HttpJsonVisitor): + """ + Run search on author's email, and parse the returned json content and form + the MetaCpanRleaseURLVisitors' URL by adding AUTHOR condition. For example: + https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 a* stands + for all email which starts with 'a', and it's the same with 'A' as email is + case insensitive. The visitor will cover all cases from a to z, and yield + the search URLs by passing each author in the release searching URL + """ + + def get_uris(self, content): + release_visitor_template = 'https://fastapi.metacpan.org/release/_search?q=author:{id}&size=5000' + hits = content.get('hits', {}) + inner_hits = hits.get('hits', []) + for hit in inner_hits: + _id = hit.get('_id') + if not _id: + continue + yield URI(uri=release_visitor_template.format(id=_id), source_uri=self.uri) + + +@visit_router.route('https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000') +class MetaCpanRleaseURLVisitors(HttpJsonVisitor): + """ + Run the release results by searching the passing AUTHOR ID. The visitor will + yield the json whose author ID is the passing author info. The + implementation if the class is empty, it just returns for mapper use of the + json content. + """ + pass + + +@visit_router.route('http://www.cpan.org/modules/01modules.index.html') +class CpanModulesVisitors(HttpVisitor): + """ + Return URIs by parsing the HTML page of cpan modules page. + """ + def get_uris(self, content): + """ + Return the uris of authors pages, the returning URIs will be an input of + CpanProjectHTMLVisitors + """ + page = BeautifulSoup(content, 'lxml') + url_template = 'http://www.cpan.org/{path}' + for a in page.find_all(name='a'): + if 'href' not in a.attrs: + continue + + url = a['href'] + if not url: + continue + + if url.startswith('../authors'): + if url.endswith(('.zip', '.tar.gz')): + # Skip tar.gz since it will be captured by the CpanProjectHTMLVisitors + continue + else: + url = url_template.format(path=url[3:]) + yield URI(uri=url, source_uri=self.uri) + + +@visit_router.route('http://www.cpan.org/authors/.*/') +class CpanProjectHTMLVisitors(HttpVisitor): + """ + Visit the HTML page of cpan project page and return the Packages info, HTML + data and error. + """ + def get_uris(self, content): + """ + Return the uris by looking for the tar.gz in the html, and then forming + the uri for meta and readme files + """ + page = BeautifulSoup(content, 'lxml') + if self.uri.endswith('/'): + url_template = self.uri + '{path}' + else: + url_template = self.uri + '/{path}' + for a in page.find_all(name='a'): + if 'href' not in a.attrs: + continue + + url = a['href'] + if not url: + continue + + if url.startswith(('/', '?')): + continue # Avoid the directory and other non-file links + else: + name = url + name = name.replace('tar.gz', ''). replace('.readme', '').replace('.meta', '') + partions = name.rpartition('-') + name = partions[0] + version = partions[-1] + package_url = None + if name and version: + package_url = PackageURL(type='cpan', name=name, version=version).to_string() + url = url_template.format(path=url) + yield URI(uri=url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route('http://www.cpan.org/.*.meta') +class CpanMetaVisitors(HttpVisitor): + """ + Visit the meta file and return the meta data of the Package The goal + of this visitor is to get the content instead of returning any valid + uris. + """ + pass + + +@visit_router.route('http://www.cpan.org/.*.readme') +class CpanReadmeVisitors(HttpVisitor): + """ + Visit the readme file and translate to json and dump it and return for mapper use. + """ + + def dumps(self, content): + """ + Return the json by parsing the readme content + """ + # Handle bytes properly in python3 + if type(content) == bytes: + content = content.decode('utf-8') + + lines = content.splitlines() + readme_dict = dict() + body = [] + head = None + for line in lines: + if len(line) > 1 and line.isupper() and line[0] != ' ': + if head: + readme_dict[head] = '\n'.join(body).lstrip('\n').rstrip('\n') + head = line + body = [] + else: + body.append(line.strip()) + return json.dumps(readme_dict) + + @map_router.route('https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000') class MetaCpanReleaseSearchMapper(Mapper): @@ -158,7 +337,7 @@ def build_packages_from_metafile(metadata, uri=None, purl=None): # FIXME: it does not make sense to use a single functin tod eal with the two # formats IMHO if is_json(metadata): - content = json.loads(metadata, object_pairs_hook=OrderedDict) + content = json.loads(metadata) else: content = saneyaml.load(metadata) diff --git a/minecode/mappers/cran.py b/minecode/miners/cran.py similarity index 83% rename from minecode/mappers/cran.py rename to minecode/miners/cran.py index d63b98e9..1ae46db8 100644 --- a/minecode/mappers/cran.py +++ b/minecode/miners/cran.py @@ -8,10 +8,15 @@ # from bs4 import BeautifulSoup +from packageurl import PackageURL import packagedcode.models as scan_models from minecode import map_router -from minecode.mappers import Mapper +from minecode import seed +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpVisitor +from minecode.miners import URI from minecode.utils import parse_date @@ -19,6 +24,34 @@ CRAN_WEB_URL = CRAN_URL + 'web/' +class CranSeed(seed.Seeder): + + def get_seeds(self): + yield 'https://cloud.r-project.org/web/packages/available_packages_by_date.html' + + +@visit_router.route('https://cloud.r-project.org/web/packages/available_packages_by_date.html') +class CranPackagesVisitors(HttpVisitor): + """ + Return URIs by parsing the HTML content of the page + """ + def get_uris(self, content): + base_url = 'https://cloud.r-project.org/web/packages/{package}/index.html' + a_blocks = BeautifulSoup(content, 'lxml').find_all('a') + for a in a_blocks: + package = a.text + package_url = PackageURL(type='cran', name=package).to_string() + yield URI(uri=base_url.format(package=package), package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https://cloud.r-project.org/web/packages/[\w\-\.]/index.html') +class CranSinglePackageVisitor(HttpVisitor): + """ + Return only the HTML content of the page, and will be parsed in mapper + """ + pass + + @map_router.route('https://cloud.r-project.org/web/packages/[\w\-\.]/index.html') class CranMetaFileMapper(Mapper): diff --git a/minecode/visitors/debian.py b/minecode/miners/debian.py similarity index 63% rename from minecode/visitors/debian.py rename to minecode/miners/debian.py index 1e1ff956..84fb6fb7 100644 --- a/minecode/visitors/debian.py +++ b/minecode/miners/debian.py @@ -7,7 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - +from collections import defaultdict import attr import gzip import json @@ -19,19 +19,23 @@ from debian_inspector import debcon from debian_inspector import copyright as debcopy from debian_inspector.version import Version as DebVersion -from packagedcode.models import PackageData +from packagedcode import models as scan_models from packagedcode.debian import DebianDscFileHandler from packagedcode.debian_copyright import StandaloneDebianCopyrightFileHandler from packageurl import PackageURL +from minecode import debutils from minecode import ls from minecode import seed +from minecode import map_router from minecode import visit_router from minecode import priority_router -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI from minecode.utils import fetch_and_write_file_from_url +from minecode.utils import form_vcs_url from minecode.utils import get_package_sha1 from packagedb.models import make_relationship from packagedb.models import PackageContentType @@ -388,7 +392,7 @@ def map_debian_package(debian_package, package_content, pipelines, priority=0): logger.error(msg) return db_package, error - purl_package = PackageData( + purl_package = scan_models.PackageData( type=purl.type, namespace=purl.namespace, name=purl.name, @@ -754,3 +758,407 @@ def set_debian_directories(self): self.archive_directory_url = package_directory self.metadata_directory_url = metadata_directory + + +# FIXME: We are not returning download URLs. Returned information is incorrect + + +def get_dependencies(data): + """ + Return a list of DependentPackage extracted from a Debian `data` mapping. + """ + scopes = { + 'Build-Depends': dict(is_runtime=False, is_optional=True), + 'Depends': dict(is_runtime=True, is_optional=False), + 'Pre-Depends': dict(is_runtime=True, is_optional=False), + # 'Provides': dict(is_runtime=True, is_optional=False), + # 'Recommends': dict(is_runtime=True, is_optional=True), + # 'Suggests': dict(is_runtime=True, is_optional=True), + } + dep_pkgs = [] + for scope, flags in scopes.items(): + depends = data.get(scope) + if not depends: + continue + + dependencies = None # debutils.comma_separated(depends) + if not dependencies: + continue + # break each dep in package names and version constraints + # FIXME:!!! + for name in dependencies: + purl = PackageURL(type='deb', namespace='debian', name=name) + dep = scan_models.DependentPackage(purl=purl.to_string(), score=scope, **flags) + dep_pkgs.append(dep) + + return dep_pkgs + + +def get_vcs_repo(description): + """ + Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found. + """ + repos = [] + for vcs_tool, vcs_repo in description.items(): + vcs_tool = vcs_tool.lower() + if not vcs_tool.startswith('vcs-') or vcs_tool.startswith('vcs-browser'): + continue + _, _, vcs_tool = vcs_tool.partition('-') + repos.append((vcs_tool, vcs_repo)) + + if len(repos) > 1: + raise TypeError('Debian description with more than one Vcs repos: %(repos)r' % locals()) + + if repos: + vcs_tool, vcs_repo = repos[0] + else: + vcs_tool = None + vcs_repo = None + + return vcs_tool, vcs_repo + + +@map_router.route('http://ftp.debian.org/debian/pool/.*\.dsc') +class DebianDescriptionMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield packages parsed from a dsc Debian control file mapping. + """ + return parse_description( + metadata=json.loads(resource_uri.data), + purl=resource_uri.package_url, + base_download_url=None) + + +def get_files(text): + """ + Yield tuples of (checksum, size, filename) collected from a files field + `text`. + """ + if text: + for line in text.splitlines(False): + # we have htree space-separated items, so we perform two partitions + line = ' '.join(line.split()) + checksum, _, rest = line.partition(' ') + size, _, filename = rest.partition(' ') + yield checksum, size, filename + + +def parse_description(metadata, purl=None, base_download_url=None): + """ + Yield Scanned Package parse from description `metadata` mapping + for a single package version. + Yield as many Package as there are download URLs. + Optionally use the `purl` Package URL string if provided. + """ + # FIXME: this may not be correct: Source and Binary are package names + common_data = dict( + name=metadata['Source'], + version=metadata['Version'], + homepage_url=metadata.get('Homepage'), + code_view_url=metadata.get('Vcs-Browser'), + parties=[] + ) + + if metadata.get('Label'): + common_data['keywords'] = [metadata.get('Label')] + + vcs_tool, vcs_repo = get_vcs_repo(metadata) + if vcs_tool and vcs_repo: + vcs_repo = form_vcs_url(vcs_tool, vcs_repo) + common_data['vcs_url'] = vcs_repo + + dependencies = get_dependencies(metadata) + if dependencies: + common_data['dependencies'] = dependencies + + # TODO: add "original maintainer" seen in Ubuntu + maintainer = metadata.get('Maintainer') + if maintainer: + name, email = debutils.parse_email(maintainer) + if name: + party = scan_models.Party( + name=name, role='maintainer', email=email) + common_data['parties'].append(party) + + @attr.s() + class File(object): + name = attr.ib(default=None) + size = attr.ib(default=None) + md5 = attr.ib(default=None) + sha1 = attr.ib(default=None) + sha256 = attr.ib(default=None) + + def collect_files(existing_files, field_value, checksum_name): + for checksum, size, name in get_files(field_value): + fl = existing_files[name] + if not fl.name: + fl.name = name + fl.size = size + setattr(fl, checksum_name, checksum) + + # TODO: what do we do with files? + # FIXME: we should store them in the package record + files = defaultdict(File) + collect_files(existing_files=files, field_value=metadata.get('Files'), checksum_name='md5') + collect_files(existing_files=files, field_value=metadata.get('Checksums-Sha1'), checksum_name='sha1') + collect_files(existing_files=files, field_value=metadata.get('Checksums-Sha256'), checksum_name='sha256') + + # FIXME: craft a download_url + download_url = None + if base_download_url: + download_url = None + common_data['download_url'] = download_url + + package = scan_models.DebianPackage(**common_data) + package.set_purl(purl) + yield package + + +@map_router.route('http://ftp.debian.org/debian/dists/.*Sources.gz') +class DebianSourceFileMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackages built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + return parse_packages(metadata, resource_uri.package_url) + + +def build_source_file_packages(metadata, purl=None): + """ + Yield packages from the passing source file metadata. + metadata: json metadata content + purl: String value of the package url of the ResourceURI object + """ + for source in debcon.get_paragraphs_data(metadata): + package_name = source.get('Package') + + parties = [] + maintainer_names = debutils.comma_separated(source.get('Maintainer', '')) + if maintainer_names: + for maintainer in maintainer_names: + name, email = debutils.parse_email(maintainer) + if name: + party = scan_models.Party( + name=name, role='maintainer', email=email) + parties.append(party) + contributor_names = debutils.comma_separated(source.get('Uploaders', '')) + if contributor_names: + for contributor in contributor_names: + name, email = debutils.parse_email(contributor) + if name: + party = scan_models.Party( + name=name, role='contributor', email=email) + parties.append(party) + + dependencies = get_dependencies(source, ['Build-Depends']) + + keywords = set() + keywords.update(debutils.comma_separated(source.get('Binary', ''))) + if source.get('Section'): + keywords.add(source.get('Section')) + + files = source.get('Files') + for f in files: + name = f.get('name') + package = dict( + name=package_name, + version=source.get('Version'), + dependencies=dependencies, + parties=parties, + code_view_url=source.get('Vcs-Browser'), + homepage_url=source.get('Homepage'), + keywords=list(keywords), + ) + + download_url = 'http://ftp.debian.org/debian/{path}/{name}'.format( + path=source.get('Directory'), + name=name) + + package['download_url'] = download_url + + vcs_tool, vcs_repo = get_vcs_repo(source) + if vcs_tool and vcs_repo: + vcs_repo = form_vcs_url(vcs_tool, vcs_repo) + package['vcs_url'] = vcs_repo + + package['md5'] = f.get('md5sum') + # TODO: Why would we have more than a single SHA1 or SHA256 + sha1s = source.get('Checksums-Sha1', []) + for sha1 in sha1s: + sha1value = sha1.get('sha1') + name = sha1.get('name') + if name and sha1value: + package['sha1'] = sha1value + sha256s = source.get('Checksums-Sha256', []) + for sha256 in sha256s: + sha256value = sha256.get('sha256') + name = sha256.get('name') + if name and sha256value: + package['sha256'] = sha256value + package = scan_models.DebianPackage(**package) + package.set_purl(purl) + yield package + + +@map_router.route('http://ftp.debian.org/debian/dists/.*Packages.gz') +class DebianPackageFileMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Packages from a Debian Packages inex. + """ + metadata = resource_uri.data + return parse_packages(metadata, resource_uri.package_url) + + +def get_programming_language(tags): + """ + Return the programming language extracted from list of `tags` strings. + """ + for tag in tags: + key, _, value = tag.partition('::') + if key == 'implemented-in': + return value + + +def parse_packages(metadata, purl=None): + """ + Yield packages from Debian package text data. + metadata: Debian data (e.g. a Packages files) + purl: String value of the package url of the ResourceURI object + """ + for pack in debcon.get_paragraphs_data(metadata): + data = dict( + name=pack['Package'], + version=pack['Version'], + homepage_url=pack.get('Homepage'), + code_view_url=pack.get('Vcs-Browser'), + description=pack.get('Description'), + bug_tracking_url=pack.get('Bugs'), + parties=[], + md5=pack.get('MD5sum'), + sha1=pack.get('SHA1'), + sha256=pack.get('SHA256'), + ) + + filename = pack.get('Filename'), + if filename: + data['download_url'] = 'http://ftp.debian.org/debian/{}'.format(filename) + + maintainers = pack.get('Maintainer') + if maintainers: + name, email = debutils.parse_email(maintainers) + if name: + party = scan_models.Party( + name=name, role='maintainer', email=email) + data['parties'].append(party) + + dependencies = get_dependencies(pack) + if dependencies: + data['dependencies'] = dependencies + + keywords = debutils.comma_separated(pack.get('Tag', '')) + + section = pack.get('Section') + if section: + keywords.append(section) + data['keywords'] = keywords + + data['primary_language'] = get_programming_language(keywords) + + package = scan_models.DebianPackage(**data) + if purl: + package.set_purl(purl) + yield package + + +################################################################################# +# FIXME: this cannot work since we do not fetch these yet AND what are the zip jar and gz in this??? +################################################################################# + + +@map_router.route('http://ftp.debian.org/debian/dists/.*\.zip', + 'http://ftp.debian.org/debian/dists/.*\.jar', + 'http://ftp.debian.org/debian/dists/.*\.gz') +class DebianArchiveFileMapper(Mapper): + + def get_packages(self, uri, resource_uri): + return build_packages_from_dist_archive(resource_uri.data, resource_uri.uri) + + +def build_packages_from_dist_archive(metadata, uri): + """ + Yield Package built from Debian project URI and the ls content associated + which is a result by running ls LR command at the Debiain root folder. + Yield as many Package as there are download URLs. + """ + debian_dist_length = len('http://ftp.debian.org/debian/dists') + # The parent folder URI related to uri file itself. + folder_uri = uri[debian_dist_length: uri.rindex('/')] + debian_dist_length = len('http://ftp.debian.org/debian/dists') + # project name by trucking the uri + name = uri[debian_dist_length:uri.index('/', debian_dist_length)] + folder_length = debian_dist_length + len(name) + 1 + # version by analysing the uri + version = uri[folder_length:uri.index('/', folder_length)] + common_data = dict( + datasource_id="debian_archive_file", + name=name, + version=version, + ) + + # FIXME: this is NOT RIGHT + def get_resourceuri_by_uri(uri): + """ + Return the Resource URI by searching with passing uri string value. + """ + from minecode.models import ResourceURI + uris = ResourceURI.objects.filter(uri=uri) + if uris: + return uris[0] + + url_template = 'http://ftp.debian.org/debian/dists{name}' + download_urls = [] + for entry in ls.parse_directory_listing(metadata): + if entry.type != ls.FILE: + continue + path = entry.path + + if path.startswith(folder_uri): + path = path.lstrip('/') + url = url_template.format(name=path) + # FIXME: this is NOT RIGHT + if path.endswith('.md5') and url.replace('.md5', '') == uri: + if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).md5: + common_data['md5'] = get_resourceuri_by_uri(url).md5 + # FIXME: this is NOT RIGHT + if path.endswith('.sha') and url.replace('.sha', '') == uri: + if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).sha1: + common_data['sha1'] = get_resourceuri_by_uri(url).sha1 + + if path.endswith(('.jar', 'zip', 'gz')) and url != uri: + download_urls.append(url) + + if download_urls: + for download_url in download_urls: + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package['download_url'] = download_url + yield package + else: + # yield package without a download_url value + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + # FIXME: this is NOT RIGHT: purl is not defined + package.set_purl(package.purl) + yield package diff --git a/minecode/visitors/dockerhub.py b/minecode/miners/dockerhub.py similarity index 72% rename from minecode/visitors/dockerhub.py rename to minecode/miners/dockerhub.py index 92601a5c..ff8685a5 100644 --- a/minecode/visitors/dockerhub.py +++ b/minecode/miners/dockerhub.py @@ -1,21 +1,26 @@ # -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals - import json import string -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup +from packagedcode import models as scan_models from packageurl import PackageURL from minecode import seed +from minecode import map_router from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI +from minecode.miners import HttpVisitor +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper +from minecode.miners import URI def get_search_conditions(): @@ -134,3 +139,42 @@ def get_uris(self, content): if name: package_url = PackageURL(type='docker', name=name).to_string() yield URI(uri=base_url.format(name=name), package_url=package_url, source_uri=self.uri) + + +@map_router.route('https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/') +class DockerHubLiraryJsonMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_jsonfile(metadata, resource_uri.uri, resource_uri.package_url) + + +def build_packages_from_jsonfile(metadata, uri=None, purl=None): + """ + Yield Package built from Docker Hub json content. + metadata: json metadata content + uri: String value of uri of the ResourceURI object. + purl: String value of the package url of the ResourceURI object + """ + content = json.loads(metadata) + dockhub_library_htmlpage_template = 'https://hub.docker.com/_/{project}' + name = content.get('name') + if name: + short_desc = content.get('description') + long_desc = content.get('full_description') + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = '\n'.join(descriptions) + common_data = dict( + type='docker', + name=name, + description=description, + homepage_url=dockhub_library_htmlpage_template.format(project=name), + ) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package diff --git a/minecode/mappers/eclipse.py b/minecode/miners/eclipse.py similarity index 53% rename from minecode/mappers/eclipse.py rename to minecode/miners/eclipse.py index 9edd9615..6166dc14 100644 --- a/minecode/mappers/eclipse.py +++ b/minecode/miners/eclipse.py @@ -10,10 +10,159 @@ import json from bs4 import BeautifulSoup - +from commoncode import fileutils from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import URI + + +class EclipseSeed(seed.Seeder): + + def get_seeds(self): + yield 'http://projects.eclipse.org/json/projects/all' + + +@visit_router.route('https://projects.eclipse.org/list-of-projects') +class EclipseProjectVisitors(HttpVisitor): + """ + Visit the HTML page of eclipse projects page and return the Packages info, json data and error. + """ + + def get_uris(self, content): + page = BeautifulSoup(content, 'lxml') + for a in page.find_all(name='a'): + if 'href' not in a.attrs: + continue + href = a['href'] + if href and href.startswith('https://projects.eclipse.org/projects/'): + # if the herf content starts with Eclipse single project suffix, generate a URI with the href content + project_name = href.replace('https://projects.eclipse.org/projects/', '') + package_url = PackageURL(type='eclipse', name=project_name).to_string() + yield URI(uri=href, package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https://projects.eclipse.org/projects/.*') +class EclipseSingleProjectVisitor(HttpVisitor): + """ + Visit the HTML page of single eclipse project. + This is to get the HTML page as metadata, as it's single project and the URI is already collected by + EclipseProjectVisitors https://projects.eclipse.org/list-of-projects, so it won't return any new URI + and the goal is to return HTML page. + + For example:https://projects.eclipse.org/projects/modeling.m2t.accele + """ + pass + + +@visit_router.route('http://git.eclipse.org/c') +class EclipseGitVisitor(HttpVisitor): + """ + Visitor Eclipse Git HTML page and return URIs in the Git HTML page. + """ + + def get_uris(self, content): + page = BeautifulSoup(content, 'lxml') + for td in page.find_all(name='td'): + if 'class' not in td.attrs: + continue + if td.attrs.get('class') != ['sublevel-repo']: + continue + + for a in td.findChildren(name='a'): + href = a['href'] + name = a.contents[0] + package_url = PackageURL(type='eclipse', name=name).to_string() + yield URI(uri=href, package_url=package_url, source_uri=self.uri) + + +@visit_router.route('http://www.eclipse.org/downloads/packages/all') +class EclipsePackagesVisitor(HttpVisitor): + """ + Visit the Eclipse packages HTML page and return URIs parsed from HTML page. + """ + + def fetch(self, uri, timeout=40): + """ + Fetch and return the content found at a remote uri with an extra timeout + """ + return HttpVisitor.fetch(self, uri, timeout=timeout) + + def get_uris(self, content): + page = BeautifulSoup(content, 'lxml') + for td in page.find_all(name='span'): + if 'class' not in td.attrs: + continue + if td.attrs.get('class') != ['field-content']: + continue + + a = td.find(name='a') + href = a['href'] + name = a.contents[0] + # Skip some of the nodes if it's a HTML tag but not a string + if name and isinstance(name, str): + package_url = PackageURL(type='eclipse', name=name).to_string() + yield URI(uri=href, package_url=package_url, source_uri=self.uri) + + +@visit_router.route('http://www.eclipse.org/downloads/packages/release/.*') +class EclipseReleaseVisitor(HttpVisitor): + """ + Visit the Eclipse release HTML page and return expected Package URIs. + """ + + def get_uris(self, content): + page = BeautifulSoup(content, 'lxml') + suffix_list = ['-win32.zip', '-win64.exe', '-win32-x86_64.zip', '-linux-gtk-x86_64.tar.gz', + '-linux-gtk-x86_64.tar.gz', '-macosx-cocoa-x86_64.tar.gz', '-linux-gtk.tar.gz', '-x86_64.tar.gz'] + for div in page.find_all(name='div'): + for a in div.find_all(name='a'): + url = a.get('href') + if url and 'download.php?file=' in url: + file_name = fileutils.file_name(url) + name = file_name + for suffix in suffix_list: + name = name.replace(suffix, '') + package_url = PackageURL(type='eclipse', name=name).to_string() + yield URI(uri=url, file_name=file_name, package_url=package_url, source_uri=self.uri) + + +@visit_router.route('http://projects.eclipse.org/json/projects/all') +class EclipseProjectsJsonVisitor(HttpJsonVisitor): + """ + Visit the Ecipse json API and return expected project specified URIs. + """ + + def fetch(self, uri, timeout=40): + """ + Fetch and return the content found at a remote uri with an extra timeout + """ + return HttpJsonVisitor.fetch(self, uri, timeout=timeout) + + def get_uris(self, content): + url_template = 'http://projects.eclipse.org/json/project/{name}' + projects = content.get('projects', {}) + for project in projects: + # TODO: are we sure there is not more data available in this JSON? + package_url = PackageURL(type='eclipse', name=project).to_string() + yield URI(uri=url_template.format(name=project), package_url=package_url, source_uri=self.uri) + + +@visit_router.route('http://projects.eclipse.org/json/project/.*') +class EclipseSingleProjectJsonVisitor(HttpJsonVisitor): + """ + Visit json of a single Eclipse project. This is to return the json + itself without any URIs, as the URI itself is returned by + EclipseProjectsJsonVisitor. + """ + pass + # FIXME: we should create packages from releases!!!! not from projects diff --git a/minecode/mappers/fdroid.py b/minecode/miners/fdroid.py similarity index 74% rename from minecode/mappers/fdroid.py rename to minecode/miners/fdroid.py index e50e59ca..d0b4085d 100644 --- a/minecode/mappers/fdroid.py +++ b/minecode/miners/fdroid.py @@ -11,12 +11,17 @@ import logging from packagedcode.models import PackageData +from packagedcode.models import Party +from packagedcode.models import party_person +from packageurl import PackageURL +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper -from packageurl import PackageURL -from packagedcode.models import party_person -from packagedcode.models import Party +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import URI +from minecode.miners import NonPersistentHttpVisitor + TRACE = False @@ -28,6 +33,78 @@ logger.setLevel(logging.DEBUG) +""" +Visitors for F-Droid package repositories. + +NOTE: the license of F-Droid package data needs to be clarified. +See https://gitlab.com/fdroid/fdroiddata/-/issues/2826 for details + +F-Droid packages come with a main JSON index and possible increment/diffs. +- https://f-droid.org/repo/index-v2.json + +- this is a legacy XMl index https://f-droid.org/repo/index.xml + +- This top level file lists index and diffs https://f-droid.org/repo/entry.json + +- This is a diff example: https://f-droid.org/repo/diff/1666980277000.json + +- Each apk is available from a URL using this form: + + https://f-droid.org/repo/app.seeneva.reader_3.apk + https://f-droid.org/repo/{application_id}_{version_code}.apk + +The {application_id}_{version_code}.apk "file name" for each tarball and +apk file name is listed in the index. +""" + + +class FdroidSeed(seed.Seeder): + + def get_seeds(self): + yield 'https://f-droid.org/repo/index-v2.json' + + +def build_purl(package_id, version_code, filename): + """ + Return a PackageURL for an F-Droid package. + """ + return PackageURL( + type='fdroid', + name=package_id, + version=version_code, + qualifiers=dict(filename=filename) + ) + + +@visit_router.route('https://f-droid.org/repo/index-v2.json') +class FdroidIndexVisitor(NonPersistentHttpVisitor): + """ + Collect package metadata URIs from the F-Droid index for each package. + We treat each apk and corresponding source tarball as a different package. + """ + + def get_uris(self, content): + """ + Yield a URI for each F-Droid package. + """ + json_location = content + with open(json_location) as c: + content = json.loads(c.read()) + + packages = content['packages'] + + for package_id, package_data in packages.items(): + purl = PackageURL(type='fdroid', name=package_id).to_string() + yield URI( + uri=purl, + package_url=purl, + source_uri=self.uri, + data=json.dumps(package_data, separators=(',', ':'), ensure_ascii=False), + # note: visited is True since there nothing more to visit + visited=True + ) + + @map_router.route('pkg:fdroid/.+') class FdroidPackageMapper(Mapper): diff --git a/minecode/visitors/fedora.py b/minecode/miners/fedora.py similarity index 100% rename from minecode/visitors/fedora.py rename to minecode/miners/fedora.py diff --git a/minecode/visitors/freebsd.py b/minecode/miners/freebsd.py similarity index 59% rename from minecode/visitors/freebsd.py rename to minecode/miners/freebsd.py index 241db9ad..aa75f2fa 100644 --- a/minecode/visitors/freebsd.py +++ b/minecode/miners/freebsd.py @@ -7,18 +7,24 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - +from io import StringIO import logging import os from bs4 import BeautifulSoup +from packagedcode.freebsd import CompactManifestHandler +import saneyaml from minecode import seed +from minecode import map_router from minecode import visit_router from minecode.utils import extract_file -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI +from minecode.miners import Mapper +from minecode.miners import HttpVisitor +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI +from minecode.utils import get_temp_dir + logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -85,5 +91,39 @@ def dumps(self, content): with open(manifest_file) as file_handler: return file_handler.read() else: - logger.warn( - 'The packagesite.yaml is not existing in index file:' + content) + logger.warn('The packagesite.yaml is not existing in index file:' + content) + + +@map_router.route('https://pkg.freebsd.org/.*packagesite.txz') +class FreeBSDIndexMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + return build_packages(resource_uri.data, resource_uri.package_url) + + +def build_packages(metadata, purl=None): + """ + Yield the package by parsing the passing json content. + metadata: json metadata content + purl: String value of the package url of the ResourceURI object + """ + buf = StringIO(metadata) + # The passing metadata is not a well-formatted yaml or json, but each line is a yaml, so read by line and parse with FreeBSDPackage parser. + for each_line in buf: + if each_line and each_line.strip() in ('', '{', '}'): + continue + content = saneyaml.load(each_line) + if content and content.get('name'): + temp_dir = get_temp_dir('freebsd_index') + location = os.path.join(temp_dir, '+COMPACT_MANIFEST') + with open(location, 'w') as manifest: + manifest.write(each_line) + with open(location, encoding='utf-8') as loc: + yaml_data = saneyaml.load(loc) + package = CompactManifestHandler._parse(yaml_data=yaml_data) + package.set_purl(purl) + yield package diff --git a/minecode/mappers/freedesktop.py b/minecode/miners/freedesktop.py similarity index 60% rename from minecode/mappers/freedesktop.py rename to minecode/miners/freedesktop.py index 297b48c1..8dc10e4e 100644 --- a/minecode/mappers/freedesktop.py +++ b/minecode/miners/freedesktop.py @@ -8,15 +8,51 @@ # from bs4 import BeautifulSoup -from packageurl import PackageURL - from packagedcode import models as scan_models +from packageurl import PackageURL +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpVisitor +from minecode.miners import URI from minecode.utils import form_vcs_url +class FreedesktopSeed(seed.Seeder): + + def get_seeds(self): + yield 'https://www.freedesktop.org/wiki/Software' + + +@visit_router.route('https://www.freedesktop.org/wiki/Software') +class FreedesktopHTMLVisitor(HttpVisitor): + """ + Visit the Freedesktop Software HTML page and return URIs parsed from HTML page. + """ + def get_uris(self, content): + url_template = 'https://www.freedesktop.org/wiki/Software/{name}' + page = BeautifulSoup(content, 'lxml') + for div in page.find_all(name='div'): + for a in div.find_all(name='a'): + if 'href' not in a.attrs: + continue + href = a['href'] + if href and href.startswith('./'): + project_name = href.replace('./', '').strip('/') + package_url = PackageURL(type='freedesktop', name=project_name).to_string() + yield URI(uri=url_template.format(name=project_name), package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https://www.freedesktop.org/wiki/Software/.*') +class FreedesktopProjectHTMLVisitor(HttpVisitor): + """ + Visit the Freedesktop Project HTML page. + """ + pass + + @map_router.route('https://www.freedesktop.org/wiki/Software/.*') class FreedesktopHTMLProjectMapper(Mapper): def get_packages(self, uri, resource_uri): diff --git a/minecode/visitors/generic.py b/minecode/miners/generic.py similarity index 100% rename from minecode/visitors/generic.py rename to minecode/miners/generic.py diff --git a/minecode/visitors/github.py b/minecode/miners/github.py similarity index 56% rename from minecode/visitors/github.py rename to minecode/miners/github.py index 665db404..3d3eb055 100644 --- a/minecode/visitors/github.py +++ b/minecode/miners/github.py @@ -1,29 +1,31 @@ -# -*- coding: utf-8 -*- # -# Copyright (c) nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -from collections import OrderedDict from datetime import date from datetime import datetime import json import logging from github.MainClass import Github -from github.Repository import Repository -from github.Download import Download from packageurl import PackageURL +import attr +import packagedcode.models as scan_models +from minecode import map_router from minecode import priority_router from minecode import visit_router, seed -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI -from minecode.visitors.generic import map_fetchcode_supported_package - +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper +from minecode.miners import URI +from minecode.miners.generic import map_fetchcode_supported_package +from minecode.utils import form_vcs_url +from minecode.utils import parse_date logger = logging.getLogger(__name__) @@ -47,6 +49,7 @@ class GithubReposVisitor(HttpJsonVisitor): Refer to: https://developer.github.com/v3/repos/#list-all-public-repositories https://api.github.com/repositories """ + def get_uris(self, content): repo_request_base = 'https://api.github.com/repositories?since=' has_content = False @@ -61,14 +64,17 @@ def get_uris(self, content): if url: package_url = None if name: - package_url = PackageURL(type='github', name=name).to_string() + package_url = PackageURL( + type='github', name=name).to_string() # Yield URI for GithubSingleRepoVisitor use yield URI(uri=url, package_url=package_url, source_uri=self.uri) if not has_content: - logger.info('The content of the response is empty, the processing might be finished for URI: {}'.format(self.uri)) + logger.info( + 'The content of the response is empty, the processing might be finished for URI: {}'.format(self.uri)) else: uri = self.uri - current_id = uri.replace('https://api.github.com/repositories?since=', '') + current_id = uri.replace( + 'https://api.github.com/repositories?since=', '') current_id = int(current_id) # 100 is fixed since each page has 100 entries. Plus 100 means to go from next page. new_id = current_id + 100 @@ -93,7 +99,7 @@ def fetch(self, uri, timeout=None): g = Github() repo = g.get_repo(full_name) - common_data = OrderedDict( + common_data = dict( name=repo.name, description=repo.description, blobs_url=repo.blobs_url, @@ -127,7 +133,7 @@ def fetch(self, uri, timeout=None): downloads = [] if repo.get_downloads(): for download in list(repo.get_downloads()): - downloads.append(OrderedDict( + downloads.append(dict( name=download.name, url=download.url, size=download.size, @@ -141,14 +147,15 @@ def fetch(self, uri, timeout=None): bucket=download.bucket, acl=download.acl, accesskeyid=download.accesskeyid, - expirationdate=json_serial_date_obj(download.expirationdate), + expirationdate=json_serial_date_obj( + download.expirationdate), )) common_data['downloads'] = downloads tags = [] if repo.get_tags(): for tag in list(repo.get_tags()): - tag_info = OrderedDict( + tag_info = dict( name=tag.name, tarball_url=tag.tarball_url, zipball_url=tag.zipball_url, @@ -166,7 +173,8 @@ def fetch(self, uri, timeout=None): download_url_bases = '{html_url}/archive/{branch_name}.zip' if repo.get_branches(): for branch in list(repo.get_branches()): - branches_download_urls.append(download_url_bases.format(html_url=common_data.get('html_url'), branch_name=branch.name)) + branches_download_urls.append(download_url_bases.format( + html_url=common_data.get('html_url'), branch_name=branch.name)) common_data['branches_download_urls'] = branches_download_urls common_data['labels'] = [] @@ -206,7 +214,125 @@ def process_request_dir_listed(purl_str, **kwargs): error = f"error occurred when parsing {purl_str}: {e}" return error - error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) + error_msg = map_fetchcode_supported_package( + package_url, pipelines, priority) if error_msg: return error_msg + + +@map_router.route('https://api\.github\.com/repos/([^/]+)/([^/]+)') +class GithubMetaFileMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + visited_data = resource_uri.data + if not visited_data: + return + return build_github_packages(visited_data, resource_uri.uri, resource_uri.package_url) + + +def build_github_packages(visited_data, uri, purl=None): + """ + Yield Package built from Github API visited_data as a JSON string. + metadata: HTML metadata content + uri: String value of the uri from ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + visited_data = json.loads(visited_data) + + full_name = visited_data['full_name'] + namespace, name = split_org_repo(full_name) + # FIXME: when could this ever happen?? + assert name == visited_data['name'], 'build_github_packages: Inconsistent name and org for URI: ' + uri + + description = visited_data['description'] + + vcs_url = visited_data.get('git_url'), + if vcs_url: + vcs_url = form_vcs_url('git', vcs_url) + package = scan_models.Package( + type='github', + namespace=namespace, + name=name, + description=description, + primary_language=visited_data.get('language'), + homepage_url=visited_data.get('html_url'), + vcs_url=vcs_url, + # this size does not make sense + size=visited_data.get('size'), + ) + + if visited_data.get('owner'): + package.parties = [ + scan_models.Party( + # FIXME: we can add the org or user URL and we can know if this + # is an org or a perrsone too. + type=scan_models.party_person, + name=visited_data.get('owner'), + role='owner') + ] + + package.set_purl(purl) + + downloads = visited_data.get('downloads') or [] + for download in downloads: + html_url = download.get('html_url') + if html_url: + # make a copy + package = attr.evolve(package) + package.download_url = html_url + package.size = download.get('size') + package.release_date = parse_date(download.get('created_at')) + yield package + + tags = visited_data.get('tags') or [] + for tag in tags: + package = attr.evolve(package) + package.version = tag.get('name') + package_url = PackageURL(type='github', name=package.name, + namespace=namespace, version=tag.get('name')).to_string() + package.sha1 = tag.get('sha1') + if tag.get('tarball_url'): + package.download_url = tag.get('tarball_url') + package.set_purl(package_url) + yield package + if tag.get('zipball_url'): + package.download_url = tag.get('zipball_url') + package.set_purl(package_url) + yield package + + branches_download_urls = visited_data.get('branches_download_urls') or [] + for branches_download_url in branches_download_urls: + package = attr.evolve(package) + package.download_url = branches_download_url + yield package + + +def split_org_repo(url_like): + """ + Given a URL-like string to a GitHub repo or a repo name as in org/name, + split and return the org and name. + + For example: + >>> split_org_repo('foo/bar') + ('foo', 'bar') + >>> split_org_repo('https://api.github.com/repos/foo/bar/') + ('foo', 'bar') + >>> split_org_repo('github.com/foo/bar/') + ('foo', 'bar') + >>> split_org_repo('git://github.com/foo/bar.git') + ('foo', 'bar') + """ + segments = [s.strip() for s in url_like.split('/') if s.strip()] + if not len(segments) >= 2: + raise ValueError('Not a GitHub-like URL: {}'.format(url_like)) + org = segments[-2] + name = segments[-1] + if name.endswith('.git'): + name, _, _ = name .rpartition('.git') + return org, name diff --git a/minecode/visitors/gitlab.py b/minecode/miners/gitlab.py similarity index 58% rename from minecode/visitors/gitlab.py rename to minecode/miners/gitlab.py index 82e7eb9b..478f4436 100644 --- a/minecode/visitors/gitlab.py +++ b/minecode/miners/gitlab.py @@ -1,19 +1,26 @@ # -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals +import json from packageurl import PackageURL - +import packagedcode.models as scan_models from minecode import seed from minecode.utils import get_http_response from minecode import visit_router - -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI +from minecode import map_router +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import URI +from minecode.utils import form_vcs_url +from minecode.utils import parse_date class GitlabSeed(seed.Seeder): @@ -84,3 +91,43 @@ def get_uris(self, content): project_name = url.rpartition('/')[-1] package_url = PackageURL(type='gitlab', name=project_name).to_string() yield URI(uri=url, package_url=package_url, data=element, source_uri=self.uri, visited=False) + + +@map_router.route('https://gitlab.com/.*') +class GitLabMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_json(metadata, resource_uri.package_url) + + +def build_packages_from_json(metadata, purl=None): + """ + Yield Package built from gitlab json content + metadata: Json metadata content + purl: String value of the package url of the ResourceURI object + """ + content = json.loads(metadata) + + name = content.get('name') + if name: + common_data = dict( + type='gitlab', + name=name, + homepage_url=content.get('web_url'), + description=content.get('description'), + ) + repo_url = content.get('http_url_to_repo') + if repo_url: + repo_url = form_vcs_url('git', repo_url) + common_data['vcs_url'] = repo_url + common_data['code_view_url'] = repo_url + common_data['release_date'] = parse_date(content.get('created_at')) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package diff --git a/minecode/visitors/gnu.py b/minecode/miners/gnu.py similarity index 95% rename from minecode/visitors/gnu.py rename to minecode/miners/gnu.py index 8dbe5860..288bbe54 100644 --- a/minecode/visitors/gnu.py +++ b/minecode/miners/gnu.py @@ -13,7 +13,7 @@ from packageurl import PackageURL from minecode import priority_router -from minecode.visitors.generic import map_fetchcode_supported_package +from minecode.miners.generic import map_fetchcode_supported_package logger = logging.getLogger(__name__) handler = logging.StreamHandler() diff --git a/minecode/visitors/golang.py b/minecode/miners/golang.py similarity index 78% rename from minecode/visitors/golang.py rename to minecode/miners/golang.py index 12f5a6b5..d8c174a1 100644 --- a/minecode/visitors/golang.py +++ b/minecode/miners/golang.py @@ -1,20 +1,24 @@ # -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals - import json +from packagedcode import models as scan_models from packageurl import PackageURL from minecode import seed +from minecode import map_router from minecode import visit_router - -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI +from minecode.utils import form_vcs_url class GoLangSeed(seed.Seeder): @@ -219,3 +223,34 @@ def parse_package_path(path): ) return package_url, path + + +@map_router.route('pkg:golang/.*') +class GolangApiDocMapper(Mapper): + + def get_packages(self, uri, resource_uri): + package = json.loads(resource_uri.data) + yield build_golang_package(package, resource_uri.package_url) + + +def build_golang_package(package_data, purl): + """ + Return a single Golang package + """ + package_url = PackageURL.from_string(purl) + vcs_url = package_url.qualifiers.get('vcs_repository') + homepage_url = '/'.join(['https:/', package_url.namespace, package_url.name]) + vcs_tool = 'git' if 'github.com' in package_url.namespace else None + if vcs_tool: + vcs_url = form_vcs_url(vcs_tool, vcs_url) + # TODO: collect stats and counter from package_data too + package = scan_models.Package( + name=package_url.name, + namespace=package_url.namespace, + type=package_url.type, + primary_language='Go', + description=package_data.get('synopsis'), + homepage_url=homepage_url, + vcs_url=vcs_url, + ) + return package diff --git a/minecode/visitors/googlecode.py b/minecode/miners/googlecode.py similarity index 59% rename from minecode/visitors/googlecode.py rename to minecode/miners/googlecode.py index b924e5fc..82cccb2c 100644 --- a/minecode/visitors/googlecode.py +++ b/minecode/miners/googlecode.py @@ -1,10 +1,13 @@ -# -*- coding: utf-8 -*- # -# Copyright (c) 2014 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals +import json from datetime import datetime import os @@ -16,10 +19,14 @@ from minecode import seed from minecode import visit_router from minecode.utils import extract_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import HttpVisitor +from minecode.miners import URI +from packagedcode import models as scan_models + +from minecode import map_router +from minecode.miners import Mapper class GooglecodeSeed(seed.Seeder): @@ -150,3 +157,112 @@ def get_uris(self, content): if release_date: last_modified_date = datetime.fromtimestamp(release_date) yield URI(uri=download_url, package_url=package_url, file_name=file_name, source_uri=self.uri, date=last_modified_date, size=download.get('fileSize'), sha1=download.get('sha1Checksum')) + + +@map_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json') +class GoogleNewAPIV2ProjectJsonMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + # FIXME: JSON deserialization should be handled eventually by the + # framework + metadata = json.loads(resource_uri.data) + return build_packages_from_projectsjson_v2(metadata, resource_uri.package_url, uri) + + +def build_packages_from_projectsjson_v2(metadata, purl=None, uri=None): + """ + Yield Package built from Googlecode API json `metadata` mapping + which is a dictionary keyed by project name and values are metadatadata. + Yield as many Package as there are download URLs. + metadata: json metadata content from API call + purl: String value of the package url of the ResourceURI object + """ + short_desc = metadata.get('summary') + long_desc = metadata.get('description') + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = '\n'.join(descriptions) + common_data = dict( + datasource_id='googlecode_api_json', + type='googlecode', + name=metadata.get('name'), + description=description + ) + + license_name = metadata.get('license') + if license_name: + common_data['extracted_license_statement'] = license_name + common_data['license_detections'] = [] + + keywords = [] + labels = metadata.get('labels') + for label in labels: + if label: + keywords.append(label.strip()) + common_data['keywords'] = keywords + + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package + + +@map_router.route('https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media') +class GoogleNewAPIV1ProjectJsonMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + # FIXME: JSON deserialization should be handled eventually by the + # framework + metadata = json.loads(resource_uri.data) + return build_packages_from_projectsjson_v1(metadata, resource_uri.package_url, uri) + + +def build_packages_from_projectsjson_v1(metadata, purl=None, uri=None): + """Yield Package from the project.json passed by the google code v1 API + metadata: json metadata content from API call + purl: String value of the package url of the ResourceURI object + """ + if metadata.get('name'): + common_data = dict( + datasource_id="googlecode_json", + type='googlecode', + name=metadata.get('name'), + description=metadata.get('description') + ) + + license_name = metadata.get('license') + if license_name: + common_data['extracted_license_statement'] = license_name + common_data['license_detections'] = [] + + keywords = [] + labels = metadata.get('labels') + for label in labels: + if label: + keywords.append(label.strip()) + common_data['keywords'] = keywords + + common_data['vcs_url'] = metadata.get('ancestorRepo') + common_data['namespace'] = metadata.get('domain') + + # createTime doesn't make sense since the timestamp value is incorrect + # and parsing it will give a wrong year out of range. + + # created_time = metadata.get('creationTime') + # if created_time: + # common_data['release_date'] = date.fromtimestamp(created_time) + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/visitors/gstreamer.py b/minecode/miners/gstreamer.py similarity index 55% rename from minecode/visitors/gstreamer.py rename to minecode/miners/gstreamer.py index 1f8637a9..f291a339 100644 --- a/minecode/visitors/gstreamer.py +++ b/minecode/miners/gstreamer.py @@ -1,10 +1,12 @@ # -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals - from bs4 import BeautifulSoup from commoncode.fileutils import file_base_name @@ -12,8 +14,13 @@ from minecode import seed from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import URI +from minecode.miners import HttpVisitor +from minecode.miners import URI +from commoncode import fileutils +from packagedcode import models as scan_models + +from minecode import map_router +from minecode.miners import Mapper class GstreamerSeed(seed.Seeder): @@ -60,3 +67,39 @@ def get_uris(self, content): version = None package_url = PackageURL(type='gstreamer', name=project_name, version=version).to_string() yield URI(uri=url_template.format(sub_path=href), package_url=package_url, file_name=file_name, source_uri=self.uri) + + +@map_router.route('https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2\\.gz|\.tar\.xz]') +class GstreamerURLMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + """ + return build_package_from_url(resource_uri.uri, resource_uri.package_url) + + +def build_package_from_url(uri, purl=None): + """ + Return Package built from uri and package_url. + uri: String value of uri of the ResourceURI object. + purl: String value of the package url of the ResourceURI object + """ + file_name = fileutils.file_name(uri) + file_name_without_prefix = file_name + prefixes = ('.tar.bz2', '.tar.gz', '.tar.xz') + for prefix in prefixes: + file_name_without_prefix = file_name_without_prefix.replace(prefix, '') + if '-' in file_name_without_prefix: + project_name, _, version = file_name.rpartition('-') + common_data = dict( + type='gstreamer', + name=project_name, + version=version, + download_url=uri, + homepage_url='https://gstreamer.freedesktop.org' + ) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package diff --git a/minecode/visitors/haxe.py b/minecode/miners/haxe.py similarity index 69% rename from minecode/visitors/haxe.py rename to minecode/miners/haxe.py index 9395ada1..83a33fdf 100644 --- a/minecode/visitors/haxe.py +++ b/minecode/miners/haxe.py @@ -1,19 +1,26 @@ # -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals +import json from bs4 import BeautifulSoup +from packagedcode.haxe import HaxelibJsonHandler from packageurl import PackageURL from minecode import seed from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI +from minecode import map_router +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import URI class HaxeSeed(seed.Seeder): @@ -81,3 +88,23 @@ class HaxePackageJsonVisitor(HttpJsonVisitor): Empty Visitor to get the package json content only. """ pass + + +@map_router.route('https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json') +class HaxePackageJsonMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield Package built from package json file. + """ + # FIXME: JSON deserialization should be handled eventually by the framework + metadata = json.loads(resource_uri.data) + return build_packages_with_json(metadata, resource_uri.package_url) + + +def build_packages_with_json(metadata, purl=None): + # yield package by getting package from the build_package parser in scancode + package = HaxelibJsonHandler._parse(json_data=metadata) + if package: + package.set_purl(purl) + yield package diff --git a/minecode/visitors/java_stream.py b/minecode/miners/java_stream.py similarity index 100% rename from minecode/visitors/java_stream.py rename to minecode/miners/java_stream.py diff --git a/minecode/visitors/java_stream.py.ABOUT b/minecode/miners/java_stream.py.ABOUT similarity index 100% rename from minecode/visitors/java_stream.py.ABOUT rename to minecode/miners/java_stream.py.ABOUT diff --git a/minecode/visitors/java_stream.py.LICENSE b/minecode/miners/java_stream.py.LICENSE similarity index 100% rename from minecode/visitors/java_stream.py.LICENSE rename to minecode/miners/java_stream.py.LICENSE diff --git a/minecode/visitors/maven.py b/minecode/miners/maven.py similarity index 93% rename from minecode/visitors/maven.py rename to minecode/miners/maven.py index d08ffb21..07e325b1 100644 --- a/minecode/visitors/maven.py +++ b/minecode/miners/maven.py @@ -17,12 +17,14 @@ import logging import os import re - +import packageurl +from commoncode.text import as_unicode +from packagedcode.models import PackageData from bs4 import BeautifulSoup from dateutil import tz import arrow import requests - +from packageurl import PackageURL from jawa.util.utf import decode_modified_utf8 import javaproperties @@ -36,15 +38,18 @@ from minecode import priority_router from minecode import seed from minecode import visit_router -from minecode.visitors import java_stream -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI +from minecode.miners import java_stream +from minecode.miners import HttpVisitor +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI from minecode.utils import validate_sha1 from packagedb.models import make_relationship from packagedb.models import PackageContentType from packagedb.models import PackageRelation from packagedb.models import make_relationship +from minecode import map_router +from minecode.utils import parse_date +from minecode.miners import Mapper """ This module handles the Maven repositories such as central and other @@ -62,8 +67,11 @@ TRACE_DEEP = False if TRACE: + import sys + logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) + MAVEN_BASE_URL = 'https://repo1.maven.org/maven2' @@ -1646,3 +1654,100 @@ def _entries_stats(location): print() print('All field name sets:', field_sets.most_common()) print() + + +@map_router.route('maven-index://.*') +class MavenIndexArtifactMapper(Mapper): + """ + Process the minimal artifacts collected for a Maven Jar or POM in an + index visit. + """ + + def get_packages(self, uri, resource_uri): + yield get_mini_package(resource_uri.data, uri, resource_uri.package_url) + + +def get_mini_package(data, uri, purl): + """ + Return a MavenPomPackage built from the minimal artifact data available in a + nexus index, given a `data` JSON string, a `uri` string and a `purl` + PacxkageURL string. Return None if the package cannot be built. + """ + if not data: + return + + artdata = json.loads(data) + + # FIXME: this should a slot in Artifact + download_url = artdata.pop('download_url') + # FIXME: what if this is an ArtifactExtended?? + artifact = Artifact(**artdata) + + if purl: + if isinstance(purl, str): + purl = PackageURL.from_string(purl) + assert isinstance(purl, PackageURL) + + qualifiers = None + if purl and purl.qualifiers: + qualifiers = packageurl.normalize_qualifiers(purl.qualifiers, encode=False) + if qualifiers: + assert isinstance(qualifiers, dict) + logger.debug('get_mini_package: qualifiers: {}'.format(qualifiers)) + + package = PackageData( + type='maven', + namespace=artifact.group_id, + name=artifact.artifact_id, + version=artifact.version, + qualifiers=qualifiers, + description=artifact.description, + download_url=download_url, + release_date=parse_date(artifact.last_modified), + size=artifact.size, + sha1=artifact.sha1 or None, + ) + logger.debug('get_mini_package: package.qualifiers: {}'.format(package.qualifiers)) + logger.debug('get_mini_package for uri: {}, package: {}'.format(uri, package)) + return package + + +# FIXME this should be valid for any POM +@map_router.route('https?://repo1.maven.org/maven2/.*\.pom') +class MavenPomMapper(Mapper): + """ + Map a proper full POM visited as XML. + """ + def get_packages(self, uri, resource_uri): + + logger.debug('MavenPomMapper.get_packages: uri: {}, resource_uri: {}, purl:' + .format(uri, resource_uri.uri, resource_uri.package_url)) + package = get_package(resource_uri.data, resource_uri.package_url) + if package: + logger.debug('MavenPomMapper.get_packages: uri: {}, package: {}' + .format(uri, package)) + yield package + + +def get_package(text, package_url=None, + baseurl='https://repo1.maven.org/maven2'): + """ + Return a ScannedPackage built from a POM XML string `text`. + """ + text = as_unicode(text) + package = _parse( + datasource_id='maven_pom', + package_type='maven', + primary_language='Java', + text=text + ) + if package: + # FIXME: this should be part of the parse call + if package_url: + purl = PackageURL.from_string(package_url) + package.set_purl(purl) + # Build proper download_url given a POM: this must be the URL for + # the Jar which is the key to the PackageDB record + # FIXME the download is hardcoded to Maven Central? + # package.download_url = package.repository_download_url(baseurl=baseurl) + return package diff --git a/minecode/visitors/npm.py b/minecode/miners/npm.py similarity index 82% rename from minecode/visitors/npm.py rename to minecode/miners/npm.py index 118a10aa..b27b78de 100644 --- a/minecode/visitors/npm.py +++ b/minecode/miners/npm.py @@ -17,10 +17,12 @@ import requests from minecode import seed +from minecode import map_router from minecode import priority_router from minecode import visit_router -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI +from minecode.miners import Mapper from packagedb.models import PackageContentType @@ -192,3 +194,36 @@ def process_request(purl_str, **kwargs): if error_msg: return error_msg + + +# FIXME: This route may not work when we have scoped Packages or URLs to a specific version +# or yarn URLs +@map_router.route('https://registry.npmjs.org/[^\/]+') +class NpmPackageMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield NpmPackage built from a resource_uri record that contains many + npm versions for a given npm name. + """ + if not resource_uri.data: + return + visited_data = json.loads(resource_uri.data) + return build_packages(visited_data) + + +# FIXME: Consider using PURL here +def build_packages(data): + """ + Yield NpmPackage built from data corresponding to a single package name + and many npm versions. + """ + versions = data.get('versions', {}) + + logger.debug('build_packages: versions: ' + repr(type(versions))) + for version, data in versions.items(): + logger.debug('build_packages: version: ' + repr(version)) + logger.debug('build_packages: data: ' + repr(data)) + package = NpmPackageJsonHandler._parse(json_data=data) + if package: + yield package diff --git a/minecode/visitors/nuget.py b/minecode/miners/nuget.py similarity index 50% rename from minecode/visitors/nuget.py rename to minecode/miners/nuget.py index ce64ea93..0dd3396c 100644 --- a/minecode/visitors/nuget.py +++ b/minecode/miners/nuget.py @@ -1,22 +1,26 @@ # -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - +import json from bs4 import BeautifulSoup from commoncode import fileutils from packageurl import PackageURL - +from packagedcode import models as scan_models from minecode import seed from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import URI +from minecode import map_router +from minecode.miners import Mapper class NugetSeed(seed.Seeder): @@ -159,3 +163,166 @@ class NugetHTMLPackageVisitor(HttpVisitor): or https://www.nuget.org/packages/log4net/2.0.7 """ pass + + +@map_router.route('https://api.nuget.org/v3/catalog.+\.json') +class NugetPackageMapper(Mapper): + """ + Return NugetPackage object by parsing the ResourceURI stored in db referenced by the + nuget API URIs. + """ + + def get_packages(self, uri, resource_uri): + if not resource_uri.data: + return + pkg_data = json.loads(resource_uri.data) + return build_packages_with_json(pkg_data, resource_uri.package_url) + + +def build_packages_with_json(metadata, purl=None): + """ + Yield package from the json metadata passed + metadata: json metadata content from API call + purl: String value of the package url of the ResourceURI object + """ + licenseUrl = metadata.get('licenseUrl') + copyr = metadata.get('copyright') + + authors = [] + names = metadata.get('authors') + if names: + for name in names.split(','): + authors.append(scan_models.Party(name=name.strip(), role='author')) + + keywords = metadata.get('tags', []) + + # TODO: the content has the SHA512, our model may extend to SHA512 + + if name: + short_desc = metadata.get('summary') + long_desc = metadata.get('description') + if long_desc == short_desc: + long_desc = None + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = '\n'.join(descriptions) + package_mapping = dict( + type='nuget', + name=metadata['id'], + version=metadata['version'], + homepage_url=metadata.get('projectUrl'), + description=description, + extracted_license_statement=licenseUrl, + license_detections=[], + copyright=copyr, + parties=authors, + keywords=keywords, + ) + package = scan_models.PackageData.from_data(package_data=package_mapping) + package.set_purl(purl) + yield package + + +@map_router.route('https://api.nuget.org/packages/.*\.nupkg') +class NugetNUPKGDownloadMapper(Mapper): + """ + Return NugetPackage object by parsing the download URL. + For example: https://api.nuget.org/packages/entityframework.4.3.1.nupkg + """ + + def get_packages(self, uri, resource_uri): + if not resource_uri.data: + return + pkg_data = json.loads(resource_uri.data) + return build_packages_with_nupkg_download_url(pkg_data, resource_uri.package_url, resource_uri.uri) + + +def build_packages_with_nupkg_download_url(metadata, purl, uri): + if purl: + package = scan_models.PackageData( + type='nuget', + name=purl.name, + download_url=uri + ) + package.set_purl(purl) + yield package + + +@map_router.route('https://www.nuget.org/packages/[\w\-\.]+', + 'https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+') +class NugetHTMLPackageMapper(Mapper): + """ + Return NugetPackage object by parsing the package HTML content. + For example: https://www.nuget.org/packages/log4net + """ + + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri data. + """ + metadata = resource_uri.data + build_packages_from_html( + metadata, resource_uri.uri, resource_uri.package_url) + + +def build_packages_from_html(metadata, uri, purl=None): + """ + Yield Package built from Nuget a `metadata` content + metadata: json metadata content + uri: the uri of the ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + download_url_format = 'https://www.nuget.org/api/v2/package/{name}/{version}' + soup = BeautifulSoup(metadata, 'lxml') + h1 = soup.find('h1') + if h1 and h1.contents: + license_value = None + name = str(h1.contents[0]).strip() + for a in soup.find_all('a'): + if a.get('data-track') and a.get('data-track') == 'outbound-license-url': + license_value = a.string + if license_value: + license_value = str(license_value).strip() + + copyright_value = None + h2s = soup.find_all('h2') + for h2 in h2s: + # Copyright will be after the copyright h2 node + # The exmaple is like this: + #

Copyright

+ #

Copyright 2004-2017 The Apache Software Foundation

+ if h2.string and h2.string == 'Copyright': + next_element = h2.find_next_sibling('p') + if next_element: + copyright_value = next_element.string + + description = None + for m in soup.find_all('meta'): + if m.get('property') and m.get('property') == 'og:description' and m.get('content'): + description = m.get('content') + + for tbody in soup.find_all('tbody'): + if tbody.get('class') and tbody.get('class')[0] == 'no-border': + for a in tbody.find_all('a'): + version = a.string + if not version or not version.strip(): + continue + version = version.strip() + download_url = download_url_format.format(name=name, version=version) + package_mapping = dict( + datasource_id="nuget_metadata_json", + name=name, + type='nuget', + version=version, + homepage_url=uri, + description=description, + download_url=download_url, + extracted_license_statement=license_value, + license_detections=[], + copyright=copyright_value + ) + package = scan_models.Package.from_package_data( + package_data=package_mapping, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/visitors/openssl.py b/minecode/miners/openssl.py similarity index 61% rename from minecode/visitors/openssl.py rename to minecode/miners/openssl.py index 0a7524fc..d9df4a12 100644 --- a/minecode/visitors/openssl.py +++ b/minecode/miners/openssl.py @@ -1,23 +1,36 @@ # -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals - from bs4 import BeautifulSoup from datetime import datetime +import logging from commoncode import fileutils from packageurl import PackageURL +from packagedcode import models as scan_models +from minecode import map_router +from minecode.miners import Mapper +from minecode.utils import parse_date from minecode import priority_router from minecode import seed from minecode import visit_router from minecode.utils import is_int -from minecode.visitors import HttpVisitor -from minecode.visitors import URI -from minecode.visitors.generic import map_fetchcode_supported_package +from minecode.miners import HttpVisitor +from minecode.miners import URI +from minecode.miners.generic import map_fetchcode_supported_package + + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) class OpenSSLSeed(seed.Seeder): @@ -120,3 +133,49 @@ def process_request_dir_listed(purl_str, **kwargs): if error_msg: return error_msg + + +@map_router.route('https://ftp.openssl.org/.*') +class OpenSSLMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackage built from resource_uri record for a single package + version. Yield as many Package from the uri + """ + return build_packages(resource_uri, resource_uri.package_url) + + +def build_packages(resource_uri, purl=None): + """ + Yield Package from resource_uri metadata + resource_uri: ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + uri = resource_uri.uri + file_name = fileutils.file_name(uri) + version = file_name.replace('.tar.gz', '').replace('openssl-', '').replace('.tar.gz', '').replace( + '.asc', '').replace('.md5', '').replace('.sha1', '').replace('.sha256', '') + common_data = dict( + datasource_id="openssl_metadeta", + type='generic', + name=file_name, + description='The OpenSSL Project is a collaborative effort to develop a robust, commercial-grade, fully featured, and Open Source toolkit implementing the Transport Layer Security (TLS) protocols (including SSLv3) as well as a full-strength general purpose cryptographic library.', + version=version, + size=resource_uri.size, + release_date=parse_date(resource_uri.last_modified_date), + extracted_license_statement='OpenSSL License', + license_detections=[], + homepage_url='https://www.openssl.org/', + download_url=uri, + copyright='Copyright (c) 1998-2018 The OpenSSL Project\nCopyright (c) 1995-1998 Eric A. Young, Tim J. Hudson\nAll rights reserved.', + vcs_url='git+https://github.com/openssl/openssl.git', + code_view_url='https://github.com/openssl/openssl', + bug_tracking_url='https://github.com/openssl/openssl/issues', + ) + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/visitors/openwrt.py b/minecode/miners/openwrt.py similarity index 53% rename from minecode/visitors/openwrt.py rename to minecode/miners/openwrt.py index 11d49aeb..0d584e3e 100644 --- a/minecode/visitors/openwrt.py +++ b/minecode/miners/openwrt.py @@ -1,24 +1,38 @@ # -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import unicode_literals - import gzip import json import os +import logging from bs4 import BeautifulSoup from debian_inspector import debcon +from packagedcode import models as scan_models from packageurl import PackageURL +from minecode import debutils from minecode import seed +from minecode import map_router from minecode import visit_router from minecode.utils import extract_file -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI +from minecode.miners import Mapper +from minecode.miners import HttpVisitor +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI +from minecode.miners.debian import get_dependencies + + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) class OpenWrtSeed(seed.Seeder): @@ -92,3 +106,63 @@ def dumps(self, content): control_location = os.path.join(control_extracted_folder, 'control') parsed = debcon.Debian822.from_file(control_location) return json.dumps(parsed) + + +@map_router.route('https://downloads.openwrt.org/.*\.ipk') +class OpenwrtIpkMetadataMapper(Mapper): + + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackage built from resource_uri record for a single package + version. Yield as many Package as there are download URLs. + """ + metadata = json.loads(resource_uri.data) + return build_packages(metadata, resource_uri.package_url, uri) + + +def build_packages(metadata, purl=None, uri=None): + """ + Yield ScannedPackage built from the passing metadata. + metadata: metadata mapping + purl: String value of the package url of the ResourceURI object + """ + common_data = dict( + type='openwrt', + datasource_id='openwrt_metadata', + name=metadata.get('Package'), + version=metadata.get('Version'), + description=metadata.get('Description'), + size=metadata.get('Installed-Size'), + ) + + dependencies = get_dependencies(metadata, ['Depends']) + if dependencies: + common_data['dependencies'] = dependencies + + maintainers = metadata.get('Maintainer') + if maintainers: + name, email = debutils.parse_email(maintainers) + if name: + parties = common_data.get('parties') + if not parties: + common_data['parties'] = [] + party = scan_models.Party(name=name, role='maintainer', email=email) + common_data['parties'].append(party) + + lic = metadata.get('License') + if lic: + common_data['declared_license'] = lic + + common_data['keywords'] = [] + section = metadata.get('Section') + if section: + common_data['keywords'].append(section) + architecture = metadata.get('Architecture') + if architecture: + common_data['keywords'].append(architecture) + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/mappers/packagist.py b/minecode/miners/packagist.py similarity index 70% rename from minecode/mappers/packagist.py rename to minecode/miners/packagist.py index dc05dd30..964f2f11 100644 --- a/minecode/mappers/packagist.py +++ b/minecode/miners/packagist.py @@ -11,12 +11,61 @@ from packagedcode import models as scan_models from packagedcode.models import DependentPackage +from packageurl import PackageURL +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import URI from minecode.utils import form_vcs_url +""" +Collect packagist packages + +The packagist repo API is at: https://packagist.org/apidoc +""" + + +class PackagistSeed(seed.Seeder): + + def get_seeds(self): + yield 'https://packagist.org/packages/list.json' + + +@visit_router.route('https://packagist.org/packages/list.json') +class PackagistListVisitor(HttpJsonVisitor): + """ + Collect list json resource and yield URIs for searching with package url. + + The yield uri format is like: https://packagist.org/p/[vendor]/[package].json + """ + + def get_uris(self, content): + search_url_template = 'https://packagist.org/p/{vendor}/{package}.json' + packages_entries = content.get('packageNames', {}) + for package in packages_entries: + # FIXME: what does it mean to have no / in the URL? + if '/' not in package: + continue + vp = package.split('/') + vendor = vp[0] + package = vp[1] + package_url = PackageURL(type='composer', name=package).to_string() + yield URI(uri=search_url_template.format(vendor=vendor, package=package), package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https://packagist.org/p/.*json') +class PackageVisitor(HttpJsonVisitor): + """ + Collect JSON for a package. + """ + # FIXME: what about having a download URL to fetch the real package??? + pass + + @map_router.route('https://packagist.org/p/.*json') class PackagistPackageMapper(Mapper): diff --git a/minecode/mappers/pypi.py b/minecode/miners/pypi.py similarity index 55% rename from minecode/mappers/pypi.py rename to minecode/miners/pypi.py index 69110414..56f6244c 100644 --- a/minecode/mappers/pypi.py +++ b/minecode/miners/pypi.py @@ -7,16 +7,127 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - +import codecs import json +import xmlrpc from packagedcode import models as scan_models +from packageurl import PackageURL +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.utils import get_temp_file +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import URI +from minecode.miners import Visitor from minecode.utils import parse_date +""" +Visitors for Pypi and Pypi-like Python package repositories. + +We have this hierarchy in Pypi: + index (xmlrpc) -> packages (json) -> package releases (json) -> download urls + +Pypi serves a main index via XMLRPC that contains a list of package names. +For each package, a JSON contains details including the list of all releases. +For each release, a JSON contains details for the released version and all the +downloads available for this release. We create Packages at this level as well +as one download URI for each effective download. + +Some information about every release and download is replicated in every JSON +payload and is ignored for simplicity (which is not super efficient). +""" + + +class PypiSeed(seed.Seeder): + + def get_seeds(self): + yield 'https://pypi.python.org/pypi/' + + +@visit_router.route('https://pypi.python.org/pypi/') +class PypiIndexVisitor(Visitor): + """ + Collect package metadata URIs from the top level pypi index for each package. + """ + def fetch(self, uri, timeout=None): + """ + Specialized fetching using XML RPCs. + """ + packages = xmlrpc.client.ServerProxy(uri).list_packages() + content = list(packages) + + temp_file = get_temp_file('PypiIndexVisitor') + with codecs.open(temp_file, mode='wb', encoding='utf-8') as expect: + json.dump(content, expect, indent=2, separators=(',', ':')) + return temp_file + + def dumps(self, content): + """ + The content is huge json and should not be dumped. + """ + return None + + def get_uris(self, content): + with codecs.open(content, mode='rb', encoding='utf-8') as contentfile: + packages_list = json.load(contentfile) + + url_template = 'https://pypi.python.org/pypi/{name}/json' + for name in packages_list: + package_url = PackageURL(type='pypi', name=name).to_string() + yield URI(uri=url_template.format(name=name), package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https://pypi.python.org/pypi/[^/]+/json') +class PypiPackageVisitor(HttpJsonVisitor): + """ + Collect package metadata URIs for all release of a single Pypi package. + The url will contain only the package name, for example: https://pypi.org/pypi/vmock/json + By parsing the content, the goal is to form the json with version/release: https://pypi.org/pypi/vmock/0.1/json + """ + def get_uris(self, content): + + url_template = 'https://pypi.python.org/pypi/{name}/{release}/json' + info = content.get('info', {}) + name = info.get('name') + if name: + for release in content['releases']: + package_url = PackageURL(type='pypi', name=name, version=release).to_string() + yield URI(uri=url_template.format(name=name, release=release), package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https://pypi.python.org/pypi/[^/]+/[^/]+/json') +class PypiPackageReleaseVisitor(HttpJsonVisitor): + """ + Collect package download URIs for all packages archives of one Pypi package + release. The example is: https://pypi.org/pypi/vmock/0.1/json + """ + def get_uris(self, content): + # TODO: this is likely best ignored entirely??? + # A download_url may be provided for an off-Pypi-download + info = content.get('info', {}) + name = info.get('name') + version = None + download_url = info.get('download_url') + if download_url and download_url != 'UNKNOWN': + version = info.get('version') + package_url = PackageURL(type='pypi', name=name, version=version).to_string() + yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) + + # Common on-Pypi-download URLs are in the urls block + for download in content.get('urls', {}): + url = download.get('url') + if not url: + continue + package_url = PackageURL(type='pypi', name=name, version=version).to_string() + yield URI(url, package_url=package_url, file_name=download.get('filename'), + size=download.get('size'), date=download.get('upload_time'), + md5=download.get('md5_digest'), source_uri=self.uri) + + @map_router.route('https://pypi.python.org/pypi/[^/]+/[^/]+/json') class PypiPackageMapper(Mapper): diff --git a/minecode/visitors/repodata.py b/minecode/miners/repodata.py similarity index 100% rename from minecode/visitors/repodata.py rename to minecode/miners/repodata.py diff --git a/minecode/visitors/repodata_rpms.py b/minecode/miners/repodata_rpms.py similarity index 97% rename from minecode/visitors/repodata_rpms.py rename to minecode/miners/repodata_rpms.py index 882285a3..f4c4e883 100644 --- a/minecode/visitors/repodata_rpms.py +++ b/minecode/miners/repodata_rpms.py @@ -9,7 +9,7 @@ from minecode import seed from minecode import rsync from minecode import visit_router -from minecode.visitors import URI +from minecode.miners import URI """ Collect YUM repositories index (aka. repodata) from CentOS, Fedora, openSUSE and diff --git a/minecode/visitors/repomd_parser.py b/minecode/miners/repomd.py similarity index 76% rename from minecode/visitors/repomd_parser.py rename to minecode/miners/repomd.py index 78f8fe8a..1a092c9c 100644 --- a/minecode/visitors/repomd_parser.py +++ b/minecode/miners/repomd.py @@ -1,26 +1,30 @@ # -# Copyright (c) 2016 nexB Inc. and others. All rights reserved. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - import json import logging import os +from minecode import rsync from commoncode import fileutils from packagedcode.models import PackageData from packagedcode.rpm import EVR +from minecode import seed +from minecode import map_router from minecode import visit_router from minecode.utils import extract_file from minecode.utils import fetch_http from minecode.utils import get_temp_file -from minecode.visitors import URI -from minecode.visitors import repodata +from minecode.miners import URI +from minecode.miners import repodata logger = logging.getLogger(__name__) @@ -108,3 +112,20 @@ def collect_rpm_packages_from_repomd(uri): if rpm.download_url: uris.append(URI(uri=rpm.download_url)) return uris, json.dumps([r.to_dict() for r in rpms]), None + + +@map_router.route('.+/repomd.xml') +def map_repomd_data(uris, resource_uri): + """ + Returns a list of RpmPackage objects collected from visitors. + """ + if not resource_uri.data: + return + packages = [] + for pkg_data in json.loads(resource_uri.data): + # 'name' is required for every package + # FIXME: how could we obtain a package without a name??? + # FIXME: This cannot work unless we use **pkg_data + if pkg_data.get('name'): + packages.append(PackageData(pkg_data)) + return packages diff --git a/minecode/mappers/rubygems.py b/minecode/miners/rubygems.py similarity index 68% rename from minecode/mappers/rubygems.py rename to minecode/miners/rubygems.py index 19f0e0c1..8662c5b5 100644 --- a/minecode/mappers/rubygems.py +++ b/minecode/miners/rubygems.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -8,24 +9,145 @@ # +import gzip import json import logging +import os +from rubymarshal import reader +from rubymarshal.classes import UsrMarshal from packagedcode import models as scan_models from packagedcode.models import DependentPackage from packagedcode.models import PackageData +from packageurl import PackageURL +import saneyaml +from minecode import seed from minecode import map_router -from minecode import saneyaml -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.utils import extract_file +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI from minecode.utils import parse_date + logger = logging.getLogger(__name__) handler = logging.StreamHandler() logger.addHandler(handler) logger.setLevel(logging.INFO) +# FIXME: we are missing several API calls: +# http://guides.rubygems.org/rubygems-org-api/ + +class RubyGemsSeed(seed.Seeder): + + def get_seeds(self): + # We keep only specs.4.8.gz and exclude latest_spec.4.8.gz, + # since specs.4.8.gz covers all uris in latest spec. + yield 'http://rubygems.org/specs.4.8.gz' + + +class GemVersion(UsrMarshal): + + def version(self): + return self.values['version'] + + +@visit_router.route('https?://rubygems\.org/specs\.4\.8\.gz') +class RubyGemsIndexVisitor(NonPersistentHttpVisitor): + """ + Collect REST APIs URIs from RubyGems index file. + """ + + def get_uris(self, content): + with gzip.open(content, 'rb') as idx: + index = idx.read() + + # TODO: use a purl!!! + for name, version, platform in reader.loads(index): + json_url = 'https://rubygems.org/api/v1/versions/{name}.json'.format( + **locals()) + + package_url = PackageURL(type='gem', name=name).to_string() + yield URI(uri=json_url, package_url=package_url, source_uri=self.uri) + + # note: this list only has ever a single value + version = version.values[0] + if isinstance(version, bytes): + version = version.decode('utf-8') + + download_url = 'https://rubygems.org/downloads/{name}-{version}' + + if isinstance(platform, bytes): + platform = platform.decode('utf-8') + if platform != 'ruby': + download_url += '-{platform}' + + download_url += '.gem' + download_url = download_url.format(**locals()) + package_url = PackageURL(type='gem', name=name, version=version).to_string() + yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https?://rubygems\.org/api/v1/versions/[\w\-\.]+.json') +class RubyGemsApiManyVersionsVisitor(HttpJsonVisitor): + """ + Collect the json content of each version. + Yield the uri of each gem based on name, platform and version. + The data of the uri is the JSON subset for a single version. + """ + + def get_uris(self, content): + """ + Yield URI of the gems url and data. + """ + # FIXME: return actual data too!!! + for version_details in content: + # get the gems name by parsing from the uri + name = self.uri[ + self.uri.index('/versions/') + len('/versions/'):-len('.json')] + version = version_details.get('number') + gem_name = '%(name)s-%(version)s' % locals() + package_url = PackageURL(type='gem', name=name, version=version).to_string() + download_url = 'https://rubygems.org/downloads/%(gem_name)s.gem' % locals() + yield URI(uri=download_url, source_uri=self.uri, package_url=package_url, + data=json.dumps(version_details)) + +# TODO: add API dependencies +# https://rubygems.org/api/v1/dependencies.json?gems=file_validators +# Also use Use the V2 API at http://guides.rubygems.org/rubygems-org-api-v2/ +# GET - /api/v2/rubygems/[GEM NAME]/versions/[VERSION NUMBER].(json|yaml) + + +@visit_router.route('https?://rubygems.org/downloads/[\w\-\.]+.gem') +class RubyGemsPackageArchiveMetadataVisitor(NonPersistentHttpVisitor): + """ + Fetch a Rubygems gem archive, extract it and return its metadata file content. + """ + + def dumps(self, content): + return get_gem_metadata(content) + + +def get_gem_metadata(location): + """ + Return the metadata file content as a string extracted from the gem archive + at `location`. + """ + # Extract the compressed file first. + extracted_location = extract_file(location) + metadata_gz = os.path.join(extracted_location, 'metadata.gz') + # Extract the embedded metadata gz file + extract_parent_location = extract_file(metadata_gz) + # Get the first file in the etracted folder which is the meta file location + meta_extracted_file = os.path.join(extract_parent_location, os.listdir(extract_parent_location)[0]) + with open(meta_extracted_file) as meta_file: + return meta_file.read() + + @map_router.route('https*://rubygems\.org/api/v1/versions/[\w\-\.]+.json') class RubyGemsApiVersionsJsonMapper(Mapper): """ diff --git a/minecode/mappers/sourceforge.py b/minecode/miners/sourceforge.py similarity index 55% rename from minecode/mappers/sourceforge.py rename to minecode/miners/sourceforge.py index 887e6748..3f7deac5 100644 --- a/minecode/mappers/sourceforge.py +++ b/minecode/miners/sourceforge.py @@ -7,12 +7,89 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import json +import logging +import re + +from bs4 import BeautifulSoup from packagedcode import models as scan_models +from packageurl import PackageURL +from minecode import seed from minecode import map_router -from minecode.mappers import Mapper +from minecode import visit_router +from minecode.miners import Mapper +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import URI + + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +class SourceforgeSeed(seed.Seeder): + + def get_seeds(self): + yield 'https://sourceforge.net/sitemap.xml' + + +@visit_router.route('https?://sourceforge.net/sitemap.xml') +class SourceforgeSitemapIndexVisitor(NonPersistentHttpVisitor): + """ + Collect sub-sitemaps from the main sitemap. Return on URI for each sub- + sitemap, for example: https://sourceforge.net/sitemap-167.xml + + Note that the class implements from NonPersistentHttpVisitor instead of HttpVisitor, + as the XML file itself will be over 100M big, so NonPersistentHttpVisitor will be more + reasonable. + """ + + def get_uris(self, content): + """ + Collect all the sitemaps URIs from master sitemap. + """ + locs = BeautifulSoup(open(content), 'lxml').find_all('loc') + # Content passing from NonPersistentHttpVisitor is a temp file path + # instead of file content, so opening to get a file handler is + # necessary. + for loc in locs: + yield URI(uri=loc.text, source_uri=self.uri) + + +@visit_router.route('https?://sourceforge.net/sitemap-\d+.xml') +class SourceforgeSitemapPageVisitor(HttpVisitor): + + def get_uris(self, content): + """ + Collect all the projects URIs from a sub-sitemaps. + """ + sitemap_locs = BeautifulSoup(content, 'lxml').find_all('loc') + regex = re.compile( + r"^https?://sourceforge.net/projects/[a-z0-9.-]+/?$") + for loc in sitemap_locs: + if loc.text and re.match(regex, loc.text): + project_json_baseurl = 'https://sourceforge.net/api/project/name/{}/json' + project_name = loc.text.partition( + 'https://sourceforge.net/projects/')[-1].strip('/') + project_json_url = project_json_baseurl.format(project_name) + package_url = PackageURL(type='sourceforge', name=project_name).to_string() + # The priority in the xml has different view with the priority in visitor, so skip it. + yield URI(uri=project_json_url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route('https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json', + 'https?://sourceforge.net/rest/p/[a-z0-9.-]+' + ) +class SourceforgeProjectJsonVisitor(HttpJsonVisitor): + """ + Collect Sourceforge project data through the JSON API. + The implementation is empty since it will inherit the implementation from HttpJsonVisitor and it returns json data for mapper. + """ + pass @map_router.route('https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json', diff --git a/minecode/visitors/ubuntu.py b/minecode/miners/ubuntu.py similarity index 100% rename from minecode/visitors/ubuntu.py rename to minecode/miners/ubuntu.py diff --git a/minecode/models.py b/minecode/models.py index 1e64d99c..3163cbfa 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -22,9 +22,8 @@ from minecode import visit_router # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA +# But importing the miners module triggers routes registration +from minecode import miners # NOQA from packagedb.models import Package diff --git a/minecode/tests/test_apache.py b/minecode/tests/test_apache.py index 306c08bb..63758a46 100644 --- a/minecode/tests/test_apache.py +++ b/minecode/tests/test_apache.py @@ -16,10 +16,10 @@ from mock import Mock from mock import patch -from minecode import mappers +from minecode import miners from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.visitors import apache +from minecode.miners import apache from minecode.tests import FIXTURES_REGEN @@ -133,7 +133,7 @@ class ApacheMapperTest(JsonBasedTesting, DjangoTestCase): test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') def test_build_package_from_download(self): - package = mappers.apache.build_package_from_download( + package = miners.apache.build_package_from_download( 'http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip', 'pkg:apache/groovy@2.4.6') expected_loc = self.get_test_loc('apache/map-groovy_expected.json') @@ -141,7 +141,7 @@ def test_build_package_from_download(self): package.to_dict(), expected_loc, regen=FIXTURES_REGEN) def test_build_package_from_download2(self): - package = mappers.apache.build_package_from_download( + package = miners.apache.build_package_from_download( 'http://archive.apache.org/dist/turbine/maven/turbine-webapp-2.3.3-1.0.0-source-release.zip', 'pkg:apache/turbine-webapp@2.3.3-1.0.0-source-release') expected_loc = self.get_test_loc( @@ -153,9 +153,8 @@ def test_build_package_from_download2(self): def test_build_packages_from_projects_json(self): with open(self.get_test_loc('apache/projects.json')) as projectsjson_meta: - metadata = json.load( - projectsjson_meta, object_pairs_hook=OrderedDict) - packages = mappers.apache.build_packages_from_projects(metadata) + metadata = json.load(projectsjson_meta, object_pairs_hook=OrderedDict) + packages = miners.apache.build_packages_from_projects(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('apache/projects_expected.json') @@ -165,8 +164,7 @@ def test_build_packages_from_projects_json(self): def test_build_packages_from_one_podling_json(self): with open(self.get_test_loc('apache/podling_amaterasu.json')) as podlings_meta: metadata = json.load(podlings_meta, object_pairs_hook=OrderedDict) - packages = mappers.apache.build_packages_from_podlings( - metadata, purl='pkg:apache-podlings/amaterasu') + packages = miners.apache.build_packages_from_podlings(metadata, purl='pkg:apache-podlings/amaterasu') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( diff --git a/minecode/tests/test_bitbucket.py b/minecode/tests/test_bitbucket.py index 1284c4ac..c4e4b12a 100644 --- a/minecode/tests/test_bitbucket.py +++ b/minecode/tests/test_bitbucket.py @@ -18,12 +18,12 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.mappers.bitbucket import build_bitbucket_download_packages -from minecode.mappers.bitbucket import build_bitbucket_repo_package +from minecode.miners.bitbucket import build_bitbucket_download_packages +from minecode.miners.bitbucket import build_bitbucket_repo_package -from minecode.visitors.bitbucket import BitbucketDetailsVisitorPaginated -from minecode.visitors.bitbucket import BitbucketIndexVisitor -from minecode.visitors.bitbucket import BitbucketSingleRepoVisitor +from minecode.miners.bitbucket import BitbucketDetailsVisitorPaginated +from minecode.miners.bitbucket import BitbucketIndexVisitor +from minecode.miners.bitbucket import BitbucketSingleRepoVisitor from minecode.tests import FIXTURES_REGEN diff --git a/minecode/tests/test_bower.py b/minecode/tests/test_bower.py index 69aa33c5..9c852bdc 100644 --- a/minecode/tests/test_bower.py +++ b/minecode/tests/test_bower.py @@ -17,9 +17,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.visitors import bower +from minecode.miners import bower from minecode.tests import FIXTURES_REGEN -from minecode import mappers +from minecode import miners class BowerVistorTest(JsonBasedTesting): @@ -51,7 +51,7 @@ class BowerMapperTest(JsonBasedTesting): def test_build_packages_metafile_from_bowerjson1(self): with open(self.get_test_loc('bower/28msec_bower.json')) as bower_metadata: metadata = bower_metadata.read() - result = mappers.bower.build_packages_from_jsonfile( + result = miners.bower.build_packages_from_jsonfile( metadata, 'https://raw.githubusercontent.com/28msec/28.io-angularjs/master/bower.json', 'pkg:bower/1140-grid') result = [p.to_dict() for p in result] expected_loc = self.get_test_loc('bower/expected_28msec_bower.json') @@ -60,7 +60,7 @@ def test_build_packages_metafile_from_bowerjson1(self): def test_build_packages_metafile_from_bowerjson2(self): with open(self.get_test_loc('bower/example1_bower.json')) as bower_metadata: metadata = bower_metadata.read() - result = mappers.bower.build_packages_from_jsonfile( + result = miners.bower.build_packages_from_jsonfile( metadata, 'https://coding.net/u/QiaoButang/p/jquery.easing-qbt/git/raw/master/bower.json', 'pkg:bower/1140-grid') result = [p.to_dict() for p in result] expected_loc = self.get_test_loc( diff --git a/minecode/tests/test_conan.py b/minecode/tests/test_conan.py index 61faa8e7..db6469eb 100644 --- a/minecode/tests/test_conan.py +++ b/minecode/tests/test_conan.py @@ -17,7 +17,7 @@ import packagedb from minecode.utils_test import JsonBasedTesting -from minecode.visitors import conan +from minecode.miners import conan class ConanPriorityQueueTests(JsonBasedTesting, TestCase): @@ -91,7 +91,7 @@ def test_get_download_info(self): self.assertEqual(result_download_url, expected_zlib_download_url) self.assertEqual(result_sha256, expected_zlib_sha256) - @patch("minecode.visitors.conan.get_conan_recipe") + @patch("minecode.miners.conan.get_conan_recipe") def test_map_conan_package(self, mock_get_conan_recipe): mock_get_conan_recipe.return_value = ( self.zlib_conanfile_contents, diff --git a/minecode/tests/test_cpan.py b/minecode/tests/test_cpan.py index 4ebeb281..0c9e22af 100644 --- a/minecode/tests/test_cpan.py +++ b/minecode/tests/test_cpan.py @@ -17,8 +17,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import cpan +from minecode import miners +from minecode.miners import cpan from minecode.tests import FIXTURES_REGEN @@ -82,7 +82,7 @@ class CpanMapperTest(JsonBasedTesting): def test_build_from_release_search_json(self): with open(self.get_test_loc('cpan/release_search.json')) as cpan_metadata: metadata = cpan_metadata.read() - packages = mappers.cpan.build_packages_from_release_json( + packages = miners.cpan.build_packages_from_release_json( metadata, 'https://fastapi.metacpan.org/release/_search?q=author:ABERNDT&size=5000') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('cpan/expected_release_search.json') @@ -92,7 +92,7 @@ def test_build_from_release_search_json(self): def test_build_from_release_search_json2(self): with open(self.get_test_loc('cpan/MIYAGAWA_author_release_search.json')) as cpan_metadata: metadata = cpan_metadata.read() - packages = mappers.cpan.build_packages_from_release_json( + packages = miners.cpan.build_packages_from_release_json( metadata, 'https://fastapi.metacpan.org/release/_search?q=author:MIYAGAWA&size=5000') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( @@ -103,7 +103,7 @@ def test_build_from_release_search_json2(self): def test_build_packages_metafile_from_yaml(self): with open(self.get_test_loc('cpan/variable-2009110702.meta')) as cpan_metadata: metadata = cpan_metadata.read() - packages = mappers.cpan.build_packages_from_metafile( + packages = miners.cpan.build_packages_from_metafile( metadata, 'http://www.cpan.org/authors/id/A/AB/ABIGAIL/variable-2009110702.metadata', 'pkg:cpan/variable@2009110702') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('cpan/expected_yaml_cpanmapper.json') @@ -113,7 +113,7 @@ def test_build_packages_metafile_from_yaml(self): def test_build_packages_metafile_from_json(self): with open(self.get_test_loc('cpan/Regexp-Common-2016010701.meta')) as cpan_metadata: metadata = cpan_metadata.read() - packages = mappers.cpan.build_packages_from_metafile( + packages = miners.cpan.build_packages_from_metafile( metadata, 'http://www.cpan.org/authors/id/A/AB/ABIGAIL/Regexp-Common-2016010701.metadata', 'pkg:cpan/Regexp-Common@2016010701') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('cpan/expected_json_cpanmapper.json') @@ -127,7 +127,7 @@ def test_build_packages_readme_from_json(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = cpan.CpanReadmeVisitors(uri) - packages = mappers.cpan.build_packages_from_readmefile( + packages = miners.cpan.build_packages_from_readmefile( data, 'http://www.cpan.org/authors/id/A/AM/AMIRITE/Mojolicious-Plugin-Nour-Config-0.09.readme', 'pkg:cpan/Mojolicious-Plugin-Nour-Config@0.09') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( @@ -142,7 +142,7 @@ def test_build_packages_readme_from_json2(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = cpan.CpanReadmeVisitors(uri) - packages = mappers.cpan.build_packages_from_readmefile( + packages = miners.cpan.build_packages_from_readmefile( data, 'http://www.cpan.org/authors/id/A/AB/ABIGAIL/Algorithm-Graphs-TransitiveClosure-2009110901.readme') packages = [p.to_dict() for p in packages] diff --git a/minecode/tests/test_cran.py b/minecode/tests/test_cran.py index 1f5e3ef8..a0a88bde 100644 --- a/minecode/tests/test_cran.py +++ b/minecode/tests/test_cran.py @@ -16,10 +16,10 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.mappers.cran import get_download_url +from minecode import miners +from minecode.miners.cran import get_download_url from minecode.models import ResourceURI -from minecode.visitors import cran +from minecode.miners import cran from minecode.tests import FIXTURES_REGEN @@ -44,8 +44,7 @@ def test_build_packages_from_directory_listing(self): uri='https://cloud.r-project.org/web/packages/ANN2/index.html') with open(self.get_test_loc('cran/CRAN_Package_ANN2.html')) as html_metadata: metadata = html_metadata.read() - packages = mappers.cran.build_packages_from_html( - metadata, 'https://cloud.r-project.org/web/packages/ANN2/index.html', 'pkg:cran/ANN2') + packages = miners.cran.build_packages_from_html(metadata, 'https://cloud.r-project.org/web/packages/ANN2/index.html', 'pkg:cran/ANN2') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('cran/mapper_ANN2_expected.json') self.check_expected_results( @@ -56,8 +55,7 @@ def test_build_packages_from_directory_listing2(self): uri='https://cloud.r-project.org/web/packages/abe/index.html') with open(self.get_test_loc('cran/CRAN_Package_abe.html')) as html_metadata: metadata = html_metadata.read() - packages = mappers.cran.build_packages_from_html( - metadata, 'https://cloud.r-project.org/web/packages/abe/index.htm', 'pkg:cran/abe') + packages = miners.cran.build_packages_from_html(metadata, 'https://cloud.r-project.org/web/packages/abe/index.htm', 'pkg:cran/abe') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('cran/mapper_abe_expected.json') self.check_expected_results( diff --git a/minecode/tests/test_debian.py b/minecode/tests/test_debian.py index 996af85c..6b8a8415 100644 --- a/minecode/tests/test_debian.py +++ b/minecode/tests/test_debian.py @@ -21,8 +21,7 @@ from minecode.utils_test import JsonBasedTesting from minecode import debutils -from minecode.mappers import debian as debian_mapper -from minecode.visitors import debian as debian_visitor +from minecode.miners import debian from minecode.tests import FIXTURES_REGEN @@ -150,15 +149,14 @@ class DebianReleaseTest(BaseDebianTest): def test_parse_release(self): release_file = self.get_test_loc('debian/release/Release') - result = list(debian_visitor.parse_release(release_file)) + result = list(debian.parse_release(release_file)) expected_loc = self.get_test_loc('debian/release/Release_expected') self.check_expected_deb822(result, expected_loc) def test_parse_release_with_md5(self): release_file = self.get_test_loc('debian/release/Release_with_md5') - result = list(debian_visitor.parse_release(release_file)) - expected_loc = self.get_test_loc( - 'debian/release/Release_with_md5_expected') + result = list(debian.parse_release(release_file)) + expected_loc = self.get_test_loc('debian/release/Release_with_md5_expected') self.check_expected_deb822(result, expected_loc) @expectedFailure @@ -167,7 +165,7 @@ def test_visit_debian_release(self): test_loc = self.get_test_loc('debian/release/visited_Release') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = debian_visitor.DebianReleaseVisitor(uri) + _, data, _ = debian.DebianReleaseVisitor(uri) result = json.loads(data) release_file = self.get_test_loc( @@ -181,35 +179,29 @@ class DebianCopyrightTest(BaseDebianTest): @expectedFailure def test_parse_copyright_only_basic(self): copyright_file = self.get_test_loc('debian/copyright/basic_copyright') - copyrights = [ - info for info in debian_visitor.parse_copyright_only(copyright_file)] - self.assertTrue( - 'Copyright 1998 John Doe ' in copyrights) - self.assertTrue( - 'Copyright 1998 Jane Doe ' in copyrights) + copyrights = [info for info in debian.parse_copyright_only(copyright_file)] + self.assertTrue('Copyright 1998 John Doe ' in copyrights) + self.assertTrue('Copyright 1998 Jane Doe ' in copyrights) @expectedFailure def test_parse_copyright_only_with_incorrect_file(self): copyright_file = self.get_test_loc( 'debian/copyright/invalid_copyright') with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_only( - copyright_file)] + [info for info in debian.parse_copyright_only(copyright_file)] self.assertTrue('no paragraphs in input' in context.exception) @expectedFailure def test_parse_copyright_only_with_incorrect_path(self): copyright_file = 'path_invalid' with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_only( - copyright_file)] + [info for info in debian.parse_copyright_only(copyright_file)] self.assertTrue('No such file or directory' in context.exception) @expectedFailure def test_parse_copyright_allinfo_basic(self): copyright_file = self.get_test_loc('debian/copyright/basic_copyright') - copyright_data = [ - info for info in debian_visitor.parse_copyright_allinfo(copyright_file)] + copyright_data = [info for info in debian.parse_copyright_allinfo(copyright_file)] expected = [ {'files': (u'*',), 'license': u'GPL-2+', @@ -227,22 +219,20 @@ def test_parse_copyright_allinfo_with_invalid_file(self): copyright_file = self.get_test_loc( 'debian/copyright/invalid_copyright') with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_allinfo( - copyright_file)] + [info for info in debian.parse_copyright_allinfo(copyright_file)] self.assertTrue('no paragraphs in input' in context.exception) @expectedFailure def test_parse_copyright_allinfo_with_incorrect_path(self): copyright_file = 'path_invalid' with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_allinfo( - copyright_file)] + [info for info in debian.parse_copyright_allinfo(copyright_file)] self.assertTrue('No such file or directory' in context.exception) @expectedFailure def test_parse_license_basic(self): copyright_file = self.get_test_loc('debian/copyright/basic_copyright') - licenses, licensetexts = debian_visitor.parse_license(copyright_file) + licenses, licensetexts = debian.parse_license(copyright_file) expected = { 'GPL-2+': [ "This program is free software; you can redistribute it\n" @@ -270,14 +260,14 @@ def test_parse_license_with_invalid_file(self): copyright_file = self.get_test_loc( 'debian/copyright/invalid_copyright') with self.assertRaises(Exception) as context: - debian_visitor.parse_license(copyright_file) + debian.parse_license(copyright_file) self.assertTrue('no paragraphs in input' in context.exception) @expectedFailure def test_parse_license_with_incorrect_path(self): copyright_file = 'path_invalid' with self.assertRaises(Exception) as context: - debian_visitor.parse_license(copyright_file) + debian.parse_license(copyright_file) self.assertTrue('No such file or directory' in context.exception) @@ -285,21 +275,15 @@ class DebianSourcesTest(BaseDebianTest): def test_collect_source_packages(self): index_file = self.get_test_loc('debian/sources/debian_Sources') - source_info = [ - info for info in debian_visitor.collect_source_packages(index_file)] - expected_loc = self.get_test_loc( - 'debian/sources/debian_Sources_visit_expected') - self.check_objects_expected( - source_info, expected_loc, regen=FIXTURES_REGEN) + source_info = [info for info in debian.collect_source_packages(index_file)] + expected_loc = self.get_test_loc('debian/sources/debian_Sources_visit_expected') + self.check_objects_expected(source_info, expected_loc, regen=FIXTURES_REGEN) def test_collect_source_packages_ubuntu(self): index_file = self.get_test_loc('debian/sources/ubuntu_Sources') - source_info = [ - info for info in debian_visitor.collect_source_packages(index_file)] - expected_loc = self.get_test_loc( - 'debian/sources/ubuntu_Sources_visit_expected') - self.check_objects_expected( - source_info, expected_loc, regen=FIXTURES_REGEN) + source_info = [info for info in debian.collect_source_packages(index_file)] + expected_loc = self.get_test_loc('debian/sources/ubuntu_Sources_visit_expected') + self.check_objects_expected(source_info, expected_loc, regen=FIXTURES_REGEN) @expectedFailure def test_DebianSourcesVisitor(self): @@ -307,9 +291,8 @@ def test_DebianSourcesVisitor(self): test_loc = self.get_test_loc('debian/sources/Sources.gz') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = debian_visitor.DebianSourcesVisitor(uri) - expected_loc = self.get_test_loc( - 'debian/sources/Sources.gz-expected.json') + uris, _, _ = debian.DebianSourcesVisitor(uri) + expected_loc = self.get_test_loc('debian/sources/Sources.gz-expected.json') self.check_expected_uris(list(uris), expected_loc) @expectedFailure @@ -318,13 +301,13 @@ def test_DebianSourcesVisitor_with_invalid_file(self): test_loc = self.get_test_loc('debian/invalid_files/ls-lR.gz') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _ = debian_visitor.DebianSourcesVisitor(uri) + uris, _data, _ = debian.DebianSourcesVisitor(uri) self.assertEqual(0, len(list(uris))) @expectedFailure def test_build_source_file_packages(self): with open(self.get_test_loc('debian/sources/debian_Sources')) as packs: - packages = debian_mapper.build_source_file_packages(packs.read()) + packages = debian.build_source_file_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'debian/sources/debian_Sources_mapped-expected-packages.json') @@ -335,17 +318,14 @@ class DebianPackagesTest(BaseDebianTest): def test_parse_packages_index(self): index_file = self.get_test_loc('debian/packages/debian_Packages') - package_info = [ - info for info in debian_visitor.parse_packages_index(index_file)] - expected_loc = self.get_test_loc( - 'debian/packages/debian_Packages-visit-expected.json') - self.check_objects_expected( - package_info, expected_loc, regen=FIXTURES_REGEN) + package_info = [info for info in debian.parse_packages_index(index_file)] + expected_loc = self.get_test_loc('debian/packages/debian_Packages-visit-expected.json') + self.check_objects_expected(package_info, expected_loc, regen=FIXTURES_REGEN) @expectedFailure def test_parse_packages_from_debian_Packages(self): with open(self.get_test_loc('debian/packages/debian_Packages')) as packs: - packages = debian_mapper.parse_packages(packs.read()) + packages = debian.parse_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'debian/packages/debian_Packages-expected.json') @@ -355,7 +335,7 @@ def test_parse_packages_from_debian_Packages(self): @expectedFailure def test_parse_packages_from_ubuntu_Packages(self): with open(self.get_test_loc('debian/packages/ubuntu_Packages')) as packs: - packages = debian_mapper.parse_packages(packs.read()) + packages = debian.parse_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'debian/packages/ubuntu_Packages-expected.json') @@ -365,7 +345,7 @@ def test_parse_packages_from_ubuntu_Packages(self): @expectedFailure def test_parse_packages_from_installed_status(self): with open(self.get_test_loc('debian/status/simple_status')) as packs: - packages = debian_mapper.parse_packages(packs.read()) + packages = debian.parse_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'debian/packages/ubuntu_Packages-expected.json') @@ -380,11 +360,9 @@ def test_DebianDirectoryIndexVisitor_from_debian(self): test_loc = self.get_test_loc('debian/lslr/ls-lR_debian') temp_gz_location = self.get_tmp_gz_file(test_loc) with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get( - uri, temp_gz_location) - uris, _, _ = debian_visitor.DebianDirectoryIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'debian/lslr/ls-lR_debian.gz-expected.json') + mock_http_get.return_value = mocked_requests_get(uri, temp_gz_location) + uris, _, _ = debian.DebianDirectoryIndexVisitor(uri) + expected_loc = self.get_test_loc('debian/lslr/ls-lR_debian.gz-expected.json') self.check_expected_uris(list(uris), expected_loc) def test_DebianDirectoryIndexVisitor_from_ubuntu(self): @@ -392,9 +370,8 @@ def test_DebianDirectoryIndexVisitor_from_ubuntu(self): test_loc = self.get_test_loc('debian/lslr/ls-lR_ubuntu') temp_gz_location = self.get_tmp_gz_file(test_loc) with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get( - uri, temp_gz_location) - uris, _, _ = debian_visitor.DebianDirectoryIndexVisitor(uri) + mock_http_get.return_value = mocked_requests_get(uri, temp_gz_location) + uris, _, _ = debian.DebianDirectoryIndexVisitor(uri) expected_loc = self.get_test_loc( 'debian/lslr/ls-lR_ubuntu.gz-expected.json') self.check_expected_uris(list(uris), expected_loc) @@ -408,7 +385,7 @@ def test_DebianDescriptionVisitor(self): test_loc = self.get_test_loc('debian/dsc/7kaa_2.14.3-1.dsc') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = debian_visitor.DebianDescriptionVisitor(uri) + _, data, _ = debian.DebianDescriptionVisitor(uri) result = json.loads(data) dsc_file = self.get_test_loc('debian/dsc/description_expected.json') self.check_expected_deb822(result, dsc_file) @@ -417,7 +394,7 @@ def test_DebianDescriptionVisitor(self): def test_parse_description(self): with open(self.get_test_loc('debian/dsc/description.json')) as debian_description_meta: metadata = json.load(debian_description_meta) - packages = debian_mapper.parse_description(metadata) + packages = debian.parse_description(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'debian/dsc/description-expected.json') @@ -434,7 +411,7 @@ def test_get_dependencies(self): 'build3': 'buildnot', } keys = ['build1', 'build2'] - result = debian_mapper.get_dependencies(test, keys) + result = debian.get_dependencies(test, keys) self.assertEqual(2, len(result)) self.assertEqual('build', result[0].purl) self.assertEqual(None, result[0].requirement) @@ -442,7 +419,6 @@ def test_get_dependencies(self): self.assertEqual(None, result[1].requirement) def test_get_programming_language(self): - tags = ['role::program', 'implemented-in::perl', - 'use::converting', 'works-with::pim'] - result = debian_mapper.get_programming_language(tags) + tags = ['role::program', 'implemented-in::perl', 'use::converting', 'works-with::pim'] + result = debian.get_programming_language(tags) self.assertEqual('perl', result) diff --git a/minecode/tests/test_dockerhub.py b/minecode/tests/test_dockerhub.py index 66e48ec0..ef90145f 100644 --- a/minecode/tests/test_dockerhub.py +++ b/minecode/tests/test_dockerhub.py @@ -18,9 +18,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.visitors import dockerhub +from minecode.miners import dockerhub from minecode.tests import FIXTURES_REGEN -from minecode import mappers +from minecode import miners class DockerHubTest(JsonBasedTesting): @@ -80,7 +80,7 @@ class DockerHubMapperTest(DockerHubTest): def test_build_packages_fromjson(self): with open(self.get_test_loc('dockerhub/elixir.json')) as dockerhub_metadata: metadata = dockerhub_metadata.read() - packages = mappers.dockerhub.build_packages_from_jsonfile( + packages = miners.dockerhub.build_packages_from_jsonfile( metadata, 'https://registry.hub.docker.com/v2/repositories/library') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( diff --git a/minecode/tests/test_eclipse.py b/minecode/tests/test_eclipse.py index db744de9..35f7a8b4 100644 --- a/minecode/tests/test_eclipse.py +++ b/minecode/tests/test_eclipse.py @@ -18,9 +18,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import URI -from minecode.visitors import eclipse +from minecode import miners +from minecode.miners import URI +from minecode.miners import eclipse from minecode.tests import FIXTURES_REGEN @@ -106,7 +106,7 @@ class TestEclipseMap(JsonBasedTesting): def test_build_packages(self): with open(self.get_test_loc('eclipse/birt.json')) as eclipse_metadata: metadata = json.load(eclipse_metadata) - packages = mappers.eclipse.build_packages_with_json(metadata) + packages = miners.eclipse.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('eclipse/eclipse_birt_expected.json') self.check_expected_results( @@ -115,7 +115,7 @@ def test_build_packages(self): def test_build_eclipse_html_packages(self): with open(self.get_test_loc('eclipse/Acceleo_projects.eclipse.org.html')) as eclipse_metadata: metadata = eclipse_metadata.read() - packages = mappers.eclipse.build_packages(metadata) + packages = miners.eclipse.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'eclipse/Acceleo_projects_expected.json') diff --git a/minecode/tests/test_fdroid.py b/minecode/tests/test_fdroid.py index 021b5f53..4611d3bc 100644 --- a/minecode/tests/test_fdroid.py +++ b/minecode/tests/test_fdroid.py @@ -15,9 +15,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.mappers import fdroid as fdroid_mapper -from minecode.visitors import fdroid as fdroid_visitor -from minecode.visitors import URI +from minecode.miners import fdroid +from minecode.miners import URI from minecode.tests import FIXTURES_REGEN @@ -29,7 +28,7 @@ def test_FdroidPackageRepoVisitor(self): test_loc = self.get_test_loc('fdroid/index-v2.json') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _errors = fdroid_visitor.FdroidIndexVisitor(uri) + uris, data, _errors = fdroid.FdroidIndexVisitor(uri) # this is a non-persistent visitor, lets make sure we dont return any data assert not data @@ -49,7 +48,7 @@ def test_build_packages(self): packages = [] for purl, data in purl_data: - pkgs = list(fdroid_mapper.build_packages(purl, data)) + pkgs = list(fdroid.build_packages(purl, data)) packages.extend(pkgs) packages = [p.to_dict() for p in packages] diff --git a/minecode/tests/test_freebsd.py b/minecode/tests/test_freebsd.py index 9f3aaa00..d28b5539 100644 --- a/minecode/tests/test_freebsd.py +++ b/minecode/tests/test_freebsd.py @@ -17,8 +17,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import freebsd +from minecode import miners +from minecode.miners import freebsd from minecode.tests import FIXTURES_REGEN @@ -60,7 +60,7 @@ class FreedesktopMapperTest(JsonBasedTesting): def test_map_index_file(self): with open(self.get_test_loc('freebsd/mapper_input1')) as freebsd_metadata: metadata = freebsd_metadata.read() - packages = mappers.freebsd.build_packages(metadata) + packages = miners.freebsd.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'freebsd/indexfile_expected_mapper.json') diff --git a/minecode/tests/test_freedesktop.py b/minecode/tests/test_freedesktop.py index 278c3da3..c9a90878 100644 --- a/minecode/tests/test_freedesktop.py +++ b/minecode/tests/test_freedesktop.py @@ -15,8 +15,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import freedesktop +from minecode import miners +from minecode.miners import freedesktop from minecode.tests import FIXTURES_REGEN @@ -43,7 +43,7 @@ class FreedesktopMapperTest(FreedesktopTest): def test_map_software_html_page_hal(self): with open(self.get_test_loc('freedesktop/hal.html')) as freedesktop_metadata: metadata = freedesktop_metadata.read() - packages = mappers.freedesktop.build_packages( + packages = miners.freedesktop.build_packages( metadata, 'https://www.freedesktop.org/wiki/Software/hal', purl='pkg:freedesktop/hal') @@ -56,7 +56,7 @@ def test_map_software_html_page_hal(self): def test_map_software_html_page_libinput(self): with open(self.get_test_loc('freedesktop/libinput.html')) as freedesktop_metadata: metadata = freedesktop_metadata.read() - packages = mappers.freedesktop.build_packages( + packages = miners.freedesktop.build_packages( metadata, 'https://www.freedesktop.org/wiki/Software/libinput/', purl='pkg:freedesktop/libinput') diff --git a/minecode/tests/test_generic.py b/minecode/tests/test_generic.py index 69300c86..cf573d5f 100644 --- a/minecode/tests/test_generic.py +++ b/minecode/tests/test_generic.py @@ -13,7 +13,7 @@ from minecode.route import NoRouteAvailable from minecode.utils_test import JsonBasedTesting -from minecode.visitors import generic +from minecode.miners import generic from packagedb.models import Package diff --git a/minecode/tests/test_github.py b/minecode/tests/test_github.py index b673a408..31ac7938 100644 --- a/minecode/tests/test_github.py +++ b/minecode/tests/test_github.py @@ -21,8 +21,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import github +from minecode import miners +from minecode.miners import github from minecode.tests import FIXTURES_REGEN @@ -147,8 +147,7 @@ class GithubMapperTest(JsonBasedTesting): def test_github_repo_mapper1(self): with open(self.get_test_loc('github/calendar_builder.json')) as json_metadata: metadata = json_metadata.read() - packages = mappers.github.build_github_packages( - metadata, 'https://api.github.com/repos/collectiveidea/calendar_builder') + packages = miners.github.build_github_packages(metadata, 'https://api.github.com/repos/collectiveidea/calendar_builder') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'github/mapper_calendar_builder_expected.json') @@ -158,8 +157,7 @@ def test_github_repo_mapper1(self): def test_github_repo_mapper2(self): with open(self.get_test_loc('github/mojombo_grit_from_visitor_4mapper_input.json')) as json_metadata: metadata = json_metadata.read() - packages = mappers.github.build_github_packages( - metadata, 'https://api.github.com/repos/mojombo/grit') + packages = miners.github.build_github_packages(metadata, 'https://api.github.com/repos/mojombo/grit') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'github/mojombo_grit_result_mapper_expected.json') diff --git a/minecode/tests/test_gitlab.py b/minecode/tests/test_gitlab.py index 5450f0a6..6f01ad1f 100644 --- a/minecode/tests/test_gitlab.py +++ b/minecode/tests/test_gitlab.py @@ -16,9 +16,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.visitors import gitlab +from minecode.miners import gitlab from minecode.tests import FIXTURES_REGEN -from minecode import mappers +from minecode import miners class GitlabTest(JsonBasedTesting): @@ -52,7 +52,7 @@ class GitlabMapperTest(JsonBasedTesting): def test_map_software_html_page_hal(self): with open(self.get_test_loc('gitlab/microservice-express-mongo.json')) as gitlab_json: metadata = gitlab_json.read() - packages = mappers.gitlab.build_packages_from_json(metadata) + packages = miners.gitlab.build_packages_from_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'gitlab/microservice-express-mongo_expected.json') diff --git a/minecode/tests/test_gnu.py b/minecode/tests/test_gnu.py index db9d7249..39fc3220 100644 --- a/minecode/tests/test_gnu.py +++ b/minecode/tests/test_gnu.py @@ -14,7 +14,7 @@ from mock import patch from minecode.utils_test import JsonBasedTesting -from minecode.visitors import gnu +from minecode.miners import gnu from packagedb.models import Package diff --git a/minecode/tests/test_golang.py b/minecode/tests/test_golang.py index d63be206..0a490f45 100644 --- a/minecode/tests/test_golang.py +++ b/minecode/tests/test_golang.py @@ -18,10 +18,10 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.visitors.golang import GodocIndexVisitor -from minecode.visitors.golang import GodocSearchVisitor -from minecode.visitors.golang import parse_package_path -from minecode.mappers.golang import build_golang_package +from minecode.miners.golang import GodocIndexVisitor +from minecode.miners.golang import GodocSearchVisitor +from minecode.miners.golang import parse_package_path +from minecode.miners.golang import build_golang_package from minecode.tests import FIXTURES_REGEN diff --git a/minecode/tests/test_googlecode.py b/minecode/tests/test_googlecode.py index ae0f974b..4f6b0743 100644 --- a/minecode/tests/test_googlecode.py +++ b/minecode/tests/test_googlecode.py @@ -17,9 +17,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import URI -from minecode.visitors import googlecode +from minecode import miners +from minecode.miners import URI +from minecode.miners import googlecode from minecode.tests import FIXTURES_REGEN @@ -95,8 +95,7 @@ class GoogleNewAPIMappersTest(JsonBasedTesting): def test_build_packages_from_v2_projects_json(self): with open(self.get_test_loc('googlecode/v2_api/project.json')) as projectsjson_meta: metadata = json.load(projectsjson_meta) - packages = mappers.googlecode.build_packages_from_projectsjson_v2( - metadata) + packages = miners.googlecode.build_packages_from_projectsjson_v2(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'googlecode/v2_api/package_expected_project.json') @@ -106,8 +105,7 @@ def test_build_packages_from_v2_projects_json(self): def test_build_packages_from_v1_projects_json(self): with open(self.get_test_loc('googlecode/v2_apache-extras.org_124799961-qian_project.json')) as projectsjson_meta: metadata = json.load(projectsjson_meta) - packages = mappers.googlecode.build_packages_from_projectsjson_v1( - metadata) + packages = miners.googlecode.build_packages_from_projectsjson_v1(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'googlecode/mapper_expected_v2_apache-extras.org_124799961-qian_project.json') diff --git a/minecode/tests/test_gstreamer.py b/minecode/tests/test_gstreamer.py index 4c1fcd48..1e64f783 100644 --- a/minecode/tests/test_gstreamer.py +++ b/minecode/tests/test_gstreamer.py @@ -16,9 +16,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.visitors import gstreamer +from minecode.miners import gstreamer from minecode.tests import FIXTURES_REGEN -from minecode import mappers +from minecode import miners class GstreamerVistorTest(JsonBasedTesting): @@ -55,8 +55,7 @@ def test_build_package_mapper_regex(self): self.assertTrue(result) def test_build_package_from_url(self): - packages = mappers.gstreamer.build_package_from_url( - 'https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2') + packages = miners.gstreamer.build_package_from_url('https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'gstreamer/mapper_build_from_url-expected') diff --git a/minecode/tests/test_haxe.py b/minecode/tests/test_haxe.py index 0c27d4c0..2158fc73 100644 --- a/minecode/tests/test_haxe.py +++ b/minecode/tests/test_haxe.py @@ -16,8 +16,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import haxe +from minecode import miners +from minecode.miners import haxe from minecode.tests import FIXTURES_REGEN @@ -60,7 +60,7 @@ class HaxeMappersTest(JsonBasedTesting): def test_build_project_package_json(self): with open(self.get_test_loc('haxe/project_package.json')) as projectsjson_meta: metadata = json.load(projectsjson_meta) - packages = mappers.haxe.build_packages_with_json(metadata) + packages = miners.haxe.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('haxe/project_package.json-expected') self.check_expected_results( diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py index 8518b4bd..ce9b03d2 100644 --- a/minecode/tests/test_maven.py +++ b/minecode/tests/test_maven.py @@ -19,12 +19,12 @@ from minecode.management.commands.run_map import map_uri from minecode.management.commands.run_visit import visit_uri -from minecode.mappers import maven as maven_mapper +from minecode.miners import maven from minecode.models import ResourceURI from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting from minecode.utils_test import model_to_dict -from minecode.visitors import maven as maven_visitor +from minecode.miners import maven as maven_visitor from minecode.tests import FIXTURES_REGEN import packagedb @@ -580,11 +580,9 @@ def test_visit_and_build_package_from_pom_axis(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/axis-1.4.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/mapper/axis-1.4.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_commons_pool(self): uri = 'https://repo1.maven.org/maven2/commons-pool/commons-pool/1.5.7/commons-pool-1.5.7.pom' @@ -592,11 +590,9 @@ def test_visit_and_build_package_from_pom_commons_pool(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/commons-pool-1.5.7.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/mapper/commons-pool-1.5.7.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_struts(self): uri = 'https://repo1.maven.org/maven2/struts-menu/struts-menu/2.4.2/struts-menu-2.4.2.pom' @@ -604,11 +600,9 @@ def test_visit_and_build_package_from_pom_struts(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/struts-menu-2.4.2.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/mapper/struts-menu-2.4.2.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_mysql(self): uri = 'https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.27/mysql-connector-java-5.1.27.pom' @@ -617,11 +611,9 @@ def test_visit_and_build_package_from_pom_mysql(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/mysql-connector-java-5.1.27.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/mapper/mysql-connector-java-5.1.27.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_xbean(self): uri = 'https://repo1.maven.org/maven2/xbean/xbean-jmx/2.0/xbean-jmx-2.0.pom' @@ -629,11 +621,9 @@ def test_visit_and_build_package_from_pom_xbean(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/xbean-jmx-2.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/mapper/xbean-jmx-2.0.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_maven_all(self): uri = 'https://repo1.maven.org/maven2/date/yetao/maven/maven-all/1.0-RELEASE/maven-all-1.0-RELEASE.pom' @@ -641,11 +631,9 @@ def test_visit_and_build_package_from_pom_maven_all(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/maven-all-1.0-RELEASE.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/mapper/maven-all-1.0-RELEASE.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_with_unicode(self): uri = 'https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.21/commons-jaxrs-1.21.pom' @@ -653,11 +641,9 @@ def test_visit_and_build_package_from_pom_with_unicode(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/commons-jaxrs-1.21.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/mapper/commons-jaxrs-1.21.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) class MavenMapperGetPackageTest(JsonBasedTesting, DjangoTestCase): @@ -666,127 +652,101 @@ class MavenMapperGetPackageTest(JsonBasedTesting, DjangoTestCase): def test_get_package_from_pom_1(self): test_loc = self.get_test_loc('maven/parsing/parse/jds-3.0.1.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/jds-3.0.1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/parse/jds-3.0.1.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_2(self): test_loc = self.get_test_loc( 'maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_3(self): test_loc = self.get_test_loc('maven/parsing/parse/jds-2.17.0718b.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/jds-2.17.0718b.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/parse/jds-2.17.0718b.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_4(self): test_loc = self.get_test_loc( 'maven/parsing/parse/maven-javanet-plugin-1.7.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/maven-javanet-plugin-1.7.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/parse/maven-javanet-plugin-1.7.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_5(self): test_loc = self.get_test_loc('maven/parsing/loop/coreplugin-1.0.0.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/coreplugin-1.0.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/loop/coreplugin-1.0.0.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_6(self): test_loc = self.get_test_loc( 'maven/parsing/loop/argus-webservices-2.7.0.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.7.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/loop/argus-webservices-2.7.0.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_7(self): test_loc = self.get_test_loc('maven/parsing/loop/pkg-2.0.13.1005.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/pkg-2.0.13.1005.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/loop/pkg-2.0.13.1005.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_8(self): test_loc = self.get_test_loc( 'maven/parsing/loop/ojcms-beans-0.1-beta.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/ojcms-beans-0.1-beta.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/loop/ojcms-beans-0.1-beta.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_9(self): test_loc = self.get_test_loc( 'maven/parsing/loop/jacuzzi-annotations-0.2.1.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-annotations-0.2.1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/loop/jacuzzi-annotations-0.2.1.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_10(self): test_loc = self.get_test_loc( 'maven/parsing/loop/argus-webservices-2.8.0.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.8.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/loop/argus-webservices-2.8.0.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_11(self): test_loc = self.get_test_loc( 'maven/parsing/loop/jacuzzi-database-0.2.1.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-database-0.2.1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/loop/jacuzzi-database-0.2.1.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_12(self): test_loc = self.get_test_loc( 'maven/parsing/empty/common-object-1.0.2.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/empty/common-object-1.0.2.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/empty/common-object-1.0.2.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_13(self): test_loc = self.get_test_loc('maven/parsing/empty/osgl-http-1.1.2.pom') data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/empty/osgl-http-1.1.2.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc('maven/parsing/empty/osgl-http-1.1.2.pom.package.json') + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_regex_maven_pom_mapper_1(self): regex = re.compile(r'^https?://repo1.maven.org/maven2/.*\.pom$') @@ -818,8 +778,7 @@ def test_MavenNexusIndexVisitor_then_get_mini_package_from_index_data(self): for i, u in enumerate(uris): # only get a few records if i % 500 == 0: - minip = maven_mapper.get_mini_package( - u.data, u.uri, u.package_url) + minip = maven.get_mini_package(u.data, u.uri, u.package_url) results.append(minip and minip.to_dict() or minip) expected_loc = self.get_test_loc( 'maven/index/increment2/expected_mini_package.json') @@ -1009,11 +968,11 @@ def test_merge_ancestors(self, regen=FIXTURES_REGEN): self.check_expected_results( package.to_dict(), expected_after_loc, regen=regen) - @mock.patch("minecode.visitors.maven.get_pom_text") + @mock.patch("minecode.miners.maven.get_pom_text") def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, regen=FIXTURES_REGEN): get_pom_text_mock.return_value = "" ancestor_pom_texts = [] - with patch("minecode.visitors.maven.get_ancestry") as mock_get_ancestry: + with patch("minecode.miners.maven.get_ancestry") as mock_get_ancestry: for loc in [ self.get_test_loc('maven/pom/apache-18.pom'), self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), diff --git a/minecode/tests/test_models.py b/minecode/tests/test_models.py index 7b0864cd..d7a5955d 100644 --- a/minecode/tests/test_models.py +++ b/minecode/tests/test_models.py @@ -13,8 +13,8 @@ from django.test import TestCase from django.utils import timezone -from minecode import visitors -from minecode import mappers +from minecode import miners +from minecode import miners from minecode.models import ResourceURI from packagedb.models import Package diff --git a/minecode/tests/test_npm.py b/minecode/tests/test_npm.py index 5271308a..06313841 100644 --- a/minecode/tests/test_npm.py +++ b/minecode/tests/test_npm.py @@ -20,12 +20,12 @@ from packageurl import PackageURL import packagedb -from minecode import mappers +from minecode import miners from minecode import route from minecode.models import ResourceURI from minecode.utils_test import JsonBasedTesting from minecode.utils_test import mocked_requests_get -from minecode.visitors import npm +from minecode.miners import npm from minecode.tests import FIXTURES_REGEN @@ -69,7 +69,7 @@ class TestNPMMapper(JsonBasedTesting): def test_build_packages(self): with open(self.get_test_loc('npm/0flux.json')) as npm_metadata: metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) + packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/0flux_npm_expected.json') self.check_expected_results( @@ -78,7 +78,7 @@ def test_build_packages(self): def test_build_package2(self): with open(self.get_test_loc('npm/2112.json')) as npm_metadata: metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) + packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/npm_2112_expected.json') self.check_expected_results( @@ -87,7 +87,7 @@ def test_build_package2(self): def test_build_package3(self): with open(self.get_test_loc('npm/microdata.json')) as npm_metadata: metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) + packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/microdata-node_expected.json') self.check_expected_results( @@ -103,7 +103,7 @@ def test_build_package_with_visitor_data(self): assert len(uris_list) == 1001 # Randomly pick a record from 0-1000 metadata = uris_list[29].data - packages = mappers.npm.build_packages(json.loads(metadata)) + packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/29_record_expected.json') self.check_expected_results( @@ -111,7 +111,7 @@ def test_build_package_with_visitor_data(self): # Randomly pick a record from 0-1000 metadata = uris_list[554].data - packages = mappers.npm.build_packages(json.loads(metadata)) + packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/554_record_expected.json') self.check_expected_results( @@ -128,7 +128,7 @@ def test_build_package_with_ticket_439(self): # Pickup the first one, since it's the one which is the problem package "angular2-autosize" # The zero element in json is the url for next visitor use, and data is empty and the url is metadata = uris_list[1].data - packages = mappers.npm.build_packages(json.loads(metadata)) + packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/expected_ticket_439.json') self.check_expected_results( @@ -145,7 +145,7 @@ def test_build_package_verify_ticket_440(self): # Pickup the index one instead of zero, since it's the one which is the problem package "npm-research", https://registry.npmjs.org/npm-research, # The zero element in json is the url for next visitor use only metadata = uris_list[1].data - packages = mappers.npm.build_packages(json.loads(metadata)) + packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/expected_ticket_440.json') self.check_expected_results( @@ -154,13 +154,13 @@ def test_build_package_verify_ticket_440(self): def test_npm_mapper(self): test_uri = 'https://registry.npmjs.org/angular-compare-validator' router = route.Router() - router.append(test_uri, mappers.npm.NpmPackageMapper) + router.append(test_uri, miners.npm.NpmPackageMapper) test_loc = self.get_test_loc('npm/mapper/index.json') with open(test_loc, 'rb') as test_file: test_data = test_file.read().decode('utf-8') test_res_uri = ResourceURI(uri=test_uri, data=test_data) - packages = mappers.npm.NpmPackageMapper(test_uri, test_res_uri) + packages = miners.npm.NpmPackageMapper(test_uri, test_res_uri) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/mapper/index.expected.json') self.check_expected_results( @@ -169,7 +169,7 @@ def test_npm_mapper(self): def test_build_package_for_jsonp_filter(self): with open(self.get_test_loc('npm/jsonp-filter.json')) as npm_metadata: metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) + packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('npm/jsonp-filter-expected.json') self.check_expected_results( diff --git a/minecode/tests/test_nuget.py b/minecode/tests/test_nuget.py index 875a086f..92e5ccb9 100644 --- a/minecode/tests/test_nuget.py +++ b/minecode/tests/test_nuget.py @@ -17,8 +17,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import nuget +from minecode import miners +from minecode.miners import nuget from minecode.tests import FIXTURES_REGEN @@ -80,7 +80,7 @@ class TestNugetMap(JsonBasedTesting): def test_build_packages(self): with open(self.get_test_loc('nuget/entityframework2.json')) as nuget_metadata: metadata = json.load(nuget_metadata) - packages = mappers.nuget.build_packages_with_json(metadata) + packages = miners.nuget.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('nuget/nuget_mapper_expected.json') self.check_expected_results( @@ -104,7 +104,7 @@ def test_build_packages_from_html(self): with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _errors = nuget.NugetHTMLPackageVisitor(uri) - packages = mappers.nuget.build_packages_from_html(data, uri,) + packages = miners.nuget.build_packages_from_html(data, uri,) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'nuget/nuget_mapper_log4net_expected.json') diff --git a/minecode/tests/test_openssl.py b/minecode/tests/test_openssl.py index 7230557e..7835aa07 100644 --- a/minecode/tests/test_openssl.py +++ b/minecode/tests/test_openssl.py @@ -18,9 +18,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode.mappers.openssl import build_packages +from minecode.miners.openssl import build_packages from minecode.models import ResourceURI -from minecode.visitors import openssl +from minecode.miners import openssl from minecode.tests import FIXTURES_REGEN diff --git a/minecode/tests/test_openwrt.py b/minecode/tests/test_openwrt.py index 06d957cc..d10a7113 100644 --- a/minecode/tests/test_openwrt.py +++ b/minecode/tests/test_openwrt.py @@ -16,8 +16,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import openwrt +from minecode import miners +from minecode.miners import openwrt from minecode.tests import FIXTURES_REGEN @@ -89,7 +89,7 @@ class OpenWRTMapperTest(JsonBasedTesting): def test_build_packages_1(self): with open(self.get_test_loc('openwrt/6to4_12-2_all_ipk_expected')) as openwrt_ipk_meta: metadata = json.load(openwrt_ipk_meta) - packages = mappers.openwrt.build_packages(metadata) + packages = miners.openwrt.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'openwrt/6to4_12-2_all_ipk_expected_mapper.json') @@ -99,7 +99,7 @@ def test_build_packages_1(self): def test_build_packages_2(self): with open(self.get_test_loc('openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected')) as openwrt_ipk_meta: metadata = json.load(openwrt_ipk_meta) - packages = mappers.openwrt.build_packages(metadata) + packages = miners.openwrt.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected_mapper.json') diff --git a/minecode/tests/test_packagist.py b/minecode/tests/test_packagist.py index 4303e171..beb71b49 100644 --- a/minecode/tests/test_packagist.py +++ b/minecode/tests/test_packagist.py @@ -16,8 +16,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import packagist +from minecode import miners +from minecode.miners import packagist from minecode.tests import FIXTURES_REGEN @@ -40,7 +40,7 @@ class TestPackagistMap(JsonBasedTesting): def test_build_packages(self): with open(self.get_test_loc('packagist/00f100_cakephp-opauth.json')) as packagist_package: metadata = json.load(packagist_package) - packages = mappers.packagist.build_packages_with_json(metadata) + packages = miners.packagist.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'packagist/packaglist_00f100_cakephp-opauth_expected.json') diff --git a/minecode/tests/test_pypi.py b/minecode/tests/test_pypi.py index 0ed802b4..b1564093 100644 --- a/minecode/tests/test_pypi.py +++ b/minecode/tests/test_pypi.py @@ -22,9 +22,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode import visitors -from minecode.visitors import URI +from minecode import miners +from minecode import miners +from minecode.miners import URI from minecode.models import ResourceURI from minecode.route import Router from minecode.tests import FIXTURES_REGEN @@ -65,7 +65,7 @@ def test_PypiIndexVisitor(self, mock_serverproxyclass): instance = mock_serverproxyclass.return_value instance.list_packages.return_value = iter(package_list) uri = 'https://pypi.python.org/pypi/' - uris, _data, _error = visitors.pypi.PypiIndexVisitor(uri) + uris, _data, _error = miners.pypi.PypiIndexVisitor(uri) self.assertIsNone(_data) expected_loc = self.get_test_loc('pypi/pypiindexvisitor-expected.json') @@ -76,7 +76,7 @@ def test_PypiPackageVisitor(self): test_loc = self.get_test_loc('pypi/cage.json') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _error = visitors.pypi.PypiPackageVisitor(uri) + uris, _data, _error = miners.pypi.PypiPackageVisitor(uri) expected_loc = self.get_test_loc('pypi/expected_uris-cage.json') self.check_expected_uris(uris, expected_loc) @@ -86,7 +86,7 @@ def test_PypiPackageVisitor_2(self): test_loc = self.get_test_loc('pypi/boolean.py.json') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = visitors.pypi.PypiPackageVisitor(uri) + uris, _data, _errors = miners.pypi.PypiPackageVisitor(uri) expected_loc = self.get_test_loc('pypi/expected_uris-boolean.py.json') self.check_expected_uris(uris, expected_loc) @@ -96,7 +96,7 @@ def test_PypiPackageReleaseVisitor_cage12(self): test_loc = self.get_test_loc('pypi/cage_1.1.2.json') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _error = visitors.pypi.PypiPackageReleaseVisitor(uri) + uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) expected_loc = self.get_test_loc('pypi/expected_uris-cage_1.1.2.json') self.check_expected_uris(uris, expected_loc) @@ -109,7 +109,7 @@ def test_PypiPackageReleaseVisitor_cage13(self): test_loc = self.get_test_loc('pypi/cage_1.1.3.json') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _error = visitors.pypi.PypiPackageReleaseVisitor(uri) + uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) expected_loc = self.get_test_loc('pypi/expected_uris-cage_1.1.3.json') self.check_expected_uris(uris, expected_loc) @@ -122,7 +122,7 @@ def test_PypiPackageReleaseVisitor_boolean(self): test_loc = self.get_test_loc('pypi/boolean.py-2.0.dev3.json') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _error = visitors.pypi.PypiPackageReleaseVisitor(uri) + uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) expected_loc = self.get_test_loc( 'pypi/expected_uris-boolean.py-2.0.dev3.json') @@ -148,7 +148,7 @@ class TestPypiMap(JsonBasedTesting, DjangoTestCase): def test_build_packages_lxml(self): with open(self.get_test_loc('pypi/lxml-3.2.0.json')) as pypi_meta: metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) + packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('pypi/expected-lxml-3.2.0.json') self.check_expected_results( @@ -157,7 +157,7 @@ def test_build_packages_lxml(self): def test_build_packages_boolean(self): with open(self.get_test_loc('pypi/boolean.py-2.0.dev3.json')) as pypi_meta: metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) + packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'pypi/expected-boolean.py-2.0.dev3.json') @@ -167,7 +167,7 @@ def test_build_packages_boolean(self): def test_build_packages_cage13(self): with open(self.get_test_loc('pypi/cage_1.1.3.json')) as pypi_meta: metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) + packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.3.json') self.check_expected_results( @@ -176,7 +176,7 @@ def test_build_packages_cage13(self): def test_build_packages_cage12(self): with open(self.get_test_loc('pypi/cage_1.1.2.json')) as pypi_meta: metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) + packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.2.json') self.check_expected_results( @@ -186,7 +186,7 @@ def test_PypiPackageMapper_cage(self): data = open(self.get_test_loc('pypi/cage_1.1.2.json')).read() uri = 'https://pypi.python.org/pypi/CAGE/1.1.2/json' resuri = MockResourceURI(uri, data) - packages = mappers.pypi.PypiPackageMapper(uri, resuri) + packages = miners.pypi.PypiPackageMapper(uri, resuri) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.2.json') self.check_expected_results( @@ -196,7 +196,7 @@ def test_PypiPackageMapper_lxml(self): data = open(self.get_test_loc('pypi/lxml-3.2.0.json')).read() uri = 'https://pypi.python.org/pypi/lxml/3.2.0/json' resuri = MockResourceURI(uri, data) - packages = mappers.pypi.PypiPackageMapper(uri, resuri) + packages = miners.pypi.PypiPackageMapper(uri, resuri) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('pypi/expected-lxml-3.2.0.json') self.check_expected_results( @@ -209,7 +209,7 @@ def test_pypi_map(self): resuri.save() # sanity check - packages = mappers.pypi.PypiPackageMapper(resuri.uri, resuri) + packages = miners.pypi.PypiPackageMapper(resuri.uri, resuri) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc('pypi/map/expected-3to2-1.1.1.json') self.check_expected_results( @@ -217,8 +217,7 @@ def test_pypi_map(self): # build a mock router router = Router() - router.append('https://pypi.python.org/pypi/3to2/1.1.1/json', - mappers.pypi.PypiPackageMapper) + router.append('https://pypi.python.org/pypi/3to2/1.1.1/json', miners.pypi.PypiPackageMapper) # sanity check expected_mapped_package_uri = 'https://pypi.python.org/packages/8f/ab/58a363eca982c40e9ee5a7ca439e8ffc5243dde2ae660ba1ffdd4868026b/3to2-1.1.1.zip' diff --git a/minecode/tests/test_repodata.py b/minecode/tests/test_repodata.py index e91aa009..ab79eeed 100644 --- a/minecode/tests/test_repodata.py +++ b/minecode/tests/test_repodata.py @@ -11,7 +11,7 @@ from commoncode.testcase import FileBasedTesting -from minecode.visitors import repodata +from minecode.miners import repodata class TestRepoData(FileBasedTesting): diff --git a/minecode/tests/test_repodata_rpms.py b/minecode/tests/test_repodata_rpms.py index 8ea5f8d8..623f538d 100644 --- a/minecode/tests/test_repodata_rpms.py +++ b/minecode/tests/test_repodata_rpms.py @@ -16,7 +16,7 @@ import os from minecode.utils_test import MiningTestCase -from minecode.visitors import repodata_rpms +from minecode.miners import repodata_rpms class RepodataRPMVisitorsTest(MiningTestCase): diff --git a/minecode/tests/test_repomd_parser.py b/minecode/tests/test_repomd_parser.py index 0e42d574..8bffa285 100644 --- a/minecode/tests/test_repomd_parser.py +++ b/minecode/tests/test_repomd_parser.py @@ -17,11 +17,11 @@ from minecode.utils_test import mocked_requests_get_for_uris from minecode.utils_test import JsonBasedTesting -from minecode.visitors import URI -from minecode.visitors.repodata import combine_list_of_dicts -from minecode.visitors.repodata import combine_dicts_using_pkgid -from minecode.visitors.repomd_parser import generate_rpm_objects -from minecode.visitors.repomd_parser import collect_rpm_packages_from_repomd +from minecode.miners import URI +from minecode.miners.repodata import combine_list_of_dicts +from minecode.miners.repodata import combine_dicts_using_pkgid +from minecode.miners.repomd import generate_rpm_objects +from minecode.miners.repomd import collect_rpm_packages_from_repomd from minecode.tests import FIXTURES_REGEN # TODO: add redhat repo test! diff --git a/minecode/tests/test_rubygems.py b/minecode/tests/test_rubygems.py index 2126d096..d5165a6f 100644 --- a/minecode/tests/test_rubygems.py +++ b/minecode/tests/test_rubygems.py @@ -23,19 +23,19 @@ from minecode.utils_test import JsonBasedTesting from minecode.utils_test import model_to_dict -from minecode import mappers +from minecode import miners from minecode import route from minecode.models import ResourceURI from minecode import visit_router -from minecode.mappers.rubygems import build_rubygem_packages_from_api_data -from minecode.mappers.rubygems import build_rubygem_packages_from_metadata -from minecode.mappers.rubygems import RubyGemsApiVersionsJsonMapper -from minecode.mappers.rubygems import RubyGemsPackageArchiveMetadataMapper - -from minecode.visitors.rubygems import get_gem_metadata -from minecode.visitors.rubygems import RubyGemsApiManyVersionsVisitor -from minecode.visitors.rubygems import RubyGemsIndexVisitor -from minecode.visitors.rubygems import RubyGemsPackageArchiveMetadataVisitor +from minecode.miners.rubygems import build_rubygem_packages_from_api_data +from minecode.miners.rubygems import build_rubygem_packages_from_metadata +from minecode.miners.rubygems import RubyGemsApiVersionsJsonMapper +from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataMapper + +from minecode.miners.rubygems import get_gem_metadata +from minecode.miners.rubygems import RubyGemsApiManyVersionsVisitor +from minecode.miners.rubygems import RubyGemsIndexVisitor +from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataVisitor from minecode.tests import FIXTURES_REGEN @@ -138,8 +138,7 @@ def test_build_rubygem_packages_from_api_data_2(self): def test_build_rubygem_packages_from_api_data_3(self): with open(self.get_test_loc('rubygems/apiv1/a1630ty_a1630ty.api.json')) as api: apidata = json.load(api) - packages = mappers.rubygems.build_rubygem_packages_from_api_data( - apidata, 'a1630ty_a1630ty') + packages = miners.rubygems.build_rubygem_packages_from_api_data(apidata, 'a1630ty_a1630ty') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'rubygems/apiv1/a1630ty_a1630ty.api.package.json') @@ -149,8 +148,7 @@ def test_build_rubygem_packages_from_api_data_3(self): def test_build_rubygem_packages_from_api_data_with_deps(self): with open(self.get_test_loc('rubygems/apiv1/action_tracker.api.json')) as api: apidata = json.load(api) - packages = mappers.rubygems.build_rubygem_packages_from_api_data( - apidata, 'action_tracker') + packages = miners.rubygems.build_rubygem_packages_from_api_data(apidata, 'action_tracker') packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'rubygems/apiv1/action_tracker.api.package.json') diff --git a/minecode/tests/test_run_visit.py b/minecode/tests/test_run_visit.py index 08f0e480..d3364692 100644 --- a/minecode/tests/test_run_visit.py +++ b/minecode/tests/test_run_visit.py @@ -19,7 +19,7 @@ from minecode.management.commands.run_visit import visit_uri from minecode.models import ResourceURI from minecode.route import Router -from minecode.visitors import URI +from minecode.miners import URI class RunVisitWithCounterTest(MiningTestCase): diff --git a/minecode/tests/test_seed.py b/minecode/tests/test_seed.py index 017cb477..61634ce7 100644 --- a/minecode/tests/test_seed.py +++ b/minecode/tests/test_seed.py @@ -248,6 +248,6 @@ def test_get_active_seeders(self): def test_get_configured_seeders(self): seeders = seed.get_configured_seeders() expected = [ - 'minecode.visitors.maven.MavenSeed', + 'minecode.miners.maven.MavenSeed', ] assert sorted(expected) == sorted(seeders) diff --git a/minecode/tests/test_sourceforge.py b/minecode/tests/test_sourceforge.py index 0397b96f..4e08ef47 100644 --- a/minecode/tests/test_sourceforge.py +++ b/minecode/tests/test_sourceforge.py @@ -15,8 +15,8 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting -from minecode import mappers -from minecode.visitors import sourceforge +from minecode import miners +from minecode.miners import sourceforge from minecode.tests import FIXTURES_REGEN @@ -78,7 +78,7 @@ class SourceforgeMappersTest(JsonBasedTesting): def test_build_packages(self): with open(self.get_test_loc('sourceforge/odanur.json')) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'sourceforge/mapper_odanur_expected.json') @@ -88,7 +88,7 @@ def test_build_packages(self): def test_build_packages2(self): with open(self.get_test_loc('sourceforge/openstunts.json')) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'sourceforge/mapper_openstunts_expected.json') @@ -98,7 +98,7 @@ def test_build_packages2(self): def test_build_packages3(self): with open(self.get_test_loc('sourceforge/monoql.json')) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'sourceforge/mapper_omonoql_expected.json') @@ -108,7 +108,7 @@ def test_build_packages3(self): def test_build_packages4(self): with open(self.get_test_loc('sourceforge/niftyphp.json')) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( 'sourceforge/mapper_niftyphp_expected.json') diff --git a/minecode/visitors/bower.py b/minecode/visitors/bower.py deleted file mode 100644 index e9f605a9..00000000 --- a/minecode/visitors/bower.py +++ /dev/null @@ -1,74 +0,0 @@ -# -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI - - -class BowerSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://registry.bower.io/packages' - - -@visit_router.route('https://registry.bower.io/packages') -class BowerTopJsonVisitor(HttpJsonVisitor): - """ - Collect URIs for all packages from the json returned. - """ - - def get_uris(self, content): - """ - The json content is a list with name and url, like the following format: - ... - { - "name": "bello", - "url": "https://github.com/QiaoBuTang/bello.git" - }, - { - "name": "bello-gfw", - "url": "https://gitcafe.com/GilbertSun/bello.git" - }, - ... - The url could be in the following formats like github, loglg, gitcafe, bitbuckets etc. - # FIXME: We should cover all urls beyond the above four categories. - """ - github_base_url = 'https://raw.githubusercontent.com/{owner}/{name}/master/bower.json' - lolg_base_url = 'https://lolg.it/{owner}/{name}/raw/master/bower.json' - gitcafe_base_url = 'https://coding.net/u/{owner}/p/{name}/git/raw/master/bower.json' - bitbucket_base_url = 'https://bitbucket.org/{owner}/{name}/raw/master/bower.json' - base_url_map = { - 'https://github.com/': github_base_url, - 'https://lolg.it/': lolg_base_url, - 'https://gitcafe.com/': gitcafe_base_url, - 'https://bitbucket.org/': bitbucket_base_url - } - for entry in content: - name = entry.get('name') - url = entry.get('url') - if name in url: - owner = None - package_url = PackageURL(type='bower', name=name).to_string() - for host_name, base_url in base_url_map.iteritems(): - if url.startswith(host_name): - owner = url[len(host_name): url.index(name) - 1] - yield URI(uri=base_url.format(owner=owner, name=name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://raw.githubusercontent.com/.*/master/bower.json', - 'https://lolg.it/.*/master/bower.json', - 'https://coding.net/.*/master/bower.json', - 'https://bitbucket.org/*/master/bower.json') -class BowerJsonVisitor(HttpJsonVisitor): - """ - Collect content of the json itself by the visitor. - """ - pass diff --git a/minecode/visitors/cpan.py b/minecode/visitors/cpan.py deleted file mode 100644 index 107771d2..00000000 --- a/minecode/visitors/cpan.py +++ /dev/null @@ -1,191 +0,0 @@ -# -# Copyright (c) by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -import json - -from bs4 import BeautifulSoup -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class CpanSeed(seed.Seeder): - - def get_seeds(self): - yield 'http://www.cpan.org/modules/01modules.index.html' - author_search_template = 'https://fastapi.metacpan.org/author/_search?q=email:{char}*&size=5000' - for char in 'abcdefghijklmnopqrstuvwxyz'.split(): - yield author_search_template.format(char) - -# The idea of CPAN API visitor is based on -# https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md -# -# From the doc: You can certainly scroll if you are fetching less than 5,000 -# items. You might want to do this if you are expecting a large data set, but -# will still need to run many requests to get all of the required data. -# -# To get all results for sure it's over 5000, we should use search twice based -# on author and release. -# -# First get all authors by searching email from a-z, then get all releases based -# on each author. It will make the returned result a small set. - -# For example: - -# First try to reach the author search, the following search URL will get all -# authors whose email starts with 'a', this will loop from 'a' to 'z. - -# https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 - -# If we get the Author ID in above returned json, we can pass to release search -# URL as follows, it will get all releases from the passing author. - -# https://fastapi.metacpan.org/release/_search?q=author:ABERNDT&size=5000 - - -@visit_router.route('https://fastapi.metacpan.org/author/_search\?q=email:[a-z]\*&size=5000') -class MetaCpanAuthorURLVisitors(HttpJsonVisitor): - """ - Run search on author's email, and parse the returned json content and form - the MetaCpanRleaseURLVisitors' URL by adding AUTHOR condition. For example: - https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 a* stands - for all email which starts with 'a', and it's the same with 'A' as email is - case insensitive. The visitor will cover all cases from a to z, and yield - the search URLs by passing each author in the release searching URL - """ - - def get_uris(self, content): - release_visitor_template = 'https://fastapi.metacpan.org/release/_search?q=author:{id}&size=5000' - hits = content.get('hits', {}) - inner_hits = hits.get('hits', []) - for hit in inner_hits: - _id = hit.get('_id') - if not _id: - continue - yield URI(uri=release_visitor_template.format(id=_id), source_uri=self.uri) - - -@visit_router.route('https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000') -class MetaCpanRleaseURLVisitors(HttpJsonVisitor): - """ - Run the release results by searching the passing AUTHOR ID. The visitor will - yield the json whose author ID is the passing author info. The - implementation if the class is empty, it just returns for mapper use of the - json content. - """ - pass - - -@visit_router.route('http://www.cpan.org/modules/01modules.index.html') -class CpanModulesVisitors(HttpVisitor): - """ - Return URIs by parsing the HTML page of cpan modules page. - """ - def get_uris(self, content): - """ - Return the uris of authors pages, the returning URIs will be an input of - CpanProjectHTMLVisitors - """ - page = BeautifulSoup(content, 'lxml') - url_template = 'http://www.cpan.org/{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - - url = a['href'] - if not url: - continue - - if url.startswith('../authors'): - if url.endswith(('.zip', '.tar.gz')): - # Skip tar.gz since it will be captured by the CpanProjectHTMLVisitors - continue - else: - url = url_template.format(path=url[3:]) - yield URI(uri=url, source_uri=self.uri) - - -@visit_router.route('http://www.cpan.org/authors/.*/') -class CpanProjectHTMLVisitors(HttpVisitor): - """ - Visit the HTML page of cpan project page and return the Packages info, HTML - data and error. - """ - def get_uris(self, content): - """ - Return the uris by looking for the tar.gz in the html, and then forming - the uri for meta and readme files - """ - page = BeautifulSoup(content, 'lxml') - if self.uri.endswith('/'): - url_template = self.uri + '{path}' - else: - url_template = self.uri + '/{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - - url = a['href'] - if not url: - continue - - if url.startswith(('/', '?')): - continue # Avoid the directory and other non-file links - else: - name = url - name = name.replace('tar.gz', ''). replace('.readme', '').replace('.meta', '') - partions = name.rpartition('-') - name = partions[0] - version = partions[-1] - package_url = None - if name and version: - package_url = PackageURL(type='cpan', name=name, version=version).to_string() - url = url_template.format(path=url) - yield URI(uri=url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://www.cpan.org/.*.meta') -class CpanMetaVisitors(HttpVisitor): - """ - Visit the meta file and return the meta data of the Package The goal - of this visitor is to get the content instead of returning any valid - uris. - """ - pass - - -@visit_router.route('http://www.cpan.org/.*.readme') -class CpanReadmeVisitors(HttpVisitor): - """ - Visit the readme file and translate to json and dump it and return for mapper use. - """ - - def dumps(self, content): - """ - Return the json by parsing the readme content - """ - # Handle bytes properly in python3 - if type(content) == bytes: - content = content.decode('utf-8') - - lines = content.splitlines() - readme_dict = dict() - body = [] - head = None - for line in lines: - if len(line) > 1 and line.isupper() and line[0] != ' ': - if head: - readme_dict[head] = '\n'.join(body).lstrip('\n').rstrip('\n') - head = line - body = [] - else: - body.append(line.strip()) - return json.dumps(readme_dict) diff --git a/minecode/visitors/cran.py b/minecode/visitors/cran.py deleted file mode 100644 index db91c100..00000000 --- a/minecode/visitors/cran.py +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class CranSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://cloud.r-project.org/web/packages/available_packages_by_date.html' - - -@visit_router.route('https://cloud.r-project.org/web/packages/available_packages_by_date.html') -class CranPackagesVisitors(HttpVisitor): - """ - Return URIs by parsing the HTML content of the page - """ - def get_uris(self, content): - base_url = 'https://cloud.r-project.org/web/packages/{package}/index.html' - a_blocks = BeautifulSoup(content, 'lxml').find_all('a') - for a in a_blocks: - package = a.text - package_url = PackageURL(type='cran', name=package).to_string() - yield URI(uri=base_url.format(package=package), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://cloud.r-project.org/web/packages/[\w\-\.]/index.html') -class CranSinglePackageVisitor(HttpVisitor): - """ - Return only the HTML content of the page, and will be parsed in mapper - """ - pass diff --git a/minecode/visitors/eclipse.py b/minecode/visitors/eclipse.py deleted file mode 100644 index 13fdb8ab..00000000 --- a/minecode/visitors/eclipse.py +++ /dev/null @@ -1,158 +0,0 @@ -# -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from bs4 import BeautifulSoup - -from commoncode import fileutils -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class EclipseSeed(seed.Seeder): - - def get_seeds(self): - yield 'http://projects.eclipse.org/json/projects/all' - - -@visit_router.route('https://projects.eclipse.org/list-of-projects') -class EclipseProjectVisitors(HttpVisitor): - """ - Visit the HTML page of eclipse projects page and return the Packages info, json data and error. - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href and href.startswith('https://projects.eclipse.org/projects/'): - # if the herf content starts with Eclipse single project suffix, generate a URI with the href content - project_name = href.replace('https://projects.eclipse.org/projects/', '') - package_url = PackageURL(type='eclipse', name=project_name).to_string() - yield URI(uri=href, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://projects.eclipse.org/projects/.*') -class EclipseSingleProjectVisitor(HttpVisitor): - """ - Visit the HTML page of single eclipse project. - This is to get the HTML page as metadata, as it's single project and the URI is already collected by - EclipseProjectVisitors https://projects.eclipse.org/list-of-projects, so it won't return any new URI - and the goal is to return HTML page. - - For example:https://projects.eclipse.org/projects/modeling.m2t.accele - """ - pass - - -@visit_router.route('http://git.eclipse.org/c') -class EclipseGitVisitor(HttpVisitor): - """ - Visitor Eclipse Git HTML page and return URIs in the Git HTML page. - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='td'): - if 'class' not in td.attrs: - continue - if td.attrs.get('class') != ['sublevel-repo']: - continue - - for a in td.findChildren(name='a'): - href = a['href'] - name = a.contents[0] - package_url = PackageURL(type='eclipse', name=name).to_string() - yield URI(uri=href, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://www.eclipse.org/downloads/packages/all') -class EclipsePackagesVisitor(HttpVisitor): - """ - Visit the Eclipse packages HTML page and return URIs parsed from HTML page. - """ - - def fetch(self, uri, timeout=40): - """ - Fetch and return the content found at a remote uri with an extra timeout - """ - return HttpVisitor.fetch(self, uri, timeout=timeout) - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='span'): - if 'class' not in td.attrs: - continue - if td.attrs.get('class') != ['field-content']: - continue - - a = td.find(name='a') - href = a['href'] - name = a.contents[0] - # Skip some of the nodes if it's a HTML tag but not a string - if name and isinstance(name, str): - package_url = PackageURL(type='eclipse', name=name).to_string() - yield URI(uri=href, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://www.eclipse.org/downloads/packages/release/.*') -class EclipseReleaseVisitor(HttpVisitor): - """ - Visit the Eclipse release HTML page and return expected Package URIs. - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - suffix_list = ['-win32.zip', '-win64.exe', '-win32-x86_64.zip', '-linux-gtk-x86_64.tar.gz', - '-linux-gtk-x86_64.tar.gz', '-macosx-cocoa-x86_64.tar.gz', '-linux-gtk.tar.gz', '-x86_64.tar.gz'] - for div in page.find_all(name='div'): - for a in div.find_all(name='a'): - url = a.get('href') - if url and 'download.php?file=' in url: - file_name = fileutils.file_name(url) - name = file_name - for suffix in suffix_list: - name = name.replace(suffix, '') - package_url = PackageURL(type='eclipse', name=name).to_string() - yield URI(uri=url, file_name=file_name, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://projects.eclipse.org/json/projects/all') -class EclipseProjectsJsonVisitor(HttpJsonVisitor): - """ - Visit the Ecipse json API and return expected project specified URIs. - """ - - def fetch(self, uri, timeout=40): - """ - Fetch and return the content found at a remote uri with an extra timeout - """ - return HttpJsonVisitor.fetch(self, uri, timeout=timeout) - - def get_uris(self, content): - url_template = 'http://projects.eclipse.org/json/project/{name}' - projects = content.get('projects', {}) - for project in projects: - # TODO: are we sure there is not more data available in this JSON? - package_url = PackageURL(type='eclipse', name=project).to_string() - yield URI(uri=url_template.format(name=project), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://projects.eclipse.org/json/project/.*') -class EclipseSingleProjectJsonVisitor(HttpJsonVisitor): - """ - Visit json of a single Eclipse project. This is to return the json - itself without any URIs, as the URI itself is returned by - EclipseProjectsJsonVisitor. - """ - pass diff --git a/minecode/visitors/fdroid.py b/minecode/visitors/fdroid.py deleted file mode 100644 index 81384d80..00000000 --- a/minecode/visitors/fdroid.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import codecs -import json -import xmlrpc - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import get_temp_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI -from minecode.visitors import Visitor -from minecode.visitors import NonPersistentHttpVisitor - -""" -Visitors for F-Droid package repositories. - -NOTE: the license of F-Droid package data needs to be clarified. -See https://gitlab.com/fdroid/fdroiddata/-/issues/2826 for details - -F-Droid packages come with a main JSON index and possible increment/diffs. -- https://f-droid.org/repo/index-v2.json - -- this is a legacy XMl index https://f-droid.org/repo/index.xml - -- This top level file lists index and diffs https://f-droid.org/repo/entry.json - -- This is a diff example: https://f-droid.org/repo/diff/1666980277000.json - -- Each apk is available from a URL using this form: - - https://f-droid.org/repo/app.seeneva.reader_3.apk - https://f-droid.org/repo/{application_id}_{version_code}.apk - -The {application_id}_{version_code}.apk "file name" for each tarball and -apk file name is listed in the index. -""" - - -class FdroidSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://f-droid.org/repo/index-v2.json' - - -def build_purl(package_id, version_code, filename): - """ - Return a PackageURL for an F-Droid package. - """ - return PackageURL( - type='fdroid', - name=package_id, - version=version_code, - qualifiers=dict(filename=filename) - ) - - -@visit_router.route('https://f-droid.org/repo/index-v2.json') -class FdroidIndexVisitor(NonPersistentHttpVisitor): - """ - Collect package metadata URIs from the F-Droid index for each package. - We treat each apk and corresponding source tarball as a different package. - """ - - def get_uris(self, content): - """ - Yield a URI for each F-Droid package. - """ - json_location = content - with open(json_location) as c: - content = json.loads(c.read()) - - packages = content['packages'] - - for package_id, package_data in packages.items(): - purl = PackageURL(type='fdroid', name=package_id).to_string() - yield URI( - uri=purl, - package_url=purl, - source_uri=self.uri, - data=json.dumps(package_data, separators=( - ',', ':'), ensure_ascii=False), - # note: visited is True since there nothing more to visit - visited=True - ) diff --git a/minecode/visitors/freedesktop.py b/minecode/visitors/freedesktop.py deleted file mode 100644 index 52987855..00000000 --- a/minecode/visitors/freedesktop.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class FreedesktopSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://www.freedesktop.org/wiki/Software' - - -@visit_router.route('https://www.freedesktop.org/wiki/Software') -class FreedesktopHTMLVisitor(HttpVisitor): - """ - Visit the Freedesktop Software HTML page and return URIs parsed from HTML page. - """ - def get_uris(self, content): - url_template = 'https://www.freedesktop.org/wiki/Software/{name}' - page = BeautifulSoup(content, 'lxml') - for div in page.find_all(name='div'): - for a in div.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href and href.startswith('./'): - project_name = href.replace('./', '').strip('/') - package_url = PackageURL(type='freedesktop', name=project_name).to_string() - yield URI(uri=url_template.format(name=project_name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://www.freedesktop.org/wiki/Software/.*') -class FreedesktopProjectHTMLVisitor(HttpVisitor): - """ - Visit the Freedesktop Project HTML page. - """ - pass diff --git a/minecode/visitors/packagist.py b/minecode/visitors/packagist.py deleted file mode 100644 index fb6adac8..00000000 --- a/minecode/visitors/packagist.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI - -""" -Collect packagist packages - -The packagist repo API is at: https://packagist.org/apidoc -""" - - -class PackagistSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://packagist.org/packages/list.json' - - -@visit_router.route('https://packagist.org/packages/list.json') -class PackagistListVisitor(HttpJsonVisitor): - """ - Collect list json resource and yield URIs for searching with package url. - - The yield uri format is like: https://packagist.org/p/[vendor]/[package].json - """ - - def get_uris(self, content): - search_url_template = 'https://packagist.org/p/{vendor}/{package}.json' - packages_entries = content.get('packageNames', {}) - for package in packages_entries: - # FIXME: what does it mean to have no / in the URL? - if '/' not in package: - continue - vp = package.split('/') - vendor = vp[0] - package = vp[1] - package_url = PackageURL(type='composer', name=package).to_string() - yield URI(uri=search_url_template.format(vendor=vendor, package=package), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://packagist.org/p/.*json') -class PackageVisitor(HttpJsonVisitor): - """ - Collect JSON for a package. - """ - # FIXME: what about having a download URL to fetch the real package??? - pass diff --git a/minecode/visitors/pypi.py b/minecode/visitors/pypi.py deleted file mode 100644 index ba9425a6..00000000 --- a/minecode/visitors/pypi.py +++ /dev/null @@ -1,131 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import codecs -import json -import xmlrpc - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import get_temp_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI -from minecode.visitors import Visitor - - -""" -Visitors for Pypi and Pypi-like Python package repositories. - -We have this hierarchy in Pypi: - index (xmlrpc) -> packages (json) -> package releases (json) -> download urls - -Pypi serves a main index via XMLRPC that contains a list of package names. -For each package, a JSON contains details including the list of all releases. -For each release, a JSON contains details for the released version and all the -downloads available for this release. We create Packages at this level as well -as one download URI for each effective download. - -Some information about every release and download is replicated in every JSON -payload and is ignored for simplicity (which is not super efficient). -""" - - -class PypiSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://pypi.python.org/pypi/' - - -@visit_router.route('https://pypi.python.org/pypi/') -class PypiIndexVisitor(Visitor): - """ - Collect package metadata URIs from the top level pypi index for each package. - """ - - def fetch(self, uri, timeout=None): - """ - Specialized fetching using XML RPCs. - """ - packages = xmlrpc.client.ServerProxy(uri).list_packages() - content = list(packages) - - temp_file = get_temp_file('PypiIndexVisitor') - with codecs.open(temp_file, mode='wb', encoding='utf-8') as expect: - json.dump(content, expect, indent=2, separators=(',', ':')) - return temp_file - - def dumps(self, content): - """ - The content is huge json and should not be dumped. - """ - return None - - def get_uris(self, content): - with codecs.open(content, mode='rb', encoding='utf-8') as contentfile: - packages_list = json.load(contentfile) - - url_template = 'https://pypi.python.org/pypi/{name}/json' - for name in packages_list: - package_url = PackageURL(type='pypi', name=name).to_string() - yield URI(uri=url_template.format(name=name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://pypi.python.org/pypi/[^/]+/json') -class PypiPackageVisitor(HttpJsonVisitor): - """ - Collect package metadata URIs for all release of a single Pypi package. - The url will contain only the package name, for example: https://pypi.org/pypi/vmock/json - By parsing the content, the goal is to form the json with version/release: https://pypi.org/pypi/vmock/0.1/json - """ - - def get_uris(self, content): - - url_template = 'https://pypi.python.org/pypi/{name}/{release}/json' - info = content.get('info', {}) - name = info.get('name') - if name: - for release in content['releases']: - package_url = PackageURL( - type='pypi', name=name, version=release).to_string() - yield URI(uri=url_template.format(name=name, release=release), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://pypi.python.org/pypi/[^/]+/[^/]+/json') -class PypiPackageReleaseVisitor(HttpJsonVisitor): - """ - Collect package download URIs for all packages archives of one Pypi package - release. The example is: https://pypi.org/pypi/vmock/0.1/json - """ - - def get_uris(self, content): - # TODO: this is likely best ignored entirely??? - # A download_url may be provided for an off-Pypi-download - info = content.get('info', {}) - name = info.get('name') - version = None - download_url = info.get('download_url') - if download_url and download_url != 'UNKNOWN': - version = info.get('version') - package_url = PackageURL( - type='pypi', name=name, version=version).to_string() - yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) - - # Common on-Pypi-download URLs are in the urls block - for download in content.get('urls', {}): - url = download.get('url') - if not url: - continue - package_url = PackageURL( - type='pypi', name=name, version=version).to_string() - yield URI(url, package_url=package_url, file_name=download.get('filename'), - size=download.get('size'), date=download.get('upload_time'), - md5=download.get('md5_digest'), source_uri=self.uri) diff --git a/minecode/visitors/rubygems.py b/minecode/visitors/rubygems.py deleted file mode 100644 index 2bac0c10..00000000 --- a/minecode/visitors/rubygems.py +++ /dev/null @@ -1,145 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import gzip -import json -import logging -import os - -from rubymarshal import reader -from rubymarshal.classes import UsrMarshal -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import extract_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -# FIXME: we are missing several API calls: -# http://guides.rubygems.org/rubygems-org-api/ - -class RubyGemsSeed(seed.Seeder): - - def get_seeds(self): - # We keep only specs.4.8.gz and exclude latest_spec.4.8.gz, - # since specs.4.8.gz covers all uris in latest spec. - yield 'http://rubygems.org/specs.4.8.gz' - - -class GemVersion(UsrMarshal): - - def version(self): - return self.values['version'] - - -@visit_router.route('https?://rubygems\.org/specs\.4\.8\.gz') -class RubyGemsIndexVisitor(NonPersistentHttpVisitor): - """ - Collect REST APIs URIs from RubyGems index file. - """ - - def get_uris(self, content): - with gzip.open(content, 'rb') as idx: - index = idx.read() - - # TODO: use a purl!!! - for name, version, platform in reader.loads(index): - json_url = 'https://rubygems.org/api/v1/versions/{name}.json'.format( - **locals()) - - package_url = PackageURL(type='gem', name=name).to_string() - yield URI(uri=json_url, package_url=package_url, source_uri=self.uri) - - # note: this list only has ever a single value - version = version.values[0] - if isinstance(version, bytes): - version = version.decode('utf-8') - - download_url = 'https://rubygems.org/downloads/{name}-{version}' - - if isinstance(platform, bytes): - platform = platform.decode('utf-8') - if platform != 'ruby': - download_url += '-{platform}' - - download_url += '.gem' - download_url = download_url.format(**locals()) - package_url = PackageURL( - type='gem', name=name, version=version).to_string() - yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https?://rubygems\.org/api/v1/versions/[\w\-\.]+.json') -class RubyGemsApiManyVersionsVisitor(HttpJsonVisitor): - """ - Collect the json content of each version. - Yield the uri of each gem based on name, platform and version. - The data of the uri is the JSON subset for a single version. - """ - - def get_uris(self, content): - """ - Yield URI of the gems url and data. - """ - # FIXME: return actual data too!!! - for version_details in content: - # get the gems name by parsing from the uri - name = self.uri[ - self.uri.index('/versions/') + len('/versions/'):-len('.json')] - version = version_details.get('number') - gem_name = '%(name)s-%(version)s' % locals() - package_url = PackageURL( - type='gem', name=name, version=version).to_string() - download_url = 'https://rubygems.org/downloads/%(gem_name)s.gem' % locals( - ) - yield URI(uri=download_url, source_uri=self.uri, package_url=package_url, - data=json.dumps(version_details)) - -# TODO: add API dependencies -# https://rubygems.org/api/v1/dependencies.json?gems=file_validators -# Also use Use the V2 API at http://guides.rubygems.org/rubygems-org-api-v2/ -# GET - /api/v2/rubygems/[GEM NAME]/versions/[VERSION NUMBER].(json|yaml) - - -@visit_router.route('https?://rubygems.org/downloads/[\w\-\.]+.gem') -class RubyGemsPackageArchiveMetadataVisitor(NonPersistentHttpVisitor): - """ - Fetch a Rubygems gem archive, extract it and return its metadata file content. - """ - - def dumps(self, content): - return get_gem_metadata(content) - - -def get_gem_metadata(location): - """ - Return the metadata file content as a string extracted from the gem archive - at `location`. - """ - # Extract the compressed file first. - extracted_location = extract_file(location) - metadata_gz = os.path.join(extracted_location, 'metadata.gz') - # Extract the embedded metadata gz file - extract_parent_location = extract_file(metadata_gz) - # Get the first file in the etracted folder which is the meta file location - meta_extracted_file = os.path.join( - extract_parent_location, os.listdir(extract_parent_location)[0]) - with open(meta_extracted_file) as meta_file: - return meta_file.read() diff --git a/minecode/visitors/sourceforge.py b/minecode/visitors/sourceforge.py deleted file mode 100644 index 7b2d7a7a..00000000 --- a/minecode/visitors/sourceforge.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import logging -import re - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -class SourceforgeSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://sourceforge.net/sitemap.xml' - - -@visit_router.route('https?://sourceforge.net/sitemap.xml') -class SourceforgeSitemapIndexVisitor(NonPersistentHttpVisitor): - """ - Collect sub-sitemaps from the main sitemap. Return on URI for each sub- - sitemap, for example: https://sourceforge.net/sitemap-167.xml - - Note that the class implements from NonPersistentHttpVisitor instead of HttpVisitor, - as the XML file itself will be over 100M big, so NonPersistentHttpVisitor will be more - reasonable. - """ - - def get_uris(self, content): - """ - Collect all the sitemaps URIs from master sitemap. - """ - locs = BeautifulSoup(open(content), 'lxml').find_all('loc') - # Content passing from NonPersistentHttpVisitor is a temp file path - # instead of file content, so opening to get a file handler is - # necessary. - for loc in locs: - yield URI(uri=loc.text, source_uri=self.uri) - - -@visit_router.route('https?://sourceforge.net/sitemap-\d+.xml') -class SourceforgeSitemapPageVisitor(HttpVisitor): - - def get_uris(self, content): - """ - Collect all the projects URIs from a sub-sitemaps. - """ - sitemap_locs = BeautifulSoup(content, 'lxml').find_all('loc') - regex = re.compile( - r"^https?://sourceforge.net/projects/[a-z0-9.-]+/?$") - for loc in sitemap_locs: - if loc.text and re.match(regex, loc.text): - project_json_baseurl = 'https://sourceforge.net/api/project/name/{}/json' - project_name = loc.text.partition( - 'https://sourceforge.net/projects/')[-1].strip('/') - project_json_url = project_json_baseurl.format(project_name) - package_url = PackageURL( - type='sourceforge', name=project_name).to_string() - # The priority in the xml has different view with the priority in visitor, so skip it. - yield URI(uri=project_json_url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json', - 'https?://sourceforge.net/rest/p/[a-z0-9.-]+' - ) -class SourceforgeProjectJsonVisitor(HttpJsonVisitor): - """ - Collect Sourceforge project data through the JSON API. - The implementation is empty since it will inherit the implementation from HttpJsonVisitor and it returns json data for mapper. - """ - pass diff --git a/packagedb/api.py b/packagedb/api.py index 104d95a7..63247f0a 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -40,9 +40,9 @@ from univers.versions import InvalidVersion # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration +# But importing the miners module triggers routes registration from minecode import priority_router -from minecode import visitors # NOQA +from minecode import miners # NOQA from minecode.models import PriorityResourceURI from minecode.models import ScannableURI from minecode.route import NoRouteAvailable diff --git a/packagedb/management/commands/fix_purl_values.py b/packagedb/management/commands/fix_purl_values.py index 61c6587b..02d3196f 100644 --- a/packagedb/management/commands/fix_purl_values.py +++ b/packagedb/management/commands/fix_purl_values.py @@ -21,8 +21,8 @@ from minecode.management.commands import VerboseCommand from minecode.utils import MemorySavingQuerysetIterator -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_for_artifacts +from minecode.miners.maven import collect_links_from_text +from minecode.miners.maven import filter_for_artifacts from packagedb.models import Package DEFAULT_TIMEOUT = 30 diff --git a/purl2vcs/src/purl2vcs/find_source_repo.py b/purl2vcs/src/purl2vcs/find_source_repo.py index a3bd708c..fd60d3d9 100644 --- a/purl2vcs/src/purl2vcs/find_source_repo.py +++ b/purl2vcs/src/purl2vcs/find_source_repo.py @@ -19,7 +19,7 @@ from scancode.api import get_urls as get_urls_from_location from minecode.model_utils import add_package_to_scan_queue -from minecode.visitors.maven import get_merged_ancestor_package_from_maven_package +from minecode.miners.maven import get_merged_ancestor_package_from_maven_package from packagedb.models import Package, PackageContentType, PackageSet logger = logging.getLogger(__name__) diff --git a/purldb_project/settings.py b/purldb_project/settings.py index ff8cf620..28c89180 100644 --- a/purldb_project/settings.py +++ b/purldb_project/settings.py @@ -302,7 +302,7 @@ # Active seeders: each active seeder class need to be added explicitly here ACTIVE_SEEDERS = [ - 'minecode.visitors.maven.MavenSeed', + 'minecode.miners.maven.MavenSeed', ] SPECTACULAR_SETTINGS = { From 7c64de17f03dcc3a3b2edb61af012cf28e777301 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 9 Aug 2024 11:23:55 -0700 Subject: [PATCH 02/12] Move priority_router related code to minecode/collectors #515 Signed-off-by: Jono Yang --- minecode/api.py | 4 +- minecode/{miners => collectors}/conan.py | 0 minecode/collectors/debian.py | 513 ++++++++++++ minecode/{miners => collectors}/generic.py | 0 minecode/collectors/github.py | 42 + minecode/{miners => collectors}/gnu.py | 2 +- minecode/collectors/maven.py | 730 +++++++++++++++++- minecode/collectors/npm.py | 111 +++ minecode/collectors/openssl.py | 42 + .../commands/get_maven_release_dates.py | 4 +- minecode/management/commands/import_queue.py | 12 +- minecode/management/commands/maven_crawler.py | 2 +- .../management/commands/priority_queue.py | 5 +- minecode/miners/debian.py | 505 +----------- minecode/miners/github.py | 32 - minecode/miners/maven.py | 707 ----------------- minecode/miners/npm.py | 91 --- minecode/miners/openssl.py | 34 +- minecode/miners/openwrt.py | 2 +- minecode/tests/{ => collectors}/test_conan.py | 6 +- .../tests/{ => collectors}/test_generic.py | 2 +- minecode/tests/{ => collectors}/test_gnu.py | 4 +- minecode/tests/collectors/test_maven.py | 516 +++++++++++++ minecode/tests/collectors/test_npm.py | 57 ++ minecode/tests/test_maven.py | 724 ++--------------- minecode/tests/test_npm.py | 46 -- packagedb/api.py | 5 +- .../management/commands/fix_purl_values.py | 4 +- packagedb/tests/test_api.py | 1 - purl2vcs/src/purl2vcs/find_source_repo.py | 2 +- 30 files changed, 2110 insertions(+), 2095 deletions(-) rename minecode/{miners => collectors}/conan.py (100%) create mode 100644 minecode/collectors/debian.py rename minecode/{miners => collectors}/generic.py (100%) create mode 100644 minecode/collectors/github.py rename minecode/{miners => collectors}/gnu.py (95%) create mode 100644 minecode/collectors/npm.py create mode 100644 minecode/collectors/openssl.py rename minecode/tests/{ => collectors}/test_conan.py (95%) rename minecode/tests/{ => collectors}/test_generic.py (98%) rename minecode/tests/{ => collectors}/test_gnu.py (93%) create mode 100644 minecode/tests/collectors/test_maven.py create mode 100644 minecode/tests/collectors/test_npm.py diff --git a/minecode/api.py b/minecode/api.py index 487c4491..8fce1795 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -24,8 +24,8 @@ from rest_framework.response import Response # UnusedImport here! -# But importing the miners module triggers routes registration -from minecode import miners # NOQA +# But importing the collectors module triggers routes registration +from minecode import collectors # NOQA from minecode import priority_router from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI from minecode.permissions import IsScanQueueWorkerAPIUser diff --git a/minecode/miners/conan.py b/minecode/collectors/conan.py similarity index 100% rename from minecode/miners/conan.py rename to minecode/collectors/conan.py diff --git a/minecode/collectors/debian.py b/minecode/collectors/debian.py new file mode 100644 index 00000000..1457e45f --- /dev/null +++ b/minecode/collectors/debian.py @@ -0,0 +1,513 @@ +from packagedcode.debian import DebianDscFileHandler +from packagedcode.debian_copyright import StandaloneDebianCopyrightFileHandler +from debian_inspector.version import Version as DebVersion +import requests +from minecode import priority_router +from minecode.utils import fetch_and_write_file_from_url +from minecode.utils import get_package_sha1 +from packagedb.models import make_relationship +from packagedb.models import PackageContentType +from packagedb.models import PackageRelation +from packageurl import PackageURL +import logging +from packagedcode import models as scan_models +import attr + + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +DEBIAN_BASE_URL = "https://deb.debian.org/debian/pool/main/" +DEBIAN_METADATA_URL = "https://metadata.ftp-master.debian.org/changelogs/main/" + +UBUNTU_BASE_URL = "http://archive.ubuntu.com/ubuntu/pool/main/" +UBUNTU_METADATA_URL = "http://changelogs.ubuntu.com/changelogs/pool/main/" + + +@priority_router.route('pkg:deb/.*') +def process_request(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a maven Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from debian and + using it to create a new PackageDB entry. The binary package is then added to the + scan queue afterwards. We also get the Package information for the + accompanying source package and add it to the PackageDB and scan queue, if + available. + + Return an error string for errors that occur, or empty string if there is no error. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + source_purl = kwargs.get("source_purl", None) + addon_pipelines = kwargs.get('addon_pipelines', []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get('priority', 0) + + try: + package_url = PackageURL.from_string(purl_str) + source_package_url = None + if source_purl: + source_package_url = PackageURL.from_string(source_purl) + + except ValueError as e: + error = f'error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}' + return error + + has_version = bool(package_url.version) + if has_version: + error = map_debian_metadata_binary_and_source( + package_url=package_url, + source_package_url=source_package_url, + pipelines=pipelines, + priority=priority, + ) + + return error + + +def map_debian_package(debian_package, package_content, pipelines, priority=0): + """ + Add a debian `package_url` to the PackageDB. + + Return an error string if errors have occured in the process. + """ + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package + + db_package = None + error = '' + + purl = debian_package.package_url + if package_content == PackageContentType.BINARY: + download_url = debian_package.binary_archive_url + elif package_content == PackageContentType.SOURCE_ARCHIVE: + download_url = debian_package.source_archive_url + + response = requests.get(download_url) + if not response.ok: + msg = f'Package metadata does not exist on debian: {download_url}' + error += msg + '\n' + logger.error(msg) + return db_package, error + + purl_package = scan_models.PackageData( + type=purl.type, + namespace=purl.namespace, + name=purl.name, + version=purl.version, + qualifiers=purl.qualifiers, + ) + + package, error_metadata = get_debian_package_metadata(debian_package) + if not package: + error += error_metadata + return db_package, error + + package_copyright, error_copyright = get_debian_package_copyright(debian_package) + package.update_purl_fields(package_data=purl_package, replace=True) + if package_copyright: + update_license_copyright_fields( + package_from=package_copyright, + package_to=package, + replace=True, + ) + else: + error += error_metadata + + # This will be used to download and scan the package + package.download_url = download_url + + # Set package_content value + package.extra_data['package_content'] = package_content + + # If sha1 exists for an archive, we know we can create the package + # Use purl info as base and create packages for binary and source package + sha1 = get_package_sha1(package=package, field="download_url") + if sha1: + package.sha1 = sha1 + db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + else: + msg = f'Failed to retrieve package archive: {purl.to_string()} from url: {download_url}' + error += msg + '\n' + logger.error(msg) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package, pipelines, priority) + + return db_package, error + + +def get_debian_package_metadata(debian_package): + """ + Given a DebianPackage object with package url and source package url + information, get the .dsc package metadata url, fetch the .dsc file, + parse and return the PackageData object containing the package metadata + for that Debian package. + + If there are errors, return None and a string containing the error + information. + """ + error = '' + + metadata_url = debian_package.package_metadata_url + temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) + if not temp_metadata_file: + msg = f'Package metadata does not exist on debian: {metadata_url}' + error += msg + '\n' + logger.error(msg) + return None, error + + packages = DebianDscFileHandler.parse(location=temp_metadata_file) + package = list(packages).pop() + + package.qualifiers = debian_package.package_url.qualifiers + + return package, error + + +def get_debian_package_copyright(debian_package): + """ + Given a DebianPackage object with package url and source package url + information, get the debian copyright file url, fetch and run license + detection, and return the PackageData object containing the package + metadata for that Debian package. + + If there are errors, return None and a string containing the error + information. + """ + error = '' + + metadata_url = debian_package.package_copyright_url + temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) + if not temp_metadata_file: + msg = f'Package metadata does not exist on debian: {metadata_url}' + error += msg + '\n' + logger.error(msg) + return None, error + + packages = StandaloneDebianCopyrightFileHandler.parse(location=temp_metadata_file) + package = list(packages).pop() + + package.qualifiers = debian_package.package_url.qualifiers + + return package, error + + +def update_license_copyright_fields(package_from, package_to, replace=True): + fields_to_update = [ + 'copyright', + 'holder', + 'declared_license_expression', + 'declared_license_expression_spdx', + 'license_detections', + 'other_license_expression', + 'other_license_expression_spdx', + 'other_license_detections', + 'extracted_license_statement' + ] + + for field in fields_to_update: + value = getattr(package_from, field) + if value and replace: + setattr(package_to, field, value) + + +def map_debian_metadata_binary_and_source(package_url, source_package_url, pipelines, priority=0): + """ + Get metadata for the binary and source release of the Debian package + `package_url` and save it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = '' + + if "repository_url" in package_url.qualifiers: + base_url = package_url.qualifiers["repository_url"] + elif package_url.namespace == 'ubuntu': + base_url = UBUNTU_BASE_URL + else: + base_url = DEBIAN_BASE_URL + + if "api_data_url" in package_url.qualifiers: + metadata_base_url = package_url.qualifiers["api_data_url"] + elif package_url.namespace == 'ubuntu': + metadata_base_url = UBUNTU_METADATA_URL + else: + metadata_base_url = DEBIAN_METADATA_URL + + package_urls = dict( + package_url=package_url, + source_package_url=source_package_url, + archive_base_url=base_url, + metadata_base_url=metadata_base_url, + ) + debian_package, emsg = DebianPackage.from_purls(package_urls) + if emsg: + return emsg + + binary_package, emsg = map_debian_package( + debian_package, + PackageContentType.BINARY, + pipelines, + priority, + ) + if emsg: + error += emsg + + package_url.qualifiers['classifier'] = 'sources' + source_package, emsg = map_debian_package( + debian_package, + PackageContentType.SOURCE_ARCHIVE, + pipelines, + priority, + ) + if emsg: + error += emsg + + if binary_package and source_package: + make_relationship( + from_package=binary_package, + to_package=source_package, + relationship=PackageRelation.Relationship.SOURCE_PACKAGE, + ) + + return error + + +@attr.s +class DebianPackage: + """ + Contains the package url and source package url for a debian package + necessary to get source, binary, metadata and copyright urls for it. + """ + + archive_base_url = attr.ib(type=str) + metadata_base_url = attr.ib(type=str) + package_url = attr.ib(type=str) + source_package_url = attr.ib(type=str) + metadata_directory_url = attr.ib(type=str, default=None) + archive_directory_url = attr.ib(type=str, default=None) + + @classmethod + def from_purls(cls, package_urls): + """ + Set the directory URLs for metadata and package archives. + """ + debian_package = cls(**package_urls) + error = debian_package.set_debian_directories() + return debian_package, error + + @property + def package_archive_version(self): + """ + Get the useful part of the debian package version used in + source, binary, metadata and copyright URLs optionally. + """ + debvers = DebVersion.from_string(self.package_url.version) + if debvers.revision != "0": + purl_version = f"{debvers.upstream}-{debvers.revision}" + else: + purl_version = debvers.upstream + return purl_version + + @property + def binary_archive_url(self): + """ + Get the .deb debian binary archive url for this debian package. + """ + purl_version = self.package_archive_version + arch = self.package_url.qualifiers.get("arch") + if arch: + archive_name =f"{self.package_url.name}_{purl_version}_{arch}.deb" + else: + archive_name =f"{self.package_url.name}_{purl_version}.deb" + binary_package_url = self.archive_directory_url + f"{archive_name}" + return binary_package_url + + @property + def source_archive_url(self): + """ + Get the debian source tarball archive url for this debian package. + """ + debian_source_archive_formats = [ + ".tar.xz", ".tar.gz", ".orig.tar.xz", ".orig.tar.gz", ".orig.tar.bz2" + ] + + source_version = self.package_archive_version + if not self.source_package_url: + source_package_name = self.package_url.name + else: + source_package_name = self.source_package_url.name + if self.source_package_url.version: + source_version = self.source_package_url.version + + for archive_format in debian_source_archive_formats: + if ".orig" in archive_format: + base_version_source = source_version.split('-')[0] + archive_name = f"{source_package_name}_{base_version_source}" + archive_format + else: + archive_name = f"{source_package_name}_{source_version}" + archive_format + source_package_url = self.archive_directory_url + archive_name + response = requests.get(source_package_url) + if response.ok: + break + + return source_package_url + + @property + def package_metadata_url(self): + """ + Get the .dsc metadata file url for this debian package. + """ + metadata_version = self.package_archive_version + if not self.source_package_url: + metadata_package_name = self.package_url.name + else: + metadata_package_name = self.source_package_url.name + if self.source_package_url.version: + metadata_version = self.source_package_url.version + + base_version_metadata = metadata_version.split('+')[0] + metadata_dsc_package_url = self.archive_directory_url + f"{metadata_package_name}_{base_version_metadata}.dsc" + response = requests.get(metadata_dsc_package_url) + if not response.ok: + metadata_dsc_package_url = self.archive_directory_url + f"{metadata_package_name}_{metadata_version}.dsc" + + return metadata_dsc_package_url + + @property + def package_copyright_url(self): + """ + Get the debian copyright file url containing license and copyright + declarations for this debian package. + """ + # Copyright files for ubuntu are named just `copyright` and placed under a name-version folder + # instead of having the name-version in the copyright file itself + copyright_file_string = "_copyright" + if self.package_url.namespace == "ubuntu": + copyright_file_string = "/copyright" + + metadata_version = self.package_archive_version + if not self.source_package_url: + metadata_package_name = self.package_url.name + else: + metadata_package_name = self.source_package_url.name + if self.source_package_url.version: + metadata_version = self.source_package_url.version + + copyright_package_url = self.metadata_directory_url + f"{metadata_package_name}_{metadata_version}{copyright_file_string}" + response = requests.get(copyright_package_url) + if not response.ok: + base_version_metadata = metadata_version.split('+')[0] + copyright_package_url = self.metadata_directory_url + f"{metadata_package_name}_{base_version_metadata}{copyright_file_string}" + + return copyright_package_url + + def set_debian_directories(self): + """ + Compute and set base urls for metadata and archives, to get + source/binary + """ + error = '' + + archive_base_url = self.archive_base_url + metadata_base_url = self.metadata_base_url + + index_folder = None + if self.package_url.name.startswith('lib'): + name_wout_lib = self.package_url.name.replace("lib", "") + index_folder = 'lib' + name_wout_lib[0] + else: + index_folder = self.package_url.name[0] + + msg = "No directory exists for package at: " + + package_directory = f"{archive_base_url}{index_folder}/{self.package_url.name}/" + metadata_directory = f"{metadata_base_url}{index_folder}/{self.package_url.name}/" + + response = requests.get(package_directory) + if not response.ok: + if not self.source_package_url: + error = msg + str(package_directory) + return error + + if self.source_package_url.name.startswith('lib'): + name_wout_lib = self.source_package_url.name.replace("lib", "") + index_folder = 'lib' + name_wout_lib[0] + else: + index_folder = self.source_package_url.name[0] + + package_directory = f"{archive_base_url}{index_folder}/{self.source_package_url.name}/" + metadata_directory = f"{metadata_base_url}{index_folder}/{self.source_package_url.name}/" + + response = requests.get(package_directory) + if not response.ok: + error = msg + str(package_directory) + return error + + self.archive_directory_url = package_directory + self.metadata_directory_url = metadata_directory + + +# FIXME: We are not returning download URLs. Returned information is incorrect + + +def get_dependencies(data): + """ + Return a list of DependentPackage extracted from a Debian `data` mapping. + """ + scopes = { + 'Build-Depends': dict(is_runtime=False, is_optional=True), + 'Depends': dict(is_runtime=True, is_optional=False), + 'Pre-Depends': dict(is_runtime=True, is_optional=False), + # 'Provides': dict(is_runtime=True, is_optional=False), + # 'Recommends': dict(is_runtime=True, is_optional=True), + # 'Suggests': dict(is_runtime=True, is_optional=True), + } + dep_pkgs = [] + for scope, flags in scopes.items(): + depends = data.get(scope) + if not depends: + continue + + dependencies = None # debutils.comma_separated(depends) + if not dependencies: + continue + # break each dep in package names and version constraints + # FIXME:!!! + for name in dependencies: + purl = PackageURL(type='deb', namespace='debian', name=name) + dep = scan_models.DependentPackage(purl=purl.to_string(), score=scope, **flags) + dep_pkgs.append(dep) + + return dep_pkgs + + +def get_vcs_repo(description): + """ + Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found. + """ + repos = [] + for vcs_tool, vcs_repo in description.items(): + vcs_tool = vcs_tool.lower() + if not vcs_tool.startswith('vcs-') or vcs_tool.startswith('vcs-browser'): + continue + _, _, vcs_tool = vcs_tool.partition('-') + repos.append((vcs_tool, vcs_repo)) + + if len(repos) > 1: + raise TypeError('Debian description with more than one Vcs repos: %(repos)r' % locals()) + + if repos: + vcs_tool, vcs_repo = repos[0] + else: + vcs_tool = None + vcs_repo = None + + return vcs_tool, vcs_repo diff --git a/minecode/miners/generic.py b/minecode/collectors/generic.py similarity index 100% rename from minecode/miners/generic.py rename to minecode/collectors/generic.py diff --git a/minecode/collectors/github.py b/minecode/collectors/github.py new file mode 100644 index 00000000..b43f98da --- /dev/null +++ b/minecode/collectors/github.py @@ -0,0 +1,42 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from packageurl import PackageURL +from minecode import priority_router +from minecode.collectors.generic import map_fetchcode_supported_package + + +# Indexing GitHub PURLs requires a GitHub API token. +# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. +@priority_router.route('pkg:github/.*') +def process_request_dir_listed(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a GitHub Package URL (PURL). + + This involves obtaining Package information for the PURL using + https://github.com/aboutcode-org/fetchcode and using it to create a new + PackageDB entry. The package is then added to the scan queue afterwards. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get('addon_pipelines', []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get('priority', 0) + + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f"error occurred when parsing {purl_str}: {e}" + return error + + error_msg = map_fetchcode_supported_package( + package_url, pipelines, priority) + + if error_msg: + return error_msg diff --git a/minecode/miners/gnu.py b/minecode/collectors/gnu.py similarity index 95% rename from minecode/miners/gnu.py rename to minecode/collectors/gnu.py index 288bbe54..ccd1a3e9 100644 --- a/minecode/miners/gnu.py +++ b/minecode/collectors/gnu.py @@ -13,7 +13,7 @@ from packageurl import PackageURL from minecode import priority_router -from minecode.miners.generic import map_fetchcode_supported_package +from minecode.collectors.generic import map_fetchcode_supported_package logger = logging.getLogger(__name__) handler = logging.StreamHandler() diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index 22fcc02c..b3e7e825 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -1,10 +1,47 @@ -from dateutil.parser import parse as dateutil_parse +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from typing import Dict +from urllib.parse import urlparse from minecode.miners.maven import get_artifacts, is_worthy_artifact, build_url_and_filename from packagedcode.maven import get_urls from minecode.utils import fetch_http, get_temp_file from packagedcode.models import PackageData +import hashlib +import re +import requests +from packagedcode.maven import get_urls +from packagedcode.maven import get_maven_pom +from minecode import priority_router +from minecode.utils import validate_sha1 +from packagedb.models import make_relationship +from packagedb.models import PackageContentType +from packagedb.models import PackageRelation +from packagedb.models import make_relationship +from packageurl import PackageURL +from packagedcode.maven import _parse +import logging + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +TRACE = False +TRACE_DEEP = False + +if TRACE: + import sys + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) +MAVEN_BASE_URL = 'https://repo1.maven.org/maven2' MAVEN_INDEX_URL = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' @@ -93,3 +130,694 @@ def get_packages(self, content=None): repository_download_url=repository_download_url, api_data_url=api_data_url, ) + + +def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): + """ + Return the contents of the POM file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + if qualifiers and not isinstance(qualifiers, Dict): + return + urls = get_urls( + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + base_url=base_url, + ) + if not urls: + return + # Get and parse POM info + pom_url = urls['api_data_url'] + # TODO: manage different types of errors (404, etc.) + response = requests.get(pom_url) + if not response: + return + return response.text + + +def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): + """ + Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. + """ + if not pom_text: + return + pom = get_maven_pom(text=pom_text) + if ( + pom.parent + and pom.parent.group_id + and pom.parent.artifact_id + and pom.parent.version.version + ): + parent_namespace = pom.parent.group_id + parent_name = pom.parent.artifact_id + parent_version = str(pom.parent.version.version) + parent_pom_text = get_pom_text( + namespace=parent_namespace, + name=parent_name, + version=parent_version, + qualifiers={}, + base_url=base_url, + ) + return parent_pom_text + + +def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): + """ + Return a list of pom text of the ancestors of `pom`. The list is ordered + from oldest ancestor to newest. The list is empty is there is no parent pom. + """ + ancestors = [] + has_parent = True + while has_parent: + parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) + if not parent_pom_text: + has_parent = False + else: + ancestors.append(parent_pom_text) + pom_text = parent_pom_text + return reversed(ancestors) + + +def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): + """ + Merge package details of a package with its ancestor pom + and return the merged package. + """ + if not package: + return + pom_text = get_pom_text( + name=package.name, + namespace=package.namespace, + version=package.version, + qualifiers=package.qualifiers, + base_url=base_url, + ) + merged_package = merge_ancestors( + ancestor_pom_texts=get_ancestry(pom_text), + package=package, + ) + return merged_package + + +def merge_parent(package, parent_package): + """ + Merge `parent_package` data into `package` and return `package. + """ + mergeable_fields = ( + 'declared_license_expression', + 'homepage_url', + 'parties', + ) + for field in mergeable_fields: + # If `field` is empty on the package we're looking at, populate + # those fields with values from the parent package. + if not getattr(package, field): + value = getattr(parent_package, field) + setattr(package, field, value) + + msg = f'Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}' + history = package.extra_data.get('history') + if history: + package.extra_data['history'].append(msg) + else: + package.extra_data['history'] = [msg] + + return package + + +def merge_ancestors(ancestor_pom_texts, package): + """ + Merge metadata from `ancestor_pom_text` into `package`. + + The order of POM content in `ancestor_pom_texts` is expected to be in the + order of oldest ancestor to newest. + """ + for ancestor_pom_text in ancestor_pom_texts: + ancestor_package = _parse( + datasource_id='maven_pom', + package_type='maven', + primary_language='Java', + text=ancestor_pom_text, + ) + package = merge_parent(package, ancestor_package) + return package + + +def map_maven_package(package_url, package_content, pipelines, priority=0, reindex_metadata=False): + """ + Add a maven `package_url` to the PackageDB. + + Return an error string if errors have occured in the process. + + if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. + """ + from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package + + db_package = None + error = '' + + if 'repository_url' in package_url.qualifiers: + base_url = package_url.qualifiers['repository_url'] + else: + base_url = MAVEN_BASE_URL + + pom_text = get_pom_text( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + if not pom_text: + msg = f'Package does not exist on maven: {package_url}' + error += msg + '\n' + logger.error(msg) + return db_package, error + + package = _parse( + 'maven_pom', + 'maven', + 'Java', + text=pom_text, + base_url=base_url, + ) + ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) + package = merge_ancestors(ancestor_pom_texts=ancestor_pom_texts, package=package) + + urls = get_urls( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + # In the case of looking up a maven package with qualifiers of + # `classifiers=sources`, the purl of the package created from the pom does + # not have the qualifiers, so we need to set them. Additionally, the download + # url is not properly generated since it would be missing the sources bit + # from the filename. + package.qualifiers = package_url.qualifiers + package.download_url = urls['repository_download_url'] + package.repository_download_url = urls['repository_download_url'] + + # Set package_content value + package.extra_data['package_content'] = package_content + + # If sha1 exists for a jar, we know we can create the package + # Use pom info as base and create packages for binary and source package + + # Check to see if binary is available + sha1 = get_package_sha1(package) + if sha1: + package.sha1 = sha1 + override = reindex_metadata + db_package, _, _, _ = merge_or_create_package(package, visit_level=50, override=override) + else: + msg = f'Failed to retrieve JAR: {package_url}' + error += msg + '\n' + logger.error(msg) + + if not reindex_metadata: + # Submit package for scanning + if db_package: + add_package_to_scan_queue( + package=db_package, + pipelines=pipelines, + priority=priority + ) + + return db_package, error + + +def map_maven_binary_and_source(package_url, pipelines, priority=0, reindex_metadata=False): + """ + Get metadata for the binary and source release of the Maven package + `package_url` and save it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = '' + package, emsg = map_maven_package( + package_url=package_url, + package_content=PackageContentType.BINARY, + pipelines=pipelines, + priority=priority, + reindex_metadata=reindex_metadata, + ) + if emsg: + error += emsg + + source_package_url = package_url + source_package_url.qualifiers['classifier'] = 'sources' + source_package, emsg = map_maven_package( + package_url=source_package_url, + package_content=PackageContentType.SOURCE_ARCHIVE, + pipelines=pipelines, + priority=priority, + reindex_metadata=reindex_metadata, + ) + if emsg: + error += emsg + + if not reindex_metadata and package and source_package: + make_relationship( + from_package=source_package, + to_package=package, + relationship=PackageRelation.Relationship.SOURCE_PACKAGE, + ) + + return error + + +def map_maven_packages(package_url, pipelines): + """ + Given a valid `package_url` with no version, get metadata for the binary and + source release for each version of the Maven package `package_url` and save + it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = '' + namespace = package_url.namespace + name = package_url.name + # Find all versions of this package + query_params = f'g:{namespace}+AND+a:{name}' + url = f'https://search.maven.org/solrsearch/select?q={query_params}&core=gav' + response = requests.get(url) + if response: + package_listings = response.json().get('response', {}).get('docs', []) + for listing in package_listings: + purl = PackageURL( + type='maven', + namespace=listing.get('g'), + name=listing.get('a'), + version=listing.get('v'), + ) + emsg = map_maven_binary_and_source(purl, pipelines) + if emsg: + error += emsg + return error + + +def get_package_sha1(package): + """ + Return the sha1 value for `package` by checking if the sha1 file exists for + `package` on maven and returning the contents if it does. + If the sha1 is invalid, we download the package's JAR and calculate the sha1 + from that. + """ + download_url = package.repository_download_url + sha1_download_url = f'{download_url}.sha1' + response = requests.get(sha1_download_url) + if response.ok: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + if not sha1: + # Download JAR and calculate sha1 if we cannot get it from the repo + response = requests.get(download_url) + if response: + sha1_hash = hashlib.new('sha1', response.content) + sha1 = sha1_hash.hexdigest() + return sha1 + + +@priority_router.route('pkg:maven/.*') +def process_request(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a maven Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from maven and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. We also get the Package information for the + accompanying source package and add it to the PackageDB and scan queue, if + available. + + Return an error string for errors that occur, or empty string if there is no error. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get('addon_pipelines', []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get('priority', 0) + + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f'error occured when parsing {purl_str}: {e}' + return error + + has_version = bool(package_url.version) + if has_version: + reindex_metadata=kwargs.get("reindex_metadata", False) + error = map_maven_binary_and_source( + package_url, + pipelines, + reindex_metadata=reindex_metadata, + priority=priority, + ) + else: + error = map_maven_packages(package_url, pipelines) + + return error + + +collect_links = re.compile(r'href="([^"]+)"').findall +collect_links_and_artifact_timestamps = re.compile( + r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' +).findall + + +def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): + """ + Return True if `file_name` is in `links` + """ + return any(l.endswith(file_name) for l in links) + + +def check_if_page_has_pom_files(links, **kwargs): + """ + Return True of any entry in `links` ends with .pom. + """ + return any(l.endswith('.pom') for l in links) + + +def check_if_page_has_directories(links, **kwargs): + """ + Return True if any entry, excluding "../", ends with /. + """ + return any(l.endswith('/') for l in links if l != '../') + + +def check_if_package_version_page(links, **kwargs): + """ + Return True if `links` contains pom files and has no directories + """ + return check_if_page_has_pom_files( + links=links + ) and not check_if_page_has_directories(links=links) + + +def check_if_package_page(links, **kwargs): + return check_if_file_name_is_linked_on_page( + file_name='maven-metadata.xml', links=links + ) and not check_if_page_has_pom_files(links=links) + + +def check_if_maven_root(links, **kwargs): + """ + Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven + repo contains "archetype-catalog.xml". + """ + return check_if_file_name_is_linked_on_page( + file_name='archetype-catalog.xml', links=links + ) + + +def check_on_page(url, checker): + """ + Return True if there is a link on `url` that is the same as `file_name`, + False otherwise. + """ + response = requests.get(url) + if response: + links = collect_links(response.text) + return checker(links=links) + return False + + +def is_maven_root(url): + """ + Return True if `url` is the root of a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_maven_root) + + +def is_package_page(url): + """ + Return True if `url` is a package page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_page) + + +def is_package_version_page(url): + """ + Return True if `url` is a package version page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_version_page) + + +def url_parts(url): + parsed_url = urlparse(url) + scheme = parsed_url.scheme + netloc = parsed_url.netloc + path_segments = [p for p in parsed_url.path.split('/') if p] + return scheme, netloc, path_segments + + +def create_url(scheme, netloc, path_segments): + url_template = f'{scheme}://{netloc}' + path = '/'.join(path_segments) + return f'{url_template}/{path}' + + +def get_maven_root(url): + """ + Given `url`, that is a URL to namespace, package, or artifact in a Maven + repo, return the URL to the root of that repo. If a Maven root cannot be + determined, return None. + + >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + 'https://repo1.maven.org/maven2' + """ + scheme, netloc, path_segments = url_parts(url) + for i in range(len(path_segments)): + segments = path_segments[: i + 1] + url_segment = create_url(scheme, netloc, segments) + if is_maven_root(url_segment): + return url_segment + return None + + +def determine_namespace_name_version_from_url(url, root_url=None): + """ + Return a 3-tuple containing strings of a Package namespace, name, and + version, determined from `url`, where `url` points to namespace, package, + specific package version, or artifact on a Maven repo. + + Return None if a Maven root cannot be determined from `url`. + + >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ('net.shibboleth', 'parent', '7.11.0') + """ + if not root_url: + root_url = get_maven_root(url) + if not root_url: + raise Exception(f'Error: not a Maven repository: {url}') + + _, remaining_path_segments = url.split(root_url) + remaining_path_segments = remaining_path_segments.split('/') + remaining_path_segments = [p for p in remaining_path_segments if p] + + namespace_segments = [] + package_name = '' + package_version = '' + for i in range(len(remaining_path_segments)): + segment = remaining_path_segments[i] + segments = remaining_path_segments[: i + 1] + path = '/'.join(segments) + url_segment = f'{root_url}/{path}' + if is_package_page(url_segment): + package_name = segment + elif is_package_version_page(url_segment): + package_version = segment + else: + namespace_segments.append(segment) + namespace = '.'.join(namespace_segments) + return namespace, package_name, package_version + + +def add_to_import_queue(url, root_url): + """ + Create ImportableURI for the Maven repo package page at `url`. + """ + from minecode.models import ImportableURI + + data = None + response = requests.get(url) + if response: + data = response.text + namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) + purl = PackageURL( + type='maven', + namespace=namespace, + name=name, + ) + importable_uri = ImportableURI.objects.insert(url, data, purl) + if importable_uri: + logger.info(f'Inserted {url} into ImportableURI queue') + + +def filter_only_directories(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + if link != '../' and link.endswith('/'): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +valid_artifact_extensions = [ + 'ejb3', + 'ear', + 'aar', + 'apk', + 'gem', + 'jar', + 'nar', + # 'pom', + 'so', + 'swc', + 'tar', + 'tar.gz', + 'war', + 'xar', + 'zip', +] + + +def filter_for_artifacts(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are the filenames + of Maven artifacts, return a mapping of filenames whose extension is in + `valid_artifact_extensions` and their timestamps. + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + for ext in valid_artifact_extensions: + if link.endswith(ext): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +def collect_links_from_text(text, filter): + """ + Return a mapping of link locations and their timestamps, given HTML `text` + content, that is filtered using `filter`. + """ + links_and_timestamps = collect_links_and_artifact_timestamps(text) + timestamps_by_links = {} + for link, timestamp in links_and_timestamps: + if timestamp == '-': + timestamp = '' + timestamps_by_links[link] = timestamp + + timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) + return timestamps_by_links + + +def create_absolute_urls_for_links(text, url, filter): + """ + Given the `text` contents from `url`, return a mapping of absolute URLs to + links from `url` and their timestamps, that is then filtered by `filter`. + """ + timestamps_by_absolute_links = {} + url = url.rstrip('/') + timestamps_by_links = collect_links_from_text(text, filter) + for link, timestamp in timestamps_by_links.items(): + if not link.startswith(url): + link = f'{url}/{link}' + timestamps_by_absolute_links[link] = timestamp + return timestamps_by_absolute_links + + +def get_directory_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_directory_links = {} + response = requests.get(url) + if response: + timestamps_by_directory_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_only_directories + ) + return timestamps_by_directory_links + + +def get_artifact_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_artifact_links = [] + response = requests.get(url) + if response: + timestamps_by_artifact_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_for_artifacts + ) + return timestamps_by_artifact_links + + +def crawl_to_package(url, root_url): + """ + Given a maven repo `url`, + """ + if is_package_page(url): + add_to_import_queue(url, root_url) + return + + for link in get_directory_links(url): + crawl_to_package(link, root_url) + + +def crawl_maven_repo_from_root(root_url): + """ + Given the `url` to a maven root, traverse the repo depth-first and add + packages to the import queue. + """ + crawl_to_package(root_url, root_url) + + +def get_artifact_sha1(artifact_url): + """ + Return the SHA1 value of the Maven artifact located at `artifact_url`. + """ + sha1 = None + artifact_sha1_url = f'{artifact_url}.sha1' + response = requests.get(artifact_sha1_url) + if response: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + return sha1 + + +def get_classifier_from_artifact_url( + artifact_url, package_version_page_url, package_name, package_version +): + """ + Return the classifier from a Maven artifact URL `artifact_url`, otherwise + return None if a classifier cannot be determined from `artifact_url` + """ + classifier = None + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 + package_version_page_url = package_version_page_url.rstrip('/') + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 + leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' + # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + # ['', '-onejar.jar'] + _, remaining_url_portion = artifact_url.split(leading_url_portion) + # ['-onejar', 'jar'] + remaining_url_portions = remaining_url_portion.split('.') + if remaining_url_portions and remaining_url_portions[0]: + # '-onejar' + classifier = remaining_url_portions[0] + if classifier.startswith('-'): + # 'onejar' + classifier = classifier[1:] + return classifier diff --git a/minecode/collectors/npm.py b/minecode/collectors/npm.py new file mode 100644 index 00000000..10ab3575 --- /dev/null +++ b/minecode/collectors/npm.py @@ -0,0 +1,111 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging + +from packagedb.models import PackageContentType +from minecode import priority_router +import requests +from packageurl import PackageURL +from packagedcode.npm import npm_api_url +from packagedcode.npm import NpmPackageJsonHandler + + +""" +Collect NPM packages from npm registries. +""" + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def get_package_json(namespace, name, version): + """ + Return the contents of the package.json file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + url = npm_api_url( + namespace=namespace, + name=name, + version=version, + ) + + try: + response = requests.get(url) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as err: + logger.error(f"HTTP error occurred: {err}") + + +def map_npm_package(package_url, pipelines, priority=0): + """ + Add a npm `package_url` to the PackageDB. + + Return an error string if any errors are encountered during the process + """ + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package + + package_json = get_package_json( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + ) + + if not package_json: + error = f'Package does not exist on npmjs: {package_url}' + logger.error(error) + return error + + package = NpmPackageJsonHandler._parse( + json_data=package_json + ) + package.extra_data['package_content'] = PackageContentType.SOURCE_ARCHIVE + + db_package, _, _, error = merge_or_create_package(package, visit_level=0) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue( + package=db_package, + pipelines=pipelines, + priority=priority + ) + + return error + + +@priority_router.route('pkg:npm/.*') +def process_request(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a npm Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from npm and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get('addon_pipelines', []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get('priority', 0) + + package_url = PackageURL.from_string(purl_str) + if not package_url.version: + return + + error_msg = map_npm_package(package_url, pipelines, priority) + + if error_msg: + return error_msg diff --git a/minecode/collectors/openssl.py b/minecode/collectors/openssl.py new file mode 100644 index 00000000..aee9a54e --- /dev/null +++ b/minecode/collectors/openssl.py @@ -0,0 +1,42 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from packageurl import PackageURL +from minecode import priority_router + +from minecode.collectors.generic import map_fetchcode_supported_package + +# Indexing OpenSSL PURLs requires a GitHub API token. +# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. +@priority_router.route('pkg:openssl/openssl@.*') +def process_request_dir_listed(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a OpenSSL Package URL (PURL) + supported by fetchcode. + + This involves obtaining Package information for the PURL using + https://github.com/nexB/fetchcode and using it to create a new + PackageDB entry. The package is then added to the scan queue afterwards. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get('addon_pipelines', []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get('priority', 0) + + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f"error occurred when parsing {purl_str}: {e}" + return error + + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) + + if error_msg: + return error_msg diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index ded0916e..257f7e77 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -15,8 +15,8 @@ import requests from minecode.management.commands import VerboseCommand -from minecode.miners.maven import collect_links_from_text -from minecode.miners.maven import filter_for_artifacts +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import filter_for_artifacts from packagedb.models import Package diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 6dfa3a85..74f2ed67 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -22,14 +22,14 @@ from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand from minecode.models import ImportableURI -from minecode.miners.maven import get_artifact_links -from minecode.miners.maven import get_classifier_from_artifact_url -from minecode.miners.maven import collect_links_from_text -from minecode.miners.maven import filter_only_directories -from minecode.miners.maven import get_artifact_sha1 +from minecode.collectors.maven import get_artifact_links +from minecode.collectors.maven import get_classifier_from_artifact_url +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import filter_only_directories +from minecode.collectors.maven import get_artifact_sha1 from minecode.model_utils import merge_or_create_package from packagedcode.models import PackageData -from minecode.miners.maven import determine_namespace_name_version_from_url +from minecode.collectors.maven import determine_namespace_name_version_from_url logger = logging.getLogger(__name__) diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index 9a90815b..f4be733c 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -10,7 +10,7 @@ import logging import sys -from minecode.miners.maven import crawl_maven_repo_from_root +from minecode.collectors.maven import crawl_maven_repo_from_root from minecode.management.commands import VerboseCommand diff --git a/minecode/management/commands/priority_queue.py b/minecode/management/commands/priority_queue.py index ca3c5bff..6c445702 100644 --- a/minecode/management/commands/priority_queue.py +++ b/minecode/management/commands/priority_queue.py @@ -16,13 +16,12 @@ from django.utils import timezone # UnusedImport here! -# But importing the miners module triggers routes registration -from minecode import miners # NOQA +# But importing the collectors module triggers routes registration +from minecode import collectors # NOQA from minecode import priority_router from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand from minecode.models import PriorityResourceURI -from minecode.models import ScannableURI from minecode.route import NoRouteAvailable diff --git a/minecode/miners/debian.py b/minecode/miners/debian.py index 84fb6fb7..ae2975c9 100644 --- a/minecode/miners/debian.py +++ b/minecode/miners/debian.py @@ -12,16 +12,12 @@ import gzip import json import logging -import requests from commoncode import fileutils import debian_inspector from debian_inspector import debcon from debian_inspector import copyright as debcopy -from debian_inspector.version import Version as DebVersion from packagedcode import models as scan_models -from packagedcode.debian import DebianDscFileHandler -from packagedcode.debian_copyright import StandaloneDebianCopyrightFileHandler from packageurl import PackageURL from minecode import debutils @@ -29,17 +25,12 @@ from minecode import seed from minecode import map_router from minecode import visit_router -from minecode import priority_router from minecode.miners import HttpVisitor from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor from minecode.miners import URI -from minecode.utils import fetch_and_write_file_from_url from minecode.utils import form_vcs_url -from minecode.utils import get_package_sha1 -from packagedb.models import make_relationship -from packagedb.models import PackageContentType -from packagedb.models import PackageRelation + logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -324,500 +315,6 @@ class DebianReleaseVisitor(HttpVisitor): pass -@priority_router.route('pkg:deb/.*') -def process_request(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a maven Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from debian and - using it to create a new PackageDB entry. The binary package is then added to the - scan queue afterwards. We also get the Package information for the - accompanying source package and add it to the PackageDB and scan queue, if - available. - - Return an error string for errors that occur, or empty string if there is no error. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - source_purl = kwargs.get("source_purl", None) - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - source_package_url = None - if source_purl: - source_package_url = PackageURL.from_string(source_purl) - - except ValueError as e: - error = f'error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}' - return error - - has_version = bool(package_url.version) - if has_version: - error = map_debian_metadata_binary_and_source( - package_url=package_url, - source_package_url=source_package_url, - pipelines=pipelines, - priority=priority, - ) - - return error - - -def map_debian_package(debian_package, package_content, pipelines, priority=0): - """ - Add a debian `package_url` to the PackageDB. - - Return an error string if errors have occured in the process. - """ - from minecode.model_utils import add_package_to_scan_queue - from minecode.model_utils import merge_or_create_package - - db_package = None - error = '' - - purl = debian_package.package_url - if package_content == PackageContentType.BINARY: - download_url = debian_package.binary_archive_url - elif package_content == PackageContentType.SOURCE_ARCHIVE: - download_url = debian_package.source_archive_url - - response = requests.get(download_url) - if not response.ok: - msg = f'Package metadata does not exist on debian: {download_url}' - error += msg + '\n' - logger.error(msg) - return db_package, error - - purl_package = scan_models.PackageData( - type=purl.type, - namespace=purl.namespace, - name=purl.name, - version=purl.version, - qualifiers=purl.qualifiers, - ) - - package, error_metadata = get_debian_package_metadata(debian_package) - if not package: - error += error_metadata - return db_package, error - - package_copyright, error_copyright = get_debian_package_copyright( - debian_package) - package.update_purl_fields(package_data=purl_package, replace=True) - if package_copyright: - update_license_copyright_fields( - package_from=package_copyright, - package_to=package, - replace=True, - ) - else: - error += error_metadata - - # This will be used to download and scan the package - package.download_url = download_url - - # Set package_content value - package.extra_data['package_content'] = package_content - - # If sha1 exists for an archive, we know we can create the package - # Use purl info as base and create packages for binary and source package - sha1 = get_package_sha1(package=package, field="download_url") - if sha1: - package.sha1 = sha1 - db_package, _, _, _ = merge_or_create_package(package, visit_level=50) - else: - msg = f'Failed to retrieve package archive: {purl.to_string()} from url: {download_url}' - error += msg + '\n' - logger.error(msg) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package, pipelines, priority) - - return db_package, error - - -def get_debian_package_metadata(debian_package): - """ - Given a DebianPackage object with package url and source package url - information, get the .dsc package metadata url, fetch the .dsc file, - parse and return the PackageData object containing the package metadata - for that Debian package. - - If there are errors, return None and a string containing the error - information. - """ - error = '' - - metadata_url = debian_package.package_metadata_url - temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) - if not temp_metadata_file: - msg = f'Package metadata does not exist on debian: {metadata_url}' - error += msg + '\n' - logger.error(msg) - return None, error - - packages = DebianDscFileHandler.parse(location=temp_metadata_file) - package = list(packages).pop() - - package.qualifiers = debian_package.package_url.qualifiers - - return package, error - - -def get_debian_package_copyright(debian_package): - """ - Given a DebianPackage object with package url and source package url - information, get the debian copyright file url, fetch and run license - detection, and return the PackageData object containing the package - metadata for that Debian package. - - If there are errors, return None and a string containing the error - information. - """ - error = '' - - metadata_url = debian_package.package_copyright_url - temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) - if not temp_metadata_file: - msg = f'Package metadata does not exist on debian: {metadata_url}' - error += msg + '\n' - logger.error(msg) - return None, error - - packages = StandaloneDebianCopyrightFileHandler.parse( - location=temp_metadata_file) - package = list(packages).pop() - - package.qualifiers = debian_package.package_url.qualifiers - - return package, error - - -def update_license_copyright_fields(package_from, package_to, replace=True): - fields_to_update = [ - 'copyright', - 'holder', - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', - 'extracted_license_statement' - ] - - for field in fields_to_update: - value = getattr(package_from, field) - if value and replace: - setattr(package_to, field, value) - - -def map_debian_metadata_binary_and_source(package_url, source_package_url, pipelines, priority=0): - """ - Get metadata for the binary and source release of the Debian package - `package_url` and save it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - - if "repository_url" in package_url.qualifiers: - base_url = package_url.qualifiers["repository_url"] - elif package_url.namespace == 'ubuntu': - base_url = UBUNTU_BASE_URL - else: - base_url = DEBIAN_BASE_URL - - if "api_data_url" in package_url.qualifiers: - metadata_base_url = package_url.qualifiers["api_data_url"] - elif package_url.namespace == 'ubuntu': - metadata_base_url = UBUNTU_METADATA_URL - else: - metadata_base_url = DEBIAN_METADATA_URL - - package_urls = dict( - package_url=package_url, - source_package_url=source_package_url, - archive_base_url=base_url, - metadata_base_url=metadata_base_url, - ) - debian_package, emsg = DebianPackage.from_purls(package_urls) - if emsg: - return emsg - - binary_package, emsg = map_debian_package( - debian_package, - PackageContentType.BINARY, - pipelines, - priority, - ) - if emsg: - error += emsg - - package_url.qualifiers['classifier'] = 'sources' - source_package, emsg = map_debian_package( - debian_package, - PackageContentType.SOURCE_ARCHIVE, - pipelines, - priority, - ) - if emsg: - error += emsg - - if binary_package and source_package: - make_relationship( - from_package=binary_package, - to_package=source_package, - relationship=PackageRelation.Relationship.SOURCE_PACKAGE, - ) - - return error - - -@attr.s -class DebianPackage: - """ - Contains the package url and source package url for a debian package - necessary to get source, binary, metadata and copyright urls for it. - """ - - archive_base_url = attr.ib(type=str) - metadata_base_url = attr.ib(type=str) - package_url = attr.ib(type=str) - source_package_url = attr.ib(type=str) - metadata_directory_url = attr.ib(type=str, default=None) - archive_directory_url = attr.ib(type=str, default=None) - - @classmethod - def from_purls(cls, package_urls): - """ - Set the directory URLs for metadata and package archives. - """ - debian_package = cls(**package_urls) - error = debian_package.set_debian_directories() - return debian_package, error - - @property - def package_archive_version(self): - """ - Get the useful part of the debian package version used in - source, binary, metadata and copyright URLs optionally. - """ - debvers = DebVersion.from_string(self.package_url.version) - if debvers.revision != "0": - purl_version = f"{debvers.upstream}-{debvers.revision}" - else: - purl_version = debvers.upstream - return purl_version - - @property - def binary_archive_url(self): - """ - Get the .deb debian binary archive url for this debian package. - """ - purl_version = self.package_archive_version - arch = self.package_url.qualifiers.get("arch") - if arch: - archive_name = f"{self.package_url.name}_{purl_version}_{arch}.deb" - else: - archive_name = f"{self.package_url.name}_{purl_version}.deb" - binary_package_url = self.archive_directory_url + f"{archive_name}" - return binary_package_url - - @property - def source_archive_url(self): - """ - Get the debian source tarball archive url for this debian package. - """ - debian_source_archive_formats = [ - ".tar.xz", ".tar.gz", ".orig.tar.xz", ".orig.tar.gz", ".orig.tar.bz2" - ] - - source_version = self.package_archive_version - if not self.source_package_url: - source_package_name = self.package_url.name - else: - source_package_name = self.source_package_url.name - if self.source_package_url.version: - source_version = self.source_package_url.version - - for archive_format in debian_source_archive_formats: - if ".orig" in archive_format: - base_version_source = source_version.split('-')[0] - archive_name = f"{source_package_name}_{base_version_source}" + \ - archive_format - else: - archive_name = f"{source_package_name}_{source_version}" + \ - archive_format - source_package_url = self.archive_directory_url + archive_name - response = requests.get(source_package_url) - if response.ok: - break - - return source_package_url - - @property - def package_metadata_url(self): - """ - Get the .dsc metadata file url for this debian package. - """ - metadata_version = self.package_archive_version - if not self.source_package_url: - metadata_package_name = self.package_url.name - else: - metadata_package_name = self.source_package_url.name - if self.source_package_url.version: - metadata_version = self.source_package_url.version - - base_version_metadata = metadata_version.split('+')[0] - metadata_dsc_package_url = self.archive_directory_url + \ - f"{metadata_package_name}_{base_version_metadata}.dsc" - response = requests.get(metadata_dsc_package_url) - if not response.ok: - metadata_dsc_package_url = self.archive_directory_url + \ - f"{metadata_package_name}_{metadata_version}.dsc" - - return metadata_dsc_package_url - - @property - def package_copyright_url(self): - """ - Get the debian copyright file url containing license and copyright - declarations for this debian package. - """ - # Copyright files for ubuntu are named just `copyright` and placed under a name-version folder - # instead of having the name-version in the copyright file itself - copyright_file_string = "_copyright" - if self.package_url.namespace == "ubuntu": - copyright_file_string = "/copyright" - - metadata_version = self.package_archive_version - if not self.source_package_url: - metadata_package_name = self.package_url.name - else: - metadata_package_name = self.source_package_url.name - if self.source_package_url.version: - metadata_version = self.source_package_url.version - - copyright_package_url = self.metadata_directory_url + \ - f"{metadata_package_name}_{metadata_version}{copyright_file_string}" - response = requests.get(copyright_package_url) - if not response.ok: - base_version_metadata = metadata_version.split('+')[0] - copyright_package_url = self.metadata_directory_url + \ - f"{metadata_package_name}_{base_version_metadata}{copyright_file_string}" - - return copyright_package_url - - def set_debian_directories(self): - """ - Compute and set base urls for metadata and archives, to get - source/binary - """ - error = '' - - archive_base_url = self.archive_base_url - metadata_base_url = self.metadata_base_url - - index_folder = None - if self.package_url.name.startswith('lib'): - name_wout_lib = self.package_url.name.replace("lib", "") - index_folder = 'lib' + name_wout_lib[0] - else: - index_folder = self.package_url.name[0] - - msg = "No directory exists for package at: " - - package_directory = f"{archive_base_url}{index_folder}/{self.package_url.name}/" - metadata_directory = f"{metadata_base_url}{index_folder}/{self.package_url.name}/" - - response = requests.get(package_directory) - if not response.ok: - if not self.source_package_url: - error = msg + str(package_directory) - return error - - if self.source_package_url.name.startswith('lib'): - name_wout_lib = self.source_package_url.name.replace("lib", "") - index_folder = 'lib' + name_wout_lib[0] - else: - index_folder = self.source_package_url.name[0] - - package_directory = f"{archive_base_url}{index_folder}/{self.source_package_url.name}/" - metadata_directory = f"{metadata_base_url}{index_folder}/{self.source_package_url.name}/" - - response = requests.get(package_directory) - if not response.ok: - error = msg + str(package_directory) - return error - - self.archive_directory_url = package_directory - self.metadata_directory_url = metadata_directory - - -# FIXME: We are not returning download URLs. Returned information is incorrect - - -def get_dependencies(data): - """ - Return a list of DependentPackage extracted from a Debian `data` mapping. - """ - scopes = { - 'Build-Depends': dict(is_runtime=False, is_optional=True), - 'Depends': dict(is_runtime=True, is_optional=False), - 'Pre-Depends': dict(is_runtime=True, is_optional=False), - # 'Provides': dict(is_runtime=True, is_optional=False), - # 'Recommends': dict(is_runtime=True, is_optional=True), - # 'Suggests': dict(is_runtime=True, is_optional=True), - } - dep_pkgs = [] - for scope, flags in scopes.items(): - depends = data.get(scope) - if not depends: - continue - - dependencies = None # debutils.comma_separated(depends) - if not dependencies: - continue - # break each dep in package names and version constraints - # FIXME:!!! - for name in dependencies: - purl = PackageURL(type='deb', namespace='debian', name=name) - dep = scan_models.DependentPackage(purl=purl.to_string(), score=scope, **flags) - dep_pkgs.append(dep) - - return dep_pkgs - - -def get_vcs_repo(description): - """ - Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found. - """ - repos = [] - for vcs_tool, vcs_repo in description.items(): - vcs_tool = vcs_tool.lower() - if not vcs_tool.startswith('vcs-') or vcs_tool.startswith('vcs-browser'): - continue - _, _, vcs_tool = vcs_tool.partition('-') - repos.append((vcs_tool, vcs_repo)) - - if len(repos) > 1: - raise TypeError('Debian description with more than one Vcs repos: %(repos)r' % locals()) - - if repos: - vcs_tool, vcs_repo = repos[0] - else: - vcs_tool = None - vcs_repo = None - - return vcs_tool, vcs_repo - - @map_router.route('http://ftp.debian.org/debian/pool/.*\.dsc') class DebianDescriptionMapper(Mapper): diff --git a/minecode/miners/github.py b/minecode/miners/github.py index 3d3eb055..2052b1dc 100644 --- a/minecode/miners/github.py +++ b/minecode/miners/github.py @@ -18,12 +18,10 @@ import packagedcode.models as scan_models from minecode import map_router -from minecode import priority_router from minecode import visit_router, seed from minecode.miners import HttpJsonVisitor from minecode.miners import Mapper from minecode.miners import URI -from minecode.miners.generic import map_fetchcode_supported_package from minecode.utils import form_vcs_url from minecode.utils import parse_date @@ -191,36 +189,6 @@ def json_serial_date_obj(obj): return obj.isoformat() -# Indexing GitHub PURLs requires a GitHub API token. -# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. -@priority_router.route('pkg:github/.*') -def process_request_dir_listed(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a GitHub Package URL (PURL). - - This involves obtaining Package information for the PURL using - https://github.com/aboutcode-org/fetchcode and using it to create a new - PackageDB entry. The package is then added to the scan queue afterwards. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f"error occurred when parsing {purl_str}: {e}" - return error - - error_msg = map_fetchcode_supported_package( - package_url, pipelines, priority) - - if error_msg: - return error_msg - - @map_router.route('https://api\.github\.com/repos/([^/]+)/([^/]+)') class GithubMetaFileMapper(Mapper): diff --git a/minecode/miners/maven.py b/minecode/miners/maven.py index 07e325b1..66e38abb 100644 --- a/minecode/miners/maven.py +++ b/minecode/miners/maven.py @@ -8,22 +8,17 @@ # from collections import namedtuple -from typing import Dict -from urllib.parse import urlparse import gzip -import hashlib import io import json import logging import os -import re import packageurl from commoncode.text import as_unicode from packagedcode.models import PackageData from bs4 import BeautifulSoup from dateutil import tz import arrow -import requests from packageurl import PackageURL from jawa.util.utf import decode_modified_utf8 import javaproperties @@ -31,22 +26,14 @@ from packageurl import PackageURL from packagedcode.maven import build_filename from packagedcode.maven import build_url -from packagedcode.maven import get_urls -from packagedcode.maven import get_maven_pom from packagedcode.maven import _parse -from minecode import priority_router from minecode import seed from minecode import visit_router from minecode.miners import java_stream from minecode.miners import HttpVisitor from minecode.miners import NonPersistentHttpVisitor from minecode.miners import URI -from minecode.utils import validate_sha1 -from packagedb.models import make_relationship -from packagedb.models import PackageContentType -from packagedb.models import PackageRelation -from packagedb.models import make_relationship from minecode import map_router from minecode.utils import parse_date from minecode.miners import Mapper @@ -118,700 +105,6 @@ def get_seeds(self): # also has a npm mirrors: https://maven-eu.nuxeo.org/nexus/#view-repositories;npmjs~browsestorage -def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): - """ - Return the contents of the POM file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - if qualifiers and not isinstance(qualifiers, Dict): - return - urls = get_urls( - namespace=namespace, - name=name, - version=version, - qualifiers=qualifiers, - base_url=base_url, - ) - if not urls: - return - # Get and parse POM info - pom_url = urls['api_data_url'] - # TODO: manage different types of errors (404, etc.) - response = requests.get(pom_url) - if not response: - return - return response.text - - -def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): - """ - Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. - """ - if not pom_text: - return - pom = get_maven_pom(text=pom_text) - if ( - pom.parent - and pom.parent.group_id - and pom.parent.artifact_id - and pom.parent.version.version - ): - parent_namespace = pom.parent.group_id - parent_name = pom.parent.artifact_id - parent_version = str(pom.parent.version.version) - parent_pom_text = get_pom_text( - namespace=parent_namespace, - name=parent_name, - version=parent_version, - qualifiers={}, - base_url=base_url, - ) - return parent_pom_text - - -def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): - """ - Return a list of pom text of the ancestors of `pom`. The list is ordered - from oldest ancestor to newest. The list is empty is there is no parent pom. - """ - ancestors = [] - has_parent = True - while has_parent: - parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) - if not parent_pom_text: - has_parent = False - else: - ancestors.append(parent_pom_text) - pom_text = parent_pom_text - return reversed(ancestors) - - -def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): - """ - Merge package details of a package with its ancestor pom - and return the merged package. - """ - if not package: - return - pom_text = get_pom_text( - name=package.name, - namespace=package.namespace, - version=package.version, - qualifiers=package.qualifiers, - base_url=base_url, - ) - merged_package = merge_ancestors( - ancestor_pom_texts=get_ancestry(pom_text), - package=package, - ) - return merged_package - - -def merge_parent(package, parent_package): - """ - Merge `parent_package` data into `package` and return `package. - """ - mergeable_fields = ( - 'declared_license_expression', - 'homepage_url', - 'parties', - ) - for field in mergeable_fields: - # If `field` is empty on the package we're looking at, populate - # those fields with values from the parent package. - if not getattr(package, field): - value = getattr(parent_package, field) - setattr(package, field, value) - - msg = f'Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}' - history = package.extra_data.get('history') - if history: - package.extra_data['history'].append(msg) - else: - package.extra_data['history'] = [msg] - - return package - - -def merge_ancestors(ancestor_pom_texts, package): - """ - Merge metadata from `ancestor_pom_text` into `package`. - - The order of POM content in `ancestor_pom_texts` is expected to be in the - order of oldest ancestor to newest. - """ - for ancestor_pom_text in ancestor_pom_texts: - ancestor_package = _parse( - datasource_id='maven_pom', - package_type='maven', - primary_language='Java', - text=ancestor_pom_text, - ) - package = merge_parent(package, ancestor_package) - return package - - -def map_maven_package(package_url, package_content, pipelines, priority=0, reindex_metadata=False): - """ - Add a maven `package_url` to the PackageDB. - - Return an error string if errors have occured in the process. - - if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. - """ - from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package - - db_package = None - error = '' - - if 'repository_url' in package_url.qualifiers: - base_url = package_url.qualifiers['repository_url'] - else: - base_url = MAVEN_BASE_URL - - pom_text = get_pom_text( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - if not pom_text: - msg = f'Package does not exist on maven: {package_url}' - error += msg + '\n' - logger.error(msg) - return db_package, error - - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text, - base_url=base_url, - ) - ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) - package = merge_ancestors( - ancestor_pom_texts=ancestor_pom_texts, package=package) - - urls = get_urls( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - # In the case of looking up a maven package with qualifiers of - # `classifiers=sources`, the purl of the package created from the pom does - # not have the qualifiers, so we need to set them. Additionally, the download - # url is not properly generated since it would be missing the sources bit - # from the filename. - package.qualifiers = package_url.qualifiers - package.download_url = urls['repository_download_url'] - package.repository_download_url = urls['repository_download_url'] - - # Set package_content value - package.extra_data['package_content'] = package_content - - # If sha1 exists for a jar, we know we can create the package - # Use pom info as base and create packages for binary and source package - - # Check to see if binary is available - sha1 = get_package_sha1(package) - if sha1: - package.sha1 = sha1 - override = reindex_metadata - db_package, _, _, _ = merge_or_create_package( - package, visit_level=50, override=override) - else: - msg = f'Failed to retrieve JAR: {package_url}' - error += msg + '\n' - logger.error(msg) - - if not reindex_metadata: - # Submit package for scanning - if db_package: - add_package_to_scan_queue( - package=db_package, - pipelines=pipelines, - priority=priority - ) - - return db_package, error - - -def map_maven_binary_and_source(package_url, pipelines, priority=0, reindex_metadata=False): - """ - Get metadata for the binary and source release of the Maven package - `package_url` and save it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - package, emsg = map_maven_package( - package_url=package_url, - package_content=PackageContentType.BINARY, - pipelines=pipelines, - priority=priority, - reindex_metadata=reindex_metadata, - ) - if emsg: - error += emsg - - source_package_url = package_url - source_package_url.qualifiers['classifier'] = 'sources' - source_package, emsg = map_maven_package( - package_url=source_package_url, - package_content=PackageContentType.SOURCE_ARCHIVE, - pipelines=pipelines, - priority=priority, - reindex_metadata=reindex_metadata, - ) - if emsg: - error += emsg - - if not reindex_metadata and package and source_package: - make_relationship( - from_package=source_package, - to_package=package, - relationship=PackageRelation.Relationship.SOURCE_PACKAGE, - ) - - return error - - -def map_maven_packages(package_url, pipelines): - """ - Given a valid `package_url` with no version, get metadata for the binary and - source release for each version of the Maven package `package_url` and save - it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - namespace = package_url.namespace - name = package_url.name - # Find all versions of this package - query_params = f'g:{namespace}+AND+a:{name}' - url = f'https://search.maven.org/solrsearch/select?q={query_params}&core=gav' - response = requests.get(url) - if response: - package_listings = response.json().get('response', {}).get('docs', []) - for listing in package_listings: - purl = PackageURL( - type='maven', - namespace=listing.get('g'), - name=listing.get('a'), - version=listing.get('v'), - ) - emsg = map_maven_binary_and_source(purl, pipelines) - if emsg: - error += emsg - return error - - -def get_package_sha1(package): - """ - Return the sha1 value for `package` by checking if the sha1 file exists for - `package` on maven and returning the contents if it does. - If the sha1 is invalid, we download the package's JAR and calculate the sha1 - from that. - """ - download_url = package.repository_download_url - sha1_download_url = f'{download_url}.sha1' - response = requests.get(sha1_download_url) - if response.ok: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - if not sha1: - # Download JAR and calculate sha1 if we cannot get it from the repo - response = requests.get(download_url) - if response: - sha1_hash = hashlib.new('sha1', response.content) - sha1 = sha1_hash.hexdigest() - return sha1 - - -@priority_router.route('pkg:maven/.*') -def process_request(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a maven Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from maven and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. We also get the Package information for the - accompanying source package and add it to the PackageDB and scan queue, if - available. - - Return an error string for errors that occur, or empty string if there is no error. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f'error occured when parsing {purl_str}: {e}' - return error - - has_version = bool(package_url.version) - if has_version: - reindex_metadata = kwargs.get("reindex_metadata", False) - error = map_maven_binary_and_source( - package_url, - pipelines, - reindex_metadata=reindex_metadata, - priority=priority, - ) - else: - error = map_maven_packages(package_url, pipelines) - - return error - - -collect_links = re.compile(r'href="([^"]+)"').findall -collect_links_and_artifact_timestamps = re.compile( - r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' -).findall - - -def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): - """ - Return True if `file_name` is in `links` - """ - return any(l.endswith(file_name) for l in links) - - -def check_if_page_has_pom_files(links, **kwargs): - """ - Return True of any entry in `links` ends with .pom. - """ - return any(l.endswith('.pom') for l in links) - - -def check_if_page_has_directories(links, **kwargs): - """ - Return True if any entry, excluding "../", ends with /. - """ - return any(l.endswith('/') for l in links if l != '../') - - -def check_if_package_version_page(links, **kwargs): - """ - Return True if `links` contains pom files and has no directories - """ - return check_if_page_has_pom_files( - links=links - ) and not check_if_page_has_directories(links=links) - - -def check_if_package_page(links, **kwargs): - return check_if_file_name_is_linked_on_page( - file_name='maven-metadata.xml', links=links - ) and not check_if_page_has_pom_files(links=links) - - -def check_if_maven_root(links, **kwargs): - """ - Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven - repo contains "archetype-catalog.xml". - """ - return check_if_file_name_is_linked_on_page( - file_name='archetype-catalog.xml', links=links - ) - - -def check_on_page(url, checker): - """ - Return True if there is a link on `url` that is the same as `file_name`, - False otherwise. - """ - response = requests.get(url) - if response: - links = collect_links(response.text) - return checker(links=links) - return False - - -def is_maven_root(url): - """ - Return True if `url` is the root of a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_maven_root) - - -def is_package_page(url): - """ - Return True if `url` is a package page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_page) - - -def is_package_version_page(url): - """ - Return True if `url` is a package version page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_version_page) - - -def url_parts(url): - parsed_url = urlparse(url) - scheme = parsed_url.scheme - netloc = parsed_url.netloc - path_segments = [p for p in parsed_url.path.split('/') if p] - return scheme, netloc, path_segments - - -def create_url(scheme, netloc, path_segments): - url_template = f'{scheme}://{netloc}' - path = '/'.join(path_segments) - return f'{url_template}/{path}' - - -def get_maven_root(url): - """ - Given `url`, that is a URL to namespace, package, or artifact in a Maven - repo, return the URL to the root of that repo. If a Maven root cannot be - determined, return None. - - >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - 'https://repo1.maven.org/maven2' - """ - scheme, netloc, path_segments = url_parts(url) - for i in range(len(path_segments)): - segments = path_segments[: i + 1] - url_segment = create_url(scheme, netloc, segments) - if is_maven_root(url_segment): - return url_segment - return None - - -def determine_namespace_name_version_from_url(url, root_url=None): - """ - Return a 3-tuple containing strings of a Package namespace, name, and - version, determined from `url`, where `url` points to namespace, package, - specific package version, or artifact on a Maven repo. - - Return None if a Maven root cannot be determined from `url`. - - >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - ('net.shibboleth', 'parent', '7.11.0') - """ - if not root_url: - root_url = get_maven_root(url) - if not root_url: - raise Exception(f'Error: not a Maven repository: {url}') - - _, remaining_path_segments = url.split(root_url) - remaining_path_segments = remaining_path_segments.split('/') - remaining_path_segments = [p for p in remaining_path_segments if p] - - namespace_segments = [] - package_name = '' - package_version = '' - for i in range(len(remaining_path_segments)): - segment = remaining_path_segments[i] - segments = remaining_path_segments[: i + 1] - path = '/'.join(segments) - url_segment = f'{root_url}/{path}' - if is_package_page(url_segment): - package_name = segment - elif is_package_version_page(url_segment): - package_version = segment - else: - namespace_segments.append(segment) - namespace = '.'.join(namespace_segments) - return namespace, package_name, package_version - - -def add_to_import_queue(url, root_url): - """ - Create ImportableURI for the Maven repo package page at `url`. - """ - from minecode.models import ImportableURI - - data = None - response = requests.get(url) - if response: - data = response.text - namespace, name, _ = determine_namespace_name_version_from_url( - url, root_url) - purl = PackageURL( - type='maven', - namespace=namespace, - name=name, - ) - importable_uri = ImportableURI.objects.insert(url, data, purl) - if importable_uri: - logger.info(f'Inserted {url} into ImportableURI queue') - - -def filter_only_directories(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - if link != '../' and link.endswith('/'): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -valid_artifact_extensions = [ - 'ejb3', - 'ear', - 'aar', - 'apk', - 'gem', - 'jar', - 'nar', - # 'pom', - 'so', - 'swc', - 'tar', - 'tar.gz', - 'war', - 'xar', - 'zip', -] - - -def filter_for_artifacts(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are the filenames - of Maven artifacts, return a mapping of filenames whose extension is in - `valid_artifact_extensions` and their timestamps. - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - for ext in valid_artifact_extensions: - if link.endswith(ext): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -def collect_links_from_text(text, filter): - """ - Return a mapping of link locations and their timestamps, given HTML `text` - content, that is filtered using `filter`. - """ - links_and_timestamps = collect_links_and_artifact_timestamps(text) - timestamps_by_links = {} - for link, timestamp in links_and_timestamps: - if timestamp == '-': - timestamp = '' - timestamps_by_links[link] = timestamp - - timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) - return timestamps_by_links - - -def create_absolute_urls_for_links(text, url, filter): - """ - Given the `text` contents from `url`, return a mapping of absolute URLs to - links from `url` and their timestamps, that is then filtered by `filter`. - """ - timestamps_by_absolute_links = {} - url = url.rstrip('/') - timestamps_by_links = collect_links_from_text(text, filter) - for link, timestamp in timestamps_by_links.items(): - if not link.startswith(url): - link = f'{url}/{link}' - timestamps_by_absolute_links[link] = timestamp - return timestamps_by_absolute_links - - -def get_directory_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_directory_links = {} - response = requests.get(url) - if response: - timestamps_by_directory_links = create_absolute_urls_for_links( - response.text, url=url, filter=filter_only_directories - ) - return timestamps_by_directory_links - - -def get_artifact_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_artifact_links = [] - response = requests.get(url) - if response: - timestamps_by_artifact_links = create_absolute_urls_for_links( - response.text, url=url, filter=filter_for_artifacts - ) - return timestamps_by_artifact_links - - -def crawl_to_package(url, root_url): - """ - Given a maven repo `url`, - """ - if is_package_page(url): - add_to_import_queue(url, root_url) - return - - for link in get_directory_links(url): - crawl_to_package(link, root_url) - - -def crawl_maven_repo_from_root(root_url): - """ - Given the `url` to a maven root, traverse the repo depth-first and add - packages to the import queue. - """ - crawl_to_package(root_url, root_url) - - -def get_artifact_sha1(artifact_url): - """ - Return the SHA1 value of the Maven artifact located at `artifact_url`. - """ - sha1 = None - artifact_sha1_url = f'{artifact_url}.sha1' - response = requests.get(artifact_sha1_url) - if response: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - return sha1 - - -def get_classifier_from_artifact_url( - artifact_url, package_version_page_url, package_name, package_version -): - """ - Return the classifier from a Maven artifact URL `artifact_url`, otherwise - return None if a classifier cannot be determined from `artifact_url` - """ - classifier = None - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 - package_version_page_url = package_version_page_url.rstrip('/') - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 - leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' - # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - # ['', '-onejar.jar'] - _, remaining_url_portion = artifact_url.split(leading_url_portion) - # ['-onejar', 'jar'] - remaining_url_portions = remaining_url_portion.split('.') - if remaining_url_portions and remaining_url_portions[0]: - # '-onejar' - classifier = remaining_url_portions[0] - if classifier.startswith('-'): - # 'onejar' - classifier = classifier[1:] - return classifier - - @visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') @visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): diff --git a/minecode/miners/npm.py b/minecode/miners/npm.py index b27b78de..d0c2c2eb 100644 --- a/minecode/miners/npm.py +++ b/minecode/miners/npm.py @@ -14,22 +14,15 @@ from packagedcode.npm import split_scoped_package_name from packagedcode.npm import NpmPackageJsonHandler from packageurl import PackageURL -import requests from minecode import seed from minecode import map_router -from minecode import priority_router from minecode import visit_router from minecode.miners import NonPersistentHttpVisitor from minecode.miners import URI from minecode.miners import Mapper -from packagedb.models import PackageContentType -""" -Collect NPM packages from npm registries. -""" - logger = logging.getLogger(__name__) handler = logging.StreamHandler() logger.addHandler(handler) @@ -112,90 +105,6 @@ def get_uris(self, content): visited=True) -def get_package_json(namespace, name, version): - """ - Return the contents of the package.json file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - url = npm_api_url( - namespace=namespace, - name=name, - version=version, - ) - - try: - response = requests.get(url) - response.raise_for_status() - return response.json() - except requests.exceptions.HTTPError as err: - logger.error(f"HTTP error occurred: {err}") - - -def map_npm_package(package_url, pipelines, priority=0): - """ - Add a npm `package_url` to the PackageDB. - - Return an error string if any errors are encountered during the process - """ - from minecode.model_utils import add_package_to_scan_queue - from minecode.model_utils import merge_or_create_package - - package_json = get_package_json( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - ) - - if not package_json: - error = f'Package does not exist on npmjs: {package_url}' - logger.error(error) - return error - - package = NpmPackageJsonHandler._parse( - json_data=package_json - ) - package.extra_data['package_content'] = PackageContentType.SOURCE_ARCHIVE - - db_package, _, _, error = merge_or_create_package(package, visit_level=0) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue( - package=db_package, - pipelines=pipelines, - priority=priority - ) - - return error - - -@priority_router.route('pkg:npm/.*') -def process_request(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a npm Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from npm and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - package_url = PackageURL.from_string(purl_str) - if not package_url.version: - return - - error_msg = map_npm_package(package_url, pipelines, priority) - - if error_msg: - return error_msg - - # FIXME: This route may not work when we have scoped Packages or URLs to a specific version # or yarn URLs @map_router.route('https://registry.npmjs.org/[^\/]+') diff --git a/minecode/miners/openssl.py b/minecode/miners/openssl.py index d9df4a12..f4eff87d 100644 --- a/minecode/miners/openssl.py +++ b/minecode/miners/openssl.py @@ -18,13 +18,13 @@ from minecode import map_router from minecode.miners import Mapper from minecode.utils import parse_date -from minecode import priority_router + from minecode import seed from minecode import visit_router from minecode.utils import is_int from minecode.miners import HttpVisitor from minecode.miners import URI -from minecode.miners.generic import map_fetchcode_supported_package + logger = logging.getLogger(__name__) @@ -105,36 +105,6 @@ def get_uris(self, content): yield URI(uri=url, source_uri=self.uri, date=date, size=size) -# Indexing OpenSSL PURLs requires a GitHub API token. -# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. -@priority_router.route('pkg:openssl/openssl@.*') -def process_request_dir_listed(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a OpenSSL Package URL (PURL) - supported by fetchcode. - - This involves obtaining Package information for the PURL using - https://github.com/aboutcode-org/fetchcode and using it to create a new - PackageDB entry. The package is then added to the scan queue afterwards. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f"error occurred when parsing {purl_str}: {e}" - return error - - error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) - - if error_msg: - return error_msg - - @map_router.route('https://ftp.openssl.org/.*') class OpenSSLMapper(Mapper): diff --git a/minecode/miners/openwrt.py b/minecode/miners/openwrt.py index 0d584e3e..f2388bf3 100644 --- a/minecode/miners/openwrt.py +++ b/minecode/miners/openwrt.py @@ -22,11 +22,11 @@ from minecode import map_router from minecode import visit_router from minecode.utils import extract_file +from minecode.collectors.debian import get_dependencies from minecode.miners import Mapper from minecode.miners import HttpVisitor from minecode.miners import NonPersistentHttpVisitor from minecode.miners import URI -from minecode.miners.debian import get_dependencies logger = logging.getLogger(__name__) diff --git a/minecode/tests/test_conan.py b/minecode/tests/collectors/test_conan.py similarity index 95% rename from minecode/tests/test_conan.py rename to minecode/tests/collectors/test_conan.py index db6469eb..e44e5147 100644 --- a/minecode/tests/test_conan.py +++ b/minecode/tests/collectors/test_conan.py @@ -17,11 +17,11 @@ import packagedb from minecode.utils_test import JsonBasedTesting -from minecode.miners import conan +from minecode.collectors import conan class ConanPriorityQueueTests(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "testfiles") def setUp(self): super(ConanPriorityQueueTests, self).setUp() @@ -91,7 +91,7 @@ def test_get_download_info(self): self.assertEqual(result_download_url, expected_zlib_download_url) self.assertEqual(result_sha256, expected_zlib_sha256) - @patch("minecode.miners.conan.get_conan_recipe") + @patch("minecode.collectors.conan.get_conan_recipe") def test_map_conan_package(self, mock_get_conan_recipe): mock_get_conan_recipe.return_value = ( self.zlib_conanfile_contents, diff --git a/minecode/tests/test_generic.py b/minecode/tests/collectors/test_generic.py similarity index 98% rename from minecode/tests/test_generic.py rename to minecode/tests/collectors/test_generic.py index cf573d5f..d3fb4492 100644 --- a/minecode/tests/test_generic.py +++ b/minecode/tests/collectors/test_generic.py @@ -13,7 +13,7 @@ from minecode.route import NoRouteAvailable from minecode.utils_test import JsonBasedTesting -from minecode.miners import generic +from minecode.collectors import generic from packagedb.models import Package diff --git a/minecode/tests/test_gnu.py b/minecode/tests/collectors/test_gnu.py similarity index 93% rename from minecode/tests/test_gnu.py rename to minecode/tests/collectors/test_gnu.py index 39fc3220..c0d89235 100644 --- a/minecode/tests/test_gnu.py +++ b/minecode/tests/collectors/test_gnu.py @@ -14,12 +14,12 @@ from mock import patch from minecode.utils_test import JsonBasedTesting -from minecode.miners import gnu +from minecode.collectors import gnu from packagedb.models import Package class GnuPriorityQueueTests(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "testfiles") def setUp(self): super(GnuPriorityQueueTests, self).setUp() diff --git a/minecode/tests/collectors/test_maven.py b/minecode/tests/collectors/test_maven.py new file mode 100644 index 00000000..23db11c7 --- /dev/null +++ b/minecode/tests/collectors/test_maven.py @@ -0,0 +1,516 @@ +from django.test import TestCase as DjangoTestCase +from minecode.utils_test import JsonBasedTesting +from unittest import mock +from packagedcode.maven import _parse +from packageurl import PackageURL +import os +from minecode.collectors import maven +from minecode.tests import FIXTURES_REGEN +import packagedb +from mock import patch + + +class MavenPriorityQueueTests(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + + def setUp(self): + super(MavenPriorityQueueTests, self).setUp() + + self.expected_pom_loc = self.get_test_loc('maven/pom/classworlds-1.1.pom') + with open(self.expected_pom_loc) as f: + self.expected_pom_contents = f.read() + + self.scan_package = _parse( + 'maven_pom', + 'maven', + 'Java', + text=self.expected_pom_contents, + ) + + def test_get_pom_text(self, regen=FIXTURES_REGEN): + pom_contents = maven.get_pom_text( + namespace=self.scan_package.namespace, + name=self.scan_package.name, + version=self.scan_package.version + ) + if regen: + with open(self.expected_pom_loc, 'w') as f: + f.write(pom_contents) + self.assertEqual(self.expected_pom_contents, pom_contents) + + pom_contents = maven.get_pom_text( + namespace='', + name='does-not-exist', + version='1.0', + ) + self.assertFalse(pom_contents) + + def test_get_package_sha1(self): + sha1 = maven.get_package_sha1(self.scan_package) + expected_sha1 = '60c708f55deeb7c5dfce8a7886ef09cbc1388eca' + self.assertEqual(expected_sha1, sha1) + + def test_map_maven_package(self): + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + package_url = PackageURL.from_string(self.scan_package.purl) + maven.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(1, package_count) + package = packagedb.models.Package.objects.all().first() + expected_purl_str = 'pkg:maven/classworlds/classworlds@1.1' + self.assertEqual(expected_purl_str, package.purl) + + def test_map_maven_package_custom_repo_url(self): + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + custom_repo_purl = "pkg:maven/org.eclipse.core/runtime@20070801?repository_url=https://packages.atlassian.com/mvn/maven-atlassian-external/" + package_url = PackageURL.from_string(custom_repo_purl) + maven.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(1, package_count) + package = packagedb.models.Package.objects.all().first() + expected_repo_url = 'https://packages.atlassian.com/mvn/maven-atlassian-external//org/eclipse/core/runtime/20070801/runtime-20070801.jar' + self.assertEqual(expected_repo_url, package.download_url) + + def test_process_request(self): + purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' + download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' + purl_sources_str = f'{purl_str}?classifier=sources' + sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + maven.process_request(purl_str) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(2, package_count) + purls = [ + (package.purl, package.download_url) + for package in packagedb.models.Package.objects.all() + ] + self.assertIn( + (purl_str, download_url), purls + ) + self.assertIn( + (purl_sources_str, sources_download_url), purls + ) + + def test_fetch_parent(self, regen=FIXTURES_REGEN): + pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') + with open(pom_loc) as f: + pom_text = f.read() + parent_pom_text = maven.fetch_parent(pom_text) + expected_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') + + if regen: + with open(expected_loc, 'w') as f: + f.write(parent_pom_text) + + with open(expected_loc) as f: + expected_pom_text = f.read() + self.assertEqual(expected_pom_text, parent_pom_text) + + def test_get_ancestry(self): + pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') + with open(pom_loc) as f: + pom_text = f.read() + ancestor_pom_texts = list(maven.get_ancestry(pom_text)) + expected_ancestor_pom_texts = [] + for expected_loc in [ + self.get_test_loc('maven/pom/apache-18.pom'), + self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), + self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') + ]: + with open(expected_loc) as f: + expected_pom_text = f.read() + expected_ancestor_pom_texts.append(expected_pom_text) + self.assertEqual(expected_ancestor_pom_texts, ancestor_pom_texts) + + def test_merge_parent(self, regen=FIXTURES_REGEN): + pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') + with open(pom_loc) as f: + pom_text = f.read() + package = _parse( + 'maven_pom', + 'maven', + 'Java', + text=pom_text + ) + expected_before_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1-package_before.json') + self.check_expected_results(package.to_dict(), expected_before_loc, regen=regen) + + parent_pom_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') + with open(parent_pom_loc) as f: + parent_pom_text = f.read() + parent_package = _parse( + 'maven_pom', + 'maven', + 'Java', + text=parent_pom_text + ) + package = maven.merge_parent(package, parent_package) + expected_after_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1-package_after.json') + self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) + + def test_merge_ancestors(self, regen=FIXTURES_REGEN): + pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') + with open(pom_loc) as f: + pom_text = f.read() + package = _parse( + 'maven_pom', + 'maven', + 'Java', + text=pom_text + ) + expected_before_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1-package_before.json') + self.check_expected_results(package.to_dict(), expected_before_loc, regen=regen) + + ancestor_pom_texts = [] + for loc in [ + self.get_test_loc('maven/pom/apache-18.pom'), + self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), + self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') + ]: + with open(loc) as f: + pom_text = f.read() + ancestor_pom_texts.append(pom_text) + + maven.merge_ancestors(ancestor_pom_texts, package) + expected_after_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1-package_after.json') + self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) + + @mock.patch("minecode.collectors.maven.get_pom_text") + def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, regen=FIXTURES_REGEN): + get_pom_text_mock.return_value = "" + ancestor_pom_texts = [] + with patch("minecode.collectors.maven.get_ancestry") as mock_get_ancestry: + for loc in [ + self.get_test_loc('maven/pom/apache-18.pom'), + self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), + self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') + ]: + with open(loc) as f: + pom_text = f.read() + ancestor_pom_texts.append(pom_text) + mock_get_ancestry.return_value = ancestor_pom_texts + db_package = packagedb.models.Package.objects.create( + name="pulsar-client", + namespace="org.apache.pulsar", + version="2.5.1", + type="maven", + download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar-client/2.5.1/pulsar-client-2.5.1.jar", + ) + merged_package = maven.get_merged_ancestor_package_from_maven_package(package=db_package) + expected_loc = self.get_test_loc('maven/pom/pulsar-client-merged-ancestor-package.json') + self.check_expected_results(merged_package.to_dict(), expected_loc, regen=regen) + + +class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + + def test_check_if_file_name_is_linked_on_page(self): + links = ['foo/', 'bar/', 'baz/'] + self.assertTrue( + maven.check_if_file_name_is_linked_on_page('foo/', links) + ) + self.assertFalse( + maven.check_if_file_name_is_linked_on_page('qux/', links) + ) + + def test_check_if_page_has_pom_files(self): + links1 = ['foo/', 'bar.jar', 'bar.pom'] + links2 = ['foo/', 'bar.jar'] + self.assertTrue(maven.check_if_page_has_pom_files(links1)) + self.assertFalse(maven.check_if_page_has_pom_files(links2)) + + def test_check_if_page_has_directories(self): + links1 = ['foo/', 'bar/', 'baz/'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven.check_if_page_has_directories(links1)) + self.assertFalse(maven.check_if_page_has_directories(links2)) + + def test_check_if_package_version_page(self): + links1 = ['../', 'bar.pom', 'bar.jar'] + links2 = ['../', 'foo/', 'bar/', 'baz/'] + self.assertTrue(maven.check_if_package_version_page(links1)) + self.assertFalse(maven.check_if_package_version_page(links2)) + + def test_check_if_package_page(self): + links1 = ['../', 'maven-metadata.xml'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven.check_if_package_page(links1)) + self.assertFalse(maven.check_if_package_page(links2)) + + def test_check_if_maven_root(self): + links1 = ['../', 'archetype-catalog.xml'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven.check_if_maven_root(links1)) + self.assertFalse(maven.check_if_maven_root(links2)) + + @mock.patch('requests.get') + def test_check_on_page(self, mock_request_get): + checker = maven.check_if_page_has_pom_files + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'parent-7.11.0.pom' + self.assertTrue(maven.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) + + @mock.patch('requests.get') + def test_is_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertTrue(maven.is_maven_root('https://repo1.maven.org/maven2/')) + + @mock.patch('requests.get') + def test_is_package_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'maven-metadata.xml' + self.assertTrue(maven.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) + + @mock.patch('requests.get') + def test_is_package_version_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + parent-7.11.0.pom + ''' + self.assertTrue(maven.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) + + def test_url_parts(self): + url = 'https://example.com/foo/bar/baz.jar' + scheme, netloc, path_segments = maven.url_parts(url) + self.assertEqual('https', scheme) + self.assertEqual('example.com', netloc) + self.assertEqual(['foo', 'bar', 'baz.jar'], path_segments) + + def test_create_url(self): + scheme = 'https' + netloc = 'example.com' + path_segments = ['foo', 'bar', 'baz.jar'] + url = 'https://example.com/foo/bar/baz.jar' + self.assertEqual( + url, + maven.create_url(scheme, netloc, path_segments) + ) + + @mock.patch('requests.get') + def test_get_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertEqual( + 'https://repo1.maven.org/maven2', + maven.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ) + + @mock.patch('requests.get') + def test_determine_namespace_name_version_from_url(self, mock_request_get): + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2' + root_url = 'https://repo1.maven.org/maven2' + + package_page_text = ''' + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + ''' + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = ''' + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=''), + package_page, + mock.Mock(ok=True, text=''), + package_version_page + ] + + namespace, package_name, package_version = maven.determine_namespace_name_version_from_url(url, root_url) + self.assertEqual('xml-apis', namespace) + self.assertEqual('xml-apis', package_name) + self.assertEqual('1.0.b2', package_version) + + @mock.patch('requests.get') + def test_add_to_import_queue(self, mock_request_get): + from minecode.models import ImportableURI + + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + root_url = 'https://repo1.maven.org/maven2' + + package_page_text = ''' + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + ''' + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = ''' + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + package_page, + mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=''), + package_page, + mock.Mock(ok=True, text=''), + package_version_page + ] + + self.assertEqual(0, ImportableURI.objects.all().count()) + maven.add_to_import_queue(url, root_url ) + self.assertEqual(1, ImportableURI.objects.all().count()) + importable_uri = ImportableURI.objects.get(uri=url) + self.assertEqual('pkg:maven/xml-apis/xml-apis', importable_uri.package_url) + + def test_filter_only_directories(self): + timestamps_by_links = { + '../': '-', + 'foo/': '-', + 'foo.pom': '2023-09-28', + } + expected = { + 'foo/': '-', + } + self.assertEqual( + expected, + maven.filter_only_directories(timestamps_by_links) + ) + + def test_filter_for_artifacts(self): + timestamps_by_links = { + '../': '2023-09-28', + 'foo.pom': '2023-09-28', + 'foo.ejb3': '2023-09-28', + 'foo.ear': '2023-09-28', + 'foo.aar': '2023-09-28', + 'foo.apk': '2023-09-28', + 'foo.gem': '2023-09-28', + 'foo.jar': '2023-09-28', + 'foo.nar': '2023-09-28', + 'foo.so': '2023-09-28', + 'foo.swc': '2023-09-28', + 'foo.tar': '2023-09-28', + 'foo.tar.gz': '2023-09-28', + 'foo.war': '2023-09-28', + 'foo.xar': '2023-09-28', + 'foo.zip': '2023-09-28', + } + expected = { + 'foo.ejb3': '2023-09-28', + 'foo.ear': '2023-09-28', + 'foo.aar': '2023-09-28', + 'foo.apk': '2023-09-28', + 'foo.gem': '2023-09-28', + 'foo.jar': '2023-09-28', + 'foo.nar': '2023-09-28', + 'foo.so': '2023-09-28', + 'foo.swc': '2023-09-28', + 'foo.tar': '2023-09-28', + 'foo.tar.gz': '2023-09-28', + 'foo.war': '2023-09-28', + 'foo.xar': '2023-09-28', + 'foo.zip': '2023-09-28', + } + self.assertEqual(expected, maven.filter_for_artifacts(timestamps_by_links)) + + def test_collect_links_from_text(self): + filter = maven.filter_only_directories + text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + expected = { + '1.0.b2/': '2005-09-20 05:53', + '1.2.01/': '2010-02-03 21:05' + } + self.assertEqual( + expected, + maven.collect_links_from_text(text, filter=filter) + ) + + def test_create_absolute_urls_for_links(self): + filter = maven.filter_only_directories + text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + } + self.assertEqual( + expected, + maven.create_absolute_urls_for_links(text, url, filter=filter) + ) + + @mock.patch('requests.get') + def test_get_directory_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + } + self.assertEqual(expected, maven.get_directory_links(url)) + + @mock.patch('requests.get') + def test_get_artifact_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + xml-apis-1.0.b2.jar + 2005-09-20 05:53 109318 + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', + } + self.assertEqual(expected, maven.get_artifact_links(url)) + + def test_crawl_to_package(self): + pass + + def test_crawl_maven_repo_from_root(self): + pass + + @mock.patch('requests.get') + def test_get_artifact_sha1(self, mock_request_get): + sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = sha1 + self.assertEqual(sha1, maven.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) + + def test_get_classifier_from_artifact_url(self): + artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' + package_name = 'livereload-jvm' + package_version = '0.2.0' + classifier = maven.get_classifier_from_artifact_url( + artifact_url, + package_version_page_url, + package_name, + package_version + ) + self.assertEqual('onejar', classifier) diff --git a/minecode/tests/collectors/test_npm.py b/minecode/tests/collectors/test_npm.py new file mode 100644 index 00000000..cc975452 --- /dev/null +++ b/minecode/tests/collectors/test_npm.py @@ -0,0 +1,57 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os + +from django.test import TestCase as DjangoTestCase +from packagedcode.npm import NpmPackageJsonHandler +from packageurl import PackageURL +import packagedb +from minecode.utils_test import JsonBasedTesting +from minecode.tests import FIXTURES_REGEN +from minecode.collectors import npm + + +class NpmPriorityQueueTests(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + + def setUp(self): + super(NpmPriorityQueueTests, self).setUp() + self.expected_json_loc = self.get_test_loc('npm/lodash_package-expected.json') + with open(self.expected_json_loc) as f: + self.expected_json_contents = json.load(f) + + self.scan_package = NpmPackageJsonHandler._parse( + json_data=self.expected_json_contents, + ) + + def test_get_package_json(self, regen=FIXTURES_REGEN): + json_contents = npm.get_package_json( + namespace=self.scan_package.namespace, + name=self.scan_package.name, + version=self.scan_package.version + ) + if regen: + with open(self.expected_json_loc, 'w') as f: + json.dump(json_contents, f, indent=3, separators=(',', ':')) + self.assertEqual(self.expected_json_contents, json_contents) + + def test_map_npm_package(self): + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + package_url = PackageURL.from_string(self.scan_package.purl) + npm.map_npm_package(package_url, ('test_pipeline')) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(1, package_count) + package = packagedb.models.Package.objects.all().first() + expected_purl_str = 'pkg:npm/lodash@4.17.21' + expected_download_url = 'https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz' + self.assertEqual(expected_purl_str, package.purl) + self.assertEqual(expected_download_url, package.download_url) diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py index ce9b03d2..8b5a467e 100644 --- a/minecode/tests/test_maven.py +++ b/minecode/tests/test_maven.py @@ -13,7 +13,6 @@ import re from mock import patch -from unittest import mock from django.test import TestCase as DjangoTestCase @@ -24,12 +23,9 @@ from minecode.utils_test import mocked_requests_get from minecode.utils_test import JsonBasedTesting from minecode.utils_test import model_to_dict -from minecode.miners import maven as maven_visitor from minecode.tests import FIXTURES_REGEN import packagedb -from packagedcode.maven import _parse -from packageurl import PackageURL # TODO: add tests from /maven-indexer/indexer-core/src/test/java/org/acche/maven/index/artifact @@ -50,35 +46,27 @@ class MavenMiscTest(JsonBasedTesting, DjangoTestCase): test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') def test_get_entries(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc('maven/index/nexus-maven-repository-index.gz') + fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) fields = set(fields) - result = list(maven_visitor.get_entries(index, fields=fields)) + result = list(maven.get_entries(index, fields=fields)) expected_loc = self.get_test_loc('maven/index/expected_entries.json') self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_entries_increment(self): - index = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc('maven/index/increment/nexus-maven-repository-index.445.gz') + fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) fields = set(fields) - result = list(maven_visitor.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_entries.json') + result = list(maven.get_entries(index, fields=fields)) + expected_loc = self.get_test_loc('maven/index/increment/expected_entries.json') self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_entries_buggy(self): - index = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc('maven/index/buggy/nexus-maven-repository-index.gz') + fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) fields = set(fields) - result = list(maven_visitor.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_entries.json') + result = list(maven.get_entries(index, fields=fields)) + expected_loc = self.get_test_loc('maven/index/buggy/expected_entries.json') self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_full(self): @@ -86,47 +74,36 @@ def test_get_artifacts_full(self): 'maven/index/nexus-maven-repository-index.gz') fields = ( - list(maven_visitor.ENTRY_FIELDS) + - list(maven_visitor.ENTRY_FIELDS_OTHER) + - list(maven_visitor.ENTRY_FIELDS_IGNORED) + list(maven.ENTRY_FIELDS) + + list(maven.ENTRY_FIELDS_OTHER) + + list(maven.ENTRY_FIELDS_IGNORED) ) fields = set(fields) - result = [a.to_dict() for a in maven_visitor.get_artifacts( - index, fields, include_all=True)] + result = [a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True)] expected_loc = self.get_test_loc('maven/index/expected_artifacts.json') self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_increment(self): - index = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc('maven/index/increment/nexus-maven-repository-index.445.gz') + fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) fields = set(fields) - result = [a.to_dict() for a in maven_visitor.get_artifacts( - index, fields, include_all=True)] - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_artifacts.json') + result = [a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True)] + expected_loc = self.get_test_loc('maven/index/increment/expected_artifacts.json') self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_buggy(self): - index = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc('maven/index/buggy/nexus-maven-repository-index.gz') + fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) fields = set(fields) - result = [a.to_dict() for a in maven_visitor.get_artifacts( - index, fields, include_all=True)] - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_artifacts.json') + result = [a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True)] + expected_loc = self.get_test_loc('maven/index/buggy/expected_artifacts.json') self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_defaults(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - result = [a.to_dict() for a in maven_visitor.get_artifacts(index)] - expected_loc = self.get_test_loc( - 'maven/index/expected_artifacts-defaults.json') + index = self.get_test_loc('maven/index/nexus-maven-repository-index.gz') + result = [a.to_dict() for a in maven.get_artifacts(index)] + expected_loc = self.get_test_loc('maven/index/expected_artifacts-defaults.json') self.check_expected_results(result, expected_loc) def test_get_artifacts_no_worthyness(self): @@ -136,26 +113,20 @@ def test_get_artifacts_no_worthyness(self): def worth(a): return True - result = [a.to_dict() - for a in maven_visitor.get_artifacts(index, worthyness=worth)] - expected_loc = self.get_test_loc( - 'maven/index/expected_artifacts-all-worthy.json') + result = [a.to_dict() for a in maven.get_artifacts(index, worthyness=worth)] + expected_loc = self.get_test_loc('maven/index/expected_artifacts-all-worthy.json') self.check_expected_results(result, expected_loc) def test_get_artifacts_defaults_increment(self): - index = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - result = [a.to_dict() for a in maven_visitor.get_artifacts(index)] - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_artifacts-defaults.json') + index = self.get_test_loc('maven/index/increment/nexus-maven-repository-index.445.gz') + result = [a.to_dict() for a in maven.get_artifacts(index)] + expected_loc = self.get_test_loc('maven/index/increment/expected_artifacts-defaults.json') self.check_expected_results(result, expected_loc) def test_get_artifacts_defaults_buggy(self): - index = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - result = [a.to_dict() for a in maven_visitor.get_artifacts(index)] - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_artifacts-defaults.json') + index = self.get_test_loc('maven/index/buggy/nexus-maven-repository-index.gz') + result = [a.to_dict() for a in maven.get_artifacts(index)] + expected_loc = self.get_test_loc('maven/index/buggy/expected_artifacts-defaults.json') self.check_expected_results(result, expected_loc) def test_build_artifact(self): @@ -164,7 +135,7 @@ def test_build_artifact(self): u'm': u'1318447185654', u'u': u'org.apache|maven|archetypes|1|0-alpha-1-20050407.154541-1.pom'} - result = maven_visitor.build_artifact(entry, include_all=True) + result = maven.build_artifact(entry, include_all=True) result = result.to_dict() expected = dict([ (u'group_id', u'org.apache'), @@ -192,29 +163,25 @@ def test_build_url_and_filename_1(self): test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-domain', 'version': '3.12.0', 'classifier': None, 'extension': 'jar'} expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/3.12.0/address-book-domain-3.12.0.jar', 'address-book-domain-3.12.0.jar' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) + self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_2(self): test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-data', 'version': '3.12.0', 'classifier': None, 'extension': 'pom'} expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-data/3.12.0/address-book-data-3.12.0.pom', 'address-book-data-3.12.0.pom' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) + self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_3(self): test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-rest-web', 'version': '3.12.0', 'classifier': None, 'extension': 'war'} expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-rest-web/3.12.0/address-book-rest-web-3.12.0.war', 'address-book-rest-web-3.12.0.war' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) + self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_4(self): test = {'group_id': 'uk.com.robust-it', 'artifact_id': 'cloning', 'version': '1.9.5', 'classifier': 'sources', 'extension': 'jar'} expected = 'https://repo1.maven.org/maven2/uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar', 'cloning-1.9.5-sources.jar' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) + self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_with_alternate_base(self): test = { @@ -222,14 +189,13 @@ def test_build_url_and_filename_with_alternate_base(self): 'version': '1.9.5', 'classifier': 'sources', 'extension': 'jar', 'base_repo_url': 'maven-index://'} expected = 'maven-index:///uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar', 'cloning-1.9.5-sources.jar' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) + self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_maven_xml_url(self): test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-domain'} expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/maven-metadata.xml' - self.assertEqual(expected, maven_visitor.build_maven_xml_url(**test)) + self.assertEqual(expected, maven.build_maven_xml_url(**test)) class MavenVisitorTest(JsonBasedTesting, DjangoTestCase): @@ -241,7 +207,7 @@ def test_MavenNexusIndexVisitor_uris(self): 'maven/index/nexus-maven-repository-index.gz') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) expected_loc = self.get_test_loc('maven/index/expected_uris.json') self.check_expected_uris( uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) @@ -252,12 +218,9 @@ def test_MavenNexusIndexPropertiesVisitor(self): 'maven/index/increment/nexus-maven-repository-index.properties') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusPropertiesVisitor( - uri) - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_properties_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + uris, _data, _errors = maven.MavenNexusPropertiesVisitor(uri) + expected_loc = self.get_test_loc('maven/index/increment/expected_properties_uris.json') + self.check_expected_uris(uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) def test_MavenNexusIndexVisitor_uris_increment(self): uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz' @@ -265,11 +228,9 @@ def test_MavenNexusIndexVisitor_uris_increment(self): 'maven/index/increment/nexus-maven-repository-index.445.gz') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) + expected_loc = self.get_test_loc('maven/index/increment/expected_uris.json') + self.check_expected_uris(uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) def test_MavenNexusIndexVisitor_uris_buggy(self): uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' @@ -277,11 +238,9 @@ def test_MavenNexusIndexVisitor_uris_buggy(self): 'maven/index/buggy/nexus-maven-repository-index.gz') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) + expected_loc = self.get_test_loc('maven/index/buggy/expected_uris.json') + self.check_expected_uris(uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) def test_visit_uri_does_not_fail_on_incorrect_sha1(self): uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' @@ -313,7 +272,7 @@ def test_MavenPOMVisitor_data(self): test_loc = self.get_test_loc('maven/pom/classworlds-1.1-alpha-2.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _ = maven_visitor.MavenPOMVisitor(uri) + uris, data, _ = maven.MavenPOMVisitor(uri) self.assertEqual(None, uris) expected = open(test_loc, 'rb').read() self.assertEqual(expected, data) @@ -530,9 +489,8 @@ def test_visit_maven_medatata_xml_file(self): test_loc = self.get_test_loc('maven/maven-metadata/maven-metadata.xml') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/maven-metadata/expected_maven_xml.json') + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc('maven/maven-metadata/expected_maven_xml.json') self.check_expected_uris(uris, expected_loc) @@ -544,9 +502,8 @@ def test_visit_maven_medatata_html_index_jcenter_1(self): test_loc = self.get_test_loc('maven/html/jcenter.bintray.com.html') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/html/visitor_expected_jcenter.bintray.com2.html.json') + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc('maven/html/visitor_expected_jcenter.bintray.com2.html.json') self.check_expected_uris(uris, expected_loc) def test_visit_maven_medatata_html_index_jcenter_2(self): @@ -554,9 +511,8 @@ def test_visit_maven_medatata_html_index_jcenter_2(self): test_loc = self.get_test_loc('maven/html/app.html') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/html/visitor_expected_app.html.json') + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc('maven/html/visitor_expected_app.html.json') self.check_expected_uris(uris, expected_loc) def test_visit_maven_medatata_html_index_jcenter_3(self): @@ -564,9 +520,8 @@ def test_visit_maven_medatata_html_index_jcenter_3(self): test_loc = self.get_test_loc('maven/html/stateframework-compiler.html') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/html/visitor_expected_stateframework-compiler.html.json') + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc('maven/html/visitor_expected_stateframework-compiler.html.json') self.check_expected_uris(uris, expected_loc) @@ -579,7 +534,7 @@ def test_visit_and_build_package_from_pom_axis(self): test_loc = self.get_test_loc('maven/mapper/axis-1.4.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) + _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() expected_loc = self.get_test_loc('maven/mapper/axis-1.4.pom.package.json') self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) @@ -589,7 +544,7 @@ def test_visit_and_build_package_from_pom_commons_pool(self): test_loc = self.get_test_loc('maven/mapper/commons-pool-1.5.7.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) + _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() expected_loc = self.get_test_loc('maven/mapper/commons-pool-1.5.7.pom.package.json') self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) @@ -599,7 +554,7 @@ def test_visit_and_build_package_from_pom_struts(self): test_loc = self.get_test_loc('maven/mapper/struts-menu-2.4.2.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) + _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() expected_loc = self.get_test_loc('maven/mapper/struts-menu-2.4.2.pom.package.json') self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) @@ -610,7 +565,7 @@ def test_visit_and_build_package_from_pom_mysql(self): 'maven/mapper/mysql-connector-java-5.1.27.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) + _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() expected_loc = self.get_test_loc('maven/mapper/mysql-connector-java-5.1.27.pom.package.json') self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) @@ -620,7 +575,7 @@ def test_visit_and_build_package_from_pom_xbean(self): test_loc = self.get_test_loc('maven/mapper/xbean-jmx-2.0.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) + _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() expected_loc = self.get_test_loc('maven/mapper/xbean-jmx-2.0.pom.package.json') self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) @@ -630,7 +585,7 @@ def test_visit_and_build_package_from_pom_maven_all(self): test_loc = self.get_test_loc('maven/mapper/maven-all-1.0-RELEASE.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) + _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() expected_loc = self.get_test_loc('maven/mapper/maven-all-1.0-RELEASE.pom.package.json') self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) @@ -640,7 +595,7 @@ def test_visit_and_build_package_from_pom_with_unicode(self): test_loc = self.get_test_loc('maven/mapper/commons-jaxrs-1.21.pom') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) + _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() expected_loc = self.get_test_loc('maven/mapper/commons-jaxrs-1.21.pom.package.json') self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) @@ -760,7 +715,7 @@ def test_MavenNexusIndexVisitor_uris_increment_contain_correct_purl(self): 'maven/index/increment2/nexus-maven-repository-index.457.gz') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) uris = [u for i, u in enumerate(uris) if i % 500 == 0] expected_loc = self.get_test_loc( 'maven/index/increment2/expected_uris.json') @@ -773,549 +728,12 @@ def test_MavenNexusIndexVisitor_then_get_mini_package_from_index_data(self): 'maven/index/increment2/nexus-maven-repository-index.457.gz') with patch('requests.get') as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) results = [] for i, u in enumerate(uris): # only get a few records if i % 500 == 0: minip = maven.get_mini_package(u.data, u.uri, u.package_url) results.append(minip and minip.to_dict() or minip) - expected_loc = self.get_test_loc( - 'maven/index/increment2/expected_mini_package.json') - self.check_expected_results( - results, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_does_create_a_correct_qualifier(self): - 'https://repo1.maven.org/maven2/org/hspconsortium/reference/hspc-reference-auth-server-webapp/1.9.1/hspc-reference-auth-server-webapp-1.9.1.pom' - - -class MavenPriorityQueueTests(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def setUp(self): - super(MavenPriorityQueueTests, self).setUp() - - self.expected_pom_loc = self.get_test_loc( - 'maven/pom/classworlds-1.1.pom') - with open(self.expected_pom_loc) as f: - self.expected_pom_contents = f.read() - - self.scan_package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=self.expected_pom_contents, - ) - - def test_get_pom_text(self, regen=FIXTURES_REGEN): - pom_contents = maven_visitor.get_pom_text( - namespace=self.scan_package.namespace, - name=self.scan_package.name, - version=self.scan_package.version - ) - if regen: - with open(self.expected_pom_loc, 'w') as f: - f.write(pom_contents) - self.assertEqual(self.expected_pom_contents, pom_contents) - - pom_contents = maven_visitor.get_pom_text( - namespace='', - name='does-not-exist', - version='1.0', - ) - self.assertFalse(pom_contents) - - def test_get_package_sha1(self): - sha1 = maven_visitor.get_package_sha1(self.scan_package) - expected_sha1 = '60c708f55deeb7c5dfce8a7886ef09cbc1388eca' - self.assertEqual(expected_sha1, sha1) - - def test_map_maven_package(self): - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - package_url = PackageURL.from_string(self.scan_package.purl) - maven_visitor.map_maven_package( - package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(1, package_count) - package = packagedb.models.Package.objects.all().first() - expected_purl_str = 'pkg:maven/classworlds/classworlds@1.1' - self.assertEqual(expected_purl_str, package.purl) - - def test_map_maven_package_custom_repo_url(self): - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - custom_repo_purl = "pkg:maven/org.eclipse.core/runtime@20070801?repository_url=https://packages.atlassian.com/mvn/maven-atlassian-external/" - package_url = PackageURL.from_string(custom_repo_purl) - maven_visitor.map_maven_package( - package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(1, package_count) - package = packagedb.models.Package.objects.all().first() - expected_repo_url = 'https://packages.atlassian.com/mvn/maven-atlassian-external//org/eclipse/core/runtime/20070801/runtime-20070801.jar' - self.assertEqual(expected_repo_url, package.download_url) - - def test_process_request(self): - purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' - download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' - purl_sources_str = f'{purl_str}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - maven_visitor.process_request(purl_str) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(2, package_count) - purls = [ - (package.purl, package.download_url) - for package in packagedb.models.Package.objects.all() - ] - self.assertIn( - (purl_str, download_url), purls - ) - self.assertIn( - (purl_sources_str, sources_download_url), purls - ) - - def test_fetch_parent(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - parent_pom_text = maven_visitor.fetch_parent(pom_text) - expected_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') - - if regen: - with open(expected_loc, 'w') as f: - f.write(parent_pom_text) - - with open(expected_loc) as f: - expected_pom_text = f.read() - self.assertEqual(expected_pom_text, parent_pom_text) - - def test_get_ancestry(self): - pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - ancestor_pom_texts = list(maven_visitor.get_ancestry(pom_text)) - expected_ancestor_pom_texts = [] - for expected_loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') - ]: - with open(expected_loc) as f: - expected_pom_text = f.read() - expected_ancestor_pom_texts.append(expected_pom_text) - self.assertEqual(expected_ancestor_pom_texts, ancestor_pom_texts) - - def test_merge_parent(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text - ) - expected_before_loc = self.get_test_loc( - 'maven/pom/ant-antlr-1.10.1-package_before.json') - self.check_expected_results( - package.to_dict(), expected_before_loc, regen=regen) - - parent_pom_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') - with open(parent_pom_loc) as f: - parent_pom_text = f.read() - parent_package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=parent_pom_text - ) - package = maven_visitor.merge_parent(package, parent_package) - expected_after_loc = self.get_test_loc( - 'maven/pom/ant-antlr-1.10.1-package_after.json') - self.check_expected_results( - package.to_dict(), expected_after_loc, regen=regen) - - def test_merge_ancestors(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text - ) - expected_before_loc = self.get_test_loc( - 'maven/pom/pulsar-client-1x-2.5.1-package_before.json') - self.check_expected_results( - package.to_dict(), expected_before_loc, regen=regen) - - ancestor_pom_texts = [] - for loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') - ]: - with open(loc) as f: - pom_text = f.read() - ancestor_pom_texts.append(pom_text) - - maven_visitor.merge_ancestors(ancestor_pom_texts, package) - expected_after_loc = self.get_test_loc( - 'maven/pom/pulsar-client-1x-2.5.1-package_after.json') - self.check_expected_results( - package.to_dict(), expected_after_loc, regen=regen) - - @mock.patch("minecode.miners.maven.get_pom_text") - def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, regen=FIXTURES_REGEN): - get_pom_text_mock.return_value = "" - ancestor_pom_texts = [] - with patch("minecode.miners.maven.get_ancestry") as mock_get_ancestry: - for loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') - ]: - with open(loc) as f: - pom_text = f.read() - ancestor_pom_texts.append(pom_text) - mock_get_ancestry.return_value = ancestor_pom_texts - db_package = packagedb.models.Package.objects.create( - name="pulsar-client", - namespace="org.apache.pulsar", - version="2.5.1", - type="maven", - download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar-client/2.5.1/pulsar-client-2.5.1.jar", - ) - merged_package = maven_visitor.get_merged_ancestor_package_from_maven_package( - package=db_package) - expected_loc = self.get_test_loc( - 'maven/pom/pulsar-client-merged-ancestor-package.json') - self.check_expected_results( - merged_package.to_dict(), expected_loc, regen=regen) - - -class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_check_if_file_name_is_linked_on_page(self): - links = ['foo/', 'bar/', 'baz/'] - self.assertTrue( - maven_visitor.check_if_file_name_is_linked_on_page('foo/', links) - ) - self.assertFalse( - maven_visitor.check_if_file_name_is_linked_on_page('qux/', links) - ) - - def test_check_if_page_has_pom_files(self): - links1 = ['foo/', 'bar.jar', 'bar.pom'] - links2 = ['foo/', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_page_has_pom_files(links1)) - self.assertFalse(maven_visitor.check_if_page_has_pom_files(links2)) - - def test_check_if_page_has_directories(self): - links1 = ['foo/', 'bar/', 'baz/'] - links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_page_has_directories(links1)) - self.assertFalse(maven_visitor.check_if_page_has_directories(links2)) - - def test_check_if_package_version_page(self): - links1 = ['../', 'bar.pom', 'bar.jar'] - links2 = ['../', 'foo/', 'bar/', 'baz/'] - self.assertTrue(maven_visitor.check_if_package_version_page(links1)) - self.assertFalse(maven_visitor.check_if_package_version_page(links2)) - - def test_check_if_package_page(self): - links1 = ['../', 'maven-metadata.xml'] - links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_package_page(links1)) - self.assertFalse(maven_visitor.check_if_package_page(links2)) - - def test_check_if_maven_root(self): - links1 = ['../', 'archetype-catalog.xml'] - links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_maven_root(links1)) - self.assertFalse(maven_visitor.check_if_maven_root(links2)) - - @mock.patch('requests.get') - def test_check_on_page(self, mock_request_get): - checker = maven_visitor.check_if_page_has_pom_files - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'parent-7.11.0.pom' - self.assertTrue(maven_visitor.check_on_page( - 'https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) - - @mock.patch('requests.get') - def test_is_maven_root(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'archetype-catalog.xml' - self.assertTrue(maven_visitor.is_maven_root( - 'https://repo1.maven.org/maven2/')) - - @mock.patch('requests.get') - def test_is_package_page(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'maven-metadata.xml' - self.assertTrue(maven_visitor.is_package_page( - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/')) - - @mock.patch('requests.get') - def test_is_package_version_page(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' - ../ - parent-7.11.0.pom - ''' - self.assertTrue(maven_visitor.is_package_version_page( - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) - - def test_url_parts(self): - url = 'https://example.com/foo/bar/baz.jar' - scheme, netloc, path_segments = maven_visitor.url_parts(url) - self.assertEqual('https', scheme) - self.assertEqual('example.com', netloc) - self.assertEqual(['foo', 'bar', 'baz.jar'], path_segments) - - def test_create_url(self): - scheme = 'https' - netloc = 'example.com' - path_segments = ['foo', 'bar', 'baz.jar'] - url = 'https://example.com/foo/bar/baz.jar' - self.assertEqual( - url, - maven_visitor.create_url(scheme, netloc, path_segments) - ) - - @mock.patch('requests.get') - def test_get_maven_root(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'archetype-catalog.xml' - self.assertEqual( - 'https://repo1.maven.org/maven2', - maven_visitor.get_maven_root( - 'https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - ) - - @mock.patch('requests.get') - def test_determine_namespace_name_version_from_url(self, mock_request_get): - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2' - root_url = 'https://repo1.maven.org/maven2' - - package_page_text = ''' - 1.0.b2/ - 2005-09-20 05:53 - - maven-metadata.xml - 2012-06-26 17:01 567 - ''' - package_page = mock.Mock(ok=True, text=package_page_text) - - package_version_page_text = ''' - ../ - - xml-apis-1.0.b2.pom - 2005-09-20 05:53 2249 - ''' - package_version_page = mock.Mock( - ok=True, text=package_version_page_text) - mock_request_get.side_effect = [ - mock.Mock(ok=True, text=''), - mock.Mock(ok=True, text=''), - package_page, - mock.Mock(ok=True, text=''), - package_version_page - ] - - namespace, package_name, package_version = maven_visitor.determine_namespace_name_version_from_url( - url, root_url) - self.assertEqual('xml-apis', namespace) - self.assertEqual('xml-apis', package_name) - self.assertEqual('1.0.b2', package_version) - - @mock.patch('requests.get') - def test_add_to_import_queue(self, mock_request_get): - from minecode.models import ImportableURI - - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' - root_url = 'https://repo1.maven.org/maven2' - - package_page_text = ''' - 1.0.b2/ - 2005-09-20 05:53 - - maven-metadata.xml - 2012-06-26 17:01 567 - ''' - package_page = mock.Mock(ok=True, text=package_page_text) - - package_version_page_text = ''' - ../ - - xml-apis-1.0.b2.pom - 2005-09-20 05:53 2249 - ''' - package_version_page = mock.Mock( - ok=True, text=package_version_page_text) - mock_request_get.side_effect = [ - package_page, - mock.Mock(ok=True, text=''), - mock.Mock(ok=True, text=''), - package_page, - mock.Mock(ok=True, text=''), - package_version_page - ] - - self.assertEqual(0, ImportableURI.objects.all().count()) - maven_visitor.add_to_import_queue(url, root_url) - self.assertEqual(1, ImportableURI.objects.all().count()) - importable_uri = ImportableURI.objects.get(uri=url) - self.assertEqual('pkg:maven/xml-apis/xml-apis', - importable_uri.package_url) - - def test_filter_only_directories(self): - timestamps_by_links = { - '../': '-', - 'foo/': '-', - 'foo.pom': '2023-09-28', - } - expected = { - 'foo/': '-', - } - self.assertEqual( - expected, - maven_visitor.filter_only_directories(timestamps_by_links) - ) - - def test_filter_for_artifacts(self): - timestamps_by_links = { - '../': '2023-09-28', - 'foo.pom': '2023-09-28', - 'foo.ejb3': '2023-09-28', - 'foo.ear': '2023-09-28', - 'foo.aar': '2023-09-28', - 'foo.apk': '2023-09-28', - 'foo.gem': '2023-09-28', - 'foo.jar': '2023-09-28', - 'foo.nar': '2023-09-28', - 'foo.so': '2023-09-28', - 'foo.swc': '2023-09-28', - 'foo.tar': '2023-09-28', - 'foo.tar.gz': '2023-09-28', - 'foo.war': '2023-09-28', - 'foo.xar': '2023-09-28', - 'foo.zip': '2023-09-28', - } - expected = { - 'foo.ejb3': '2023-09-28', - 'foo.ear': '2023-09-28', - 'foo.aar': '2023-09-28', - 'foo.apk': '2023-09-28', - 'foo.gem': '2023-09-28', - 'foo.jar': '2023-09-28', - 'foo.nar': '2023-09-28', - 'foo.so': '2023-09-28', - 'foo.swc': '2023-09-28', - 'foo.tar': '2023-09-28', - 'foo.tar.gz': '2023-09-28', - 'foo.war': '2023-09-28', - 'foo.xar': '2023-09-28', - 'foo.zip': '2023-09-28', - } - self.assertEqual( - expected, maven_visitor.filter_for_artifacts(timestamps_by_links)) - - def test_collect_links_from_text(self): - filter = maven_visitor.filter_only_directories - text = ''' - ../ - 1.0.b2/ - 2005-09-20 05:53 - - 1.2.01/ - 2010-02-03 21:05 - - ''' - expected = { - '1.0.b2/': '2005-09-20 05:53', - '1.2.01/': '2010-02-03 21:05' - } - self.assertEqual( - expected, - maven_visitor.collect_links_from_text(text, filter=filter) - ) - - def test_create_absolute_urls_for_links(self): - filter = maven_visitor.filter_only_directories - text = ''' - ../ - 1.0.b2/ - 2005-09-20 05:53 - - 1.2.01/ - 2010-02-03 21:05 - - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' - expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' - } - self.assertEqual( - expected, - maven_visitor.create_absolute_urls_for_links( - text, url, filter=filter) - ) - - @mock.patch('requests.get') - def test_get_directory_links(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' - ../ - 1.0.b2/ - 2005-09-20 05:53 - - 1.2.01/ - 2010-02-03 21:05 - - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' - expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' - } - self.assertEqual(expected, maven_visitor.get_directory_links(url)) - - @mock.patch('requests.get') - def test_get_artifact_links(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' - ../ - xml-apis-1.0.b2.jar - 2005-09-20 05:53 109318 - xml-apis-1.0.b2.pom - 2005-09-20 05:53 2249 - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/' - expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', - } - self.assertEqual(expected, maven_visitor.get_artifact_links(url)) - - def test_crawl_to_package(self): - pass - - def test_crawl_maven_repo_from_root(self): - pass - - @mock.patch('requests.get') - def test_get_artifact_sha1(self, mock_request_get): - sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = sha1 - self.assertEqual(sha1, maven_visitor.get_artifact_sha1( - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) - - def test_get_classifier_from_artifact_url(self): - artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' - package_name = 'livereload-jvm' - package_version = '0.2.0' - classifier = maven_visitor.get_classifier_from_artifact_url( - artifact_url, - package_version_page_url, - package_name, - package_version - ) - self.assertEqual('onejar', classifier) + expected_loc = self.get_test_loc('maven/index/increment2/expected_mini_package.json') + self.check_expected_results(results, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_npm.py b/minecode/tests/test_npm.py index 06313841..ddbf733f 100644 --- a/minecode/tests/test_npm.py +++ b/minecode/tests/test_npm.py @@ -7,19 +7,12 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - -import codecs import json import os import re -from django.test import TestCase as DjangoTestCase -from mock import Mock from mock import patch -from packagedcode.npm import NpmPackageJsonHandler -from packageurl import PackageURL -import packagedb from minecode import miners from minecode import route from minecode.models import ResourceURI @@ -180,42 +173,3 @@ def test_regex_npm_mapper(self): result = re.match( regex, 'https://registry.npmjs.org/react-mobile-navigation-modal') self.assertTrue(result) - - -class NpmPriorityQueueTests(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def setUp(self): - super(NpmPriorityQueueTests, self).setUp() - self.expected_json_loc = self.get_test_loc( - 'npm/lodash_package-expected.json') - with open(self.expected_json_loc) as f: - self.expected_json_contents = json.load(f) - - self.scan_package = NpmPackageJsonHandler._parse( - json_data=self.expected_json_contents, - ) - - def test_get_package_json(self, regen=FIXTURES_REGEN): - json_contents = npm.get_package_json( - namespace=self.scan_package.namespace, - name=self.scan_package.name, - version=self.scan_package.version - ) - if regen: - with open(self.expected_json_loc, 'w') as f: - json.dump(json_contents, f, indent=3, separators=(',', ':')) - self.assertEqual(self.expected_json_contents, json_contents) - - def test_map_npm_package(self): - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - package_url = PackageURL.from_string(self.scan_package.purl) - npm.map_npm_package(package_url, ('test_pipeline')) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(1, package_count) - package = packagedb.models.Package.objects.all().first() - expected_purl_str = 'pkg:npm/lodash@4.17.21' - expected_download_url = 'https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz' - self.assertEqual(expected_purl_str, package.purl) - self.assertEqual(expected_download_url, package.download_url) diff --git a/packagedb/api.py b/packagedb/api.py index 63247f0a..2503da7b 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -40,11 +40,10 @@ from univers.versions import InvalidVersion # UnusedImport here! -# But importing the miners module triggers routes registration +# But importing the collectors module triggers routes registration from minecode import priority_router -from minecode import miners # NOQA +from minecode import collectors # NOQA from minecode.models import PriorityResourceURI -from minecode.models import ScannableURI from minecode.route import NoRouteAvailable from packagedb.filters import PackageSearchFilter from packagedb.models import Package diff --git a/packagedb/management/commands/fix_purl_values.py b/packagedb/management/commands/fix_purl_values.py index 02d3196f..be34ff80 100644 --- a/packagedb/management/commands/fix_purl_values.py +++ b/packagedb/management/commands/fix_purl_values.py @@ -21,8 +21,8 @@ from minecode.management.commands import VerboseCommand from minecode.utils import MemorySavingQuerysetIterator -from minecode.miners.maven import collect_links_from_text -from minecode.miners.maven import filter_for_artifacts +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import filter_for_artifacts from packagedb.models import Package DEFAULT_TIMEOUT = 30 diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index e91dfe83..d1620333 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -15,7 +15,6 @@ from django.test import TestCase from django.urls import reverse from django.utils import timezone -from packageurl.contrib.django.utils import purl_to_lookups from rest_framework import status from rest_framework.test import APIClient from univers.versions import MavenVersion diff --git a/purl2vcs/src/purl2vcs/find_source_repo.py b/purl2vcs/src/purl2vcs/find_source_repo.py index fd60d3d9..6fe917a7 100644 --- a/purl2vcs/src/purl2vcs/find_source_repo.py +++ b/purl2vcs/src/purl2vcs/find_source_repo.py @@ -19,7 +19,7 @@ from scancode.api import get_urls as get_urls_from_location from minecode.model_utils import add_package_to_scan_queue -from minecode.miners.maven import get_merged_ancestor_package_from_maven_package +from minecode.collectors.maven import get_merged_ancestor_package_from_maven_package from packagedb.models import Package, PackageContentType, PackageSet logger = logging.getLogger(__name__) From b873555b6295d476125d1c39800499b5a6407b32 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 9 Aug 2024 11:45:58 -0700 Subject: [PATCH 03/12] Move miners test to own directory #515 Signed-off-by: Jono Yang --- minecode/tests/collectors/__init__.py | 8 ++++++++ minecode/tests/miners/__init__.py | 8 ++++++++ minecode/tests/{ => miners}/test_apache.py | 4 ++-- minecode/tests/{ => miners}/test_bitbucket.py | 4 ++-- minecode/tests/{ => miners}/test_bower.py | 4 ++-- minecode/tests/{ => miners}/test_cpan.py | 4 ++-- minecode/tests/{ => miners}/test_cran.py | 4 ++-- minecode/tests/{ => miners}/test_debian.py | 2 +- minecode/tests/{ => miners}/test_dockerhub.py | 2 +- minecode/tests/{ => miners}/test_eclipse.py | 4 ++-- minecode/tests/{ => miners}/test_fdroid.py | 4 ++-- minecode/tests/{ => miners}/test_freebsd.py | 4 ++-- minecode/tests/{ => miners}/test_freedesktop.py | 2 +- minecode/tests/{ => miners}/test_github.py | 4 ++-- minecode/tests/{ => miners}/test_gitlab.py | 6 +++--- minecode/tests/{ => miners}/test_golang.py | 4 ++-- minecode/tests/{ => miners}/test_googlecode.py | 4 ++-- minecode/tests/{ => miners}/test_gstreamer.py | 4 ++-- minecode/tests/{ => miners}/test_haxe.py | 4 ++-- minecode/tests/{ => miners}/test_maven.py | 14 +++++++------- minecode/tests/{ => miners}/test_npm.py | 4 ++-- minecode/tests/{ => miners}/test_nuget.py | 4 ++-- minecode/tests/{ => miners}/test_openssl.py | 4 ++-- minecode/tests/{ => miners}/test_openwrt.py | 4 ++-- minecode/tests/{ => miners}/test_packagist.py | 4 ++-- minecode/tests/{ => miners}/test_pypi.py | 4 ++-- minecode/tests/{ => miners}/test_repodata.py | 2 +- minecode/tests/{ => miners}/test_repodata_rpms.py | 2 +- minecode/tests/{ => miners}/test_repomd_parser.py | 2 +- minecode/tests/{ => miners}/test_rubygems.py | 8 ++++---- minecode/tests/{ => miners}/test_sourceforge.py | 4 ++-- minecode/tests/test_models.py | 3 --- minecode/tests/test_run_visit.py | 3 --- 33 files changed, 76 insertions(+), 66 deletions(-) create mode 100644 minecode/tests/collectors/__init__.py create mode 100644 minecode/tests/miners/__init__.py rename minecode/tests/{ => miners}/test_apache.py (98%) rename minecode/tests/{ => miners}/test_bitbucket.py (97%) rename minecode/tests/{ => miners}/test_bower.py (94%) rename minecode/tests/{ => miners}/test_cpan.py (97%) rename minecode/tests/{ => miners}/test_cran.py (94%) rename minecode/tests/{ => miners}/test_debian.py (99%) rename minecode/tests/{ => miners}/test_dockerhub.py (97%) rename minecode/tests/{ => miners}/test_eclipse.py (96%) rename minecode/tests/{ => miners}/test_fdroid.py (91%) rename minecode/tests/{ => miners}/test_freebsd.py (93%) rename minecode/tests/{ => miners}/test_freedesktop.py (96%) rename minecode/tests/{ => miners}/test_github.py (97%) rename minecode/tests/{ => miners}/test_gitlab.py (89%) rename minecode/tests/{ => miners}/test_golang.py (96%) rename minecode/tests/{ => miners}/test_googlecode.py (96%) rename minecode/tests/{ => miners}/test_gstreamer.py (93%) rename minecode/tests/{ => miners}/test_haxe.py (93%) rename minecode/tests/{ => miners}/test_maven.py (98%) rename minecode/tests/{ => miners}/test_npm.py (97%) rename minecode/tests/{ => miners}/test_nuget.py (96%) rename minecode/tests/{ => miners}/test_openssl.py (93%) rename minecode/tests/{ => miners}/test_openwrt.py (96%) rename minecode/tests/{ => miners}/test_packagist.py (90%) rename minecode/tests/{ => miners}/test_pypi.py (98%) rename minecode/tests/{ => miners}/test_repodata.py (97%) rename minecode/tests/{ => miners}/test_repodata_rpms.py (92%) rename minecode/tests/{ => miners}/test_repomd_parser.py (99%) rename minecode/tests/{ => miners}/test_rubygems.py (97%) rename minecode/tests/{ => miners}/test_sourceforge.py (96%) diff --git a/minecode/tests/collectors/__init__.py b/minecode/tests/collectors/__init__.py new file mode 100644 index 00000000..2eb8f9f0 --- /dev/null +++ b/minecode/tests/collectors/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# diff --git a/minecode/tests/miners/__init__.py b/minecode/tests/miners/__init__.py new file mode 100644 index 00000000..2eb8f9f0 --- /dev/null +++ b/minecode/tests/miners/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# diff --git a/minecode/tests/test_apache.py b/minecode/tests/miners/test_apache.py similarity index 98% rename from minecode/tests/test_apache.py rename to minecode/tests/miners/test_apache.py index 63758a46..e5d6eaa4 100644 --- a/minecode/tests/test_apache.py +++ b/minecode/tests/miners/test_apache.py @@ -24,7 +24,7 @@ class ApacheVistorTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_ApacheDistIndexVisitor(self): uri = 'http://apache.org/dist/zzz/find-ls.gz' @@ -130,7 +130,7 @@ def test_ApachePodlingsJsonVisitor(self): class ApacheMapperTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_package_from_download(self): package = miners.apache.build_package_from_download( diff --git a/minecode/tests/test_bitbucket.py b/minecode/tests/miners/test_bitbucket.py similarity index 97% rename from minecode/tests/test_bitbucket.py rename to minecode/tests/miners/test_bitbucket.py index c4e4b12a..ad9a35f8 100644 --- a/minecode/tests/test_bitbucket.py +++ b/minecode/tests/miners/test_bitbucket.py @@ -29,7 +29,7 @@ class BitbucketVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_BitbucketIndexVisitor(self): uri = 'https://api.bitbucket.org/2.0/repositories?pagelen=10' @@ -84,7 +84,7 @@ def test_BitbucketDetailsVisitorPaginated(self): class BitbucketMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_pattern_match_without_download(self): url = 'https://api.bitbucket.org/2.0/repositories/phlogistonjohn/tweakmsg' diff --git a/minecode/tests/test_bower.py b/minecode/tests/miners/test_bower.py similarity index 94% rename from minecode/tests/test_bower.py rename to minecode/tests/miners/test_bower.py index 9c852bdc..68034124 100644 --- a/minecode/tests/test_bower.py +++ b/minecode/tests/miners/test_bower.py @@ -23,7 +23,7 @@ class BowerVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_findls_file(self): uri = 'https://registry.bower.io/packages' @@ -46,7 +46,7 @@ def test_visit_bower_json_file(self): class BowerMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages_metafile_from_bowerjson1(self): with open(self.get_test_loc('bower/28msec_bower.json')) as bower_metadata: diff --git a/minecode/tests/test_cpan.py b/minecode/tests/miners/test_cpan.py similarity index 97% rename from minecode/tests/test_cpan.py rename to minecode/tests/miners/test_cpan.py index 0c9e22af..806dfc51 100644 --- a/minecode/tests/test_cpan.py +++ b/minecode/tests/miners/test_cpan.py @@ -23,7 +23,7 @@ class CpanVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_metacpanauthorurlvisitors(self): uri = 'https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000' @@ -77,7 +77,7 @@ def test_visit_readme_file(self): class CpanMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_from_release_search_json(self): with open(self.get_test_loc('cpan/release_search.json')) as cpan_metadata: diff --git a/minecode/tests/test_cran.py b/minecode/tests/miners/test_cran.py similarity index 94% rename from minecode/tests/test_cran.py rename to minecode/tests/miners/test_cran.py index a0a88bde..d81b30fb 100644 --- a/minecode/tests/test_cran.py +++ b/minecode/tests/miners/test_cran.py @@ -24,7 +24,7 @@ class CranVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_metacpan_api_projects(self): uri = 'https://cloud.r-project.org/web/packages/available_packages_by_date.html' @@ -37,7 +37,7 @@ def test_visit_metacpan_api_projects(self): class CranMapperTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages_from_directory_listing(self): ResourceURI.objects.create( diff --git a/minecode/tests/test_debian.py b/minecode/tests/miners/test_debian.py similarity index 99% rename from minecode/tests/test_debian.py rename to minecode/tests/miners/test_debian.py index 6b8a8415..2a35fff0 100644 --- a/minecode/tests/test_debian.py +++ b/minecode/tests/miners/test_debian.py @@ -26,7 +26,7 @@ class BaseDebianTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def check_objects_expected(self, results, expected_loc, regen=FIXTURES_REGEN): """ diff --git a/minecode/tests/test_dockerhub.py b/minecode/tests/miners/test_dockerhub.py similarity index 97% rename from minecode/tests/test_dockerhub.py rename to minecode/tests/miners/test_dockerhub.py index ef90145f..8a7f802d 100644 --- a/minecode/tests/test_dockerhub.py +++ b/minecode/tests/miners/test_dockerhub.py @@ -25,7 +25,7 @@ class DockerHubTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') class DockerHubVistorTest(DockerHubTest): diff --git a/minecode/tests/test_eclipse.py b/minecode/tests/miners/test_eclipse.py similarity index 96% rename from minecode/tests/test_eclipse.py rename to minecode/tests/miners/test_eclipse.py index 35f7a8b4..7cbc4245 100644 --- a/minecode/tests/test_eclipse.py +++ b/minecode/tests/miners/test_eclipse.py @@ -25,7 +25,7 @@ class EclipseVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_eclipse_projects(self): uri = 'https://projects.eclipse.org/list-of-projects' @@ -101,7 +101,7 @@ def test_visitor_eclipse_projects_json_download_timeout_error(self): class TestEclipseMap(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages(self): with open(self.get_test_loc('eclipse/birt.json')) as eclipse_metadata: diff --git a/minecode/tests/test_fdroid.py b/minecode/tests/miners/test_fdroid.py similarity index 91% rename from minecode/tests/test_fdroid.py rename to minecode/tests/miners/test_fdroid.py index 4611d3bc..ffd7f1c5 100644 --- a/minecode/tests/test_fdroid.py +++ b/minecode/tests/miners/test_fdroid.py @@ -21,7 +21,7 @@ class TestFdroidVisitor(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_FdroidPackageRepoVisitor(self): uri = 'https://f-droid.org/repo/index-v2.json' @@ -38,7 +38,7 @@ def test_FdroidPackageRepoVisitor(self): class TestFdroidMapper(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages(self): with open(self.get_test_loc('fdroid/index-v2-visited.json')) as fdroid_data: diff --git a/minecode/tests/test_freebsd.py b/minecode/tests/miners/test_freebsd.py similarity index 93% rename from minecode/tests/test_freebsd.py rename to minecode/tests/miners/test_freebsd.py index d28b5539..a16c9a7e 100644 --- a/minecode/tests/test_freebsd.py +++ b/minecode/tests/miners/test_freebsd.py @@ -23,7 +23,7 @@ class FreeBSDVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_freebsd_seed(self): uri = 'https://pkg.freebsd.org' @@ -55,7 +55,7 @@ def test_visit_freebsd_indexvisitor(self): class FreedesktopMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_map_index_file(self): with open(self.get_test_loc('freebsd/mapper_input1')) as freebsd_metadata: diff --git a/minecode/tests/test_freedesktop.py b/minecode/tests/miners/test_freedesktop.py similarity index 96% rename from minecode/tests/test_freedesktop.py rename to minecode/tests/miners/test_freedesktop.py index c9a90878..00515f08 100644 --- a/minecode/tests/test_freedesktop.py +++ b/minecode/tests/miners/test_freedesktop.py @@ -22,7 +22,7 @@ class FreedesktopTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') class FreedesktopVistorTest(FreedesktopTest): diff --git a/minecode/tests/test_github.py b/minecode/tests/miners/test_github.py similarity index 97% rename from minecode/tests/test_github.py rename to minecode/tests/miners/test_github.py index 31ac7938..4bb6c42f 100644 --- a/minecode/tests/test_github.py +++ b/minecode/tests/miners/test_github.py @@ -27,7 +27,7 @@ class GithubVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') @patch('github.MainClass.Github.get_repo') def test_GithubRepoVisitor(self, mock_get_repo): @@ -142,7 +142,7 @@ def test_GithubReposVisitor(self): class GithubMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_github_repo_mapper1(self): with open(self.get_test_loc('github/calendar_builder.json')) as json_metadata: diff --git a/minecode/tests/test_gitlab.py b/minecode/tests/miners/test_gitlab.py similarity index 89% rename from minecode/tests/test_gitlab.py rename to minecode/tests/miners/test_gitlab.py index 6f01ad1f..604b5dc4 100644 --- a/minecode/tests/test_gitlab.py +++ b/minecode/tests/miners/test_gitlab.py @@ -22,11 +22,11 @@ class GitlabTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') class GitlabVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') @unittest.skip('The test is to test fetching remotely through http connection') def test_visit_api_header_getheaders(self): @@ -47,7 +47,7 @@ def test_visit_metacpan_api_projects(self): class GitlabMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_map_software_html_page_hal(self): with open(self.get_test_loc('gitlab/microservice-express-mongo.json')) as gitlab_json: diff --git a/minecode/tests/test_golang.py b/minecode/tests/miners/test_golang.py similarity index 96% rename from minecode/tests/test_golang.py rename to minecode/tests/miners/test_golang.py index 0a490f45..6695a271 100644 --- a/minecode/tests/test_golang.py +++ b/minecode/tests/miners/test_golang.py @@ -26,7 +26,7 @@ class GoLangVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_GoLangGoDocAPIVisitor(self): uri = 'https://api.godoc.org/packages' @@ -67,7 +67,7 @@ def test_parse_package_path(self): class GoLangMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_golang_package(self): purl = 'pkg:golang/github.com/golang/glog?vcs_repository=https://github.com/golang/glog' diff --git a/minecode/tests/test_googlecode.py b/minecode/tests/miners/test_googlecode.py similarity index 96% rename from minecode/tests/test_googlecode.py rename to minecode/tests/miners/test_googlecode.py index 4f6b0743..f4b344fa 100644 --- a/minecode/tests/test_googlecode.py +++ b/minecode/tests/miners/test_googlecode.py @@ -24,7 +24,7 @@ class GoogleNewAPIVisitorsTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_google_download_zip_visitor(self): uri = 'https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip' @@ -90,7 +90,7 @@ def test_visit_googleapi_project_json(self): class GoogleNewAPIMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages_from_v2_projects_json(self): with open(self.get_test_loc('googlecode/v2_api/project.json')) as projectsjson_meta: diff --git a/minecode/tests/test_gstreamer.py b/minecode/tests/miners/test_gstreamer.py similarity index 93% rename from minecode/tests/test_gstreamer.py rename to minecode/tests/miners/test_gstreamer.py index 1e64f783..9b41cbb9 100644 --- a/minecode/tests/test_gstreamer.py +++ b/minecode/tests/miners/test_gstreamer.py @@ -22,7 +22,7 @@ class GstreamerVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_gstreamer_source_root(self): uri = 'https://gstreamer.freedesktop.org/src/' @@ -45,7 +45,7 @@ def test_visit_Gstreamer_subpath_contains_file_resources(self): class GstreamerMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_package_mapper_regex(self): regex = re.compile( diff --git a/minecode/tests/test_haxe.py b/minecode/tests/miners/test_haxe.py similarity index 93% rename from minecode/tests/test_haxe.py rename to minecode/tests/miners/test_haxe.py index 2158fc73..f729a280 100644 --- a/minecode/tests/test_haxe.py +++ b/minecode/tests/miners/test_haxe.py @@ -22,7 +22,7 @@ class HaxeVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_haxe_projects(self): uri = 'https://lib.haxe.org/all' @@ -55,7 +55,7 @@ def test_visit_haxe_package_json(self): class HaxeMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_project_package_json(self): with open(self.get_test_loc('haxe/project_package.json')) as projectsjson_meta: diff --git a/minecode/tests/test_maven.py b/minecode/tests/miners/test_maven.py similarity index 98% rename from minecode/tests/test_maven.py rename to minecode/tests/miners/test_maven.py index 8b5a467e..2ebc6e2d 100644 --- a/minecode/tests/test_maven.py +++ b/minecode/tests/miners/test_maven.py @@ -43,7 +43,7 @@ def sort_deps(results): class MavenMiscTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_get_entries(self): index = self.get_test_loc('maven/index/nexus-maven-repository-index.gz') @@ -199,7 +199,7 @@ def test_build_maven_xml_url(self): class MavenVisitorTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_MavenNexusIndexVisitor_uris(self): uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' @@ -280,7 +280,7 @@ def test_MavenPOMVisitor_data(self): class MavenEnd2EndTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_MavenNexusIndexVisitor_with_run_visit_then_map_end2end(self): # setup @@ -482,7 +482,7 @@ def test_visit_and_map_with_index(self): class MavenXmlMetadataVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_maven_medatata_xml_file(self): uri = 'https://repo1.maven.org/maven2/st/digitru/identity-core/maven-metadata.xml' @@ -495,7 +495,7 @@ def test_visit_maven_medatata_xml_file(self): class MavenHtmlIndexVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_maven_medatata_html_index_jcenter_1(self): uri = 'http://jcenter.bintray.com/' @@ -527,7 +527,7 @@ def test_visit_maven_medatata_html_index_jcenter_3(self): # FIXME: we should not need to call a visitor for testing a mapper class MavenMapperVisitAndMapTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_and_build_package_from_pom_axis(self): uri = 'https://repo1.maven.org/maven2/axis/axis/1.4/axis-1.4.pom' @@ -602,7 +602,7 @@ def test_visit_and_build_package_from_pom_with_unicode(self): class MavenMapperGetPackageTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_get_package_from_pom_1(self): test_loc = self.get_test_loc('maven/parsing/parse/jds-3.0.1.pom') diff --git a/minecode/tests/test_npm.py b/minecode/tests/miners/test_npm.py similarity index 97% rename from minecode/tests/test_npm.py rename to minecode/tests/miners/test_npm.py index ddbf733f..3b8003df 100644 --- a/minecode/tests/test_npm.py +++ b/minecode/tests/miners/test_npm.py @@ -23,7 +23,7 @@ class TestNPMVisit(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') # FIXME: use smaller test files def test_NpmRegistryVisitor(self): @@ -57,7 +57,7 @@ def test_NpmRegistryVisitor_1000records(self): class TestNPMMapper(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages(self): with open(self.get_test_loc('npm/0flux.json')) as npm_metadata: diff --git a/minecode/tests/test_nuget.py b/minecode/tests/miners/test_nuget.py similarity index 96% rename from minecode/tests/test_nuget.py rename to minecode/tests/miners/test_nuget.py index 92e5ccb9..8a0b7a7b 100644 --- a/minecode/tests/test_nuget.py +++ b/minecode/tests/miners/test_nuget.py @@ -24,7 +24,7 @@ class NugetVisitorsTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_NugetQueryVisitor(self): uri = 'https://api-v2v3search-0.nuget.org/query' @@ -75,7 +75,7 @@ def test_NugetHTMLPackageVisitor(self): class TestNugetMap(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages(self): with open(self.get_test_loc('nuget/entityframework2.json')) as nuget_metadata: diff --git a/minecode/tests/test_openssl.py b/minecode/tests/miners/test_openssl.py similarity index 93% rename from minecode/tests/test_openssl.py rename to minecode/tests/miners/test_openssl.py index 7835aa07..dd414e27 100644 --- a/minecode/tests/test_openssl.py +++ b/minecode/tests/miners/test_openssl.py @@ -26,7 +26,7 @@ class OpenSSLVisitorsTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_OpenSSLVisitor(self): uri = 'https://ftp.openssl.org/' @@ -51,7 +51,7 @@ def test_OpenSSLVisitor_sub_folder(self): class OpenSSLTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_OpenSSL_mapper(self): uri = 'https://ftp.openssl.org/snapshot/openssl-1.0.2-stable-SNAP-20180518.tar.gz' diff --git a/minecode/tests/test_openwrt.py b/minecode/tests/miners/test_openwrt.py similarity index 96% rename from minecode/tests/test_openwrt.py rename to minecode/tests/miners/test_openwrt.py index d10a7113..f9122260 100644 --- a/minecode/tests/test_openwrt.py +++ b/minecode/tests/miners/test_openwrt.py @@ -22,7 +22,7 @@ class OpenWRTVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_openwrt_download_pages(self): uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/' @@ -83,7 +83,7 @@ def test_visitor_openwrt_ipk2(self): class OpenWRTMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') @expectedFailure def test_build_packages_1(self): diff --git a/minecode/tests/test_packagist.py b/minecode/tests/miners/test_packagist.py similarity index 90% rename from minecode/tests/test_packagist.py rename to minecode/tests/miners/test_packagist.py index beb71b49..da5b1efb 100644 --- a/minecode/tests/test_packagist.py +++ b/minecode/tests/miners/test_packagist.py @@ -22,7 +22,7 @@ class PackagistVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_packagistlist(self): uri = 'https://packagist.org/packages/list.json' @@ -35,7 +35,7 @@ def test_visit_packagistlist(self): class TestPackagistMap(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages(self): with open(self.get_test_loc('packagist/00f100_cakephp-opauth.json')) as packagist_package: diff --git a/minecode/tests/test_pypi.py b/minecode/tests/miners/test_pypi.py similarity index 98% rename from minecode/tests/test_pypi.py rename to minecode/tests/miners/test_pypi.py index b1564093..8d39ca7c 100644 --- a/minecode/tests/test_pypi.py +++ b/minecode/tests/miners/test_pypi.py @@ -32,7 +32,7 @@ class TestPypiVisit(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') ''' import unittest @@ -143,7 +143,7 @@ def __init__(self, uri, data): class TestPypiMap(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages_lxml(self): with open(self.get_test_loc('pypi/lxml-3.2.0.json')) as pypi_meta: diff --git a/minecode/tests/test_repodata.py b/minecode/tests/miners/test_repodata.py similarity index 97% rename from minecode/tests/test_repodata.py rename to minecode/tests/miners/test_repodata.py index ab79eeed..55b9feb6 100644 --- a/minecode/tests/test_repodata.py +++ b/minecode/tests/miners/test_repodata.py @@ -15,7 +15,7 @@ class TestRepoData(FileBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_get_pkg_infos(self): filelists_xml = self.get_test_loc( diff --git a/minecode/tests/test_repodata_rpms.py b/minecode/tests/miners/test_repodata_rpms.py similarity index 92% rename from minecode/tests/test_repodata_rpms.py rename to minecode/tests/miners/test_repodata_rpms.py index 623f538d..6fca7b6c 100644 --- a/minecode/tests/test_repodata_rpms.py +++ b/minecode/tests/miners/test_repodata_rpms.py @@ -20,7 +20,7 @@ class RepodataRPMVisitorsTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_collect_rsync_urls(self): directory_listing_loc = self.get_test_loc( diff --git a/minecode/tests/test_repomd_parser.py b/minecode/tests/miners/test_repomd_parser.py similarity index 99% rename from minecode/tests/test_repomd_parser.py rename to minecode/tests/miners/test_repomd_parser.py index 8bffa285..f74d3178 100644 --- a/minecode/tests/test_repomd_parser.py +++ b/minecode/tests/miners/test_repomd_parser.py @@ -28,7 +28,7 @@ class TestRepomdParser(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_combine_list_of_dicts(self): expected = {'a': '1', 'b': '2', 'c': '3'} diff --git a/minecode/tests/test_rubygems.py b/minecode/tests/miners/test_rubygems.py similarity index 97% rename from minecode/tests/test_rubygems.py rename to minecode/tests/miners/test_rubygems.py index d5165a6f..edb3e91c 100644 --- a/minecode/tests/test_rubygems.py +++ b/minecode/tests/miners/test_rubygems.py @@ -46,7 +46,7 @@ class RubyGemsVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_check_gem_file_visitor_routes(self): routes = [ @@ -113,7 +113,7 @@ def test_RubyGemsPackageArchiveMetadataVisitor(self): class RubyGemsApiMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_rubygem_packages_from_api_data_1(self): with open(self.get_test_loc('rubygems/apiv1/0xffffff.api.json')) as api: @@ -173,7 +173,7 @@ def test_RubyGemsApiVersionsJsonMapper(self): class RubyGemsArchiveMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_test_RubyGemsPackageArchiveMetadataMapper(self): test_uri = 'https://rubygems.org/downloads/mysmallidea-address_standardization-0.4.1.gem' @@ -281,7 +281,7 @@ def test_build_rubygem_packages_from_metadata_with_deps(self): class RubyEnd2EndTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_and_map_end2end(self): from minecode.management.commands.run_visit import visit_uri diff --git a/minecode/tests/test_sourceforge.py b/minecode/tests/miners/test_sourceforge.py similarity index 96% rename from minecode/tests/test_sourceforge.py rename to minecode/tests/miners/test_sourceforge.py index 4e08ef47..25a53c83 100644 --- a/minecode/tests/test_sourceforge.py +++ b/minecode/tests/miners/test_sourceforge.py @@ -22,7 +22,7 @@ class SourceforgeVisitorsTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_visit_sf_sitemap_index_new(self): uri = 'http://sourceforge.net/sitemap.xml' @@ -73,7 +73,7 @@ def test_visit_sf_project_json_api_new(self): class SourceforgeMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') def test_build_packages(self): with open(self.get_test_loc('sourceforge/odanur.json')) as sourceforge_metadata: diff --git a/minecode/tests/test_models.py b/minecode/tests/test_models.py index d7a5955d..7ef88672 100644 --- a/minecode/tests/test_models.py +++ b/minecode/tests/test_models.py @@ -13,9 +13,6 @@ from django.test import TestCase from django.utils import timezone -from minecode import miners -from minecode import miners - from minecode.models import ResourceURI from packagedb.models import Package from minecode.models import get_canonical diff --git a/minecode/tests/test_run_visit.py b/minecode/tests/test_run_visit.py index d3364692..b38102c6 100644 --- a/minecode/tests/test_run_visit.py +++ b/minecode/tests/test_run_visit.py @@ -7,13 +7,10 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - -from operator import itemgetter from io import StringIO from collections import Counter from django.core import management -from django.forms.models import model_to_dict from minecode.utils_test import MiningTestCase from minecode.management.commands.run_visit import visit_uri From f164569c32eb8d99b230a78490dfbfdbc1d46783 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 13:21:35 -0700 Subject: [PATCH 04/12] Add ruff to testing dependencies #512 #515 Signed-off-by: Jono Yang --- Makefile | 25 +++++++++------------ pyproject.toml | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.cfg | 1 + 3 files changed, 70 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index bc8eeaea..0e416510 100644 --- a/Makefile +++ b/Makefile @@ -56,27 +56,22 @@ envfile_testing: envfile @echo SCANCODEIO_DB_USER=\"postgres\" >> ${ENV_FILE} @echo SCANCODEIO_DB_PASSWORD=\"postgres\" >> ${ENV_FILE} -isort: - @echo "-> Apply isort changes to ensure proper imports ordering" - ${VENV}/bin/isort . - -black: - @echo "-> Apply black code formatter" - ${VENV}/bin/black . - doc8: @echo "-> Run doc8 validation" @${ACTIVATE} doc8 --max-line-length 100 --ignore-path docs/_build/ --quiet docs/ -valid: isort black +valid: + @echo "-> Run Ruff format" + @${ACTIVATE} ruff format --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ + @echo "-> Run Ruff linter" + @${ACTIVATE} ruff check --fix --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ check: - @echo "-> Run pycodestyle (PEP8) validation" - @${ACTIVATE} pycodestyle --max-line-length=100 --exclude=venv,lib,thirdparty,docs,migrations,settings.py . - @echo "-> Run isort imports ordering validation" - @${ACTIVATE} isort --check-only . - @echo "-> Run black validation" - @${ACTIVATE} black --check ${BLACK_ARGS} + @echo "-> Run Ruff linter validation (pycodestyle, bandit, isort, and more)" + @${ACTIVATE} ruff check --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ + @echo "-> Run Ruff format validation" + @${ACTIVATE} ruff format --check --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ + @$(MAKE) doc8 clean: @echo "-> Clean the Python env" diff --git a/pyproject.toml b/pyproject.toml index cde79074..65f49137 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,3 +50,62 @@ addopts = [ "--strict-markers", "--doctest-modules" ] + +[tool.ruff] +line-length = 88 +extend-exclude = ["migrations", "var"] +target-version = "py310" + +[tool.ruff.lint] +# Rules: https://docs.astral.sh/ruff/rules/ +select = [ + "E", # pycodestyle + "W", # pycodestyle warnings + "D", # pydocstyle + "F", # Pyflakes + "UP", # pyupgrade + "S", # flake8-bandit + "I", # isort + "C9", # McCabe complexity +] +ignore = [ + "D1", + "D203", # one-blank-line-before-class + "D205", # blank-line-after-summary + "D212", # multi-line-summary-first-line + "D400", # ends-in-period + "D415", # ends-in-punctuation + "E501", # line-too-long + # TODO: we want to address these issues in the codebase, then get rid of + # the following ignores + "C901", # complex-structure + "S101", # assert + "S103", # bad-file-permissions + "S113", # request-without-timeout + "S202", # tarfile-unsafe-members + "S314", # suspicious-xml-element-tree-usage + "S320", # suspicious-xmle-tree-usage + "S324", # hashlib-insecure-hash-function + "S506", # unsafe-yaml-load + "S602", # subprocess-popen-with-shell-equals-true +] + +[tool.ruff.lint.isort] +force-single-line = true +sections = { django = ["django"] } +section-order = [ + "future", + "standard-library", + "django", + "third-party", + "first-party", + "local-folder", +] + +[tool.ruff.lint.mccabe] +max-complexity = 10 + +[tool.ruff.lint.per-file-ignores] +"**/testfiles/**.py" = ["F821"] # Ignore undefined names from test files +"matchcode_project/settings.py" = ["F403", "F405"] # Ignore undefined names from star imports and star imports +"purldb_public_project/settings.py" = ["F403", "F405"] # Ignore undefined names from star imports and star imports diff --git a/setup.cfg b/setup.cfg index 87505db8..898d1042 100644 --- a/setup.cfg +++ b/setup.cfg @@ -80,6 +80,7 @@ testing = black mock flot + ruff docs = Sphinx>=5.0.2 From 57f9cc03259a54f0f6ab1b2318ff186bd0dfe2d7 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 13:25:29 -0700 Subject: [PATCH 05/12] Add ruff and perform initial reformat #512 #515 Signed-off-by: Jono Yang --- clearcode/cdutils.py | 334 ++-- clearcode/load.py | 59 +- clearcode/management/commands/clearload.py | 23 +- clearcode/management/commands/clearsync.py | 110 +- clearcode/management/commands/store_scans.py | 13 +- clearcode/models.py | 61 +- clearcode/store_scans.py | 89 +- clearcode/sync.py | 386 ++-- clearcode/tests/test_models.py | 63 +- clearcode/tests/test_sync.py | 12 +- clearindex/harvest.py | 150 +- .../management/commands/run_clearindex.py | 181 +- clearindex/utils.py | 14 +- manage_matchcode.py | 6 +- manage_purldb.py | 5 +- matchcode/api.py | 105 +- matchcode/match.py | 106 +- matchcode/models.py | 161 +- matchcode/tests/__init__.py | 1 - matchcode/tests/test_match.py | 496 ++--- matchcode/tests/test_models.py | 312 ++- matchcode/utils.py | 104 +- matchcode_pipeline/api.py | 52 +- matchcode_pipeline/pipelines/matching.py | 3 +- matchcode_pipeline/pipes/matching.py | 52 +- .../tests/pipes/test_matching.py | 34 +- matchcode_pipeline/tests/test_api.py | 244 ++- matchcode_project/dbrouter.py | 28 +- matchcode_project/settings.py | 51 +- matchcode_project/urls.py | 14 +- minecode/__init__.py | 9 +- minecode/api.py | 121 +- minecode/apps.py | 4 +- minecode/collectors/conan.py | 15 +- minecode/collectors/debian.py | 189 +- minecode/collectors/generic.py | 19 +- minecode/collectors/github.py | 10 +- minecode/collectors/gnu.py | 7 +- minecode/collectors/maven.py | 302 ++- minecode/collectors/npm.py | 26 +- minecode/collectors/openssl.py | 9 +- minecode/command.py | 32 +- minecode/debutils.py | 20 +- minecode/filter.py | 58 +- minecode/indexing.py | 77 +- minecode/ls.py | 86 +- minecode/management/commands/__init__.py | 18 +- .../management/commands/check_licenses.py | 61 +- minecode/management/commands/check_uri.py | 88 +- .../commands/create-scan-queue-worker-user.py | 18 +- minecode/management/commands/create-user.py | 12 +- minecode/management/commands/dump_purls.py | 14 +- .../commands/get_maven_release_dates.py | 40 +- minecode/management/commands/get_status.py | 58 +- minecode/management/commands/import_queue.py | 62 +- .../increase_scannableuri_priority.py | 24 +- .../commands/load_priority_queue.py | 33 +- .../management/commands/make_scannableuris.py | 16 +- minecode/management/commands/manage_scans.py | 80 +- minecode/management/commands/maven_crawler.py | 5 +- .../management/commands/priority_queue.py | 32 +- minecode/management/commands/remap.py | 31 +- minecode/management/commands/run_map.py | 62 +- minecode/management/commands/run_visit.py | 189 +- minecode/management/commands/seed.py | 41 +- .../commands/update_maven_package_data.py | 163 +- minecode/management/user_creation.py | 42 +- minecode/mappings/gcode_keywords.py | 867 ++++----- minecode/mappings/gcode_licenses.py | 37 +- .../mappings/gcode_programming_languages.py | 80 +- minecode/mappings/pypi_trove.py | 127 +- minecode/mappings/sfnet_licenses.py | 157 +- .../mappings/sfnet_programming_languages.py | 190 +- minecode/miners/__init__.py | 100 +- minecode/miners/apache.py | 330 ++-- minecode/miners/bitbucket.py | 151 +- minecode/miners/bower.py | 159 +- minecode/miners/cpan.py | 291 +-- minecode/miners/cran.py | 142 +- minecode/miners/debian.py | 387 ++-- minecode/miners/dockerhub.py | 143 +- minecode/miners/eclipse.py | 244 +-- minecode/miners/fdroid.py | 107 +- minecode/miners/fedora.py | 1 - minecode/miners/freebsd.py | 78 +- minecode/miners/freedesktop.py | 89 +- minecode/miners/github.py | 178 +- minecode/miners/gitlab.py | 77 +- minecode/miners/golang.py | 207 +- minecode/miners/googlecode.py | 231 ++- minecode/miners/gstreamer.py | 64 +- minecode/miners/haxe.py | 97 +- minecode/miners/java_stream.py | 15 +- minecode/miners/maven.py | 509 ++--- minecode/miners/npm.py | 68 +- minecode/miners/nuget.py | 214 ++- minecode/miners/openssl.py | 117 +- minecode/miners/openwrt.py | 137 +- minecode/miners/packagist.py | 103 +- minecode/miners/pypi.py | 206 +- minecode/miners/repodata.py | 180 +- minecode/miners/repodata_rpms.py | 42 +- minecode/miners/repomd.py | 65 +- minecode/miners/rubygems.py | 301 +-- minecode/miners/sourceforge.py | 121 +- minecode/miners/ubuntu.py | 2 - minecode/model_utils.py | 218 ++- minecode/models.py | 462 ++--- minecode/permissions.py | 7 +- minecode/route.py | 42 +- minecode/rsync.py | 62 +- minecode/saneyaml.py | 82 +- minecode/seed.py | 11 +- minecode/tasks.py | 3 +- minecode/tests/__init__.py | 1 - minecode/tests/collectors/test_conan.py | 22 +- minecode/tests/collectors/test_generic.py | 31 +- minecode/tests/collectors/test_gnu.py | 8 +- minecode/tests/collectors/test_maven.py | 445 ++--- minecode/tests/collectors/test_npm.py | 24 +- minecode/tests/miners/test_apache.py | 144 +- minecode/tests/miners/test_bitbucket.py | 122 +- minecode/tests/miners/test_bower.py | 56 +- minecode/tests/miners/test_cpan.py | 153 +- minecode/tests/miners/test_cran.py | 60 +- minecode/tests/miners/test_debian.py | 292 +-- minecode/tests/miners/test_dockerhub.py | 66 +- minecode/tests/miners/test_eclipse.py | 100 +- minecode/tests/miners/test_fdroid.py | 35 +- minecode/tests/miners/test_freebsd.py | 51 +- minecode/tests/miners/test_freedesktop.py | 53 +- minecode/tests/miners/test_github.py | 121 +- minecode/tests/miners/test_gitlab.py | 48 +- minecode/tests/miners/test_golang.py | 81 +- minecode/tests/miners/test_googlecode.py | 102 +- minecode/tests/miners/test_gstreamer.py | 53 +- minecode/tests/miners/test_haxe.py | 50 +- minecode/tests/miners/test_maven.py | 747 +++++--- minecode/tests/miners/test_npm.py | 123 +- minecode/tests/miners/test_nuget.py | 97 +- minecode/tests/miners/test_openssl.py | 59 +- minecode/tests/miners/test_openwrt.py | 74 +- minecode/tests/miners/test_packagist.py | 35 +- minecode/tests/miners/test_pypi.py | 173 +- minecode/tests/miners/test_repodata.py | 113 +- minecode/tests/miners/test_repodata_rpms.py | 16 +- minecode/tests/miners/test_repomd_parser.py | 337 ++-- minecode/tests/miners/test_rubygems.py | 270 +-- minecode/tests/miners/test_sourceforge.py | 90 +- minecode/tests/test_api.py | 257 +-- minecode/tests/test_command.py | 14 +- minecode/tests/test_filter.py | 14 +- minecode/tests/test_housekeeping.py | 91 +- minecode/tests/test_indexing.py | 93 +- minecode/tests/test_ls.py | 60 +- minecode/tests/test_migrations.py | 69 +- minecode/tests/test_model_utils.py | 63 +- minecode/tests/test_models.py | 289 ++- minecode/tests/test_priority_queue.py | 24 +- minecode/tests/test_route.py | 214 +-- minecode/tests/test_rsync.py | 318 ++- minecode/tests/test_run_map.py | 363 ++-- minecode/tests/test_run_visit.py | 365 ++-- minecode/tests/test_seed.py | 149 +- minecode/tests/test_tasks.py | 59 +- minecode/tests/test_utils.py | 44 +- minecode/tests/test_version.py | 316 ++- .../conan/zlib/manifest/conanfile.py | 54 +- minecode/utils.py | 188 +- minecode/utils_test.py | 109 +- minecode/version.py | 156 +- packagedb/api.py | 616 +++--- packagedb/api_custom.py | 3 +- packagedb/filters.py | 9 +- packagedb/from_purl.py | 11 +- .../commands/create_source_repo_packages.py | 5 +- .../management/commands/fix_purl_values.py | 85 +- .../management/commands/run_scheduler.py | 1 + .../management/commands/watch_packages.py | 6 +- packagedb/models.py | 449 +++-- packagedb/package_managers.py | 100 +- packagedb/schedules.py | 20 +- packagedb/serializers.py | 449 ++--- packagedb/tasks.py | 6 +- packagedb/tests/test_api.py | 1699 ++++++++--------- packagedb/tests/test_filters.py | 25 +- packagedb/tests/test_migrations.py | 18 +- packagedb/tests/test_models.py | 400 ++-- packagedb/tests/test_package_managers.py | 250 ++- packagedb/tests/test_schedules.py | 6 +- packagedb/tests/test_tasks.py | 1 + packagedb/tests/test_throttling.py | 37 +- packagedb/tests/test_views.py | 6 +- packagedb/throttling.py | 9 +- packagedb/to_purl.py | 3 +- purldb_project/__init__.py | 4 +- purldb_project/settings.py | 145 +- purldb_project/urls.py | 51 +- purldb_project/wsgi.py | 4 +- purldb_public_project/__init__.py | 5 +- purldb_public_project/settings.py | 5 +- purldb_public_project/urls.py | 29 +- purldb_public_project/wsgi.py | 5 +- 203 files changed, 12798 insertions(+), 12068 deletions(-) diff --git a/clearcode/cdutils.py b/clearcode/cdutils.py index 0469c4a3..5645cc41 100644 --- a/clearcode/cdutils.py +++ b/clearcode/cdutils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -18,23 +17,21 @@ # limitations under the License. import base64 +import os +import time from hashlib import md5 from itertools import zip_longest -import os from os import path -import subprocess -import time -from urllib.parse import urlsplit -from urllib.parse import urlunsplit from urllib.parse import parse_qs from urllib.parse import quote_plus from urllib.parse import unquote_plus +from urllib.parse import urlsplit +from urllib.parse import urlunsplit import attr import click -from packageurl import PackageURL import requests - +from packageurl import PackageURL """ ClearlyDefined utlities. @@ -46,67 +43,65 @@ PACKAGE_TYPES_BY_CD_TYPE = { - 'crate': 'cargo', - 'deb': 'deb', - 'debsrc': 'deb', + "crate": "cargo", + "deb": "deb", + "debsrc": "deb", # Currently used only for maven packages - 'sourcearchive': 'maven', - 'maven': 'maven', - 'composer': 'composer', + "sourcearchive": "maven", + "maven": "maven", + "composer": "composer", # Currently used only for Github repo/packages - 'git': 'github', - 'pod': 'pod', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'gem', - 'npm': 'npm', - 'go': 'golang', + "git": "github", + "pod": "pod", + "nuget": "nuget", + "pypi": "pypi", + "gem": "gem", + "npm": "npm", + "go": "golang", } PACKAGE_TYPES_BY_PURL_TYPE = { - 'cargo': 'crate', - 'deb': 'deb', - 'maven': 'maven', - 'composer': 'composer', - 'github': 'git', - 'pod': 'pod', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'gem', - 'npm': 'npm', + "cargo": "crate", + "deb": "deb", + "maven": "maven", + "composer": "composer", + "github": "git", + "pod": "pod", + "nuget": "nuget", + "pypi": "pypi", + "gem": "gem", + "npm": "npm", } PROVIDERS_BY_PURL_TYPE = { - 'cargo': 'cratesio', - 'deb': 'debian', - 'maven': 'mavencentral', - 'composer': 'packagist', + "cargo": "cratesio", + "deb": "debian", + "maven": "mavencentral", + "composer": "packagist", # Currently used only for Github repo/packages - 'git': 'github', - 'github': 'github', - 'pod': 'cocoapods', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'rubygem', - 'npm': 'npmjs', + "git": "github", + "github": "github", + "pod": "cocoapods", + "nuget": "nuget", + "pypi": "pypi", + "gem": "rubygem", + "npm": "npmjs", } QUALIFIERS_BY_CD_TYPE = { - 'sourcearchive': {'classifier': 'sources'}, - 'debsrc': {'arch': 'source'} + "sourcearchive": {"classifier": "sources"}, + "debsrc": {"arch": "source"}, } @attr.s(slots=True) -class Coordinate(object): - """ - ClearlyDefined coordinates are used to identify any tracked component. - """ +class Coordinate: + """ClearlyDefined coordinates are used to identify any tracked component.""" - base_api_url = 'https://dev-api.clearlydefined.io' + base_api_url = "https://dev-api.clearlydefined.io" type = attr.ib() provider = attr.ib() @@ -115,15 +110,15 @@ class Coordinate(object): revision = attr.ib() def __attrs_post_init__(self, *args, **kwargs): - if self.provider == 'debian': - self.namespace = 'debian' + if self.provider == "debian": + self.namespace = "debian" if not self.namespace: - self.namespace = '-' + self.namespace = "-" @classmethod def from_dict(cls, coords): - if 'namespace' not in coords: - coords['namespace'] = '-' + if "namespace" not in coords: + coords["namespace"] = "-" return cls(**coords) def to_dict(self): @@ -169,20 +164,20 @@ def from_path(cls, pth, root=None): >>> assert expected == test """ - pth = pth.strip('/') + pth = pth.strip("/") if root and root in pth: - root = root.strip('/') + root = root.strip("/") _, _, pth = pth.partition(root) - segments = pth.strip('/').split('/') - if len(segments) >= 6 and segments[4] == 'revision': + segments = pth.strip("/").split("/") + if len(segments) >= 6 and segments[4] == "revision": # AZ blob style # /maven/mavencentral/io.dropwizard/dropwizard/revision/2.0.0-rc13.json # /maven/mavencentral/io.dropwizard/dropwizard/revision/2.0.0-rc13/tool/scancode/3.2.2.json start = segments[:4] version = segments[5] - if version.endswith('.json'): - version, _, _ = version.rpartition('.json') + if version.endswith(".json"): + version, _, _ = version.rpartition(".json") segments = start + [version] else: # plain API paths do not have a /revision/ segment @@ -190,47 +185,45 @@ def from_path(cls, pth, root=None): return cls(*segments) def to_api_path(self): - return '{type}/{provider}/{namespace}/{name}/{revision}'.format(**self.to_dict()) + return "{type}/{provider}/{namespace}/{name}/{revision}".format( + **self.to_dict() + ) def to_def_blob_path(self): - return '{type}/{provider}/{namespace}/{name}/revision/{revision}.json'.format(**self.to_dict()) + return "{type}/{provider}/{namespace}/{name}/revision/{revision}.json".format( + **self.to_dict() + ) def to_harvest_blob_path(self, tool, tool_version): - return '{type}/{provider}/{namespace}/{name}/revision/{revision}/tool/{tool}/{tool_version}.json'.format( - tool=tool, tool_version=tool_version, - **self.to_dict()) + return "{type}/{provider}/{namespace}/{name}/revision/{revision}/tool/{tool}/{tool_version}.json".format( + tool=tool, tool_version=tool_version, **self.to_dict() + ) def get_definition_api_url(self, base_api_url=None): - """ - Return a URL to fetch the full definition. - """ - return '{base_url}/definitions/{type}/{provider}/{namespace}/{name}/{revision}'.format( + """Return a URL to fetch the full definition.""" + return "{base_url}/definitions/{type}/{provider}/{namespace}/{name}/{revision}".format( base_url=base_api_url or self.base_api_url, path=self.to_api_path(), - **self.to_dict()) + **self.to_dict(), + ) def get_harvests_api_url(self, base_api_url=None): - """ - Return a URL to fetch all harvests at once. - """ - return '{base_url}/harvest/{type}/{provider}/{namespace}/{name}/{revision}?form=raw'.format( + """Return a URL to fetch all harvests at once.""" + return "{base_url}/harvest/{type}/{provider}/{namespace}/{name}/{revision}?form=raw".format( base_url=base_api_url or self.base_api_url, path=self.to_api_path(), - **self.to_dict()) + **self.to_dict(), + ) def to_def_query_api_url(self, include_revision=False, base_api_url=None): - """ - Return a CD API URL for query definitions. - """ - qs = 'type={type}&provider={provider}&name{name}' + """Return a CD API URL for query definitions.""" + qs = "type={type}&provider={provider}&name{name}" if include_revision: - qs += '&revision={revision}' - if self.namespace and self.namespace != '-': - qs += '&namespace={namespace}' - qs = qs.format( - base_url=base_api_url or self.base_api_url, - **self.to_dict()) - return '{base_url}/definitions?{qs}'.format(**locals()) + qs += "&revision={revision}" + if self.namespace and self.namespace != "-": + qs += "&namespace={namespace}" + qs = qs.format(base_url=base_api_url or self.base_api_url, **self.to_dict()) + return "{base_url}/definitions?{qs}".format(**locals()) def to_purl(self): """ @@ -250,15 +243,18 @@ def to_purl(self): """ converted_package_type = PACKAGE_TYPES_BY_CD_TYPE[self.type] - namespace = '' - if self.namespace != '-': + namespace = "" + if self.namespace != "-": namespace = self.namespace - if self.provider == 'debian': - namespace = 'debian' + if self.provider == "debian": + namespace = "debian" qualifiers = {} - if self.type in ('debsrc', 'sourcearchive',): + if self.type in ( + "debsrc", + "sourcearchive", + ): qualifiers = QUALIFIERS_BY_CD_TYPE[self.type] return PackageURL( @@ -293,14 +289,16 @@ def from_purl(cls, purl): package_type = p.type if package_type not in PACKAGE_TYPES_BY_PURL_TYPE: - raise Exception('Package type is not supported by ClearlyDefined: {}'.format(package_type)) + raise Exception( + f"Package type is not supported by ClearlyDefined: {package_type}" + ) # Handle the source types of Maven and Debian packages - if package_type == 'maven' and p.qualifiers.get('classifier', '') == 'sources': - package_type = 'sourcearchive' - provider = 'mavencentral' - elif package_type == 'deb' and p.qualifiers.get('arch', '') == 'source': - package_type = 'debsrc' - provider = 'debian' + if package_type == "maven" and p.qualifiers.get("classifier", "") == "sources": + package_type = "sourcearchive" + provider = "mavencentral" + elif package_type == "deb" and p.qualifiers.get("arch", "") == "source": + package_type = "debsrc" + provider = "debian" else: package_type = PACKAGE_TYPES_BY_PURL_TYPE[package_type] # TODO: Have way to set other providers? @@ -320,19 +318,21 @@ def get_coordinates(data_dir): Yield tuple of (path, Coordinate) from definition directories from `data_dir` at full depth. """ - data_dir = data_dir.strip('/') + data_dir = data_dir.strip("/") for dirpath, dirnames, _filenames in os.walk(data_dir, followlinks=False): for d in dirnames: pth = path.join(dirpath, d) _, _, cdpth = pth.partition(data_dir) - segments = cdpth.strip('/').split('/') + segments = cdpth.strip("/").split("/") # skip paths that have not the full depth required (e.g. 5 segments) if not len(segments) == 5: continue yield pth, Coordinate.from_path(cdpth) -def _get_response_content(url, retries=2, wait=2, session=requests, verbose=False, _retries=set()): +def _get_response_content( + url, retries=2, wait=2, session=requests, verbose=False, _retries=set() +): """ Return a tuple of (etag, md5, content bytes) with the content as bytes or as decoded text if `as_text` is True) of the response of a GET HTTP request at `url`. @@ -340,7 +340,7 @@ def _get_response_content(url, retries=2, wait=2, session=requests, verbose=Fals `wait` seconds. """ if verbose: - click.echo(' --> Fetching: {url}'.format(**locals())) + click.echo(" --> Fetching: {url}".format(**locals())) response = session.get(url, timeout=600) status_code = response.status_code @@ -350,25 +350,39 @@ def _get_response_content(url, retries=2, wait=2, session=requests, verbose=Fals # to restart from an earlier continuation if url in _retries: _retries.remove(url) - print(' SUCCESS after Failure to fetch:', url) - etag = response.headers.get('etag') + print(" SUCCESS after Failure to fetch:", url) + etag = response.headers.get("etag") content = response.content checksum = md5(content).hexdigest() return etag, checksum, response.content - error_code = requests.codes.get(status_code) or '' + error_code = requests.codes.get(status_code) or "" if status_code >= 500 and retries: # timeout/522 or other server error: let's wait a bit and retry for "retries" number of retries retries -= 1 - print(' Failure to fetch:', url, 'with', status_code, error_code, 'retrying after waiting:', wait, 'seconds.') + print( + " Failure to fetch:", + url, + "with", + status_code, + error_code, + "retrying after waiting:", + wait, + "seconds.", + ) _retries.add(url) time.sleep(wait) return _get_response_content( - url=url, retries=retries, wait=wait, session=session, verbose=verbose) + url=url, retries=retries, wait=wait, session=session, verbose=verbose + ) # all other errors - raise Exception('Failed HTTP request for {url} : error: {status_code} : {error_code}'.format(**locals())) + raise Exception( + "Failed HTTP request for {url} : error: {status_code} : {error_code}".format( + **locals() + ) + ) def get_response_content(url, retries=2, wait=4, session=requests, verbose=False): @@ -378,25 +392,31 @@ def get_response_content(url, retries=2, wait=4, session=requests, verbose=False """ try: return _get_response_content( - url=url, retries=retries, wait=wait, - session=session, verbose=verbose) + url=url, retries=retries, wait=wait, session=session, verbose=verbose + ) except Exception as e: if retries: - print(' Failure to fetch:', url, 'with error:', e, 'and retrying after waiting:', wait, 'seconds.') + print( + " Failure to fetch:", + url, + "with error:", + e, + "and retrying after waiting:", + wait, + "seconds.", + ) # we sleep progressively more after each failure and up to wait seconds time.sleep(int(wait / (retries or 1))) retries -= 1 return get_response_content( - url=url, retries=retries, wait=wait, - session=session, verbose=verbose) + url=url, retries=retries, wait=wait, session=session, verbose=verbose + ) else: raise def split_url(url): - """ - Given a URL, return a tuple of URL elements where `query` is a mapping. - """ + """Given a URL, return a tuple of URL elements where `query` is a mapping.""" scheme, netloc, path, query, fragment = urlsplit(url) query = parse_qs(query) return scheme, netloc, path, query, fragment @@ -408,25 +428,24 @@ def join_qs(keys_values, do_not_quote=()): Quote values unless the name is in in the `do_not_quote` set. """ keys_values = { - k: (v[0] if v and isinstance(v, list) else v) for k, v in keys_values.items()} - return '&'.join('='.join([k, v if k in do_not_quote else quote_plus(v)]) - for k, v in keys_values.items()) + k: (v[0] if v and isinstance(v, list) else v) for k, v in keys_values.items() + } + return "&".join( + "=".join([k, v if k in do_not_quote else quote_plus(v)]) + for k, v in keys_values.items() + ) def append_path_to_url(url, extra_path): - """ - Return a new `url` with `extra_path` appended to its path. - """ + """Return a new `url` with `extra_path` appended to its path.""" scheme, netloc, path, query, fragment = split_url(url) - path = path.strip('/') + '/' + extra_path.strip('/') + path = path.strip("/") + "/" + extra_path.strip("/") segments = scheme, netloc, path, join_qs(query), fragment return urlunsplit(segments) def update_url(url, qs_mapping, do_not_quote=()): - """ - Return a new `url` with its query string updated from a mapping of key/value pairs. - """ + """Return a new `url` with its query string updated from a mapping of key/value pairs.""" scheme, netloc, path, query, fragment = split_url(url) query.update(qs_mapping) segments = scheme, netloc, path, join_qs(query, do_not_quote=do_not_quote), fragment @@ -434,10 +453,8 @@ def update_url(url, qs_mapping, do_not_quote=()): def build_cdapi_continuation_url(api_url, continuation_token): - """ - Return a new `api_url` with a CD API `continuation_token`. - """ - return update_url(api_url, {'continuationToken': continuation_token}) + """Return a new `api_url` with a CD API `continuation_token`.""" + return update_url(api_url, {"continuationToken": continuation_token}) def build_cdapi_continuation_url_from_coordinates(api_url, coordinates): @@ -458,15 +475,15 @@ def split_cdapi_url(url): # get a continuation-free base URL. This assumes that the continuationToken # is always the last query string param if it is present. scheme, netloc, url, query, fragment = split_url(url) - token = query.pop('continuationToken', None) + token = query.pop("continuationToken", None) if token: token = token[0] - if '%' in token: + if "%" in token: token = unquote_plus(token) segments = scheme, netloc, url, join_qs(query), fragment unparsed = urlunsplit(segments) if TRACE: - print('split_cdapi_url:', 'unparsed:', unparsed, 'token:', token) + print("split_cdapi_url:", "unparsed:", unparsed, "token:", token) return unparsed, token @@ -483,19 +500,17 @@ def get_coord_from_cdapi_continuation_url(api_url): def get_coord_from_cdapi_continuation(continuation): - """ - Given an encoded continuation token, return a string of CD coordinates. - """ + """Given an encoded continuation token, return a string of CD coordinates.""" if TRACE: - print('get_coord_from_cdapi_continuation: continuation:', continuation) - continuation = continuation.replace(' ', '+') + print("get_coord_from_cdapi_continuation: continuation:", continuation) + continuation = continuation.replace(" ", "+") - if '%' in continuation: + if "%" in continuation: continuation = unquote_plus(continuation) decoded = base64.b64decode(continuation) if not isinstance(decoded, str): - decoded = decoded.decode('utf-8') + decoded = decoded.decode("utf-8") return decoded @@ -506,10 +521,10 @@ def get_cdapi_continuation_token(coord): """ if isinstance(coord, dict): coord = coord2str(coord) - coord = coord.replace(' ', '+') - encoded = coord.encode('utf-8') + coord = coord.replace(" ", "+") + encoded = coord.encode("utf-8") - return base64.b64encode(encoded).decode('utf-8') + return base64.b64encode(encoded).decode("utf-8") def str2coord(s): @@ -521,17 +536,23 @@ def str2coord(s): URN: "urn:gem:rubygems:-:mocha:revision:1.7.0:tool:scancode:3.1.0" plain: /gem/rubygems/foo/mocha/1.7.0" """ - #TODO: Add doctest - is_urn = s.startswith('urn') - is_url = s.startswith('cd:') - splitter = ':' if is_urn else '/' + # TODO: Add doctest + is_urn = s.startswith("urn") + is_url = s.startswith("cd:") + splitter = ":" if is_urn else "/" segments = s.strip(splitter).split(splitter) if is_urn or is_url: segments = segments[1:] # ignore extra segments for now beyond the 5 fisrt (such as the PR of a curation) segments = segments[:5] - fields = ('type', 'provider', 'namespace', 'name', 'revision',) + fields = ( + "type", + "provider", + "namespace", + "name", + "revision", + ) return dict(zip_longest(fields, segments)) @@ -547,18 +568,17 @@ def coord2str(coord): "name": "license-expression", "revision": "70277cdfc186466667cb58ec9f9c7281e68a221b" """ - assert coord, 'Empty or missing coordinate mapping: {}'.format(coord) - rev = coord.get('revision') + assert coord, f"Empty or missing coordinate mapping: {coord}" + rev = coord.get("revision") kwargs = dict( - t=coord['type'], - p=coord['provider'], - ns=coord.get('namespace') or '-', - n=coord['name'], + t=coord["type"], + p=coord["provider"], + ns=coord.get("namespace") or "-", + n=coord["name"], r=rev, ) if rev: - template = '{t}/{p}/{ns}/{n}/{r}' + template = "{t}/{p}/{ns}/{n}/{r}" else: - template = '{t}/{p}/{ns}/{n}' + template = "{t}/{p}/{ns}/{n}" return template.format(**kwargs) - diff --git a/clearcode/load.py b/clearcode/load.py index a2889d63..e4e3b0ab 100644 --- a/clearcode/load.py +++ b/clearcode/load.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -16,16 +15,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import multiprocessing import os -from pathlib import Path import sys from django.db.utils import IntegrityError import click - """ Load ClearlyDefined definitions and harvests from the filesystem @@ -52,30 +48,32 @@ def walk_and_load_from_filesystem(input_dir, cd_root_dir): CDitem.path = npm/npmjs/@actions/github/revision/2.1.1.json.gz CDitem.content = 'the file: 2.1.1.json.gz in bytes' """ - # for now, we count dirs too file_counter = 1 for root, dirs, files in os.walk(input_dir): for filename in files: # output some progress - print(' ', end='\r') - print("Processing file #{}".format(file_counter), end='\r') - file_counter +=1 + print(" ", end="\r") + print(f"Processing file #{file_counter}", end="\r") + file_counter += 1 # TODO: check if the location is actually a CD data item. full_gzip_path = os.path.join(root, filename) - full_json_path = full_gzip_path.rstrip('.gz') + full_json_path = full_gzip_path.rstrip(".gz") # normalize the `path` value by removing the arbitrary parent directory cditem_rel_path = os.path.relpath(full_json_path, cd_root_dir) - with open(full_gzip_path, mode='rb') as f: + with open(full_gzip_path, mode="rb") as f: content = f.read() from clearcode import models + # Save to DB try: - cditem = models.CDitem.objects.create(path=cditem_rel_path, content=content) + cditem = models.CDitem.objects.create( + path=cditem_rel_path, content=content + ) except IntegrityError: # skip if we already have it in the DB continue @@ -87,41 +85,38 @@ def load(input_dir=None, cd_root_dir=None, *arg, **kwargs): creating CDItem objects and loading them into a PostgreSQL database. """ if not input_dir: - sys.exit('Please specify an input directory using the `--input-dir` option.') + sys.exit("Please specify an input directory using the `--input-dir` option.") if not cd_root_dir: - sys.exit('Please specify the cd-root-directory using the --cd-root-dir option.') + sys.exit("Please specify the cd-root-directory using the --cd-root-dir option.") # get proper DB setup walk_and_load_from_filesystem(input_dir, cd_root_dir) - print(' ', end='\r') + print(" ", end="\r") print("Loading complete") @click.command() - -@click.option('--input-dir', - type=click.Path(), metavar='DIR', - help='Load content from this input directory that contains a tree of gzip-compressed JSON CD files') - -@click.option('--cd-root-dir', - type=click.Path(), metavar='DIR', - help='specify root directory that contains a tree of gzip-compressed JSON CD files') - -@click.help_option('-h', '--help') - +@click.option( + "--input-dir", + type=click.Path(), + metavar="DIR", + help="Load content from this input directory that contains a tree of gzip-compressed JSON CD files", +) +@click.option( + "--cd-root-dir", + type=click.Path(), + metavar="DIR", + help="specify root directory that contains a tree of gzip-compressed JSON CD files", +) +@click.help_option("-h", "--help") def cli(input_dir=None, cd_root_dir=None, *arg, **kwargs): """ Handle ClearlyDefined gzipped JSON scans by walking a clearsync directory structure, creating CDItem objects and loading them into a PostgreSQL database. """ - load( - input_dir=input_dir, - cd_root_dir=cd_root_dir, - *arg, - **kwargs - ) + load(input_dir=input_dir, cd_root_dir=cd_root_dir, *arg, **kwargs) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/clearcode/management/commands/clearload.py b/clearcode/management/commands/clearload.py index c56c2f48..476b58bb 100644 --- a/clearcode/management/commands/clearload.py +++ b/clearcode/management/commands/clearload.py @@ -19,23 +19,22 @@ class Command(VerboseCommand): def add_arguments(self, parser): parser.add_argument( - '--input-dir', - dest='input_dir', + "--input-dir", + dest="input_dir", default=None, type=str, - help='Load content from this input directory that contains a tree of gzip-compressed JSON CD files') + help="Load content from this input directory that contains a tree of gzip-compressed JSON CD files", + ) parser.add_argument( - '--cd-root-dir', - dest='cd_root_dir', + "--cd-root-dir", + dest="cd_root_dir", default=None, type=str, - help='Specify root directory that contains a tree of gzip-compressed JSON CD files') + help="Specify root directory that contains a tree of gzip-compressed JSON CD files", + ) def handle(self, *args, **options): - input_dir = options.get('input_dir') - cd_root_dir = options.get('cd_root_dir') + input_dir = options.get("input_dir") + cd_root_dir = options.get("cd_root_dir") - load( - input_dir=input_dir, - cd_root_dir=cd_root_dir - ) + load(input_dir=input_dir, cd_root_dir=cd_root_dir) diff --git a/clearcode/management/commands/clearsync.py b/clearcode/management/commands/clearsync.py index ab7cf94a..4b8f74a6 100644 --- a/clearcode/management/commands/clearsync.py +++ b/clearcode/management/commands/clearsync.py @@ -20,76 +20,86 @@ class Command(VerboseCommand): def add_arguments(self, parser): parser.add_argument( - '--output-dir', - dest='output_dir', + "--output-dir", + dest="output_dir", default=None, type=str, - help='Save fetched content as compressed gzipped files to this output directory.') + help="Save fetched content as compressed gzipped files to this output directory.", + ) parser.add_argument( - '--save-to-db', - dest='save_to_db', - action='store_true', - help='Save fetched content as compressed gzipped blobs in the configured database.') + "--save-to-db", + dest="save_to_db", + action="store_true", + help="Save fetched content as compressed gzipped blobs in the configured database.", + ) parser.add_argument( - '--unsorted', - dest='unsorted', - action='store_true', - help='Fetch data without any sorting. The default is to fetch data sorting by latest updated first.') + "--unsorted", + dest="unsorted", + action="store_true", + help="Fetch data without any sorting. The default is to fetch data sorting by latest updated first.", + ) parser.add_argument( - '--base-api-url', - dest='base_api_url', - default='https://api.clearlydefined.io', - help='ClearlyDefined base API URL.') + "--base-api-url", + dest="base_api_url", + default="https://api.clearlydefined.io", + help="ClearlyDefined base API URL.", + ) parser.add_argument( - '--wait', - dest='wait', + "--wait", + dest="wait", default=60, type=int, - help='Set the number of seconds to wait for new or updated definitions ' - 'between two loops.') + help="Set the number of seconds to wait for new or updated definitions " + "between two loops.", + ) parser.add_argument( - '-n', - '--processes', - dest='processes', + "-n", + "--processes", + dest="processes", default=1, type=int, - help='Set the number of parallel processes to use. ' - 'Disable parallel processing if 0.') + help="Set the number of parallel processes to use. " + "Disable parallel processing if 0.", + ) parser.add_argument( - '--max-def', - dest='max_def', + "--max-def", + dest="max_def", default=0, type=int, - help='Set the maximum number of definitions to fetch.') + help="Set the maximum number of definitions to fetch.", + ) parser.add_argument( - '--only-definitions', - dest='only_definitions', - action='store_true', - help='Only fetch definitions and no other data item.') + "--only-definitions", + dest="only_definitions", + action="store_true", + help="Only fetch definitions and no other data item.", + ) parser.add_argument( - '--log-file', - dest='log_file', + "--log-file", + dest="log_file", default=None, type=str, - help='Path to a file where to log fetched paths, one per line. ' - 'Log entries will be appended to this file if it exists.') + help="Path to a file where to log fetched paths, one per line. " + "Log entries will be appended to this file if it exists.", + ) parser.add_argument( - '--verbose', - dest='verbose', - action='store_true', - help='Display more verbose progress messages.') + "--verbose", + dest="verbose", + action="store_true", + help="Display more verbose progress messages.", + ) def handle(self, *args, **options): - output_dir = options.get('output_dir') - save_to_db = options.get('save_to_db') - base_api_url = options.get('base_api_url') - wait = options.get('wait') - processes = options.get('processes') - unsorted = options.get('unsorted') - log_file = options.get('log_file') - max_def = options.get('max_def') - only_definitions = options.get('only_definitions') - verbose = options.get('verbose') + output_dir = options.get("output_dir") + save_to_db = options.get("save_to_db") + base_api_url = options.get("base_api_url") + wait = options.get("wait") + processes = options.get("processes") + unsorted = options.get("unsorted") + log_file = options.get("log_file") + max_def = options.get("max_def") + only_definitions = options.get("only_definitions") + verbose = options.get("verbose") sync( output_dir=output_dir, @@ -101,5 +111,5 @@ def handle(self, *args, **options): log_file=log_file, max_def=max_def, only_definitions=only_definitions, - verbose=verbose + verbose=verbose, ) diff --git a/clearcode/management/commands/store_scans.py b/clearcode/management/commands/store_scans.py index 47212f1d..d102346e 100644 --- a/clearcode/management/commands/store_scans.py +++ b/clearcode/management/commands/store_scans.py @@ -12,13 +12,16 @@ class Command(VerboseCommand): - help = 'Store scancode scans in git repositories' + help = "Store scancode scans in git repositories" def add_arguments(self, parser): - parser.add_argument('work_dir', type=str) - parser.add_argument('--github_org', type=str, default="") - parser.add_argument('--count', type=int, default=0) + parser.add_argument("work_dir", type=str) + parser.add_argument("--github_org", type=str, default="") + parser.add_argument("--count", type=int, default=0) def handle(self, *args, **options): store_scancode_scans_from_cd_items( - work_dir=options['work_dir'], github_org=options['github_org'], count=options['count']) + work_dir=options["work_dir"], + github_org=options["github_org"], + count=options["count"], + ) diff --git a/clearcode/models.py b/clearcode/models.py index 0b96f2ee..871a4549 100644 --- a/clearcode/models.py +++ b/clearcode/models.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -25,9 +24,8 @@ class VirtualFileStore: - """ - Convenience wrapper to access CDitems as if they would be concrete files. - """ + """Convenience wrapper to access CDitems as if they would be concrete files.""" + @classmethod def walk(self, prefix=None, since=None): """ @@ -48,17 +46,17 @@ class CDitemQuerySet(models.QuerySet): def known_package_types(self): # These are the Package types that can be stored in the PackageDB KNOWN_PACKAGE_TYPES = [ - 'composer', - 'crate', - 'deb', - 'debsrc', - 'gem', - 'git', - 'maven', - 'npm', - 'nuget', - 'pypi', - 'sourcearchive', + "composer", + "crate", + "deb", + "debsrc", + "gem", + "git", + "maven", + "npm", + "nuget", + "pypi", + "sourcearchive", ] q_objs = models.Q() for package_type in KNOWN_PACKAGE_TYPES: @@ -66,10 +64,10 @@ def known_package_types(self): return self.filter(q_objs) def definitions(self): - return self.exclude(path__contains='/tool/') + return self.exclude(path__contains="/tool/") def scancode_harvests(self): - return self.filter(path__contains='tool/scancode') + return self.filter(path__contains="tool/scancode") def mappable(self): return self.filter(last_map_date__isnull=True, map_error__isnull=True) @@ -81,9 +79,7 @@ def mappable_scancode_harvests(self): return self.mappable().scancode_harvests().known_package_types() def modified_after(self, date): - """ - Limit the QuerySet to CDitems that were modified after a given `date`. - """ + """Limit the QuerySet to CDitems that were modified after a given `date`.""" return self.filter(last_modified_date__gt=date) @@ -93,8 +89,11 @@ class CDitem(models.Model): stored in ClearlyDefined blob storage and the value is a GZipped compressed JSON file content, stored as a binary bytes blob. """ - path = models.CharField(primary_key=True, max_length=2048, - help_text='Path to the original file in the ClearlyDefined file storage.' + + path = models.CharField( + primary_key=True, + max_length=2048, + help_text="Path to the original file in the ClearlyDefined file storage.", ) uuid = models.UUIDField( @@ -103,12 +102,10 @@ class CDitem(models.Model): editable=False, ) - content = models.BinaryField( - help_text='Actual gzipped JSON content.' - ) + content = models.BinaryField(help_text="Actual gzipped JSON content.") last_modified_date = models.DateTimeField( - help_text='Date and time that this record was last modified.', + help_text="Date and time that this record was last modified.", auto_now=True, # Automatically set to now on object save() ) @@ -116,24 +113,22 @@ class CDitem(models.Model): null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of the last mapping. ' - 'Used to track mapping status.', + help_text="Timestamp set to the date of the last mapping. " + "Used to track mapping status.", ) map_error = models.TextField( null=True, blank=True, - help_text='Mapping errors messages. When present this means the mapping failed.', + help_text="Mapping errors messages. When present this means the mapping failed.", ) objects = CDitemQuerySet.as_manager() @property def data(self): - """ - Return the data content deserialized from the content field. - """ + """Return the data content deserialized from the content field.""" uncompressed_content = gzip.decompress(self.content) if not uncompressed_content: - uncompressed_content = '{}' + uncompressed_content = "{}" return json.loads(uncompressed_content) diff --git a/clearcode/store_scans.py b/clearcode/store_scans.py index fed1e4f4..0304f193 100644 --- a/clearcode/store_scans.py +++ b/clearcode/store_scans.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -17,18 +16,22 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import os from collections import defaultdict -from clearcode.models import CDitem -from clearcode.cdutils import Coordinate -from clearcode.cdutils import str2coord -from django.db.models import Q from hashlib import sha512 -import json -import requests -from packageurl import PackageURL from pathlib import Path + +from django.db.models import Q + +import requests from git import Repo -import os +from packageurl import PackageURL + +from clearcode.cdutils import Coordinate +from clearcode.cdutils import str2coord +from clearcode.models import CDitem + """ The input is a bunch of scans from ClearlyDefined and the output is a bunch of git repositories with commited and @@ -66,16 +69,17 @@ # Create hex values of integers and ignore the 0x prefix repo_names = [hex(hash)[2:].zfill(3) for hash in range(4096)] + def store_scancode_scans_from_cd_items(work_dir, github_org="", count=0): """ - Iterate over CDItem objects with scancode scans. + Iterate over CDItem objects with scancode scans. Save and commit them in git repositories in work dir. Process a maximum of count items and process all items if count is 0 """ - cd_items = CDitem.objects.filter(~Q(content=b''), path__contains="tool/scancode") + cd_items = CDitem.objects.filter(~Q(content=b""), path__contains="tool/scancode") if count: - cd_items = cd_items[:count] + cd_items = cd_items[:count] for purl_hash, cd_items in get_cd_item_by_purl_hash(cd_items=cd_items).items(): commit_count = 0 for cd_item in cd_items: @@ -88,20 +92,24 @@ def store_scancode_scans_from_cd_items(work_dir, github_org="", count=0): scancode_scan = data.get("content") if not scancode_scan: continue - repo = get_or_init_repo(repo_name=purl_hash, work_dir=work_dir, repo_namespace=github_org, user_name=github_org, pull=False) + repo = get_or_init_repo( + repo_name=purl_hash, + work_dir=work_dir, + repo_namespace=github_org, + user_name=github_org, + pull=False, + ) purl = coordinate.to_purl() if add_scancode_scan(scancode_scan=scancode_scan, purl=purl, repo=repo): commit_count += 1 if commit_count % 10 == 0: print(".", end="") - origin = repo.remote(name='origin') + origin = repo.remote(name="origin") origin.push() def get_cd_item_by_purl_hash(cd_items): - """ - Return a mapping of {purl_hash: [CDItem,....]} - """ + """Return a mapping of {purl_hash: [CDItem,....]}""" cd_item_by_purl_hash = defaultdict(list) for cd_item in cd_items: data = cd_item.data @@ -124,7 +132,7 @@ def add_scancode_scan(repo, purl, scancode_scan): purl_data_dir = get_or_create_dir_for_purl(purl=purl, repo=repo) scancode_scan_path = purl_data_dir / "scancode-toolkit-scan.json" with open(scancode_scan_path, "w") as f: - json.dump(scancode_scan,f,indent=2) + json.dump(scancode_scan, f, indent=2) if repo.is_dirty(): repo.index.add([scancode_scan_path]) @@ -138,13 +146,14 @@ def is_valid_coordinate(coordinate): def get_or_create_dir_for_purl(purl, repo): """ - Return a path to a directory for this purl, + Return a path to a directory for this purl, in this git repo. """ purl_dir = repo.working_dir / get_purl_path(purl) purl_dir.mkdir(parents=True, exist_ok=True) return purl_dir + def get_purl_path(purl): purl_path = Path(purl.type) if purl.namespace: @@ -152,10 +161,8 @@ def get_purl_path(purl): return purl_path / purl.name / purl.version -def get_purl_hash(purl: PackageURL, length: int=3) -> str: - """ - Return a short lower cased hash of a purl. - """ +def get_purl_hash(purl: PackageURL, length: int = 3) -> str: + """Return a short lower cased hash of a purl.""" # This function takes a PackageURL object and an optional length parameter. # It returns a short hash of the purl. The length of the hash is determined by the length parameter. # The default length is 3. The function first converts the purl to bytes and then computes the sha512 hash of the purl. @@ -166,10 +173,16 @@ def get_purl_hash(purl: PackageURL, length: int=3) -> str: return short_hash.lower() -def get_or_init_repo(repo_name: str, work_dir: Path, repo_namespace: str= "", user_name: str = "", pull=False): +def get_or_init_repo( + repo_name: str, + work_dir: Path, + repo_namespace: str = "", + user_name: str = "", + pull=False, +): """ - Return a repo object for repo name and namespace - and store it in the work dir. Clone if it does not + Return a repo object for repo name and namespace + and store it in the work dir. Clone if it does not exist optionally take the latest pull if it does exist. """ # TODO: Manage org repo name @@ -186,7 +199,9 @@ def get_or_init_repo(repo_name: str, work_dir: Path, repo_namespace: str= "", us return repo -def get_scan_download_url(namespace:str, purl:str, scan_file_name: str = "scancode-toolkit-scan.json"): +def get_scan_download_url( + namespace: str, purl: str, scan_file_name: str = "scancode-toolkit-scan.json" +): purl_hash = get_purl_hash(purl=purl) purl_path = get_purl_path(purl) return f"https://raw.githubusercontent.com/{namespace}/{purl_hash}/main/{purl_path}/{scan_file_name}" @@ -194,15 +209,15 @@ def get_scan_download_url(namespace:str, purl:str, scan_file_name: str = "scanco def create_github_repo(repo_name, token=os.getenv("GH_TOKEN")): headers = { - 'Authorization': f'token {token}', - 'Accept': 'application/vnd.github.v3+json' + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", } data = { - 'name': repo_name, + "name": repo_name, } - url = 'https://api.github.com/user/repos' + url = "https://api.github.com/user/repos" response = requests.post(url, headers=headers, json=data) @@ -218,18 +233,18 @@ def get_github_repos(user_name, token=os.getenv("GH_TOKEN")): Yield full repo names for a user or org name, use the optional ``token`` if provided. Full repo name is in the form user or org name / repo name """ - headers = { - 'Accept': 'application/vnd.github.v3+json' - } + headers = {"Accept": "application/vnd.github.v3+json"} if token: - headers['Authorization'] = f'token {token}' + headers["Authorization"] = f"token {token}" - url = f'https://api.github.com/users/{user_name}/repos' + url = f"https://api.github.com/users/{user_name}/repos" response = requests.get(url, headers=headers) # TODO: We need have a way to handle failures from GH API if not response.status_code == 200: - raise Exception(f"HTTP {response.status_code}: Failed to get repos for {user_name}") + raise Exception( + f"HTTP {response.status_code}: Failed to get repos for {user_name}" + ) data = response.json() for repo_data in data: diff --git a/clearcode/sync.py b/clearcode/sync.py index 6b39d526..4b1d8098 100644 --- a/clearcode/sync.py +++ b/clearcode/sync.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -17,21 +16,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from datetime import datetime import gzip import json -from multiprocessing import pool import os -from os import path import time +from datetime import datetime +from multiprocessing import pool +from os import path -import click from django.utils import timezone + +import click import requests from clearcode import cdutils - """ Fetch the latest definitions and harvests from ClearlyDefined @@ -67,18 +66,18 @@ known_types = ( # fake empty type None, - 'npm', - 'git', - 'pypi', - 'composer', - 'maven', - 'gem', - 'nuget', - 'sourcearchive', - 'deb', - 'debsrc', - 'crate', - 'pod', + "npm", + "git", + "pypi", + "composer", + "maven", + "gem", + "nuget", + "sourcearchive", + "deb", + "debsrc", + "crate", + "pod", ) @@ -87,8 +86,14 @@ def fetch_and_save_latest_definitions( - base_api_url, cache, output_dir=None, save_to_db=False, - by_latest=True, retries=2, verbose=True): + base_api_url, + cache, + output_dir=None, + save_to_db=False, + by_latest=True, + retries=2, + verbose=True, +): """ Fetch ClearlyDefined definitions and paginate through. Save these as blobs to data_dir. @@ -97,26 +102,32 @@ def fetch_and_save_latest_definitions( Otherwise, the order is not specified. NOTE: these do not contain file details (but the harvest do) """ - assert output_dir or save_to_db, 'You must select one of the --output-dir or --save-to-db options.' + assert ( + output_dir or save_to_db + ), "You must select one of the --output-dir or --save-to-db options." - definitions_url = cdutils.append_path_to_url(base_api_url, extra_path='definitions') + definitions_url = cdutils.append_path_to_url(base_api_url, extra_path="definitions") if by_latest: - definitions_url = cdutils.update_url(definitions_url, qs_mapping=dict(sort='releaseDate', sortDesc='true')) + definitions_url = cdutils.update_url( + definitions_url, qs_mapping=dict(sort="releaseDate", sortDesc="true") + ) - for content in fetch_definitions(api_url=definitions_url, cache=cache, retries=retries, verbose=TRACE): + for content in fetch_definitions( + api_url=definitions_url, cache=cache, retries=retries, verbose=TRACE + ): # content is a batch of 100 definitions - definitions = content and content.get('data') + definitions = content and content.get("data") if not definitions: if verbose: - print(' No more data for: {}'.format(definitions_url)) + print(f" No more data for: {definitions_url}") break if verbose: - first = cdutils.coord2str(definitions[0]['coordinates']) - last = cdutils.coord2str(definitions[-1]['coordinates']) - print('Fetched definitions from :', first, 'to:', last, flush=True) + first = cdutils.coord2str(definitions[0]["coordinates"]) + last = cdutils.coord2str(definitions[-1]["coordinates"]) + print("Fetched definitions from :", first, "to:", last, flush=True) else: - print('.', end='', flush=True) + print(".", end="", flush=True) savers = [] if save_to_db: @@ -126,11 +137,14 @@ def fetch_and_save_latest_definitions( # we received a batch of definitions: let's save each as a Gzipped JSON for definition in definitions: - coordinate = cdutils.Coordinate.from_dict(definition['coordinates']) + coordinate = cdutils.Coordinate.from_dict(definition["coordinates"]) for saver in savers: blob_path, _size = save_def( - coordinate=coordinate, content=definition, output_dir=output_dir, - saver=saver) + coordinate=coordinate, + content=definition, + output_dir=output_dir, + saver=saver, + ) yield coordinate, blob_path @@ -147,7 +161,7 @@ def fetch_definitions(api_url, cache, retries=1, verbose=True): The structure of the REST payload is a list : {"data": [{}, ...], "continuationToken": ""} """ - assert '/definitions' in api_url + assert "/definitions" in api_url content = None errors_count = 0 max_errors = 5 @@ -159,7 +173,9 @@ def fetch_definitions(api_url, cache, retries=1, verbose=True): content = json.loads(content) except requests.exceptions.ConnectionError as ex: - print('!!!!!!!!!!!!!!!!!! -> Request failed, retrying:', api_url, 'with:', ex) + print( + "!!!!!!!!!!!!!!!!!! -> Request failed, retrying:", api_url, "with:", ex + ) errors_count += 1 if errors_count <= max_errors: # wait and retry, sleeping more each time we egt some error @@ -168,14 +184,14 @@ def fetch_definitions(api_url, cache, retries=1, verbose=True): else: raise - continuation_token = '' + continuation_token = "" if content: yield content - continuation_token = content.get('continuationToken', '') + continuation_token = content.get("continuationToken", "") if not continuation_token: if verbose: - print(' No more data for: {}'.format(api_url)) + print(f" No more data for: {api_url}") break api_url = cdutils.build_cdapi_continuation_url(api_url, continuation_token) @@ -187,9 +203,9 @@ def compress(content): `content` is eiher a string or a JSON-serializable data structure. """ if isinstance(content, str): - content = content.encode('utf-8') + content = content.encode("utf-8") else: - content = json.dumps(content , separators=(',', ':')).encode('utf-8') + content = json.dumps(content, separators=(",", ":")).encode("utf-8") return gzip.compress(content, compresslevel=9) @@ -198,11 +214,11 @@ def file_saver(content, blob_path, output_dir, **kwargs): Save `content` bytes (or dict or string) as gzip compressed bytes to `file_path`. Return the length of the written payload or 0 if it existed and was not updated. """ - file_path = path.join(output_dir, blob_path + '.gz') + file_path = path.join(output_dir, blob_path + ".gz") compressed = compress(content) if path.exists(file_path): - with open(file_path , 'rb') as ef: + with open(file_path, "rb") as ef: existing = ef.read() if existing == compressed: return 0 @@ -210,9 +226,9 @@ def file_saver(content, blob_path, output_dir, **kwargs): parent_dir = path.dirname(file_path) os.makedirs(parent_dir, exist_ok=True) - with open(file_path , 'wb') as oi: + with open(file_path, "wb") as oi: if TRACE: - print('Saving:', blob_path) + print("Saving:", blob_path) oi.write(compressed) return len(compressed) @@ -233,12 +249,12 @@ def db_saver(content, blob_path, **kwargs): cditem.content = compressed cditem.save() if TRACE: - print('Updating content for:', blob_path) + print("Updating content for:", blob_path) else: return 0 else: if TRACE: - print('Adding content for:', blob_path) + print("Adding content for:", blob_path) return len(compressed) @@ -254,8 +270,7 @@ def save_def(coordinate, content, output_dir, saver=file_saver): return blob_path, saver(content=content, output_dir=output_dir, blob_path=blob_path) -def save_harvest( - coordinate, tool, tool_version, content, output_dir, saver=file_saver): +def save_harvest(coordinate, tool, tool_version, content, output_dir, saver=file_saver): """ Save the scan `content` bytes (or dict or string) for `tool` `tool_version` of `coordinate` object to `output_dir` using blob paths conventions. @@ -267,19 +282,28 @@ def save_harvest( def fetch_and_save_harvests( - coordinate, cache, output_dir=None, save_to_db=False, retries=2, - session=session, verbose=True): + coordinate, + cache, + output_dir=None, + save_to_db=False, + retries=2, + session=session, + verbose=True, +): """ Fetch all the harvests for `coordinate` Coordinate object and save them in `outputdir` using blob-style paths, one file for each harvest/scan. (Note: Return a tuple of (etag, md5, url) for usage as a callback) """ - assert output_dir or save_to_db, 'You must select one of the --output-dir or --save-to-db options.' + assert ( + output_dir or save_to_db + ), "You must select one of the --output-dir or --save-to-db options." url = coordinate.get_harvests_api_url() etag, checksum, content = cache.get_content( - url, retries=retries, session=session, with_cache_keys=True) + url, retries=retries, session=session, with_cache_keys=True + ) if content: savers = [] @@ -289,9 +313,9 @@ def fetch_and_save_harvests( savers.append(file_saver) if verbose: - print(' Fetched harvest for:', coordinate.to_api_path(), flush=True) + print(" Fetched harvest for:", coordinate.to_api_path(), flush=True) else: - print('.', end='', flush=True) + print(".", end="", flush=True) for tool, versions in json.loads(content).items(): for tool_version, harvest in versions.items(): @@ -302,15 +326,14 @@ def fetch_and_save_harvests( tool_version=tool_version, content=harvest, output_dir=output_dir, - saver=saver) + saver=saver, + ) return etag, checksum, url -class Cache(object): - """ - A caching object for etags and checksums to avoid refetching things. - """ +class Cache: + """A caching object for etags and checksums to avoid refetching things.""" def __init__(self, max_size=100 * 1000): self.etags_cache = {} @@ -324,16 +347,14 @@ def is_unchanged_remotely(self, url, session=session): """ try: response = session.head(url) - remote_etag = response.headers.get('etag') + remote_etag = response.headers.get("etag") if remote_etag and self.etags_cache.get(url) == remote_etag: return True except: return False def is_fetched(self, checksum, url): - """ - Return True if the content checksum exists for url, using MD5 checksum. - """ + """Return True if the content checksum exists for url, using MD5 checksum.""" return url and checksum and self.checksums_cache.get(checksum) == url def add(self, etag, checksum, url): @@ -346,9 +367,7 @@ def add_args(self, args): self.add(*args) def trim(self): - """ - Trim the cache to its max size. - """ + """Trim the cache to its max size.""" def _resize(cache): extra_items = len(cache) - self.max_size @@ -368,7 +387,8 @@ def get_content(self, url, retries=1, session=session, with_cache_keys=False): return etag, checksum, content = cdutils.get_response_content( - url, retries=retries, session=session) + url, retries=retries, session=session + ) if not content: return @@ -384,26 +404,36 @@ def get_content(self, url, retries=1, session=session, with_cache_keys=False): return content def copy(self): - """ - Return a deep copy of self - """ + """Return a deep copy of self""" cache = Cache(self.max_size) cache.checksums_cache = dict(self.checksums_cache) cache.etags_cache = dict(self.etags_cache) return cache -def sync(output_dir=None, save_to_db=False, - base_api_url='https://api.clearlydefined.io', - wait=60, processes=1, unsorted=False, - log_file=None, max_def=0, only_definitions=False, session=session, - verbose=False, *arg, **kwargs): +def sync( + output_dir=None, + save_to_db=False, + base_api_url="https://api.clearlydefined.io", + wait=60, + processes=1, + unsorted=False, + log_file=None, + max_def=0, + only_definitions=False, + session=session, + verbose=False, + *arg, + **kwargs, +): """ Fetch the latest definitions and harvests from ClearlyDefined and save these as gzipped JSON either as as files in output-dir or in a PostgreSQL database. Loop forever after waiting some seconds between each cycles. """ - assert output_dir or save_to_db, 'You must select at least one of the --output-dir or --save-to-db options.' + assert ( + output_dir or save_to_db + ), "You must select at least one of the --output-dir or --save-to-db options." fetch_harvests = not only_definitions @@ -421,7 +451,7 @@ def sync(output_dir=None, save_to_db=False, log_file_fn = None if log_file: - log_file_fn = open(log_file, 'a') + log_file_fn = open(log_file, "a") try: if fetch_harvests: @@ -441,7 +471,9 @@ def sync(output_dir=None, save_to_db=False, if def_type: # get latest with a "type" query - def_api_url = cdutils.update_url(base_api_url, qs_mapping=dict(type=def_type)) + def_api_url = cdutils.update_url( + base_api_url, qs_mapping=dict(type=def_type) + ) else: # do nothing if we have no type def_api_url = base_api_url @@ -452,16 +484,17 @@ def sync(output_dir=None, save_to_db=False, save_to_db=save_to_db, cache=cache, by_latest=not unsorted, - verbose=verbose) + verbose=verbose, + ) for coordinate, file_path in definitions: - cycle_defs_count += 1 if log_file: - log_file_fn.write(file_path.partition('.gz')[0] + '\n') + log_file_fn.write(file_path.partition(".gz")[0] + "\n") - if TRACE: print(' Saved def for:', coordinate) + if TRACE: + print(" Saved def for:", coordinate) if fetch_harvests: kwds = dict( @@ -472,17 +505,19 @@ def sync(output_dir=None, save_to_db=False, # subprocess, the data is best not shared to avoid # any sync issue cache=cache.copy(), - verbose=verbose) + verbose=verbose, + ) harvest_fetchers.apply_async( - fetch_and_save_harvests, - kwds=kwds, - callback=cache.add_args) + fetch_and_save_harvests, kwds=kwds, callback=cache.add_args + ) if max_def and max_def <= cycle_defs_count: break - if max_def and (max_def <= cycle_defs_count or max_def <= total_defs_count): + if max_def and ( + max_def <= cycle_defs_count or max_def <= total_defs_count + ): break total_defs_count += cycle_defs_count @@ -490,24 +525,42 @@ def sync(output_dir=None, save_to_db=False, total_duration += cycle_duration if not sleeping: - print('Saved', cycle_defs_count, 'defs and harvests,', - 'in:', int(cycle_duration), 'sec.') - - print('TOTAL cycles:', cycles, - 'with:', total_defs_count, 'defs and combined harvests,', - 'in:', int(total_duration), 'sec.') - - print('Cycle completed at:', datetime.utcnow().isoformat(), - 'Sleeping for', wait, 'seconds...') + print( + "Saved", + cycle_defs_count, + "defs and harvests,", + "in:", + int(cycle_duration), + "sec.", + ) + + print( + "TOTAL cycles:", + cycles, + "with:", + total_defs_count, + "defs and combined harvests,", + "in:", + int(total_duration), + "sec.", + ) + + print( + "Cycle completed at:", + datetime.utcnow().isoformat(), + "Sleeping for", + wait, + "seconds...", + ) else: - print('.', end='') + print(".", end="") sleeping = True time.sleep(wait) cache.trim() except KeyboardInterrupt: - click.secho('\nAborted with Ctrl+C!', fg='red', err=True) + click.secho("\nAborted with Ctrl+C!", fg="red", err=True) return finally: @@ -518,66 +571,97 @@ def sync(output_dir=None, save_to_db=False, harvest_fetchers.close() harvest_fetchers.terminate() - print('TOTAL cycles:', cycles, - 'with:', total_defs_count, 'defs and combined harvests,', - 'in:', int(total_duration), 'sec.') + print( + "TOTAL cycles:", + cycles, + "with:", + total_defs_count, + "defs and combined harvests,", + "in:", + int(total_duration), + "sec.", + ) @click.command() - -@click.option('--output-dir', - type=click.Path(), metavar='DIR', - help='Save fetched content as compressed gzipped files to this output directory.') - -@click.option('--save-to-db', +@click.option( + "--output-dir", + type=click.Path(), + metavar="DIR", + help="Save fetched content as compressed gzipped files to this output directory.", +) +@click.option( + "--save-to-db", is_flag=True, - help='Save fetched content as compressed gzipped blobs in the configured database.') - -@click.option('--unsorted', + help="Save fetched content as compressed gzipped blobs in the configured database.", +) +@click.option( + "--unsorted", is_flag=True, - help='Fetch data without any sorting. The default is to fetch data sorting by latest updated first.') - -@click.option('--base-api-url', + help="Fetch data without any sorting. The default is to fetch data sorting by latest updated first.", +) +@click.option( + "--base-api-url", type=str, - default='https://api.clearlydefined.io', show_default=True, - help='ClearlyDefined base API URL.') - -@click.option('--wait', - type=int, metavar='INT', - default=60, show_default=True, - help='Set the number of seconds to wait for new or updated definitions ' - 'between two loops.') - -@click.option('-n', '--processes', - type=int, metavar='INT', - default=1, show_default=True, - help='Set the number of parallel processes to use. ' - 'Disable parallel processing if 0.') - -@click.option('--max-def', - type=int, metavar='INT', + default="https://api.clearlydefined.io", + show_default=True, + help="ClearlyDefined base API URL.", +) +@click.option( + "--wait", + type=int, + metavar="INT", + default=60, + show_default=True, + help="Set the number of seconds to wait for new or updated definitions " + "between two loops.", +) +@click.option( + "-n", + "--processes", + type=int, + metavar="INT", + default=1, + show_default=True, + help="Set the number of parallel processes to use. " + "Disable parallel processing if 0.", +) +@click.option( + "--max-def", + type=int, + metavar="INT", default=0, - help='Set the maximum number of definitions to fetch.') - -@click.option('--only-definitions', - is_flag=True, - help='Only fetch definitions and no other data item.') - -@click.option('--log-file', - type=click.Path(), default=None, - help='Path to a file where to log fetched paths, one per line. ' - 'Log entries will be appended to this file if it exists.') - -@click.option('--verbose', + help="Set the maximum number of definitions to fetch.", +) +@click.option( + "--only-definitions", is_flag=True, - help='Display more verbose progress messages.') - -@click.help_option('-h', '--help') -def cli(output_dir=None, save_to_db=False, - base_api_url='https://api.clearlydefined.io', - wait=60, processes=1, unsorted=False, - log_file=None, max_def=0, only_definitions=False, session=session, - verbose=False, *arg, **kwargs): + help="Only fetch definitions and no other data item.", +) +@click.option( + "--log-file", + type=click.Path(), + default=None, + help="Path to a file where to log fetched paths, one per line. " + "Log entries will be appended to this file if it exists.", +) +@click.option("--verbose", is_flag=True, help="Display more verbose progress messages.") +@click.help_option("-h", "--help") +def cli( + output_dir=None, + save_to_db=False, + base_api_url="https://api.clearlydefined.io", + wait=60, + processes=1, + unsorted=False, + log_file=None, + max_def=0, + only_definitions=False, + session=session, + verbose=False, + *arg, + **kwargs, +): """ Fetch the latest definitions and harvests from ClearlyDefined and save these as gzipped JSON either as as files in output-dir or in a PostgreSQL @@ -600,5 +684,5 @@ def cli(output_dir=None, save_to_db=False, ) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/clearcode/tests/test_models.py b/clearcode/tests/test_models.py index 541ac8bf..ddaf6524 100644 --- a/clearcode/tests/test_models.py +++ b/clearcode/tests/test_models.py @@ -10,15 +10,14 @@ class CDitemManagerModifiedAfterTestCase(TestCase): - def setUp(self): - self.cditem0 = CDitem.objects.create(path='npm/name/version') + self.cditem0 = CDitem.objects.create(path="npm/name/version") def test_modified_after_1_day_old(self): test_date = datetime.datetime.now() - datetime.timedelta(days=1) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) self.assertEqual(1, len(CDitem.objects.modified_after(test_date))) - + def test_modified_after_1_week_old(self): test_date = datetime.datetime.now() - datetime.timedelta(days=7) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) @@ -28,7 +27,7 @@ def test_modified_after_1_day_new(self): test_date = datetime.datetime.now() + datetime.timedelta(days=1) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) self.assertEqual(0, len(CDitem.objects.modified_after(test_date))) - + def test_modified_after_1_week_new(self): test_date = datetime.datetime.now() + datetime.timedelta(days=7) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) @@ -38,66 +37,86 @@ def test_modified_after_1_week_new(self): class CDitemManagerTestCase(TestCase): def test_known_package_types(self): # This path starts with npm, which is known - cditem_1 = CDitem.objects.create(path='npm/name/version') + cditem_1 = CDitem.objects.create(path="npm/name/version") # asdf is not a proper type - cditem_2 = CDitem.objects.create(path='asdf/name/version') + cditem_2 = CDitem.objects.create(path="asdf/name/version") cditems = list(CDitem.objects.known_package_types()) self.assertEqual(1, len(cditems)) cditem = cditems[0] self.assertEqual(cditem_1, cditem) def test_definitions(self): - expected_definition = CDitem.objects.create(path='composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json') + expected_definition = CDitem.objects.create( + path="composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json" + ) # harvest should not be in cditems - harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') + harvest = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" + ) cditems = list(CDitem.objects.definitions()) self.assertEqual(1, len(cditems)) definition = cditems[0] self.assertEqual(expected_definition, definition) def test_scancode_harvests(self): - expected_harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') + expected_harvest = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" + ) # unexpected_harvest should not be in cditems - unexpected_harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/licensee/9.13.0.json') + unexpected_harvest = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/licensee/9.13.0.json" + ) harvests = list(CDitem.objects.scancode_harvests()) self.assertEqual(1, len(harvests)) harvest = harvests[0] self.assertEqual(expected_harvest, harvest) def test_mappable(self): - definition_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json') + definition_1 = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" + ) definition_2 = CDitem.objects.create( - path='sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json', + path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json", last_map_date=timezone.now(), - map_error='error' + map_error="error", + ) + harvest = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) - harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') mappables = list(CDitem.objects.mappable()) self.assertEqual(2, len(mappables)) self.assertIn(definition_1, mappables) self.assertIn(harvest, mappables) def test_mappable_definitions(self): - definition_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json') + definition_1 = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" + ) definition_2 = CDitem.objects.create( - path='sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json', + path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json", last_map_date=timezone.now(), - map_error='error' + map_error="error", + ) + harvest = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) - harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') mappables = list(CDitem.objects.mappable_definitions()) self.assertEqual(1, len(mappables)) definition = mappables[0] self.assertEqual(definition_1, definition) def test_mappable_scancode_harvests(self): - harvest_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') + harvest_1 = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" + ) harvest_2 = CDitem.objects.create( - path='sourcearchive/mavencentral/io.cucumber/cucumber-core/revision/5.0.0-RC1/tool/scancode/3.2.2.json', + path="sourcearchive/mavencentral/io.cucumber/cucumber-core/revision/5.0.0-RC1/tool/scancode/3.2.2.json", last_map_date=timezone.now(), - map_error='error' + map_error="error", + ) + definition_1 = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" ) - definition_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json') mappables = list(CDitem.objects.mappable_scancode_harvests()) self.assertEqual(1, len(mappables)) harvest = mappables[0] diff --git a/clearcode/tests/test_sync.py b/clearcode/tests/test_sync.py index 83bb7ce8..97a409b3 100644 --- a/clearcode/tests/test_sync.py +++ b/clearcode/tests/test_sync.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -21,7 +20,6 @@ import json from django.test import TestCase -from django.utils import timezone from clearcode.models import CDitem from clearcode.sync import db_saver @@ -29,12 +27,12 @@ class SyncDbsaverTestCase(TestCase): def setUp(self): - self.test_path = 'composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json' - self.test_content = {'test': 'content'} + self.test_path = "composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json" + self.test_content = {"test": "content"} self.cditem0 = CDitem.objects.create( - path=self.test_path, - content=gzip.compress(json.dumps(self.test_content).encode('utf-8')), + path=self.test_path, + content=gzip.compress(json.dumps(self.test_content).encode("utf-8")), ) def test_db_saver_identical_path(self): @@ -42,5 +40,5 @@ def test_db_saver_identical_path(self): self.assertEqual(1, len(CDitem.objects.all())) def test_db_saver_different_path(self): - db_saver(content=self.test_content, blob_path='new/blob/path.json') + db_saver(content=self.test_content, blob_path="new/blob/path.json") self.assertEqual(2, len(CDitem.objects.all())) diff --git a/clearindex/harvest.py b/clearindex/harvest.py index 2cf4ef42..a152c7db 100644 --- a/clearindex/harvest.py +++ b/clearindex/harvest.py @@ -13,12 +13,10 @@ from django.db import transaction from django.utils import timezone -from packagedb.models import Package -from packagedb.models import Resource - from minecode.model_utils import merge_packages from minecode.utils import stringify_null_purl_fields - +from packagedb.models import Package +from packagedb.models import Resource logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -31,13 +29,13 @@ def get_resource_license_expressions(file_data): with a newline separating each or None if there are no license_expression statements in the scan data. """ - license_expressions = file_data.get('license_expressions', []) or [] + license_expressions = file_data.get("license_expressions", []) or [] if license_expressions == []: return expressions = set(list(expression for expression in license_expressions)) - return '\n'.join(expressions) + return "\n".join(expressions) def get_resource_copyright_statements(file_data): @@ -45,62 +43,60 @@ def get_resource_copyright_statements(file_data): Return a string that contains all the copyright statements (deduped), with a newline separating each or None if there are no copyright statements in the scan data. """ - copyrights = file_data.get('copyrights', []) or [] + copyrights = file_data.get("copyrights", []) or [] if copyrights == []: return - statements = set(list(copyright.get('value') for copyright in copyrights)) + statements = set(list(copyright.get("value") for copyright in copyrights)) - return '\n'.join(statements) + return "\n".join(statements) -def create_from_harvest(package_scan={}, files_data=[], cditem_path=''): - """ - Return a Package object, created or updated via a ScanCode-Toolkit "package" scan. - """ +def create_from_harvest(package_scan={}, files_data=[], cditem_path=""): + """Return a Package object, created or updated via a ScanCode-Toolkit "package" scan.""" fields = ( - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'primary_language', - 'description', - 'keywords', - 'homepage_url', - 'download_url', - 'size', - 'sha1', - 'md5', - 'sha256', - 'sha512', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'copyright', - 'license_expression', - 'declared_license', - 'notice_text', - 'source_packages', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "primary_language", + "description", + "keywords", + "homepage_url", + "download_url", + "size", + "sha1", + "md5", + "sha256", + "sha512", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "copyright", + "license_expression", + "declared_license", + "notice_text", + "source_packages", ) - package_data = {field_name: package_scan.get( - field_name) for field_name in fields} + package_data = {field_name: package_scan.get(field_name) for field_name in fields} stringify_null_purl_fields(package_data) - pkg_type = package_data.get('type') - namespace = package_data.get('namespace') - name = package_data.get('name') - version = package_data.get('version') - qualifiers = package_data.get('qualifiers') - subpath = package_data.get('subpath') + pkg_type = package_data.get("type") + namespace = package_data.get("namespace") + name = package_data.get("name") + version = package_data.get("version") + qualifiers = package_data.get("qualifiers") + subpath = package_data.get("subpath") - download_url = package_data.get('download_url') + download_url = package_data.get("download_url") if not download_url: logger.error( - 'Null `download_url` value for `package_data`: {}'.format(package_data)) + f"Null `download_url` value for `package_data`: {package_data}" + ) return # This ugly block is needed until https://github.com/nexB/packagedb/issues/14 @@ -113,57 +109,53 @@ def create_from_harvest(package_scan={}, files_data=[], cditem_path=''): version=version, qualifiers=qualifiers, subpath=subpath, - download_url=download_url + download_url=download_url, ) # Merge package records if it already exists merge_packages( - existing_package=package, - new_package_data=package_data, - replace=False + existing_package=package, new_package_data=package_data, replace=False ) package.append_to_history( - 'Updated package from CDitem harvest: {}'.format(cditem_path)) + f"Updated package from CDitem harvest: {cditem_path}" + ) - logger.info( - 'Merged package data from scancode harvest: {}'.format(package)) + logger.info(f"Merged package data from scancode harvest: {package}") except Package.DoesNotExist: try: package = Package.objects.get(download_url=download_url) # Merge package records if it already exists merge_packages( - existing_package=package, - new_package_data=package_data, - replace=False + existing_package=package, new_package_data=package_data, replace=False ) package.append_to_history( - 'Updated package from CDitem harvest: {}'.format(cditem_path)) + f"Updated package from CDitem harvest: {cditem_path}" + ) - logger.info( - 'Merged package data from scancode harvest: {}'.format(package)) + logger.info(f"Merged package data from scancode harvest: {package}") except Package.DoesNotExist: package = Package.objects.create(**package_data) package.append_to_history( - 'Created package from CDitem harvest: {}'.format(cditem_path)) + f"Created package from CDitem harvest: {cditem_path}" + ) - logger.info( - 'Created package from scancode harvest: {}'.format(package)) + logger.info(f"Created package from scancode harvest: {package}") # Now, add resources to the Package. for f in files_data: - path = f.get('path') - is_file = f.get('type', '') == 'file' + path = f.get("path") + is_file = f.get("type", "") == "file" copyright = get_resource_copyright_statements(f) license_expression = get_resource_license_expressions(f) file_data = dict( package=package, path=path, - size=f.get('size'), - sha1=f.get('sha1'), - md5=f.get('md5'), - sha256=f.get('sha256'), - git_sha1=f.get('git_sha1'), + size=f.get("size"), + sha1=f.get("sha1"), + md5=f.get("md5"), + sha256=f.get("sha256"), + git_sha1=f.get("git_sha1"), is_file=is_file, copyright=copyright, license_expression=license_expression, @@ -187,28 +179,26 @@ def map_scancode_harvest(cditem): try: harvest_data = cditem.data except ValueError: - err_msg = 'CDitemError: empty content field for CDitem: {}'.format( - cditem.path) + err_msg = f"CDitemError: empty content field for CDitem: {cditem.path}" logger.error(err_msg) cditem.map_error = err_msg cditem.save() return 0 - content = harvest_data.get('content', {}) or {} - files_data = content.get('files', []) or [] - summary = content.get('summary', {}) or {} - packages = summary.get('packages', []) or [] + content = harvest_data.get("content", {}) or {} + files_data = content.get("files", []) or [] + summary = content.get("summary", {}) or {} + packages = summary.get("packages", []) or [] for package_scan in packages: # Check if there is a valid download url. Missing download_url values are # considered map_errors, as a Package object cannot have a `Null` # download_url value. - download_url = package_scan.get('download_url') + download_url = package_scan.get("download_url") if not download_url: - purl = package_scan.get('purl') - err_msg = 'CDitemError: empty download_url for package: {}'.format( - purl) + purl = package_scan.get("purl") + err_msg = f"CDitemError: empty download_url for package: {purl}" logger.error(err_msg) cditem.map_error = err_msg diff --git a/clearindex/management/commands/run_clearindex.py b/clearindex/management/commands/run_clearindex.py index 9af70702..1f6b341f 100644 --- a/clearindex/management/commands/run_clearindex.py +++ b/clearindex/management/commands/run_clearindex.py @@ -27,13 +27,12 @@ from clearcode.models import CDitem from clearindex import harvest -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand +from minecode.management.commands import get_error_message from minecode.model_utils import merge_packages from minecode.utils import stringify_null_purl_fields from packagedb.models import Package - TRACE = False logger = logging.getLogger(__name__) @@ -48,9 +47,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -62,42 +59,43 @@ def stop_handler(*args, **kwargs): PACKAGE_TYPES_BY_CD_TYPE = { - 'crate': 'cargo', - 'deb': 'deb', - 'debsrc': 'deb', + "crate": "cargo", + "deb": "deb", + "debsrc": "deb", # Currently used only for maven packages - 'sourcearchive': 'maven', - 'maven': 'maven', - 'composer': 'composer', + "sourcearchive": "maven", + "maven": "maven", + "composer": "composer", # Currently used only for Github repo/packages - 'git': 'github', - 'pod': 'pod', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'gem', + "git": "github", + "pod": "pod", + "nuget": "nuget", + "pypi": "pypi", + "gem": "gem", } # TODO: Update with more Package types when scancode-toolkit is updated PACKAGE_TYPES_WITH_GET_URLS = { - 'maven': maven.get_urls, - 'npm': npm.get_urls, - 'pypi': pypi.get_pypi_urls, - 'gem': rubygems.get_urls, - 'nuget': nuget.get_urls, + "maven": maven.get_urls, + "npm": npm.get_urls, + "pypi": pypi.get_pypi_urls, + "gem": rubygems.get_urls, + "nuget": nuget.get_urls, } class Command(VerboseCommand): - help = 'Run a mapping worker.' + help = "Run a mapping worker." def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) def handle(self, *args, **options): """ @@ -107,32 +105,34 @@ def handle(self, *args, **options): global MUST_STOP logger.setLevel(self.get_verbosity(**options)) - exit_on_empty = options.get('exit_on_empty') + exit_on_empty = options.get("exit_on_empty") sleeping = False created_packages_count = 0 - logger.info('Running ClearIndex') + logger.info("Running ClearIndex") while True: if MUST_STOP: - logger.info('Graceful exit of the map loop.') + logger.info("Graceful exit of the map loop.") break mappable_definitions = CDitem.objects.mappable_definitions()[ - :MAP_BATCH_SIZE] + :MAP_BATCH_SIZE + ] mappable_scancode_harvests = CDitem.objects.mappable_scancode_harvests()[ - :MAP_BATCH_SIZE] + :MAP_BATCH_SIZE + ] try: if not mappable_definitions and not mappable_scancode_harvests: if exit_on_empty: - logger.info('No mappable CDitem, exiting...') + logger.info("No mappable CDitem, exiting...") break # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No mappable CDitem, sleeping...') + logger.info("No mappable CDitem, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -155,7 +155,7 @@ def handle(self, *args, **options): logger.error(e) break - msg = '{}: {} Packages processed.' + msg = "{}: {} Packages processed." msg = msg.format(timezone.now(), created_packages_count) logger.info(msg) @@ -177,8 +177,7 @@ def map_definition(cditem): cditem.save() return package except Exception as e: - msg = 'Error: Failed to map while processing CDitem: {}\n'.format( - repr(cditem.path)) + msg = f"Error: Failed to map while processing CDitem: {repr(cditem.path)}\n" msg += get_error_message(e) logger.error(msg) cditem.map_error = msg @@ -186,55 +185,56 @@ def map_definition(cditem): def get_coords_des_and_lic_from_def(definition): - return definition.get('coordinates', {}), definition.get('described', {}), definition.get('licensed', {}) + return ( + definition.get("coordinates", {}), + definition.get("described", {}), + definition.get("licensed", {}), + ) # CD_TYPES_WITH_SOURCE = ('debsrc', 'npm', 'sourcearchive',) def get_or_create_package_from_cditem_definition(cditem): - """ - Create a Package from a CDitem definition or return a Package if it already exists - """ + """Create a Package from a CDitem definition or return a Package if it already exists""" definition = cditem.data if not definition: - raise Exception('No data available for this definition') - coordinates, described, licensed = get_coords_des_and_lic_from_def( - definition) + raise Exception("No data available for this definition") + coordinates, described, licensed = get_coords_des_and_lic_from_def(definition) - download_url = described.get('urls', {}).get('download', '') + download_url = described.get("urls", {}).get("download", "") if not download_url: # We use our data to create a Package in order to form the download_url, since we do not have the download_url for the Package # We need to have a unique download URL for every Package download_url = create_download_url_from_coords(coordinates) if not download_url: - raise Exception('No download URL is available for this definition') + raise Exception("No download URL is available for this definition") - if download_url.startswith('http://central.maven.org'): - split_download_url = download_url.rsplit('http://central.maven.org') + if download_url.startswith("http://central.maven.org"): + split_download_url = download_url.rsplit("http://central.maven.org") if len(split_download_url) == 2: - download_url = 'https://repo1.maven.org' + split_download_url[1] + download_url = "https://repo1.maven.org" + split_download_url[1] stringify_null_purl_fields(coordinates) - namespace = coordinates.get('namespace') - namespace = namespace if namespace != '-' else '' - name = coordinates.get('name') - version = coordinates.get('revision') - package_type = coordinates.get('type') - converted_package_type = PACKAGE_TYPES_BY_CD_TYPE.get( - package_type) or package_type + namespace = coordinates.get("namespace") + namespace = namespace if namespace != "-" else "" + name = coordinates.get("name") + version = coordinates.get("revision") + package_type = coordinates.get("type") + converted_package_type = PACKAGE_TYPES_BY_CD_TYPE.get(package_type) or package_type # TODO: Source packages need to be updated for clearlydefined, link source packages to binary packages - hashes = described.get('hashes', {}) - sha1 = hashes.get('sha1') - sha256 = hashes.get('sha256') - homepage_url = described.get('projectWebsite') - release_date = described.get('releaseDate') - declared_license = licensed.get('declared') + hashes = described.get("hashes", {}) + sha1 = hashes.get("sha1") + sha256 = hashes.get("sha256") + homepage_url = described.get("projectWebsite") + release_date = described.get("releaseDate") + declared_license = licensed.get("declared") normalized_license_expression = licensing.get_normalized_expression( - declared_license) + declared_license + ) copyrights = get_parties_from_licensed(licensed) - copyrights = '\n'.join(copyrights) + copyrights = "\n".join(copyrights) definition_mining_level = 0 existing_package = None @@ -258,12 +258,13 @@ def get_or_create_package_from_cditem_definition(cditem): declared_license=declared_license, license_expression=normalized_license_expression, copyright=copyrights, - mining_level=definition_mining_level + mining_level=definition_mining_level, ) # log history if package was created if created: package.append_to_history( - 'Created package from CDitem definition: {}'.format(cditem.path)) + f"Created package from CDitem definition: {cditem.path}" + ) else: # TODO: This is temporary until we fold clearindex into minecode mapping @@ -286,37 +287,35 @@ def get_or_create_package_from_cditem_definition(cditem): merge_packages( existing_package=existing_package, new_package_data=new_package_data, - replace=True + replace=True, ) package = existing_package package.append_to_history( - 'Updated package from CDitem definition: {}'.format(cditem.path)) + f"Updated package from CDitem definition: {cditem.path}" + ) return package def is_scancode_scan(harvest): - return harvest.get('_metadata', {}).get('type', '') == 'scancode' + return harvest.get("_metadata", {}).get("type", "") == "scancode" def create_download_url_from_coords(coord): - """ - Return a download URL for a supported Package from Coordinates `coord` - """ - ptype = coord.get('type') - namespace = coord.get('namespace') - name = coord.get('name') - version = coord.get('revision') + """Return a download URL for a supported Package from Coordinates `coord`""" + ptype = coord.get("type") + namespace = coord.get("namespace") + name = coord.get("name") + version = coord.get("revision") package_type = PACKAGE_TYPES_BY_CD_TYPE.get(ptype) if not package_type: - raise Exception( - 'Unsupported ClearlyDefined package type: {}'.format(ptype)) + raise Exception(f"Unsupported ClearlyDefined package type: {ptype}") get_urls = PACKAGE_TYPES_WITH_GET_URLS.get(package_type) if get_urls: urls = get_urls(namespace=namespace, name=name, version=version) - return urls['repository_download_url'] + return urls["repository_download_url"] def str2coord(s): @@ -330,21 +329,31 @@ def str2coord(s): plain: /gem/rubygems/foo/mocha/1.7.0" """ from itertools import izip_longest - is_urn = s.startswith('urn') - is_url = s.startswith('cd:') - splitter = ':' if is_urn else '/' + + is_urn = s.startswith("urn") + is_url = s.startswith("cd:") + splitter = ":" if is_urn else "/" segments = s.strip(splitter).split(splitter) if is_urn or is_url: segments = segments[1:] # ignore extra segments for now beyond the 5 fisrt (such as the PR of a curation) segments = segments[:5] - fields = ('type', 'provider', 'namespace', 'name', 'revision',) + fields = ( + "type", + "provider", + "namespace", + "name", + "revision", + ) return dict(izip_longest(fields, segments)) def get_parties_from_licensed(licensed): - """ - Return a list of Copyright statements from `licensed`, if available - """ - return licensed.get('facets', {}).get('core', {}).get('attribution', {}).get('parties', []) + """Return a list of Copyright statements from `licensed`, if available""" + return ( + licensed.get("facets", {}) + .get("core", {}) + .get("attribution", {}) + .get("parties", []) + ) diff --git a/clearindex/utils.py b/clearindex/utils.py index dcf19ba4..7c57d8d8 100644 --- a/clearindex/utils.py +++ b/clearindex/utils.py @@ -7,19 +7,15 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from unittest import TestCase -import logging import ntpath import os import posixpath -import traceback +from unittest import TestCase -from django.core.management.base import BaseCommand from django.test import TestCase as DjangoTestCase from minecode.utils_test import JsonBasedTesting - """ The conventions used for the tests are: - for tests that require files these are stored in the testfiles directory @@ -30,7 +26,7 @@ class BaseTestCase(TestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") @classmethod def get_test_loc(cls, path): @@ -44,13 +40,11 @@ def get_test_loc(cls, path): class ClearIndexTestCase(JsonBasedTesting, BaseTestCase, DjangoTestCase): - databases = '__all__' + databases = "__all__" def to_os_native_path(path): - """ - Normalize a path to use the native OS path separator. - """ + """Normalize a path to use the native OS path separator.""" path = path.replace(posixpath.sep, os.path.sep) path = path.replace(ntpath.sep, os.path.sep) path = path.rstrip(os.path.sep) diff --git a/manage_matchcode.py b/manage_matchcode.py index 872f8398..bfaed621 100755 --- a/manage_matchcode.py +++ b/manage_matchcode.py @@ -11,10 +11,8 @@ import os import sys - -if __name__ == '__main__': +if __name__ == "__main__": from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', - 'matchcode_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "matchcode_project.settings") execute_from_command_line(sys.argv) diff --git a/manage_purldb.py b/manage_purldb.py index 2dbe57a9..62029b84 100755 --- a/manage_purldb.py +++ b/manage_purldb.py @@ -11,9 +11,8 @@ import os import sys - -if __name__ == '__main__': +if __name__ == "__main__": from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'purldb_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_project.settings") execute_from_command_line(sys.argv) diff --git a/matchcode/api.py b/matchcode/api.py index 98762547..1edebbc4 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -9,8 +9,13 @@ from django.db.models import Q from django.forms import widgets from django.forms.fields import MultipleChoiceField + from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet +from matchcode_toolkit.fingerprinting import create_halohash_chunks +from matchcode_toolkit.fingerprinting import hexstring_to_binarray +from matchcode_toolkit.fingerprinting import split_fingerprint +from matchcode_toolkit.halohash import byte_hamming_distance from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.serializers import CharField @@ -21,49 +26,35 @@ from rest_framework.serializers import Serializer from rest_framework.viewsets import ReadOnlyModelViewSet -from matchcode_toolkit.fingerprinting import create_halohash_chunks -from matchcode_toolkit.fingerprinting import hexstring_to_binarray -from matchcode_toolkit.fingerprinting import split_fingerprint -from matchcode_toolkit.halohash import byte_hamming_distance -from matchcode.models import ExactFileIndex -from matchcode.models import ExactPackageArchiveIndex from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex +from matchcode.models import ExactFileIndex +from matchcode.models import ExactPackageArchiveIndex class BaseFileIndexSerializer(ModelSerializer): - sha1 = CharField(source='fingerprint') + sha1 = CharField(source="fingerprint") package = HyperlinkedRelatedField( - view_name='api:package-detail', - lookup_field='uuid', - read_only=True + view_name="api:package-detail", lookup_field="uuid", read_only=True ) class ExactFileIndexSerializer(BaseFileIndexSerializer): class Meta: model = ExactFileIndex - fields = ( - 'sha1', - 'package' - ) + fields = ("sha1", "package") class ExactPackageArchiveIndexSerializer(BaseFileIndexSerializer): class Meta: model = ExactPackageArchiveIndex - fields = ( - 'sha1', - 'package' - ) + fields = ("sha1", "package") class BaseDirectoryIndexSerializer(ModelSerializer): fingerprint = ReadOnlyField() package = HyperlinkedRelatedField( - view_name='api:package-detail', - lookup_field='uuid', - read_only=True + view_name="api:package-detail", lookup_field="uuid", read_only=True ) @@ -71,8 +62,8 @@ class ApproximateDirectoryContentIndexSerializer(BaseDirectoryIndexSerializer): class Meta: model = ApproximateDirectoryContentIndex fields = ( - 'fingerprint', - 'package', + "fingerprint", + "package", ) @@ -80,8 +71,8 @@ class ApproximateDirectoryStructureIndexSerializer(BaseDirectoryIndexSerializer) class Meta: model = ApproximateDirectoryStructureIndex fields = ( - 'fingerprint', - 'package', + "fingerprint", + "package", ) @@ -89,9 +80,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer): fingerprint = CharField() matched_fingerprint = CharField() package = HyperlinkedRelatedField( - view_name='api:package-detail', - lookup_field='uuid', - read_only=True + view_name="api:package-detail", lookup_field="uuid", read_only=True ) similarity_score = FloatField() @@ -104,22 +93,19 @@ class CharMultipleWidget(widgets.TextInput): def value_from_datadict(self, data, files, name): value = widgets.SelectMultiple().value_from_datadict(data, files, name) - if not value or value == ['']: - return '' + if not value or value == [""]: + return "" return value def format_value(self, value): - """ - Return a value as it should appear when rendered in a template. - """ - return ', '.join(value) + """Return a value as it should appear when rendered in a template.""" + return ", ".join(value) class MultipleCharField(MultipleChoiceField): - """ - Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`. - """ + """Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`.""" + widget = CharMultipleWidget def valid_value(self, value): @@ -127,9 +113,8 @@ def valid_value(self, value): class MultipleCharFilter(MultipleChoiceFilter): - """ - Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax. - """ + """Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax.""" + field_class = MultipleCharField @@ -145,7 +130,7 @@ def filter(self, qs, value): predicate = self.get_filter_predicate(value) old_field_name = next(iter(predicate)) - new_field_name = f'{old_field_name}__in' + new_field_name = f"{old_field_name}__in" predicate[new_field_name] = predicate[old_field_name] predicate.pop(old_field_name) @@ -198,9 +183,9 @@ def filter(self, qs, value): chunk1=chunk1, chunk2=chunk2, chunk3=chunk3, - chunk4=chunk4 + chunk4=chunk4, ), - Q.OR + Q.OR, ) return qs.filter(q) @@ -213,17 +198,13 @@ class BaseFileIndexFilterSet(FilterSet): class ExactFileIndexFilterSet(BaseFileIndexFilterSet): class Meta: model = ExactFileIndex - fields = ( - 'sha1', - ) + fields = ("sha1",) class ExactPackageArchiveFilterSet(BaseFileIndexFilterSet): class Meta: model = ExactPackageArchiveIndex - fields = ( - 'sha1', - ) + fields = ("sha1",) class BaseDirectoryIndexFilterSet(FilterSet): @@ -233,21 +214,17 @@ class BaseDirectoryIndexFilterSet(FilterSet): class ApproximateDirectoryContentFilterSet(BaseDirectoryIndexFilterSet): class Meta: model = ApproximateDirectoryContentIndex - fields = ( - 'fingerprint', - ) + fields = ("fingerprint",) class ApproximateDirectoryStructureFilterSet(BaseDirectoryIndexFilterSet): class Meta: model = ApproximateDirectoryStructureIndex - fields = ( - 'fingerprint', - ) + fields = ("fingerprint",) class BaseFileIndexViewSet(ReadOnlyModelViewSet): - lookup_field = 'sha1' + lookup_field = "sha1" class ExactFileIndexViewSet(BaseFileIndexViewSet): @@ -263,11 +240,11 @@ class ExactPackageArchiveIndexViewSet(BaseFileIndexViewSet): class BaseDirectoryIndexViewSet(ReadOnlyModelViewSet): - lookup_field = 'fingerprint' + lookup_field = "fingerprint" @action(detail=False) def match(self, request): - fingerprints = request.query_params.getlist('fingerprint') + fingerprints = request.query_params.getlist("fingerprint") if not fingerprints: return Response() @@ -285,17 +262,15 @@ def match(self, request): similarity_score = (128 - hd) / 128 results.append( { - 'fingerprint': fingerprint, - 'matched_fingerprint': fp, - 'package': match.package, - 'similarity_score': similarity_score, + "fingerprint": fingerprint, + "matched_fingerprint": fp, + "package": match.package, + "similarity_score": similarity_score, } ) serialized_match_results = BaseDirectoryIndexMatchSerializer( - results, - context={'request': request}, - many=True + results, context={"request": request}, many=True ) return Response(serialized_match_results.data) diff --git a/matchcode/match.py b/matchcode/match.py index 46936bc2..c4a15768 100644 --- a/matchcode/match.py +++ b/matchcode/match.py @@ -10,9 +10,10 @@ from functools import reduce from operator import or_ +from django.db.models import Q + import attr from commoncode.resource import VirtualCodebase -from django.db.models import Q from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode.models import ApproximateDirectoryContentIndex @@ -52,10 +53,9 @@ def do_match(codebase, match_type): The total number of matches found is returned. """ - matcher = get_matchers().get(match_type) if not matcher: - raise Exception('Unknown match type: {}'.format(match_type)) + raise Exception(f"Unknown match type: {match_type}") match_count = matcher(codebase) return match_count @@ -63,12 +63,8 @@ def do_match(codebase, match_type): def run_do_match_from_scan(scan_file_location, match_type): vc = VirtualCodebase( location=scan_file_location, - codebase_attributes=dict( - matches=attr.ib(default=attr.Factory(list)) - ), - resource_attributes=dict( - matched_to=attr.ib(default=attr.Factory(list)) - ) + codebase_attributes=dict(matches=attr.ib(default=attr.Factory(list))), + resource_attributes=dict(matched_to=attr.ib(default=attr.Factory(list))), ) vc = compute_codebase_directory_fingerprints(vc) do_match(vc, match_type) @@ -83,9 +79,11 @@ def package_archive_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if (resource.is_dir - or not resource.is_archive - or resource.extra_data.get('matched', False)): + if ( + resource.is_dir + or not resource.is_archive + or resource.extra_data.get("matched", False) + ): continue archive_matches, match_type = get_archive_match(resource) @@ -107,7 +105,7 @@ def approximate_directory_content_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_file or resource.extra_data.get('matched', False): + if resource.is_file or resource.extra_data.get("matched", False): continue directory_matches, match_type = get_directory_content_match(resource) @@ -115,8 +113,7 @@ def approximate_directory_content_match(codebase): continue match_count += directory_matches.count() - tag_matched_resources(resource, codebase, - directory_matches, match_type) + tag_matched_resources(resource, codebase, directory_matches, match_type) return match_count @@ -128,7 +125,7 @@ def approximate_directory_structure_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_file or resource.extra_data.get('matched', False): + if resource.is_file or resource.extra_data.get("matched", False): continue directory_matches, match_type = get_directory_structure_match(resource) @@ -136,8 +133,7 @@ def approximate_directory_structure_match(codebase): continue match_count += directory_matches.count() - tag_matched_resources(resource, codebase, - directory_matches, match_type) + tag_matched_resources(resource, codebase, directory_matches, match_type) return match_count @@ -149,7 +145,7 @@ def individual_file_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_dir or resource.extra_data.get('matched', False): + if resource.is_dir or resource.extra_data.get("matched", False): continue file_matches, match_type = get_file_match(resource) @@ -169,7 +165,7 @@ def approximate_file_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_dir or resource.extra_data.get('matched', False): + if resource.is_dir or resource.extra_data.get("matched", False): continue file_matches, match_type = get_approximate_file_match(resource) if not file_matches: @@ -181,71 +177,56 @@ def approximate_file_match(codebase): def get_directory_content_match(resource): - """ - Match a directory to a Package using its contents - """ - directory_content_fingerprint = resource.extra_data.get( - 'directory_content', '') + """Match a directory to a Package using its contents""" + directory_content_fingerprint = resource.extra_data.get("directory_content", "") matches = ApproximateDirectoryContentIndex.objects.none() - match_type = '' + match_type = "" if directory_content_fingerprint: directory_matches = ApproximateDirectoryContentIndex.match( - directory_content_fingerprint, - resource + directory_content_fingerprint, resource ) matches |= directory_matches - match_type = 'approximate-content' + match_type = "approximate-content" return matches, match_type # TODO: rename match_directory_structure def get_directory_structure_match(resource): - """ - Match a directory to a Package using its structure - """ - directory_structure_fingerprint = resource.extra_data.get( - 'directory_structure', '') + """Match a directory to a Package using its structure""" + directory_structure_fingerprint = resource.extra_data.get("directory_structure", "") matches = ApproximateDirectoryStructureIndex.objects.none() - match_type = '' + match_type = "" if directory_structure_fingerprint: directory_matches = ApproximateDirectoryStructureIndex.match( - directory_structure_fingerprint, - resource + directory_structure_fingerprint, resource ) matches |= directory_matches - match_type = 'approximate-structure' + match_type = "approximate-structure" return matches, match_type def get_archive_match(resource): - """ - Match an Archive resource to a Package - """ + """Match an Archive resource to a Package""" file_matches = ExactPackageArchiveIndex.match(resource.sha1) - return file_matches, 'exact-archive' + return file_matches, "exact-archive" def get_file_match(resource): - """ - Match an individual file back to the Package it is from - """ + """Match an individual file back to the Package it is from""" file_matches = ExactFileIndex.match(resource.sha1) - return file_matches, 'exact-file' + return file_matches, "exact-file" def get_approximate_file_match(resource): - """ - Approximately match an individual file back to the Package it is from - """ - if hasattr(resource, 'halo1'): + """Approximately match an individual file back to the Package it is from""" + if hasattr(resource, "halo1"): resource_content_fingerprint = resource.halo1 else: - resource_content_fingerprint = resource.extra_data.get('halo1', '') + resource_content_fingerprint = resource.extra_data.get("halo1", "") file_matches = ApproximateResourceContentIndex.match( - resource_content_fingerprint, - resource + resource_content_fingerprint, resource ) - return file_matches, 'approximate-file' + return file_matches, "approximate-file" def tag_matched_resource(resource, codebase, purl): @@ -255,7 +236,7 @@ def tag_matched_resource(resource, codebase, purl): """ if purl not in resource.matched_to: resource.matched_to.append(purl) - resource.extra_data['matched'] = True + resource.extra_data["matched"] = True resource.save(codebase) @@ -268,7 +249,7 @@ def tag_matched_resources(resource, codebase, matches, match_type): for match in matches: # Prep matched package data and append to `codebase` matched_package_info = match.package.to_dict() - matched_package_info['match_type'] = match_type + matched_package_info["match_type"] = match_type codebase.attributes.matches.append(matched_package_info) purl = match.package.package_url @@ -280,18 +261,17 @@ def tag_matched_resources(resource, codebase, matches, match_type): # by or), then querying the matched packages resources to see if any of # those suffixes match a package child resource path for child in resource.walk(codebase): - query = reduce(or_, (Q(path=suffix) - for suffix in path_suffixes(child.path)), Q()) + query = reduce( + or_, (Q(path=suffix) for suffix in path_suffixes(child.path)), Q() + ) matched_child_resources = match.package.resources.filter(query) if len(matched_child_resources) > 0: tag_matched_resource(child, codebase, purl) def path_suffixes(path): - """ - Yield all the suffixes of `path`, starting from the longest (e.g. more segments). - """ - segments = path.strip('/').split('/') + """Yield all the suffixes of `path`, starting from the longest (e.g. more segments).""" + segments = path.strip("/").split("/") suffixes = (segments[i:] for i in range(len(segments))) for suffix in suffixes: - yield '/'.join(suffix) + yield "/".join(suffix) diff --git a/matchcode/models.py b/matchcode/models.py index 83aa59fa..bca5d9db 100644 --- a/matchcode/models.py +++ b/matchcode/models.py @@ -17,6 +17,7 @@ from django.db import models from django.forms.models import model_to_dict from django.utils.translation import gettext_lazy as _ + from matchcode_toolkit.fingerprinting import create_halohash_chunks from matchcode_toolkit.fingerprinting import hexstring_to_binarray from matchcode_toolkit.fingerprinting import split_fingerprint @@ -38,7 +39,7 @@ def logger_debug(*args): - return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) + return logger.debug(" ".join(isinstance(a, str) and a or repr(a) for a in args)) ############################################################################### @@ -48,14 +49,14 @@ class BaseFileIndex(models.Model): sha1 = models.BinaryField( max_length=20, db_index=True, - help_text='Binary form of a SHA1 checksum in lowercase hex for a file', + help_text="Binary form of a SHA1 checksum in lowercase hex for a file", null=False, blank=False, ) package = models.ForeignKey( Package, - help_text='The Package that this file is from', + help_text="The Package that this file is from", null=False, on_delete=models.CASCADE, ) @@ -67,22 +68,14 @@ class Meta: def index(cls, sha1, package): try: sha1_bin = hexstring_to_binarray(sha1) - bfi, created = cls.objects.get_or_create( - package=package, - sha1=sha1_bin - ) + bfi, created = cls.objects.get_or_create(package=package, sha1=sha1_bin) if created: logger.info( - '{} - Inserted {} for Package {}:\t{}'.format( - datetime.utcnow().isoformat(), - bfi.__class__.__name__, - package.download_url, - sha1 - ) + f"{datetime.utcnow().isoformat()} - Inserted {bfi.__class__.__name__} for Package {package.download_url}:\t{sha1}" ) return bfi, created except Exception as e: - msg = f'Error creating FileIndex:\n' + msg = "Error creating FileIndex:\n" msg += get_error_message(e) package.index_error = msg package.save() @@ -90,11 +83,9 @@ def index(cls, sha1, package): @classmethod def match(cls, sha1): - """ - Return a list of matched Packages that contains a file with a SHA1 value of `sha1` - """ + """Return a list of matched Packages that contains a file with a SHA1 value of `sha1`""" if TRACE: - logger_debug(cls.__name__, 'match:', 'sha1:', sha1) + logger_debug(cls.__name__, "match:", "sha1:", sha1) if not sha1: return cls.objects.none() @@ -105,11 +96,11 @@ def match(cls, sha1): for match in matches: package = match.package dct = model_to_dict(package) - logger_debug(cls.__name__, 'match:', 'matched_file:', dct) + logger_debug(cls.__name__, "match:", "matched_file:", dct) return matches def fingerprint(self): - return binascii.hexlify(self.sha1).decode('utf-8') + return binascii.hexlify(self.sha1).decode("utf-8") class ExactPackageArchiveIndex(BaseFileIndex): @@ -134,63 +125,62 @@ def bah128_ranges(indexed_elements_count, range_ratio=0.05): """ return ( int(indexed_elements_count * (1 - range_ratio)), - int(indexed_elements_count * (1 + range_ratio)) + int(indexed_elements_count * (1 + range_ratio)), ) class ApproximateMatchingHashMixin(models.Model): indexed_elements_count = models.IntegerField( - help_text='Number of elements that went into the fingerprint', + help_text="Number of elements that went into the fingerprint", ) chunk1 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the first 8 (0-7) hex digits of the fingerprint', + help_text="Binary form of the first 8 (0-7) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) chunk2 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the second 8 (8-15) hex digits of the fingerprint', + help_text="Binary form of the second 8 (8-15) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) chunk3 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the third 8 (16-23) hex digits of the fingerprint', + help_text="Binary form of the third 8 (16-23) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) chunk4 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the fourth 8 (24-32) hex digits of the fingerprint', + help_text="Binary form of the fourth 8 (24-32) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) package = models.ForeignKey( Package, - help_text='The Package that this resource is a part of', + help_text="The Package that this resource is a part of", null=False, on_delete=models.CASCADE, ) path = models.CharField( max_length=2000, - help_text=_('The full path value of this resource'), + help_text=_("The full path value of this resource"), ) class Meta: abstract = True - unique_together = ['chunk1', 'chunk2', - 'chunk3', 'chunk4', 'package', 'path'] + unique_together = ["chunk1", "chunk2", "chunk3", "chunk4", "package", "path"] def __str__(self): return self.fingerprint() @@ -207,8 +197,7 @@ def index(cls, fingerprint, resource_path, package): """ try: indexed_elements_count, fp = split_fingerprint(fingerprint) - fp_chunk1, fp_chunk2, fp_chunk3, fp_chunk4 = create_halohash_chunks( - fp) + fp_chunk1, fp_chunk2, fp_chunk3, fp_chunk4 = create_halohash_chunks(fp) bdi, created = cls.objects.get_or_create( indexed_elements_count=indexed_elements_count, chunk1=fp_chunk1, @@ -220,16 +209,11 @@ def index(cls, fingerprint, resource_path, package): ) if created: logger.info( - '{} - Inserted {} for Package {}:\t{}'.format( - datetime.utcnow().isoformat(), - bdi.__class__.__name__, - package.download_url, - fingerprint - ) + f"{datetime.utcnow().isoformat()} - Inserted {bdi.__class__.__name__} for Package {package.download_url}:\t{fingerprint}" ) return bdi, created except Exception as e: - msg = f'Error creating ApproximateMatchingHashMixin:\n' + msg = "Error creating ApproximateMatchingHashMixin:\n" msg += get_error_message(e) package.index_error = msg package.save() @@ -237,17 +221,15 @@ def index(cls, fingerprint, resource_path, package): @classmethod def match(cls, fingerprint, resource=None, exact_match=False): - """ - Return a list of matched Packages - """ + """Return a list of matched Packages""" if TRACE: logger_debug( cls.__name__, - 'match:', - 'fingerprint:', + "match:", + "fingerprint:", fingerprint, - 'resource:', - resource + "resource:", + resource, ) if not fingerprint: @@ -270,28 +252,16 @@ def match(cls, fingerprint, resource=None, exact_match=False): # Step 1: find fingerprints with matching chunks range = bah128_ranges(indexed_elements_count) matches = cls.objects.filter( - models.Q( - indexed_elements_count__range=range, - chunk1=chunk1 - ) | - models.Q( - indexed_elements_count__range=range, - chunk2=chunk2 - ) | - models.Q( - indexed_elements_count__range=range, - chunk3=chunk3 - ) | - models.Q( - indexed_elements_count__range=range, - chunk4=chunk4 - ) + models.Q(indexed_elements_count__range=range, chunk1=chunk1) + | models.Q(indexed_elements_count__range=range, chunk2=chunk2) + | models.Q(indexed_elements_count__range=range, chunk3=chunk3) + | models.Q(indexed_elements_count__range=range, chunk4=chunk4) ) if TRACE: for match in matches: dct = model_to_dict(match) - logger_debug(cls.__name__, 'match:', 'matched_package:', dct) + logger_debug(cls.__name__, "match:", "matched_package:", dct) # Step 2: calculate Hamming distance of all matches @@ -309,8 +279,7 @@ def match(cls, fingerprint, resource=None, exact_match=False): # TODO: try other thresholds if this is too restrictive if hd < 8: # Save match to `matches_by_hamming_distance` by adding the matched object to the queryset - matches_by_hamming_distance[hd] |= cls.objects.filter( - pk=match.pk) + matches_by_hamming_distance[hd] |= cls.objects.filter(pk=match.pk) if TRACE: logger_debug(list(matches_by_hamming_distance.items())) @@ -319,9 +288,7 @@ def match(cls, fingerprint, resource=None, exact_match=False): # TODO: consider limiting matches for brevity hamming_distances_and_matches = [] for hamming_distance, matches in sorted(matches_by_hamming_distance.items()): - hamming_distances_and_matches.append( - (hamming_distance, matches) - ) + hamming_distances_and_matches.append((hamming_distance, matches)) if TRACE: for hamming_distance, matches in hamming_distances_and_matches: @@ -329,11 +296,11 @@ def match(cls, fingerprint, resource=None, exact_match=False): dct = model_to_dict(match) logger_debug( cls.__name__, - 'match:', - 'step_3_hamming_distance:', + "match:", + "step_3_hamming_distance:", hamming_distance, - 'step_3_matched_package:', - dct + "step_3_matched_package:", + dct, ) # Step 4: use file heuristics to rank matches from step 3 @@ -355,35 +322,34 @@ def match(cls, fingerprint, resource=None, exact_match=False): if TRACE: logger_debug( cls.__name__, - 'match:', - 'step_4_matched_resource:', - matched_resource + "match:", + "step_4_matched_resource:", + matched_resource, ) # Compute size and name difference if matched_resource.is_file: - size_difference = abs( - resource_size - matched_resource.size) + size_difference = abs(resource_size - matched_resource.size) else: # TODO: index number of files in a directory so we can use # that for size comparison. For now, we are going to # disregard size as a factor. size_difference = 0 name_sequence_matcher = SequenceMatcher( - a=resource.name, b=matched_resource.name) + a=resource.name, b=matched_resource.name + ) name_difference = 1 - name_sequence_matcher.ratio() - rank_attributes = ( - hamming_distance, size_difference, name_difference) + rank_attributes = (hamming_distance, size_difference, name_difference) matches_by_rank_attributes[rank_attributes].append(match) if TRACE: logger_debug( cls.__name__, - 'match:', - 'step_4_size_difference:', + "match:", + "step_4_size_difference:", size_difference, - 'step_4_name_difference:', - name_difference + "step_4_name_difference:", + name_difference, ) # Order these from low to high (low being low difference/very similar)), first by hamming distance, then by size difference, and finally by name difference. @@ -393,15 +359,9 @@ def match(cls, fingerprint, resource=None, exact_match=False): if TRACE: dct = model_to_dict(match) - logger_debug( - cls.__name__, - 'match:', - 'step_4_best_match:', - dct - ) + logger_debug(cls.__name__, "match:", "step_4_best_match:", dct) - matches = cls.objects.filter( - pk__in=[match.pk for match in ranked_matches]) + matches = cls.objects.filter(pk__in=[match.pk for match in ranked_matches]) return matches def get_chunks(self): @@ -412,11 +372,12 @@ def get_chunks(self): return chunk1, chunk2, chunk3, chunk4 def fingerprint(self): - indexed_element_count_as_hex_bytes = b'%08x' % self.indexed_elements_count + indexed_element_count_as_hex_bytes = b"%08x" % self.indexed_elements_count chunk1, chunk2, chunk3, chunk4 = self.get_chunks() - fingerprint = indexed_element_count_as_hex_bytes + \ - chunk1 + chunk2 + chunk3 + chunk4 - return fingerprint.decode('utf-8') + fingerprint = ( + indexed_element_count_as_hex_bytes + chunk1 + chunk2 + chunk3 + chunk4 + ) + return fingerprint.decode("utf-8") class ApproximateDirectoryStructureIndex(ApproximateMatchingHashMixin): diff --git a/matchcode/tests/__init__.py b/matchcode/tests/__init__.py index d4b312b7..cf38b66f 100644 --- a/matchcode/tests/__init__.py +++ b/matchcode/tests/__init__.py @@ -9,5 +9,4 @@ import os - FIXTURES_REGEN = os.environ.get("MATCHCODE_TEST_FIXTURES_REGEN", False) diff --git a/matchcode/tests/test_match.py b/matchcode/tests/test_match.py index fa9df079..07faada7 100644 --- a/matchcode/tests/test_match.py +++ b/matchcode/tests/test_match.py @@ -28,7 +28,7 @@ class MatchPackagesTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): @@ -37,122 +37,122 @@ def setUp(self): super(MatchPackagesTestCase, self).setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='abbot-0.12.3.jar', - sha1='51d28a27d919ce8690a40f4f335b9d591ceb16e9', - md5='38206e62a54b0489fb6baa4db5a06093', + filename="abbot-0.12.3.jar", + sha1="51d28a27d919ce8690a40f4f335b9d591ceb16e9", + md5="38206e62a54b0489fb6baa4db5a06093", size=689791, - name='abbot', - version='0.12.3', - download_url='http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar', - type='maven', + name="abbot", + version="0.12.3", + download_url="http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar", + type="maven", ) self.test_package1_metadata = self.test_package1.to_dict() self.test_package2, _ = Package.objects.get_or_create( - filename='dojoz-0.4.1-1.jar', - sha1='ae9d68fd6a29906606c2d9407d1cc0749ef84588', - md5='508361a1c6273a4c2b8e4945618b509f', + filename="dojoz-0.4.1-1.jar", + sha1="ae9d68fd6a29906606c2d9407d1cc0749ef84588", + md5="508361a1c6273a4c2b8e4945618b509f", size=876720, - name='dojoz', - version='0.4.1-1', - download_url='https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar', - type='maven', + name="dojoz", + version="0.4.1-1", + download_url="https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar", + type="maven", ) self.test_package2_metadata = self.test_package2.to_dict() self.test_package3, _ = Package.objects.get_or_create( - filename='acegi-security-0.51.jar', - sha1='ede156692b33872f5ee9465b7a06d6b2bc9e5e7f', + filename="acegi-security-0.51.jar", + sha1="ede156692b33872f5ee9465b7a06d6b2bc9e5e7f", size=176954, - name='acegi-security', - version='0.51', - download_url='https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar', - type='maven' + name="acegi-security", + version="0.51", + download_url="https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar", + type="maven", ) self.test_package3_metadata = self.test_package3.to_dict() self.test_package4, _ = Package.objects.get_or_create( - filename='test.tar.gz', - sha1='deadbeef', + filename="test.tar.gz", + sha1="deadbeef", size=42589, - name='test', - version='0.01', - download_url='https://test.com/test.tar.gz', - type='maven' + name="test", + version="0.01", + download_url="https://test.com/test.tar.gz", + type="maven", ) self.test_package4_metadata = self.test_package4.to_dict() # Populate ExactPackageArchiveIndexFingerprint table index_packages_sha1() - load_resources_from_scan(self.get_test_loc( - 'models/match-test.json'), self.test_package4) + load_resources_from_scan( + self.get_test_loc("models/match-test.json"), self.test_package4 + ) index_package_directories(self.test_package4) index_package_files_sha1( - self.test_package4, self.get_test_loc('models/match-test.json')) + self.test_package4, self.get_test_loc("models/match-test.json") + ) # Add approximate file resource self.test_package5, _ = Package.objects.get_or_create( - filename='inflate.tar.gz', - sha1='deadfeed', - type='generic', - name='inflate', - version='1.0.0', - download_url='inflate.com/inflate.tar.gz', + filename="inflate.tar.gz", + sha1="deadfeed", + type="generic", + name="inflate", + version="1.0.0", + download_url="inflate.com/inflate.tar.gz", ) self.test_resource5, _ = Resource.objects.get_or_create( - path='inflate.c', - size=55466, - package=self.test_package5 + path="inflate.c", size=55466, package=self.test_package5 ) - self.test_resource5_fingerprint = '000018fba23a49e4cd40718d1297be719e6564a4' + self.test_resource5_fingerprint = "000018fba23a49e4cd40718d1297be719e6564a4" ApproximateResourceContentIndex.index( self.test_resource5_fingerprint, self.test_resource5.path, - self.test_package5 + self.test_package5, ) def test_do_match_package_archive_match(self): - input_file = self.get_test_loc('models/match-test.json') + input_file = self.get_test_loc("models/match-test.json") vc = run_do_match_from_scan(input_file, EXACT_PACKAGE_ARCHIVE_MATCH) - expected = self.get_test_loc( - 'models/match-test-exact-package-results.json') + expected = self.get_test_loc("models/match-test-exact-package-results.json") self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_directory_structure_match(self): - input_file = self.get_test_loc('models/match-test.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("models/match-test.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'models/match-test-approximate-directory-structure-results.json') + "models/match-test-approximate-directory-structure-results.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_directory_content_match(self): - input_file = self.get_test_loc('models/match-test.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("models/match-test.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'models/match-test-approximate-directory-content-results.json') + "models/match-test-approximate-directory-content-results.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_package_file_match(self): - input_file = self.get_test_loc('models/match-test.json') + input_file = self.get_test_loc("models/match-test.json") vc = run_do_match_from_scan(input_file, EXACT_FILE_MATCH) - expected = self.get_test_loc( - 'models/match-test-exact-file-results.json') + expected = self.get_test_loc("models/match-test-exact-file-results.json") self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_package_file_match(self): input_file = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-test.json') + "match/approximate-file-matching/approximate-match-test.json" + ) vc = run_do_match_from_scan(input_file, APPROXIMATE_FILE_MATCH) expected = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-test-results.json') + "match/approximate-file-matching/approximate-match-test-results.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) class MatchNestedPackagesTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): @@ -161,316 +161,322 @@ def setUp(self): super(MatchNestedPackagesTestCase, self).setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='plugin-request-2.4.1.tgz', - sha1='7295749caddd3c52be472eef6623a7b441ed17d6', + filename="plugin-request-2.4.1.tgz", + sha1="7295749caddd3c52be472eef6623a7b441ed17d6", size=7269, - name='plugin-request', - version='2.4.1', - download_url='https://registry.npmjs.org/@umijs/plugin-request/-/plugin-request-2.4.1.tgz', - type='npm', + name="plugin-request", + version="2.4.1", + download_url="https://registry.npmjs.org/@umijs/plugin-request/-/plugin-request-2.4.1.tgz", + type="npm", + ) + load_resources_from_scan( + self.get_test_loc("match/nested/plugin-request-2.4.1-ip.json"), + self.test_package1, ) - load_resources_from_scan(self.get_test_loc( - 'match/nested/plugin-request-2.4.1-ip.json'), self.test_package1) index_package_directories(self.test_package1) self.test_package2, _ = Package.objects.get_or_create( - filename='underscore-1.10.9.tgz', - sha1='ba7a9cfc15873e67821611503a34a7c26bf7264f', + filename="underscore-1.10.9.tgz", + sha1="ba7a9cfc15873e67821611503a34a7c26bf7264f", size=26569, - name='underscore', - version='1.10.9', - download_url='https://registry.npmjs.org/@types/underscore/-/underscore-1.10.9.tgz', - type='npm', + name="underscore", + version="1.10.9", + download_url="https://registry.npmjs.org/@types/underscore/-/underscore-1.10.9.tgz", + type="npm", + ) + load_resources_from_scan( + self.get_test_loc("match/nested/underscore-1.10.9-ip.json"), + self.test_package2, ) - load_resources_from_scan(self.get_test_loc( - 'match/nested/underscore-1.10.9-ip.json'), self.test_package2) index_package_directories(self.test_package2) def test_do_match_approximate_directory_structure_match(self): - input_file = self.get_test_loc('match/nested/nested.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/nested/nested.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/nested/nested-directory-structure-match-expected.json') + "match/nested/nested-directory-structure-match-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_directory_content_match(self): - input_file = self.get_test_loc('match/nested/nested.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/nested/nested.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/nested/nested-directory-content-match-expected.json') + "match/nested/nested-directory-content-match-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) class MatchUtilityFunctionsTestCase(MatchcodeTestCase): def test_path_suffixes(self): - suffixes = list(path_suffixes('/foo/bar/baz/qux')) - expected = ['foo/bar/baz/qux', 'bar/baz/qux', 'baz/qux', 'qux'] + suffixes = list(path_suffixes("/foo/bar/baz/qux")) + expected = ["foo/bar/baz/qux", "bar/baz/qux", "baz/qux", "qux"] self.assertEqual(expected, suffixes) class DirectoryMatchingTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): super(DirectoryMatchingTestCase, self).setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='abbrev-1.0.3.tgz', - sha1='aa049c967f999222aa42e14434f0c562ef468241', - name='abbrev', - version='1.0.3', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.3.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i.json'), self.test_package1) + filename="abbrev-1.0.3.tgz", + sha1="aa049c967f999222aa42e14434f0c562ef468241", + name="abbrev", + version="1.0.3", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.3.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.3-i.json"), + self.test_package1, + ) index_package_directories(self.test_package1) self.test_package2, _ = Package.objects.get_or_create( - filename='abbrev-1.0.4.tgz', - sha1='bd55ae5e413ba1722ee4caba1f6ea10414a59ecd', - name='abbrev', - version='1.0.4', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.4.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i.json'), self.test_package2) + filename="abbrev-1.0.4.tgz", + sha1="bd55ae5e413ba1722ee4caba1f6ea10414a59ecd", + name="abbrev", + version="1.0.4", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.4.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.4-i.json"), + self.test_package2, + ) index_package_directories(self.test_package2) self.test_package3, _ = Package.objects.get_or_create( - filename='abbrev-1.0.5.tgz', - sha1='5d8257bd9ebe435e698b2fa431afde4fe7b10b03', - name='abbrev', - version='1.0.5', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.5.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i.json'), self.test_package3) + filename="abbrev-1.0.5.tgz", + sha1="5d8257bd9ebe435e698b2fa431afde4fe7b10b03", + name="abbrev", + version="1.0.5", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.5.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.5-i.json"), + self.test_package3, + ) index_package_directories(self.test_package3) self.test_package4, _ = Package.objects.get_or_create( - filename='abbrev-1.0.6.tgz', - sha1='b6d632b859b3fa2d6f7e4b195472461b9e32dc30', - name='abbrev', - version='1.0.6', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.6.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i.json'), self.test_package4) + filename="abbrev-1.0.6.tgz", + sha1="b6d632b859b3fa2d6f7e4b195472461b9e32dc30", + name="abbrev", + version="1.0.6", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.6.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.6-i.json"), + self.test_package4, + ) index_package_directories(self.test_package4) self.test_package5, _ = Package.objects.get_or_create( - filename='abbrev-1.0.7.tgz', - sha1='5b6035b2ee9d4fb5cf859f08a9be81b208491843', - name='abbrev', - version='1.0.7', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.7.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i.json'), self.test_package5) + filename="abbrev-1.0.7.tgz", + sha1="5b6035b2ee9d4fb5cf859f08a9be81b208491843", + name="abbrev", + version="1.0.7", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.7.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.7-i.json"), + self.test_package5, + ) index_package_directories(self.test_package5) self.test_package6, _ = Package.objects.get_or_create( - filename='abbrev-1.0.9.tgz', - sha1='91b4792588a7738c25f35dd6f63752a2f8776135', - name='abbrev', - version='1.0.9', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.9.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i.json'), self.test_package6) + filename="abbrev-1.0.9.tgz", + sha1="91b4792588a7738c25f35dd6f63752a2f8776135", + name="abbrev", + version="1.0.9", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.9.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.9-i.json"), + self.test_package6, + ) index_package_directories(self.test_package6) self.test_package7, _ = Package.objects.get_or_create( - filename='abbrev-1.1.0.tgz', - sha1='d0554c2256636e2f56e7c2e5ad183f859428d81f', - name='abbrev', - version='1.1.0', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.1.0.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i.json'), self.test_package7) + filename="abbrev-1.1.0.tgz", + sha1="d0554c2256636e2f56e7c2e5ad183f859428d81f", + name="abbrev", + version="1.1.0", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.1.0.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.1.0-i.json"), + self.test_package7, + ) index_package_directories(self.test_package7) self.test_package8, _ = Package.objects.get_or_create( - filename='abbrev-1.1.1.tgz', - sha1='f8f2c887ad10bf67f634f005b6987fed3179aac8', - name='abbrev', - version='1.1.1', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i.json'), self.test_package8) + filename="abbrev-1.1.1.tgz", + sha1="f8f2c887ad10bf67f634f005b6987fed3179aac8", + name="abbrev", + version="1.1.1", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.1.1-i.json"), + self.test_package8, + ) index_package_directories(self.test_package8) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_3(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.3-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i-expected.json') + "match/directory-matching/abbrev-1.0.3-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_4(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.4-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i-expected.json') + "match/directory-matching/abbrev-1.0.4-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_5(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.5-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i-expected.json') + "match/directory-matching/abbrev-1.0.5-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_6(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.6-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i-expected.json') + "match/directory-matching/abbrev-1.0.6-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_7(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.7-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i-expected.json') + "match/directory-matching/abbrev-1.0.7-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_9(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.9-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i-expected.json') + "match/directory-matching/abbrev-1.0.9-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_1_0(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.0-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i-expected.json') + "match/directory-matching/abbrev-1.1.0-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_1_1(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.1-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i-expected.json') + "match/directory-matching/abbrev-1.1.1-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_get_stdin_3_0_2(self): input_file = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + "match/directory-matching/get-stdin-3.0.2-i.json" + ) + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i-expected.json') + "match/directory-matching/get-stdin-3.0.2-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_3(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.3-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i-expected.json') + "match/directory-matching/abbrev-1.0.3-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_4(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.4-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i-expected.json') + "match/directory-matching/abbrev-1.0.4-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_5(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.5-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i-expected.json') + "match/directory-matching/abbrev-1.0.5-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_6(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.6-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i-expected.json') + "match/directory-matching/abbrev-1.0.6-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_7(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.7-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i-expected.json') + "match/directory-matching/abbrev-1.0.7-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_9(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.9-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i-expected.json') + "match/directory-matching/abbrev-1.0.9-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_1_0(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.0-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i-expected.json') + "match/directory-matching/abbrev-1.1.0-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_1_1(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.1-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i-expected.json') + "match/directory-matching/abbrev-1.1.1-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_get_stdin_3_0_2(self): input_file = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + "match/directory-matching/get-stdin-3.0.2-i.json" + ) + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i-expected.json') + "match/directory-matching/get-stdin-3.0.2-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) diff --git a/matchcode/tests/test_models.py b/matchcode/tests/test_models.py index d3a84c6e..a74d6aa2 100644 --- a/matchcode/tests/test_models.py +++ b/matchcode/tests/test_models.py @@ -7,7 +7,6 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import binascii import os import attr @@ -38,55 +37,55 @@ class BaseModelTest(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): super(BaseModelTest, self).setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='abbot-0.12.3.jar', - sha1='51d28a27d919ce8690a40f4f335b9d591ceb16e9', - md5='38206e62a54b0489fb6baa4db5a06093', + filename="abbot-0.12.3.jar", + sha1="51d28a27d919ce8690a40f4f335b9d591ceb16e9", + md5="38206e62a54b0489fb6baa4db5a06093", size=689791, - name='abbot', - version='0.12.3', - download_url='http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar', - type='maven', + name="abbot", + version="0.12.3", + download_url="http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar", + type="maven", ) self.test_package1_metadata = self.test_package1.to_dict() self.test_package2, _ = Package.objects.get_or_create( - filename='dojoz-0.4.1-1.jar', - sha1='ae9d68fd6a29906606c2d9407d1cc0749ef84588', - md5='508361a1c6273a4c2b8e4945618b509f', + filename="dojoz-0.4.1-1.jar", + sha1="ae9d68fd6a29906606c2d9407d1cc0749ef84588", + md5="508361a1c6273a4c2b8e4945618b509f", size=876720, - name='dojoz', - version='0.4.1-1', - download_url='https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar', - type='maven', + name="dojoz", + version="0.4.1-1", + download_url="https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar", + type="maven", ) self.test_package2_metadata = self.test_package2.to_dict() self.test_package3, _ = Package.objects.get_or_create( - filename='acegi-security-0.51.jar', - sha1='ede156692b33872f5ee9465b7a06d6b2bc9e5e7f', + filename="acegi-security-0.51.jar", + sha1="ede156692b33872f5ee9465b7a06d6b2bc9e5e7f", size=176954, - name='acegi-security', - version='0.51', - download_url='https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar', - type='maven' + name="acegi-security", + version="0.51", + download_url="https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar", + type="maven", ) self.test_package3_metadata = self.test_package3.to_dict() self.test_package4, _ = Package.objects.get_or_create( - filename='test.tar.gz', - sha1='deadbeef', + filename="test.tar.gz", + sha1="deadbeef", size=42589, - name='test', - version='0.01', - download_url='https://test.com/test.tar.gz', - type='maven' + name="test", + version="0.01", + download_url="https://test.com/test.tar.gz", + type="maven", ) self.test_package4_metadata = self.test_package4.to_dict() @@ -94,38 +93,38 @@ def setUp(self): index_packages_sha1() # Populate ExactFileIndexFingerprint table - load_resources_from_scan(self.get_test_loc( - 'models/match-test.json'), self.test_package4) + load_resources_from_scan( + self.get_test_loc("models/match-test.json"), self.test_package4 + ) index_package_directories(self.test_package4) index_package_files_sha1( - self.test_package4, self.get_test_loc('models/match-test.json')) + self.test_package4, self.get_test_loc("models/match-test.json") + ) class ExactPackageArchiveIndexModelTestCase(BaseModelTest): def test_ExactPackageArchiveIndex_index(self): # Test index - sha1 = 'b6bbe0b067469d719708ca38de5c237cb526c3d2' - epai, created = ExactPackageArchiveIndex.index( - sha1, self.test_package1) + sha1 = "b6bbe0b067469d719708ca38de5c237cb526c3d2" + epai, created = ExactPackageArchiveIndex.index(sha1, self.test_package1) self.assertTrue(created) self.assertEqual(sha1, epai.fingerprint()) # Test index of existing sha1 - epai, created = ExactPackageArchiveIndex.index( - sha1, self.test_package1) + epai, created = ExactPackageArchiveIndex.index(sha1, self.test_package1) self.assertFalse(created) self.assertEqual(sha1, epai.fingerprint()) # Test index of invalid sha1 - ExactPackageArchiveIndex.index('not a sha1', self.test_package1) + ExactPackageArchiveIndex.index("not a sha1", self.test_package1) self.assertTrue( - "Error('Non-hexadecimal digit found')" - in self.test_package1.index_error + "Error('Non-hexadecimal digit found')" in self.test_package1.index_error ) def test_ExactPackageArchiveIndex_single_sha1_single_match(self): result = ExactPackageArchiveIndex.match( - '51d28a27d919ce8690a40f4f335b9d591ceb16e9') + "51d28a27d919ce8690a40f4f335b9d591ceb16e9" + ) result = [r.package.to_dict() for r in result] expected = [self.test_package1_metadata] self.assertEqual(expected, result) @@ -134,7 +133,7 @@ def test_ExactPackageArchiveIndex_single_sha1_single_match(self): class ExactFileIndexModelTestCase(BaseModelTest): def test_ExactFileIndex_index(self): # Test index - sha1 = 'b6bbe0b067469d719708ca38de5c237cb526c3d2' + sha1 = "b6bbe0b067469d719708ca38de5c237cb526c3d2" efi, created = ExactFileIndex.index(sha1, self.test_package1) self.assertTrue(created) self.assertEqual(sha1, efi.fingerprint()) @@ -145,22 +144,17 @@ def test_ExactFileIndex_index(self): self.assertEqual(sha1, efi.fingerprint()) # Test index of invalid sha1 - ExactFileIndex.index('not a sha1', self.test_package1) + ExactFileIndex.index("not a sha1", self.test_package1) self.assertTrue( - "Error('Non-hexadecimal digit found')" - in self.test_package1.index_error + "Error('Non-hexadecimal digit found')" in self.test_package1.index_error ) def test_ExactFileIndex_match(self): - scan_location = self.get_test_loc('models/match-test.json') + scan_location = self.get_test_loc("models/match-test.json") codebase = VirtualCodebase( location=scan_location, - codebase_attributes=dict( - matches=attr.ib(default=attr.Factory(list)) - ), - resource_attributes=dict( - matched_to=attr.ib(default=attr.Factory(list)) - ) + codebase_attributes=dict(matches=attr.ib(default=attr.Factory(list))), + resource_attributes=dict(matched_to=attr.ib(default=attr.Factory(list))), ) # populate codebase with match results @@ -168,77 +162,76 @@ def test_ExactFileIndex_match(self): matches = ExactFileIndex.match(resource.sha1) for match in matches: p = match.package.to_dict() - p['match_type'] = 'exact' + p["match_type"] = "exact" codebase.attributes.matches.append(p) - resource.matched_to.append(p['purl']) + resource.matched_to.append(p["purl"]) resource.save(codebase) expected = self.get_test_loc( - 'models/exact-file-matching-standalone-test-results.json') + "models/exact-file-matching-standalone-test-results.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) class ApproximateDirectoryMatchingIndexModelTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): super(MatchcodeTestCase, self).setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='async-0.2.10.tgz', - sha1='b6bbe0b0674b9d719708ca38de8c237cb526c3d1', - md5='fd313a0e8cc2343569719e80cd7a67ac', + filename="async-0.2.10.tgz", + sha1="b6bbe0b0674b9d719708ca38de8c237cb526c3d1", + md5="fd313a0e8cc2343569719e80cd7a67ac", size=15772, - name='async', - version='0.2.10', - download_url='https://registry.npmjs.org/async/-/async-0.2.10.tgz', - type='npm', + name="async", + version="0.2.10", + download_url="https://registry.npmjs.org/async/-/async-0.2.10.tgz", + type="npm", ) self.test_package1_metadata = self.test_package1.to_dict() - load_resources_from_scan(self.get_test_loc( - 'models/directory-matching/async-0.2.10.tgz-i.json'), self.test_package1) + load_resources_from_scan( + self.get_test_loc("models/directory-matching/async-0.2.10.tgz-i.json"), + self.test_package1, + ) index_package_directories(self.test_package1) self.test_package2, _ = Package.objects.get_or_create( - filename='async-0.2.9.tgz', - sha1='df63060fbf3d33286a76aaf6d55a2986d9ff8619', - md5='895ac62ba7c61086cffdd50ab03c0447', + filename="async-0.2.9.tgz", + sha1="df63060fbf3d33286a76aaf6d55a2986d9ff8619", + md5="895ac62ba7c61086cffdd50ab03c0447", size=15672, - name='async', - version='0.2.9', - download_url='https://registry.npmjs.org/async/-/async-0.2.9.tgz', - type='npm', + name="async", + version="0.2.9", + download_url="https://registry.npmjs.org/async/-/async-0.2.9.tgz", + type="npm", ) self.test_package2_metadata = self.test_package2.to_dict() - load_resources_from_scan(self.get_test_loc( - 'models/directory-matching/async-0.2.9-i.json'), self.test_package2) + load_resources_from_scan( + self.get_test_loc("models/directory-matching/async-0.2.9-i.json"), + self.test_package2, + ) index_package_directories(self.test_package2) def test_ApproximateDirectoryStructureIndex_index(self): # Test index - fingerprint = '000018fad23a49e4cd40718d1297be719e6564a4' - resource_path = 'foo/bar' + fingerprint = "000018fad23a49e4cd40718d1297be719e6564a4" + resource_path = "foo/bar" adsi, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertTrue(created) self.assertEqual(fingerprint, adsi.fingerprint()) # Test index of existing fingerprint adsi, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertFalse(created) self.assertEqual(fingerprint, adsi.fingerprint()) # Test index of invalid fingerprint ApproximateResourceContentIndex.index( - 'not a fingerprint', - resource_path, - self.test_package1 + "not a fingerprint", resource_path, self.test_package1 ) self.assertTrue( "ValueError: invalid literal for int() with base 16: 'not a fi'" @@ -247,11 +240,11 @@ def test_ApproximateDirectoryStructureIndex_index(self): def test_ApproximateDirectoryStructureIndex_match_subdir(self): scan_location = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i.json') + "models/directory-matching/async-0.2.9-i.json" + ) vc = VirtualCodebase( location=scan_location, - resource_attributes=dict( - packages=attr.ib(default=attr.Factory(list))) + resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))), ) codebase = compute_codebase_directory_fingerprints(vc) @@ -259,47 +252,41 @@ def test_ApproximateDirectoryStructureIndex_match_subdir(self): for resource in codebase.walk(topdown=True): if resource.is_file: continue - fp = resource.extra_data.get('directory_structure', '') + fp = resource.extra_data.get("directory_structure", "") matches = ApproximateDirectoryStructureIndex.match( - fingerprint=fp, - resource=resource + fingerprint=fp, resource=resource ) for match in matches: p = match.package.to_dict() - p['match_type'] = 'approximate-directory-structure' + p["match_type"] = "approximate-directory-structure" resource.packages.append(p) resource.save(codebase) expected = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i-expected-structure.json') + "models/directory-matching/async-0.2.9-i-expected-structure.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) def test_ApproximateDirectoryContentIndex_index(self): # Test index - fingerprint = '000018fad23a49e4cd40718d1297be719e6564a4' - resource_path = 'foo/bar' + fingerprint = "000018fad23a49e4cd40718d1297be719e6564a4" + resource_path = "foo/bar" adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertTrue(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of existing fingerprint adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertFalse(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of invalid fingerprint ApproximateResourceContentIndex.index( - 'not a fingerprint', - resource_path, - self.test_package1 + "not a fingerprint", resource_path, self.test_package1 ) self.assertTrue( "ValueError: invalid literal for int() with base 16: 'not a fi'" @@ -308,11 +295,11 @@ def test_ApproximateDirectoryContentIndex_index(self): def test_ApproximateDirectoryContentIndex_match_subdir(self): scan_location = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i.json') + "models/directory-matching/async-0.2.9-i.json" + ) vc = VirtualCodebase( location=scan_location, - resource_attributes=dict( - packages=attr.ib(default=attr.Factory(list))) + resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))), ) codebase = compute_codebase_directory_fingerprints(vc) @@ -320,101 +307,91 @@ def test_ApproximateDirectoryContentIndex_match_subdir(self): for resource in codebase.walk(topdown=True): if resource.is_file: continue - fp = resource.extra_data.get('directory_content', '') + fp = resource.extra_data.get("directory_content", "") matches = ApproximateDirectoryContentIndex.match( - fingerprint=fp, - resource=resource + fingerprint=fp, resource=resource ) for match in matches: p = match.package.to_dict() - p['match_type'] = 'approximate-directory-content' + p["match_type"] = "approximate-directory-content" resource.packages.append(p) resource.save(codebase) expected = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i-expected-content.json') + "models/directory-matching/async-0.2.9-i-expected-content.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) class ApproximateResourceMatchingIndexModelTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): super(MatchcodeTestCase, self).setUp() # Add approximate file resource self.test_package, _ = Package.objects.get_or_create( - filename='inflate.tar.gz', - sha1='deadfeed', - type='generic', - name='inflate', - version='1.0.0', - download_url='inflate.com/inflate.tar.gz', + filename="inflate.tar.gz", + sha1="deadfeed", + type="generic", + name="inflate", + version="1.0.0", + download_url="inflate.com/inflate.tar.gz", ) self.test_resource, _ = Resource.objects.get_or_create( - path='inflate.c', - name='inflate.c', - size=55466, - package=self.test_package + path="inflate.c", name="inflate.c", size=55466, package=self.test_package ) - self.test_resource_fingerprint = '000018fba23a49e4cd40718d1297be719e6564a4' + self.test_resource_fingerprint = "000018fba23a49e4cd40718d1297be719e6564a4" ApproximateResourceContentIndex.index( - self.test_resource_fingerprint, - self.test_resource.path, - self.test_package + self.test_resource_fingerprint, self.test_resource.path, self.test_package ) # Add approximate file resource self.test_package1, _ = Package.objects.get_or_create( - filename='deep-equal-1.0.1.tgz', - sha1='f5d260292b660e084eff4cdbc9f08ad3247448b5', - type='npm', - name='deep-equal', - version='1.0.1', - download_url='https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz', + filename="deep-equal-1.0.1.tgz", + sha1="f5d260292b660e084eff4cdbc9f08ad3247448b5", + type="npm", + name="deep-equal", + version="1.0.1", + download_url="https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz", ) self.test_resource1, _ = Resource.objects.get_or_create( - path='package/index.js', - name='index', - extension='js', - package=self.test_package1 + path="package/index.js", + name="index", + extension="js", + package=self.test_package1, ) test_resource1_loc = self.get_test_loc( - 'match/approximate-file-matching/index.js') + "match/approximate-file-matching/index.js" + ) fingerprints = get_file_fingerprint_hashes(test_resource1_loc) - self.test_resource1_fingerprint = fingerprints['halo1'] + self.test_resource1_fingerprint = fingerprints["halo1"] ApproximateResourceContentIndex.index( self.test_resource1_fingerprint, self.test_resource1.path, - self.test_package1 + self.test_package1, ) def test_ApproximateResourceContentIndex_index(self): # Test index - fingerprint = '000018fba23a39e4cd40718d1297be719e6564a4' - resource_path = 'foo/bar' + fingerprint = "000018fba23a39e4cd40718d1297be719e6564a4" + resource_path = "foo/bar" adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package + fingerprint, resource_path, self.test_package ) self.assertTrue(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of existing fingerprint adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package + fingerprint, resource_path, self.test_package ) self.assertFalse(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of invalid fingerprint ApproximateResourceContentIndex.index( - 'not a fingerprint', - resource_path, - self.test_package + "not a fingerprint", resource_path, self.test_package ) self.assertTrue( "ValueError: invalid literal for int() with base 16: 'not a fi'" @@ -423,11 +400,11 @@ def test_ApproximateResourceContentIndex_index(self): def test_ApproximateResourceContentIndex_match(self): scan_location = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-test.json') + "match/approximate-file-matching/approximate-match-test.json" + ) codebase = VirtualCodebase( location=scan_location, - resource_attributes=dict( - packages=attr.ib(default=attr.Factory(list))) + resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))), ) # populate codebase with match results @@ -435,40 +412,41 @@ def test_ApproximateResourceContentIndex_match(self): if not (fp := resource.halo1): continue matches = ApproximateResourceContentIndex.match( - fingerprint=fp, - resource=resource + fingerprint=fp, resource=resource ) for match in matches: p = match.package.to_dict() - p['match_type'] = 'approximate-resource-content' + p["match_type"] = "approximate-resource-content" resource.packages.append(p) resource.save(codebase) expected = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-model-test-results.json') + "match/approximate-file-matching/approximate-match-model-test-results.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) def test_ApproximateResourceContentIndex_match_deep_equals(self): test_file_loc = self.get_test_loc( - 'match/approximate-file-matching/index-modified.js') + "match/approximate-file-matching/index-modified.js" + ) fingerprints = get_file_fingerprint_hashes(test_file_loc) - fp = fingerprints['halo1'] + fp = fingerprints["halo1"] matches = ApproximateResourceContentIndex.match(fp) results = [match.package.to_dict() for match in matches] expected_results_loc = self.get_test_loc( - 'match/approximate-file-matching/index-modified.js-expected.json') - self.check_expected_results( - results, expected_results_loc, regen=FIXTURES_REGEN) + "match/approximate-file-matching/index-modified.js-expected.json" + ) + self.check_expected_results(results, expected_results_loc, regen=FIXTURES_REGEN) class MatchcodeModelUtilsTestCase(MatchcodeTestCase): def test_create_halohash_chunks(self): - fingerprint = '49280e141724c001e1080128621a4210' + fingerprint = "49280e141724c001e1080128621a4210" chunk1, chunk2, chunk3, chunk4 = create_halohash_chunks(fingerprint) - expected_chunk1 = hexstring_to_binarray('49280e14') - expected_chunk2 = hexstring_to_binarray('1724c001') - expected_chunk3 = hexstring_to_binarray('e1080128') - expected_chunk4 = hexstring_to_binarray('621a4210') + expected_chunk1 = hexstring_to_binarray("49280e14") + expected_chunk2 = hexstring_to_binarray("1724c001") + expected_chunk3 = hexstring_to_binarray("e1080128") + expected_chunk4 = hexstring_to_binarray("621a4210") self.assertEqual(expected_chunk1, chunk1) self.assertEqual(expected_chunk2, chunk2) self.assertEqual(expected_chunk3, chunk3) diff --git a/matchcode/utils.py b/matchcode/utils.py index 7168847a..cdbdfa68 100644 --- a/matchcode/utils.py +++ b/matchcode/utils.py @@ -7,28 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict -from unittest import TestCase - -import codecs import json import ntpath import os import posixpath +from unittest import TestCase from django.test import TestCase as DjangoTestCase from commoncode.resource import VirtualCodebase from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import hexstring_to_binarray -from rest_framework.utils.serializer_helpers import ReturnDict -from rest_framework.utils.serializer_helpers import ReturnList from scancode.cli_test_utils import purl_with_fake_uuid from matchcode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTestingMixin - ############## TEST UTILITIES ############## """ The conventions used for the tests are: @@ -40,7 +34,7 @@ class BaseTestCase(TestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") @classmethod def get_test_loc(cls, path): @@ -53,9 +47,14 @@ def get_test_loc(cls, path): return location -class CodebaseTester(object): - def check_codebase(self, codebase, expected_codebase_json_loc, - regen=FIXTURES_REGEN, remove_file_date=True): +class CodebaseTester: + def check_codebase( + self, + codebase, + expected_codebase_json_loc, + regen=FIXTURES_REGEN, + remove_file_date=True, + ): """ Check the Resources of the `codebase` Codebase objects are the same as the data in the `expected_codebase_json_loc` JSON file location, @@ -70,41 +69,39 @@ def check_codebase(self, codebase, expected_codebase_json_loc, def serializer(r): rd = r.to_dict(with_info=True) if remove_file_date: - rd.pop('file_date', None) + rd.pop("file_date", None) - for package_data in rd.get('packages', []): + for package_data in rd.get("packages", []): # Normalize package_uid - package_uid = package_data.get('package_uid') + package_uid = package_data.get("package_uid") if package_uid: - package_data['package_uid'] = purl_with_fake_uuid( - package_uid) + package_data["package_uid"] = purl_with_fake_uuid(package_uid) return rd results = list(map(serializer, codebase.walk(topdown=True))) if regen: - with open(expected_codebase_json_loc, 'w') as reg: - json.dump(dict(files=results), reg, - indent=2, separators=(',', ': ')) + with open(expected_codebase_json_loc, "w") as reg: + json.dump(dict(files=results), reg, indent=2, separators=(",", ": ")) expected_vc = VirtualCodebase(location=expected_codebase_json_loc) expected = list(map(serializer, expected_vc.walk(topdown=True))) # NOTE we redump the JSON as a string for a more efficient display of the # failures comparison/diff - expected = json.dumps(expected, indent=2, separators=(',', ': ')) - results = json.dumps(results, indent=2, separators=(',', ': ')) + expected = json.dumps(expected, indent=2, separators=(",", ": ")) + results = json.dumps(results, indent=2, separators=(",", ": ")) self.assertEqual(expected, results) -class MatchcodeTestCase(CodebaseTester, JsonBasedTestingMixin, BaseTestCase, DjangoTestCase): - databases = '__all__' +class MatchcodeTestCase( + CodebaseTester, JsonBasedTestingMixin, BaseTestCase, DjangoTestCase +): + databases = "__all__" def to_os_native_path(path): - """ - Normalize a path to use the native OS path separator. - """ + """Normalize a path to use the native OS path separator.""" path = path.replace(posixpath.sep, os.path.sep) path = path.replace(ntpath.sep, os.path.sep) path = path.rstrip(os.path.sep) @@ -113,6 +110,7 @@ def to_os_native_path(path): def load_resources_from_scan(scan_location, package): from packagedb.models import Resource + vc = VirtualCodebase( location=scan_location, ) @@ -123,35 +121,27 @@ def load_resources_from_scan(scan_location, package): size=resource.size, sha1=resource.sha1, md5=resource.md5, - is_file=resource.type == 'file' + is_file=resource.type == "file", ) def index_packages_sha1(): - """ - Reindex all the packages for exact sha1 matching. - """ + """Reindex all the packages for exact sha1 matching.""" from matchcode.models import ExactPackageArchiveIndex from packagedb.models import Package for package in Package.objects.filter(sha1__isnull=False): sha1_in_bin = hexstring_to_binarray(package.sha1) - _ = ExactPackageArchiveIndex.objects.create( - package=package, - sha1=sha1_in_bin - ) + _ = ExactPackageArchiveIndex.objects.create(package=package, sha1=sha1_in_bin) def index_package_files_sha1(package, scan_location): - """ - Index for SHA1 the package files found in the JSON scan at scan_location - """ + """Index for SHA1 the package files found in the JSON scan at scan_location""" from matchcode.models import ExactFileIndex resource_attributes = dict() vc = VirtualCodebase( - location=scan_location, - resource_attributes=resource_attributes + location=scan_location, resource_attributes=resource_attributes ) for resource in vc.walk(topdown=True): @@ -166,12 +156,10 @@ def index_package_files_sha1(package, scan_location): def _create_virtual_codebase_from_package_resources(package): - """ - Return a VirtualCodebase from the resources of `package` - """ + """Return a VirtualCodebase from the resources of `package`""" # Create something that looks like a scancode scan so we can import it into # a VirtualCodebase - package_resources = package.resources.order_by('path') + package_resources = package.resources.order_by("path") if not package_resources: return @@ -179,28 +167,28 @@ def _create_virtual_codebase_from_package_resources(package): for resource in package_resources: files.append( { - 'path': resource.path, - 'size': resource.size, - 'sha1': resource.sha1, - 'md5': resource.md5, - 'type': resource.type, + "path": resource.path, + "size": resource.size, + "sha1": resource.sha1, + "md5": resource.md5, + "type": resource.type, } ) make_new_root = False - sample_file_path = files[0].get('path', '') - root_dir = sample_file_path.split('/')[0] + sample_file_path = files[0].get("path", "") + root_dir = sample_file_path.split("/")[0] for f in files: - file_path = f.get('path', '') + file_path = f.get("path", "") if not file_path.startswith(root_dir): make_new_root = True break if make_new_root: - new_root = '{}-{}'.format(package.name, package.version) + new_root = f"{package.name}-{package.version}" for f in files: - new_path = os.path.join(new_root, f.get('path', '')) - f['path'] = new_path + new_path = os.path.join(new_root, f.get("path", "")) + f["path"] = new_path # Create VirtualCodebase mock_scan = dict(files=files) @@ -226,11 +214,11 @@ def index_resource_fingerprints(codebase, package): indexed_adsi = 0 indexed_arci = 0 for resource in codebase.walk(topdown=False): - directory_content_fingerprint = resource.extra_data.get( - 'directory_content', '') + directory_content_fingerprint = resource.extra_data.get("directory_content", "") directory_structure_fingerprint = resource.extra_data.get( - 'directory_structure', '') - resource_content_fingerprint = resource.extra_data.get('halo1', '') + "directory_structure", "" + ) + resource_content_fingerprint = resource.extra_data.get("halo1", "") if directory_content_fingerprint: _, adci_created = ApproximateDirectoryContentIndex.index( diff --git a/matchcode_pipeline/api.py b/matchcode_pipeline/api.py index 0945bd0e..40c892b5 100644 --- a/matchcode_pipeline/api.py +++ b/matchcode_pipeline/api.py @@ -13,7 +13,6 @@ from rest_framework import serializers from rest_framework import viewsets from rest_framework.decorators import action - from scanpipe.api import ExcludeFromListViewMixin from scanpipe.api.serializers import InputSourceSerializer from scanpipe.api.serializers import SerializerExcludeFieldsMixin @@ -80,8 +79,8 @@ class MatchingSerializer(ExcludeFromListViewMixin, serializers.ModelSerializer): class Meta: model = Project fields = ( - 'url', - 'uuid', + "url", + "uuid", "upload_file", "input_urls", "webhook_url", @@ -108,9 +107,9 @@ class Meta: "codebase_relations_summary", ] extra_kwargs = { - 'url': { - 'view_name': 'matching-detail', - 'lookup_field': 'pk', + "url": { + "view_name": "matching-detail", + "lookup_field": "pk", }, } @@ -143,20 +142,17 @@ def validate_input_urls(self, value): """Add support for providing multiple URLs in a single string.""" return [url for entry in value for url in entry.split()] - def create(self, validated_data, matching_pipeline_name='matching'): - """ - Create a new `project` with `upload_file`, using the `matching` pipeline - """ + def create(self, validated_data, matching_pipeline_name="matching"): + """Create a new `project` with `upload_file`, using the `matching` pipeline""" execute_now = True - validated_data['name'] = uuid4() + validated_data["name"] = uuid4() upload_file = validated_data.pop("upload_file", None) input_urls = validated_data.pop("input_urls", []) webhook_url = validated_data.pop("webhook_url", None) downloads, errors = fetch_urls(input_urls) if errors: - raise serializers.ValidationError( - "Could not fetch: " + "\n".join(errors)) + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) project = super().create(validated_data) @@ -190,8 +186,8 @@ class D2DSerializer(ExcludeFromListViewMixin, serializers.ModelSerializer): class Meta: model = Project fields = ( - 'url', - 'uuid', + "url", + "uuid", "input_urls", "created_date", "input_sources", @@ -218,9 +214,9 @@ class Meta: "codebase_resources_discrepancies", ] extra_kwargs = { - 'url': { - 'view_name': 'd2d-detail', - 'lookup_field': 'pk', + "url": { + "view_name": "d2d-detail", + "lookup_field": "pk", }, } @@ -255,18 +251,15 @@ def get_codebase_relations_summary(self, project): queryset = project.codebaserelations.all() return count_group_by(queryset, "map_type") - def create(self, validated_data, matching_pipeline_name='d2d'): - """ - Create a new `project` with `input_urls`, using the `d2d` pipeline - """ + def create(self, validated_data, matching_pipeline_name="d2d"): + """Create a new `project` with `input_urls`, using the `d2d` pipeline""" execute_now = True - validated_data['name'] = uuid4() + validated_data["name"] = uuid4() input_urls = validated_data.pop("input_urls", []) errors = check_urls_availability(input_urls) if errors: - raise serializers.ValidationError( - "Could not fetch: " + "\n".join(errors)) + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) project = super().create(validated_data) @@ -287,8 +280,11 @@ def create(self, validated_data, matching_pipeline_name='d2d'): for url in urls: project.add_input_source(download_url=url) - project.add_pipeline(matching_pipeline_name, selected_groups=[ - "Java", "Javascript", "Elf", "Go"], execute_now=execute_now) + project.add_pipeline( + matching_pipeline_name, + selected_groups=["Java", "Javascript", "Elf", "Go"], + execute_now=execute_now, + ) return project @@ -329,6 +325,7 @@ class MatchingViewSet( - List of mapping containing details about the runs created for this match request. """ + queryset = Project.objects.all() serializer_class = MatchingSerializer filterset_class = ProjectFilterSet @@ -384,6 +381,7 @@ class D2DViewSet( - List of mapping containing details about the runs created for this match request. """ + queryset = Project.objects.all() serializer_class = D2DSerializer filterset_class = ProjectFilterSet diff --git a/matchcode_pipeline/pipelines/matching.py b/matchcode_pipeline/pipelines/matching.py index 2803c657..a30d87ce 100644 --- a/matchcode_pipeline/pipelines/matching.py +++ b/matchcode_pipeline/pipelines/matching.py @@ -22,9 +22,10 @@ from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase -from matchcode_pipeline.pipes import matching from scanpipe.pipes import matchcode +from matchcode_pipeline.pipes import matching + class Matching(ScanCodebase, LoadInventory): """ diff --git a/matchcode_pipeline/pipes/matching.py b/matchcode_pipeline/pipes/matching.py index a70dcbad..82a2196b 100644 --- a/matchcode_pipeline/pipes/matching.py +++ b/matchcode_pipeline/pipes/matching.py @@ -113,7 +113,7 @@ def match_purldb_package( """ match_count = 0 sha1_list = list(resources_by_sha1.keys()) - results = Package.objects.using('packagedb').filter(sha1__in=sha1_list) + results = Package.objects.using("packagedb").filter(sha1__in=sha1_list) # Process matched Package data for package in results: package_data = package.to_dict() @@ -147,7 +147,7 @@ def match_purldb_resource( package_data_by_purldb_urls = package_data_by_purldb_urls or {} match_count = 0 sha1_list = list(resources_by_sha1.keys()) - results = Resource.objects.using('packagedb').filter(sha1__in=sha1_list) + results = Resource.objects.using("packagedb").filter(sha1__in=sha1_list) # Process match results for resource in results: # Get package data @@ -170,13 +170,15 @@ def match_purldb_resource_approximately(project, resource): """Match by approximation a single resource in the PurlDB.""" fingerprint = resource.extra_data.get("halo1", "") results = ApproximateResourceContentIndex.match( - fingerprint=fingerprint, - resource=resource + fingerprint=fingerprint, resource=resource ) for result in results: package_data = result.package.to_dict() return create_package_from_purldb_data( - project, [resource], package_data, flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE + project, + [resource], + package_data, + flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE, ) @@ -184,9 +186,7 @@ def match_purldb_directory(project, resource, exact_match=False): """Match a single directory resource in the PurlDB.""" fingerprint = resource.extra_data.get("directory_content", "") results = ApproximateDirectoryContentIndex.match( - fingerprint=fingerprint, - resource=resource, - exact_match=exact_match + fingerprint=fingerprint, resource=resource, exact_match=exact_match ) for result in results: package_data = result.package.to_dict() @@ -236,14 +236,9 @@ def match_purldb_resources( if logger: if resource_count > 0: - logger( - f"Matching {resource_count:,d} resources in PurlDB, " - "using SHA1" - ) + logger(f"Matching {resource_count:,d} resources in PurlDB, " "using SHA1") else: - logger( - f"Skipping resource matching as there are {resource_count:,d}" - ) + logger(f"Skipping resource matching as there are {resource_count:,d}") _match_purldb_resources( project=project, @@ -324,11 +319,9 @@ def match_purldb_resources_approximately(project, logger=None): resource, ) - matched_count = ( - project.codebaseresources - .filter(status=flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE) - .count() - ) + matched_count = project.codebaseresources.filter( + status=flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE + ).count() logger( f"{matched_count:,d} resource{pluralize(matched_count, 's')} " f"approximately matched in PurlDB" @@ -362,11 +355,7 @@ def match_purldb_directories(project, exact_directory_match=False, logger=None): for directory in progress.iter(directory_iterator): directory.refresh_from_db() if directory.status != flag.MATCHED_TO_PURLDB_DIRECTORY: - match_purldb_directory( - project, - directory, - exact_directory_match - ) + match_purldb_directory(project, directory, exact_directory_match) matched_count = ( project.codebaseresources.directories() @@ -381,9 +370,8 @@ def match_purldb_directories(project, exact_directory_match=False, logger=None): def match_purldb_resources_post_process(project, logger=None): """Choose the best package for PurlDB matched resources.""" - extract_directories = ( - project.codebaseresources.directories() - .filter(path__regex=r"^.*-extract$") + extract_directories = project.codebaseresources.directories().filter( + path__regex=r"^.*-extract$" ) resources = project.codebaseresources.files().filter( @@ -403,16 +391,12 @@ def match_purldb_resources_post_process(project, logger=None): map_count = 0 for directory in progress.iter(resource_iterator): - map_count += _match_purldb_resources_post_process( - directory.path, resources - ) + map_count += _match_purldb_resources_post_process(directory.path, resources) logger(f"{map_count:,d} resource processed") -def _match_purldb_resources_post_process( - directory_path, codebase_resources -): +def _match_purldb_resources_post_process(directory_path, codebase_resources): # Exclude the content of nested archive. interesting_codebase_resources = ( codebase_resources.filter(path__startswith=directory_path) diff --git a/matchcode_pipeline/tests/pipes/test_matching.py b/matchcode_pipeline/tests/pipes/test_matching.py index eb70c119..2360a1e0 100644 --- a/matchcode_pipeline/tests/pipes/test_matching.py +++ b/matchcode_pipeline/tests/pipes/test_matching.py @@ -3,6 +3,7 @@ from pathlib import Path from django.test import TestCase + from scanpipe import pipes from scanpipe.models import Project from scanpipe.pipes import flag @@ -30,7 +31,7 @@ def setUp(self): namespace=package_data1["namespace"], name=package_data1["name"], version=package_data1["version"], - sha1="abcdef" + sha1="abcdef", ) self.directory_content_fingerprint1 = ApproximateDirectoryContentIndex.index( fingerprint="00000003238f6ed2c218090d4da80b3b42160e69", @@ -42,9 +43,7 @@ def setUp(self): package=self.package1, ) self.resource1 = Resource.objects.create( - path="inflate.c", - size=55466, - package=self.package1 + path="inflate.c", size=55466, package=self.package1 ) self.resource_content_fingerprint1 = ApproximateResourceContentIndex.index( fingerprint="000018fba23a49e4cd40718d1297be719e6564a4", @@ -67,7 +66,9 @@ def test_matchcode_pipeline_pipes_matching_get_project_resources_qs(self): make_resource_file(self.project1, "directory100/bar.txt") resources = [package_resource, directory_resource] - resources_qs = matching.get_project_resources_qs(self.project1, resources=resources) + resources_qs = matching.get_project_resources_qs( + self.project1, resources=resources + ) expected_paths = [ "package.jar", "package.jar-extract/", @@ -113,7 +114,9 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): to_1 = make_resource_directory( self.project1, "package.jar-extract", - extra_data={"directory_content": "00000003238f6ed2c218090d4da80b3b42160e69"}, + extra_data={ + "directory_content": "00000003238f6ed2c218090d4da80b3b42160e69" + }, ) to_2 = make_resource_file(self.project1, "package.jar-extract/a.class") to_3 = make_resource_file(self.project1, "package.jar-extract/b.class") @@ -124,9 +127,7 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): logger=buffer.write, ) - expected = ( - "Matching 1 directory against PurlDB" "1 directory matched in PurlDB" - ) + expected = "Matching 1 directory against PurlDB" "1 directory matched in PurlDB" self.assertEqual(expected, buffer.getvalue()) package = self.project1.discoveredpackages.get() @@ -137,13 +138,13 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): self.assertEqual("matched-to-purldb-directory", resource.status) self.assertEqual(package, resource.discovered_packages.get()) - - def test_matchcode_pipeline_pipes_matching_match_purldb_resources_post_process(self): + def test_matchcode_pipeline_pipes_matching_match_purldb_resources_post_process( + self, + ): to_map = self.data_location / "d2d-javascript" / "to" / "main.js.map" to_mini = self.data_location / "d2d-javascript" / "to" / "main.js" to_dir = ( - self.project1.codebase_path - / "project.tar.zst/modules/apps/adaptive-media/" + self.project1.codebase_path / "project.tar.zst/modules/apps/adaptive-media/" "adaptive-media-web-extract/src/main/resources/META-INF/resources/" "adaptive_media/js" ) @@ -202,7 +203,9 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_resources_post_process(s self.assertEqual(2, package1_resource_count) self.assertEqual(0, package2_resource_count) - def test_matchcode_pipeline_pipes_matching_match_purldb_resource_approximately(self): + def test_matchcode_pipeline_pipes_matching_match_purldb_resource_approximately( + self, + ): resource = make_resource_file( self.project1, "inflate.c", @@ -217,7 +220,8 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_resource_approximately(s ) expected = ( - "Approximate matching 1 resource against PurlDB" "1 resource approximately matched in PurlDB" + "Approximate matching 1 resource against PurlDB" + "1 resource approximately matched in PurlDB" ) self.assertEqual(expected, buffer.getvalue()) diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index 8783cdd6..7933298f 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -13,8 +13,9 @@ from django.contrib.auth.models import User from django.test import TransactionTestCase from django.urls import reverse -from rest_framework.test import APIClient +from rest_framework import status +from rest_framework.test import APIClient from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredDependency @@ -22,21 +23,19 @@ from scanpipe.models import Run from scanpipe.tests import dependency_data1 from scanpipe.tests import package_data1 -from rest_framework import status class MatchCodePipelineAPITest(TransactionTestCase): - databases = {'default', 'packagedb'} - data_location = Path(__file__).parent / 'data' + databases = {"default", "packagedb"} + data_location = Path(__file__).parent / "data" def setUp(self): - self.project1 = Project.objects.create(name='Analysis') + self.project1 = Project.objects.create(name="Analysis") self.resource1 = CodebaseResource.objects.create( project=self.project1, - path='daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO', + path="daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO", ) - self.discovered_package1 = self.resource1.create_and_add_package( - package_data1) + self.discovered_package1 = self.resource1.create_and_add_package(package_data1) self.discovered_dependency1 = DiscoveredDependency.create_from_data( self.project1, dependency_data1 ) @@ -44,16 +43,14 @@ def setUp(self): project=self.project1, from_resource=self.resource1, to_resource=self.resource1, - map_type='java_to_class', + map_type="java_to_class", ) - self.matching_list_url = reverse('matching-list') - self.project1_detail_url = reverse( - 'matching-detail', args=[self.project1.uuid]) + self.matching_list_url = reverse("matching-list") + self.project1_detail_url = reverse("matching-detail", args=[self.project1.uuid]) - self.user = User.objects.create_user( - 'username', 'e@mail.com', 'secret') - self.auth = f'Token {self.user.auth_token.key}' + self.user = User.objects.create_user("username", "e@mail.com", "secret") + self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) @@ -62,52 +59,50 @@ def test_matchcode_pipeline_api_matching_list(self): response = self.csrf_client.get(self.matching_list_url) self.assertContains(response, self.project1_detail_url) - self.assertEqual(1, response.data['count']) - self.assertNotContains(response, 'input_root') - self.assertNotContains(response, 'extra_data') - self.assertNotContains(response, 'message_count') - self.assertNotContains(response, 'resource_count') - self.assertNotContains(response, 'package_count') - self.assertNotContains(response, 'dependency_count') + self.assertEqual(1, response.data["count"]) + self.assertNotContains(response, "input_root") + self.assertNotContains(response, "extra_data") + self.assertNotContains(response, "message_count") + self.assertNotContains(response, "resource_count") + self.assertNotContains(response, "package_count") + self.assertNotContains(response, "dependency_count") def test_matchcode_pipeline_api_matching_detail(self): response = self.csrf_client.get(self.project1_detail_url) - self.assertIn(self.project1_detail_url, response.data['url']) - self.assertEqual(str(self.project1.uuid), response.data['uuid']) - self.assertEqual([], response.data['input_sources']) - self.assertEqual([], response.data['runs']) - self.assertEqual(1, response.data['resource_count']) - self.assertEqual(1, response.data['package_count']) - self.assertEqual(1, response.data['dependency_count']) - self.assertEqual(1, response.data['relation_count']) - - expected = {'': 1} - self.assertEqual(expected, response.data['codebase_resources_summary']) + self.assertIn(self.project1_detail_url, response.data["url"]) + self.assertEqual(str(self.project1.uuid), response.data["uuid"]) + self.assertEqual([], response.data["input_sources"]) + self.assertEqual([], response.data["runs"]) + self.assertEqual(1, response.data["resource_count"]) + self.assertEqual(1, response.data["package_count"]) + self.assertEqual(1, response.data["dependency_count"]) + self.assertEqual(1, response.data["relation_count"]) + + expected = {"": 1} + self.assertEqual(expected, response.data["codebase_resources_summary"]) expected = { - 'total': 1, - 'with_missing_resources': 0, - 'with_modified_resources': 0, + "total": 1, + "with_missing_resources": 0, + "with_modified_resources": 0, } - self.assertEqual( - expected, response.data['discovered_packages_summary']) + self.assertEqual(expected, response.data["discovered_packages_summary"]) expected = { - 'total': 1, - 'is_runtime': 1, - 'is_optional': 0, - 'is_resolved': 0, + "total": 1, + "is_runtime": 1, + "is_optional": 0, + "is_resolved": 0, } - self.assertEqual( - expected, response.data['discovered_dependencies_summary']) + self.assertEqual(expected, response.data["discovered_dependencies_summary"]) - expected = {'java_to_class': 1} - self.assertEqual(expected, response.data['codebase_relations_summary']) + expected = {"java_to_class": 1} + self.assertEqual(expected, response.data["codebase_relations_summary"]) - input1 = self.project1.add_input_source( - filename='file1', is_uploaded=True) + input1 = self.project1.add_input_source(filename="file1", is_uploaded=True) input2 = self.project1.add_input_source( - filename='file2', download_url='https://download.url') + filename="file2", download_url="https://download.url" + ) self.project1.save() response = self.csrf_client.get(self.project1_detail_url) expected = [ @@ -128,89 +123,88 @@ def test_matchcode_pipeline_api_matching_detail(self): "uuid": str(input2.uuid), }, ] - self.assertEqual(expected, response.data['input_sources']) + self.assertEqual(expected, response.data["input_sources"]) - @mock.patch('scanpipe.models.Run.execute_task_async') + @mock.patch("scanpipe.models.Run.execute_task_async") def test_matching_pipeline_api_matching_create(self, mock_execute_pipeline_task): # load upload_file contents - test_out_loc = self.data_location / 'test-out.json' + test_out_loc = self.data_location / "test-out.json" with open(test_out_loc) as f: data = { - 'upload_file': f, + "upload_file": f, } # Send match request response = self.csrf_client.post(self.matching_list_url, data) self.assertEqual(status.HTTP_201_CREATED, response.status_code) - self.assertEqual(1, len(response.data['runs'])) - self.assertEqual('matching', response.data['runs'][0]['pipeline_name']) + self.assertEqual(1, len(response.data["runs"])) + self.assertEqual("matching", response.data["runs"][0]["pipeline_name"]) mock_execute_pipeline_task.assert_called_once() - created_matching_project_detail_url = response.data['url'] - matching_project_uuid = response.data['uuid'] - results_url = reverse('matching-results', args=[matching_project_uuid]) + created_matching_project_detail_url = response.data["url"] + matching_project_uuid = response.data["uuid"] + results_url = reverse("matching-results", args=[matching_project_uuid]) # Check that the file was uploaded response = self.csrf_client.get(created_matching_project_detail_url) - self.assertEqual( - 'test-out.json', response.data['input_sources'][0]['filename']) + self.assertEqual("test-out.json", response.data["input_sources"][0]["filename"]) - @mock.patch('scanpipe.models.Run.execute_task_async') - def test_matching_pipeline_api_matching_create_multiple_input_urls(self, mock_execute_pipeline_task): + @mock.patch("scanpipe.models.Run.execute_task_async") + def test_matching_pipeline_api_matching_create_multiple_input_urls( + self, mock_execute_pipeline_task + ): # load input_urls data = { - 'input_urls': 'https://registry.npmjs.org/asdf/-/asdf-1.2.2.tgz\r\nhttps://registry.npmjs.org/asdf/-/asdf-1.2.1.tgz', + "input_urls": "https://registry.npmjs.org/asdf/-/asdf-1.2.2.tgz\r\nhttps://registry.npmjs.org/asdf/-/asdf-1.2.1.tgz", } # Send match request response = self.csrf_client.post(self.matching_list_url, data) self.assertEqual(status.HTTP_201_CREATED, response.status_code) - self.assertEqual(1, len(response.data['runs'])) - self.assertEqual('matching', response.data['runs'][0]['pipeline_name']) + self.assertEqual(1, len(response.data["runs"])) + self.assertEqual("matching", response.data["runs"][0]["pipeline_name"]) mock_execute_pipeline_task.assert_called_once() - created_matching_project_detail_url = response.data['url'] - matching_project_uuid = response.data['uuid'] - results_url = reverse('matching-results', args=[matching_project_uuid]) + created_matching_project_detail_url = response.data["url"] + matching_project_uuid = response.data["uuid"] + results_url = reverse("matching-results", args=[matching_project_uuid]) # Check that the file was uploaded response = self.csrf_client.get(created_matching_project_detail_url) - input_sources = response.data['input_sources'] + input_sources = response.data["input_sources"] self.assertEqual(2, len(input_sources)) - self.assertEqual('asdf-1.2.2.tgz', input_sources[0]['filename']) - self.assertEqual('asdf-1.2.1.tgz', input_sources[1]['filename']) + self.assertEqual("asdf-1.2.2.tgz", input_sources[0]["filename"]) + self.assertEqual("asdf-1.2.1.tgz", input_sources[1]["filename"]) def test_matchcode_pipeline_api_run_detail(self): - run1 = self.project1.add_pipeline('matching') - url = reverse('run-detail', args=[run1.uuid]) - project1_detail_url = reverse('run-detail', args=[self.project1.uuid]) + run1 = self.project1.add_pipeline("matching") + url = reverse("run-detail", args=[run1.uuid]) + project1_detail_url = reverse("run-detail", args=[self.project1.uuid]) response = self.csrf_client.get(url) - self.assertEqual(str(run1.uuid), response.data['uuid']) - self.assertIn(project1_detail_url, response.data['project']) - self.assertEqual('matching', response.data['pipeline_name']) - self.assertEqual('', response.data['description']) - self.assertEqual('', response.data['scancodeio_version']) - self.assertIsNone(response.data['task_id']) - self.assertIsNone(response.data['task_start_date']) - self.assertIsNone(response.data['task_end_date']) - self.assertEqual('', response.data['task_output']) - self.assertIsNone(response.data['execution_time']) - self.assertEqual(Run.Status.NOT_STARTED, response.data['status']) + self.assertEqual(str(run1.uuid), response.data["uuid"]) + self.assertIn(project1_detail_url, response.data["project"]) + self.assertEqual("matching", response.data["pipeline_name"]) + self.assertEqual("", response.data["description"]) + self.assertEqual("", response.data["scancodeio_version"]) + self.assertIsNone(response.data["task_id"]) + self.assertIsNone(response.data["task_start_date"]) + self.assertIsNone(response.data["task_end_date"]) + self.assertEqual("", response.data["task_output"]) + self.assertIsNone(response.data["execution_time"]) + self.assertEqual(Run.Status.NOT_STARTED, response.data["status"]) class D2DPipelineAPITest(TransactionTestCase): - databases = {'default', 'packagedb'} - data_location = Path(__file__).parent / 'data' + databases = {"default", "packagedb"} + data_location = Path(__file__).parent / "data" def setUp(self): - self.project1 = Project.objects.create(name='Analysis') - self.d2d_list_url = reverse('d2d-list') - self.project1_detail_url = reverse( - 'd2d-detail', args=[self.project1.uuid]) - - self.user = User.objects.create_user( - 'username', 'a@mail.com', 'secret') - self.auth = f'Token {self.user.auth_token.key}' + self.project1 = Project.objects.create(name="Analysis") + self.d2d_list_url = reverse("d2d-list") + self.project1_detail_url = reverse("d2d-detail", args=[self.project1.uuid]) + + self.user = User.objects.create_user("username", "a@mail.com", "secret") + self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) @@ -218,44 +212,48 @@ def test_d2d_pipeline_api_d2d_list(self): response = self.csrf_client.get(self.d2d_list_url) self.assertContains(response, self.project1_detail_url) - self.assertEqual(1, response.data['count']) - self.assertNotContains(response, 'input_root') - self.assertNotContains(response, 'extra_data') - self.assertNotContains(response, 'message_count') - self.assertNotContains(response, 'resource_count') - self.assertNotContains(response, 'package_count') - self.assertNotContains(response, 'dependency_count') - - @mock.patch('scanpipe.models.Run.execute_task_async') + self.assertEqual(1, response.data["count"]) + self.assertNotContains(response, "input_root") + self.assertNotContains(response, "extra_data") + self.assertNotContains(response, "message_count") + self.assertNotContains(response, "resource_count") + self.assertNotContains(response, "package_count") + self.assertNotContains(response, "dependency_count") + + @mock.patch("scanpipe.models.Run.execute_task_async") def test_d2d_pipeline_api_d2d_create(self, mock_execute_pipeline_task): # load upload_file contents data = { - 'input_urls': ['https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/from-data.zip#from', - 'https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/to-data.zip#to'], + "input_urls": [ + "https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/from-data.zip#from", + "https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/to-data.zip#to", + ], } # Send match request response = self.csrf_client.post(self.d2d_list_url, data) self.assertEqual(status.HTTP_201_CREATED, response.status_code) - self.assertEqual(1, len(response.data['runs'])) + self.assertEqual(1, len(response.data["runs"])) mock_execute_pipeline_task.assert_called_once() - response = self.csrf_client.get(response.data['url']) - self.assertIn('codebase_resources_discrepancies', response.data) + response = self.csrf_client.get(response.data["url"]) + self.assertIn("codebase_resources_discrepancies", response.data) def test_d2d_pipeline_api_run_detail(self): - run1 = self.project1.add_pipeline('d2d') - url = reverse('run-detail', args=[run1.uuid]) - project1_detail_url = reverse('run-detail', args=[self.project1.uuid]) + run1 = self.project1.add_pipeline("d2d") + url = reverse("run-detail", args=[run1.uuid]) + project1_detail_url = reverse("run-detail", args=[self.project1.uuid]) response = self.csrf_client.get(url) - self.assertEqual(str(run1.uuid), response.data['uuid']) - self.assertIn(project1_detail_url, response.data['project']) - self.assertEqual('d2d', response.data['pipeline_name']) + self.assertEqual(str(run1.uuid), response.data["uuid"]) + self.assertIn(project1_detail_url, response.data["project"]) + self.assertEqual("d2d", response.data["pipeline_name"]) self.assertEqual( - 'Establish relationships between two code trees: deployment and development.', response.data['description']) - self.assertEqual('', response.data['scancodeio_version']) - self.assertIsNone(response.data['task_id']) - self.assertIsNone(response.data['task_start_date']) - self.assertIsNone(response.data['task_end_date']) - self.assertEqual('', response.data['task_output']) - self.assertIsNone(response.data['execution_time']) - self.assertEqual(Run.Status.NOT_STARTED, response.data['status']) + "Establish relationships between two code trees: deployment and development.", + response.data["description"], + ) + self.assertEqual("", response.data["scancodeio_version"]) + self.assertIsNone(response.data["task_id"]) + self.assertIsNone(response.data["task_start_date"]) + self.assertIsNone(response.data["task_end_date"]) + self.assertEqual("", response.data["task_output"]) + self.assertIsNone(response.data["execution_time"]) + self.assertEqual(Run.Status.NOT_STARTED, response.data["status"]) diff --git a/matchcode_project/dbrouter.py b/matchcode_project/dbrouter.py index 582f77d7..ac739a3b 100644 --- a/matchcode_project/dbrouter.py +++ b/matchcode_project/dbrouter.py @@ -8,23 +8,23 @@ # -class PackageDBRouter(object): +class PackageDBRouter: app_labels = [ - 'clearcode', - 'clearindex', - 'minecode', - 'matchcode', - 'packagedb', + "clearcode", + "clearindex", + "minecode", + "matchcode", + "packagedb", ] def db_for_read(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'packagedb' + return "packagedb" return None def db_for_write(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'packagedb' + return "packagedb" return None def allow_relation(self, obj1, obj2, **hints): @@ -37,23 +37,23 @@ def allow_relation(self, obj1, obj2, **hints): def allow_migrate(self, db, app_label, model_name=None, **hints): if app_label in self.app_labels: - return db == 'packagedb' + return db == "packagedb" return None -class ScancodeIORouter(object): +class ScancodeIORouter: app_labels = [ - 'scanpipe', + "scanpipe", ] def db_for_read(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'default' + return "default" return None def db_for_write(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'default' + return "default" return None def allow_relation(self, obj1, obj2, **hints): @@ -66,5 +66,5 @@ def allow_relation(self, obj1, obj2, **hints): def allow_migrate(self, db, app_label, model_name=None, **hints): if app_label in self.app_labels: - return db == 'default' + return db == "default" return None diff --git a/matchcode_project/settings.py b/matchcode_project/settings.py index 3e4b2b19..b9bf11cb 100644 --- a/matchcode_project/settings.py +++ b/matchcode_project/settings.py @@ -10,7 +10,6 @@ from pathlib import Path import environ - from scancodeio.settings import * PROJECT_DIR = environ.Path(__file__) - 1 @@ -44,40 +43,40 @@ ) INSTALLED_APPS += [ - 'clearcode', - 'clearindex', - 'matchcode', - 'minecode', - 'packagedb', + "clearcode", + "clearindex", + "matchcode", + "minecode", + "packagedb", ] # Database DATABASES = { - 'default': { - 'ENGINE': env.str('SCANCODEIO_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('SCANCODEIO_DB_HOST', 'localhost'), - 'NAME': env.str('SCANCODEIO_DB_NAME', 'matchcodeio'), - 'USER': env.str('SCANCODEIO_DB_USER', 'matchcodeio'), - 'PASSWORD': env.str('SCANCODEIO_DB_PASSWORD', 'matchcodeio'), - 'PORT': env.str('SCANCODEIO_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, + "default": { + "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), + "NAME": env.str("SCANCODEIO_DB_NAME", "matchcodeio"), + "USER": env.str("SCANCODEIO_DB_USER", "matchcodeio"), + "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "matchcodeio"), + "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, + }, + "packagedb": { + "ENGINE": env.str("PACKAGEDB_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("PACKAGEDB_DB_HOST", "localhost"), + "NAME": env.str("PACKAGEDB_DB_NAME", "packagedb"), + "USER": env.str("PACKAGEDB_DB_USER", "packagedb"), + "PASSWORD": env.str("PACKAGEDB_DB_PASSWORD", "packagedb"), + "PORT": env.str("PACKAGEDB_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, }, - 'packagedb': { - 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), - 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), - 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), - 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), - 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, - } } DATABASE_ROUTERS = [ - 'matchcode_project.dbrouter.PackageDBRouter', - 'matchcode_project.dbrouter.ScancodeIORouter', + "matchcode_project.dbrouter.PackageDBRouter", + "matchcode_project.dbrouter.ScancodeIORouter", ] -ROOT_URLCONF = 'matchcode_project.urls' +ROOT_URLCONF = "matchcode_project.urls" diff --git a/matchcode_project/urls.py b/matchcode_project/urls.py index 94812be2..72ccd50a 100644 --- a/matchcode_project/urls.py +++ b/matchcode_project/urls.py @@ -10,20 +10,20 @@ from django.urls import include from django.urls import path from django.views.generic import RedirectView + from rest_framework import routers from matchcode_pipeline.api import D2DViewSet from matchcode_pipeline.api import MatchingViewSet from matchcode_pipeline.api import RunViewSet - api_router = routers.DefaultRouter() -api_router.register('matching', MatchingViewSet, basename='matching') -api_router.register('d2d', D2DViewSet, basename='d2d') -api_router.register('runs', RunViewSet) +api_router.register("matching", MatchingViewSet, basename="matching") +api_router.register("d2d", D2DViewSet, basename="d2d") +api_router.register("runs", RunViewSet) urlpatterns = [ - path('api/', include(api_router.urls)), - path('', include('scanpipe.urls')), - path('', RedirectView.as_view(url='api/')), + path("api/", include(api_router.urls)), + path("", include("scanpipe.urls")), + path("", RedirectView.as_view(url="api/")), ] diff --git a/minecode/__init__.py b/minecode/__init__.py index f31542f8..83ca57de 100644 --- a/minecode/__init__.py +++ b/minecode/__init__.py @@ -12,14 +12,13 @@ from minecode import route - -default_app_config = 'minecode.apps.MinecodeConfig' +default_app_config = "minecode.apps.MinecodeConfig" sys_platform = str(sys.platform).lower() -ON_WINDOWS = 'win32' in sys_platform -ON_MAC = 'darwin' in sys_platform -ON_LINUX = 'linux' in sys_platform +ON_WINDOWS = "win32" in sys_platform +ON_MAC = "darwin" in sys_platform +ON_LINUX = "linux" in sys_platform # global instances of our routers visit_router = route.Router() diff --git a/minecode/api.py b/minecode/api.py index 8fce1795..a479f01a 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -18,8 +18,11 @@ from django.views.decorators.csrf import csrf_exempt from packageurl import PackageURL -from rest_framework import serializers, status, viewsets -from rest_framework.decorators import action, api_view +from rest_framework import serializers +from rest_framework import status +from rest_framework import viewsets +from rest_framework.decorators import action +from rest_framework.decorators import api_view from rest_framework.permissions import IsAdminUser from rest_framework.response import Response @@ -27,7 +30,9 @@ # But importing the collectors module triggers routes registration from minecode import collectors # NOQA from minecode import priority_router -from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI +from minecode.models import PriorityResourceURI +from minecode.models import ResourceURI +from minecode.models import ScannableURI from minecode.permissions import IsScanQueueWorkerAPIUser from minecode.utils import get_temp_file from minecode.utils import get_webhook_url @@ -45,10 +50,9 @@ class ResourceURIViewSet(viewsets.ModelViewSet): class PriorityResourceURISerializer(serializers.ModelSerializer): - class Meta: model = PriorityResourceURI - fields = '__all__' + fields = "__all__" class PriorityResourceURIViewSet(viewsets.ModelViewSet): @@ -60,25 +64,19 @@ class PriorityResourceURIViewSet(viewsets.ModelViewSet): # TODO: hide debug endpoints under `admin` @action(detail=False, methods=["post"]) def index_package(self, request, *args, **kwargs): - """ - Request the indexing and scanning of Package, given a valid Package URL `purl`. - """ - purl = request.data.get('purl') + """Request the indexing and scanning of Package, given a valid Package URL `purl`.""" + purl = request.data.get("purl") # validate purl try: package_url = PackageURL.from_string(purl) except ValueError as e: - message = { - 'status': f'purl validation error: {e}' - } + message = {"status": f"purl validation error: {e}"} return Response(message, status=status.HTTP_400_BAD_REQUEST) # see if its routeable if not priority_router.is_routable(purl): - message = { - 'status': f'Package type `{package_url.type}` is unsupported' - } + message = {"status": f"Package type `{package_url.type}` is unsupported"} return Response(message, status=status.HTTP_400_BAD_REQUEST) # add to queue @@ -86,11 +84,11 @@ def index_package(self, request, *args, **kwargs): if priority_resource_uri: message = { - 'status': f'Package index request for {purl} has been successful.' + "status": f"Package index request for {purl} has been successful." } else: message = { - 'status': f'Package {purl} has already been requested for indexing.' + "status": f"Package {purl} has already been requested for indexing." } # TODO: revisiting a package should be handled on another level, dependent on data we store return Response(message) @@ -99,44 +97,42 @@ def index_package(self, request, *args, **kwargs): class ScannableURISerializer(serializers.ModelSerializer): class Meta: model = ScannableURI - fields = '__all__' + fields = "__all__" class ScannableURIViewSet(viewsets.ModelViewSet): queryset = ScannableURI.objects.all() serializer_class = ScannableURISerializer permission_classes = [IsScanQueueWorkerAPIUser | IsAdminUser] - lookup_field = 'uuid' + lookup_field = "uuid" - @action(detail=False, methods=['get']) + @action(detail=False, methods=["get"]) def get_next_download_url(self, request, *args, **kwargs): - """ - Return download url for next Package on scan queue - """ + """Return download url for next Package on scan queue""" with transaction.atomic(): scannable_uri = ScannableURI.objects.get_next_scannable() if scannable_uri: user = self.request.user - webhook_url = get_webhook_url('index_package_scan', user.id) + webhook_url = get_webhook_url("index_package_scan", user.id) response = { - 'scannable_uri_uuid': scannable_uri.uuid, - 'download_url': scannable_uri.uri, - 'pipelines': scannable_uri.pipelines, - 'webhook_url': webhook_url, + "scannable_uri_uuid": scannable_uri.uuid, + "download_url": scannable_uri.uri, + "pipelines": scannable_uri.pipelines, + "webhook_url": webhook_url, } scannable_uri.scan_status = ScannableURI.SCAN_SUBMITTED scannable_uri.scan_date = timezone.now() scannable_uri.save() else: response = { - 'scannable_uri_uuid': '', - 'download_url': '', - 'pipelines': [], - 'webhook_url': '', + "scannable_uri_uuid": "", + "download_url": "", + "pipelines": [], + "webhook_url": "", } return Response(response) - @action(detail=True, methods=['post']) + @action(detail=True, methods=["post"]) def update_status(self, request, *args, **kwargs): """ Update the status of a ScannableURI with `scan_status` @@ -144,17 +140,16 @@ def update_status(self, request, *args, **kwargs): If `scan_status` is 'failed', then a `scan_log` string is expected and should contain the error messages for that scan. """ - scan_status = request.data.get('scan_status') + scan_status = request.data.get("scan_status") if not scan_status: - response = { - 'error': 'missing scan_status' - } + response = {"error": "missing scan_status"} return Response(response, status=status.HTTP_400_BAD_REQUEST) scannable_uri = self.get_object() scannable_uri_uuid = scannable_uri.uuid scannable_uri_status = ScannableURI.SCAN_STATUSES_BY_CODE.get( - scannable_uri.scan_status) + scannable_uri.scan_status + ) if scannable_uri.scan_status in [ ScannableURI.SCAN_INDEXED, @@ -163,36 +158,34 @@ def update_status(self, request, *args, **kwargs): ScannableURI.SCAN_INDEX_FAILED, ]: response = { - 'error': f'cannot update status for scannable_uri {scannable_uri_uuid}: ' - f'scannable_uri has finished with status "{scannable_uri_status}"' + "error": f"cannot update status for scannable_uri {scannable_uri_uuid}: " + f'scannable_uri has finished with status "{scannable_uri_status}"' } return Response(response, status=status.HTTP_400_BAD_REQUEST) if scan_status == scannable_uri_status: response = { - 'error': f'cannot update status for scannable_uri {scannable_uri_uuid}: ' - f'scannable_uri status is already "{scannable_uri_status}"' + "error": f"cannot update status for scannable_uri {scannable_uri_uuid}: " + f'scannable_uri status is already "{scannable_uri_status}"' } return Response(response, status=status.HTTP_400_BAD_REQUEST) - if scan_status == 'failed': - scan_log = request.data.get('scan_log') + if scan_status == "failed": + scan_log = request.data.get("scan_log") scannable_uri.scan_error = scan_log scannable_uri.scan_status = ScannableURI.SCAN_FAILED scannable_uri.wip_date = None scannable_uri.save() response = { - 'status': f'updated scannable_uri {scannable_uri_uuid} scan_status to {scan_status}' + "status": f"updated scannable_uri {scannable_uri_uuid} scan_status to {scan_status}" } return Response(response) - response = { - 'error': f'invalid scan_status: {scan_status}' - } + response = {"error": f"invalid scan_status: {scan_status}"} return Response(response, status=status.HTTP_400_BAD_REQUEST) -@api_view(['POST']) +@api_view(["POST"]) @csrf_exempt def index_package_scan(request, key): """ @@ -210,36 +203,30 @@ def index_package_scan(request, key): User = get_user_model() user = get_object_or_404(User, id=user_id) - results = json_data.get('results') - summary = json_data.get('summary') - project_data = json_data.get('project') - extra_data = project_data.get('extra_data') - scannable_uri_uuid = extra_data.get('scannable_uri_uuid') + results = json_data.get("results") + summary = json_data.get("summary") + project_data = json_data.get("project") + extra_data = project_data.get("extra_data") + scannable_uri_uuid = extra_data.get("scannable_uri_uuid") # Save results to temporary files - scan_results_location = get_temp_file( - file_name='scan_results', - extension='.json' - ) - scan_summary_location = get_temp_file( - file_name='scan_summary', - extension='.json' - ) + scan_results_location = get_temp_file(file_name="scan_results", extension=".json") + scan_summary_location = get_temp_file(file_name="scan_summary", extension=".json") - with open(scan_results_location, 'w') as f: + with open(scan_results_location, "w") as f: json.dump(results, f) - with open(scan_summary_location, 'w') as f: + with open(scan_summary_location, "w") as f: json.dump(summary, f) scannable_uri = get_object_or_404(ScannableURI, uuid=scannable_uri_uuid) scannable_uri.process_scan_results( scan_results_location=scan_results_location, scan_summary_location=scan_summary_location, - project_extra_data=extra_data + project_extra_data=extra_data, ) msg = { - 'status': f'scan results for scannable_uri {scannable_uri.uuid} ' - 'have been queued for indexing' + "status": f"scan results for scannable_uri {scannable_uri.uuid} " + "have been queued for indexing" } return Response(msg) diff --git a/minecode/apps.py b/minecode/apps.py index 148452e7..aa8b5641 100644 --- a/minecode/apps.py +++ b/minecode/apps.py @@ -13,5 +13,5 @@ class MinecodeConfig(AppConfig): - name = 'minecode' - verbose_name = _('Minecode') + name = "minecode" + verbose_name = _("Minecode") diff --git a/minecode/collectors/conan.py b/minecode/collectors/conan.py index 35c1af94..9b9728d0 100644 --- a/minecode/collectors/conan.py +++ b/minecode/collectors/conan.py @@ -29,9 +29,7 @@ def get_yaml_response(url): - """ - Fetch YAML content from the url and return it as a dictionary. - """ + """Fetch YAML content from the url and return it as a dictionary.""" try: response = requests.get(url) response.raise_for_status() @@ -62,8 +60,7 @@ def get_conan_recipe(name, version): folder = recipe_location.get("folder") if not folder: - logger.error( - f"No folder found for version {version} of package {name}") + logger.error(f"No folder found for version {version} of package {name}") return None, None conanfile_py_url = f"{base_index_url}/{name}/{folder}/conanfile.py" @@ -85,9 +82,7 @@ def get_conan_recipe(name, version): def get_download_info(conandata, version): - """ - Return download_url and SHA256 hash from `conandata.yml`. - """ + """Return download_url and SHA256 hash from `conandata.yml`.""" sources = conandata.get("sources", {}) pkg_data = sources.get(version, {}) @@ -153,9 +148,9 @@ def process_request(purl_str, **kwargs): from minecode.model_utils import DEFAULT_PIPELINES package_url = PackageURL.from_string(purl_str) - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) if not package_url.version: return diff --git a/minecode/collectors/debian.py b/minecode/collectors/debian.py index 1457e45f..72216078 100644 --- a/minecode/collectors/debian.py +++ b/minecode/collectors/debian.py @@ -1,18 +1,19 @@ +import logging + +import attr +import requests +from debian_inspector.version import Version as DebVersion +from packagedcode import models as scan_models from packagedcode.debian import DebianDscFileHandler from packagedcode.debian_copyright import StandaloneDebianCopyrightFileHandler -from debian_inspector.version import Version as DebVersion -import requests +from packageurl import PackageURL + from minecode import priority_router from minecode.utils import fetch_and_write_file_from_url from minecode.utils import get_package_sha1 -from packagedb.models import make_relationship from packagedb.models import PackageContentType from packagedb.models import PackageRelation -from packageurl import PackageURL -import logging -from packagedcode import models as scan_models -import attr - +from packagedb.models import make_relationship logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -27,7 +28,7 @@ UBUNTU_METADATA_URL = "http://changelogs.ubuntu.com/changelogs/pool/main/" -@priority_router.route('pkg:deb/.*') +@priority_router.route("pkg:deb/.*") def process_request(purl_str, **kwargs): """ Process `priority_resource_uri` containing a maven Package URL (PURL) as a @@ -44,9 +45,9 @@ def process_request(purl_str, **kwargs): from minecode.model_utils import DEFAULT_PIPELINES source_purl = kwargs.get("source_purl", None) - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) @@ -55,7 +56,7 @@ def process_request(purl_str, **kwargs): source_package_url = PackageURL.from_string(source_purl) except ValueError as e: - error = f'error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}' + error = f"error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}" return error has_version = bool(package_url.version) @@ -80,7 +81,7 @@ def map_debian_package(debian_package, package_content, pipelines, priority=0): from minecode.model_utils import merge_or_create_package db_package = None - error = '' + error = "" purl = debian_package.package_url if package_content == PackageContentType.BINARY: @@ -90,8 +91,8 @@ def map_debian_package(debian_package, package_content, pipelines, priority=0): response = requests.get(download_url) if not response.ok: - msg = f'Package metadata does not exist on debian: {download_url}' - error += msg + '\n' + msg = f"Package metadata does not exist on debian: {download_url}" + error += msg + "\n" logger.error(msg) return db_package, error @@ -123,7 +124,7 @@ def map_debian_package(debian_package, package_content, pipelines, priority=0): package.download_url = download_url # Set package_content value - package.extra_data['package_content'] = package_content + package.extra_data["package_content"] = package_content # If sha1 exists for an archive, we know we can create the package # Use purl info as base and create packages for binary and source package @@ -132,8 +133,8 @@ def map_debian_package(debian_package, package_content, pipelines, priority=0): package.sha1 = sha1 db_package, _, _, _ = merge_or_create_package(package, visit_level=50) else: - msg = f'Failed to retrieve package archive: {purl.to_string()} from url: {download_url}' - error += msg + '\n' + msg = f"Failed to retrieve package archive: {purl.to_string()} from url: {download_url}" + error += msg + "\n" logger.error(msg) # Submit package for scanning @@ -153,13 +154,13 @@ def get_debian_package_metadata(debian_package): If there are errors, return None and a string containing the error information. """ - error = '' + error = "" metadata_url = debian_package.package_metadata_url temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) if not temp_metadata_file: - msg = f'Package metadata does not exist on debian: {metadata_url}' - error += msg + '\n' + msg = f"Package metadata does not exist on debian: {metadata_url}" + error += msg + "\n" logger.error(msg) return None, error @@ -181,13 +182,13 @@ def get_debian_package_copyright(debian_package): If there are errors, return None and a string containing the error information. """ - error = '' + error = "" metadata_url = debian_package.package_copyright_url temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) if not temp_metadata_file: - msg = f'Package metadata does not exist on debian: {metadata_url}' - error += msg + '\n' + msg = f"Package metadata does not exist on debian: {metadata_url}" + error += msg + "\n" logger.error(msg) return None, error @@ -201,15 +202,15 @@ def get_debian_package_copyright(debian_package): def update_license_copyright_fields(package_from, package_to, replace=True): fields_to_update = [ - 'copyright', - 'holder', - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', - 'extracted_license_statement' + "copyright", + "holder", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", + "extracted_license_statement", ] for field in fields_to_update: @@ -218,25 +219,27 @@ def update_license_copyright_fields(package_from, package_to, replace=True): setattr(package_to, field, value) -def map_debian_metadata_binary_and_source(package_url, source_package_url, pipelines, priority=0): +def map_debian_metadata_binary_and_source( + package_url, source_package_url, pipelines, priority=0 +): """ Get metadata for the binary and source release of the Debian package `package_url` and save it to the PackageDB. Return an error string for errors that occur, or empty string if there is no error. """ - error = '' + error = "" if "repository_url" in package_url.qualifiers: base_url = package_url.qualifiers["repository_url"] - elif package_url.namespace == 'ubuntu': + elif package_url.namespace == "ubuntu": base_url = UBUNTU_BASE_URL else: base_url = DEBIAN_BASE_URL if "api_data_url" in package_url.qualifiers: metadata_base_url = package_url.qualifiers["api_data_url"] - elif package_url.namespace == 'ubuntu': + elif package_url.namespace == "ubuntu": metadata_base_url = UBUNTU_METADATA_URL else: metadata_base_url = DEBIAN_METADATA_URL @@ -260,7 +263,7 @@ def map_debian_metadata_binary_and_source(package_url, source_package_url, pipel if emsg: error += emsg - package_url.qualifiers['classifier'] = 'sources' + package_url.qualifiers["classifier"] = "sources" source_package, emsg = map_debian_package( debian_package, PackageContentType.SOURCE_ARCHIVE, @@ -296,9 +299,7 @@ class DebianPackage: @classmethod def from_purls(cls, package_urls): - """ - Set the directory URLs for metadata and package archives. - """ + """Set the directory URLs for metadata and package archives.""" debian_package = cls(**package_urls) error = debian_package.set_debian_directories() return debian_package, error @@ -318,25 +319,25 @@ def package_archive_version(self): @property def binary_archive_url(self): - """ - Get the .deb debian binary archive url for this debian package. - """ + """Get the .deb debian binary archive url for this debian package.""" purl_version = self.package_archive_version arch = self.package_url.qualifiers.get("arch") if arch: - archive_name =f"{self.package_url.name}_{purl_version}_{arch}.deb" + archive_name = f"{self.package_url.name}_{purl_version}_{arch}.deb" else: - archive_name =f"{self.package_url.name}_{purl_version}.deb" + archive_name = f"{self.package_url.name}_{purl_version}.deb" binary_package_url = self.archive_directory_url + f"{archive_name}" return binary_package_url @property def source_archive_url(self): - """ - Get the debian source tarball archive url for this debian package. - """ + """Get the debian source tarball archive url for this debian package.""" debian_source_archive_formats = [ - ".tar.xz", ".tar.gz", ".orig.tar.xz", ".orig.tar.gz", ".orig.tar.bz2" + ".tar.xz", + ".tar.gz", + ".orig.tar.xz", + ".orig.tar.gz", + ".orig.tar.bz2", ] source_version = self.package_archive_version @@ -349,10 +350,14 @@ def source_archive_url(self): for archive_format in debian_source_archive_formats: if ".orig" in archive_format: - base_version_source = source_version.split('-')[0] - archive_name = f"{source_package_name}_{base_version_source}" + archive_format + base_version_source = source_version.split("-")[0] + archive_name = ( + f"{source_package_name}_{base_version_source}" + archive_format + ) else: - archive_name = f"{source_package_name}_{source_version}" + archive_format + archive_name = ( + f"{source_package_name}_{source_version}" + archive_format + ) source_package_url = self.archive_directory_url + archive_name response = requests.get(source_package_url) if response.ok: @@ -362,9 +367,7 @@ def source_archive_url(self): @property def package_metadata_url(self): - """ - Get the .dsc metadata file url for this debian package. - """ + """Get the .dsc metadata file url for this debian package.""" metadata_version = self.package_archive_version if not self.source_package_url: metadata_package_name = self.package_url.name @@ -373,11 +376,17 @@ def package_metadata_url(self): if self.source_package_url.version: metadata_version = self.source_package_url.version - base_version_metadata = metadata_version.split('+')[0] - metadata_dsc_package_url = self.archive_directory_url + f"{metadata_package_name}_{base_version_metadata}.dsc" + base_version_metadata = metadata_version.split("+")[0] + metadata_dsc_package_url = ( + self.archive_directory_url + + f"{metadata_package_name}_{base_version_metadata}.dsc" + ) response = requests.get(metadata_dsc_package_url) if not response.ok: - metadata_dsc_package_url = self.archive_directory_url + f"{metadata_package_name}_{metadata_version}.dsc" + metadata_dsc_package_url = ( + self.archive_directory_url + + f"{metadata_package_name}_{metadata_version}.dsc" + ) return metadata_dsc_package_url @@ -401,11 +410,17 @@ def package_copyright_url(self): if self.source_package_url.version: metadata_version = self.source_package_url.version - copyright_package_url = self.metadata_directory_url + f"{metadata_package_name}_{metadata_version}{copyright_file_string}" + copyright_package_url = ( + self.metadata_directory_url + + f"{metadata_package_name}_{metadata_version}{copyright_file_string}" + ) response = requests.get(copyright_package_url) if not response.ok: - base_version_metadata = metadata_version.split('+')[0] - copyright_package_url = self.metadata_directory_url + f"{metadata_package_name}_{base_version_metadata}{copyright_file_string}" + base_version_metadata = metadata_version.split("+")[0] + copyright_package_url = ( + self.metadata_directory_url + + f"{metadata_package_name}_{base_version_metadata}{copyright_file_string}" + ) return copyright_package_url @@ -414,22 +429,24 @@ def set_debian_directories(self): Compute and set base urls for metadata and archives, to get source/binary """ - error = '' + error = "" archive_base_url = self.archive_base_url metadata_base_url = self.metadata_base_url index_folder = None - if self.package_url.name.startswith('lib'): + if self.package_url.name.startswith("lib"): name_wout_lib = self.package_url.name.replace("lib", "") - index_folder = 'lib' + name_wout_lib[0] + index_folder = "lib" + name_wout_lib[0] else: index_folder = self.package_url.name[0] msg = "No directory exists for package at: " package_directory = f"{archive_base_url}{index_folder}/{self.package_url.name}/" - metadata_directory = f"{metadata_base_url}{index_folder}/{self.package_url.name}/" + metadata_directory = ( + f"{metadata_base_url}{index_folder}/{self.package_url.name}/" + ) response = requests.get(package_directory) if not response.ok: @@ -437,14 +454,18 @@ def set_debian_directories(self): error = msg + str(package_directory) return error - if self.source_package_url.name.startswith('lib'): + if self.source_package_url.name.startswith("lib"): name_wout_lib = self.source_package_url.name.replace("lib", "") - index_folder = 'lib' + name_wout_lib[0] + index_folder = "lib" + name_wout_lib[0] else: index_folder = self.source_package_url.name[0] - package_directory = f"{archive_base_url}{index_folder}/{self.source_package_url.name}/" - metadata_directory = f"{metadata_base_url}{index_folder}/{self.source_package_url.name}/" + package_directory = ( + f"{archive_base_url}{index_folder}/{self.source_package_url.name}/" + ) + metadata_directory = ( + f"{metadata_base_url}{index_folder}/{self.source_package_url.name}/" + ) response = requests.get(package_directory) if not response.ok: @@ -459,13 +480,11 @@ def set_debian_directories(self): def get_dependencies(data): - """ - Return a list of DependentPackage extracted from a Debian `data` mapping. - """ + """Return a list of DependentPackage extracted from a Debian `data` mapping.""" scopes = { - 'Build-Depends': dict(is_runtime=False, is_optional=True), - 'Depends': dict(is_runtime=True, is_optional=False), - 'Pre-Depends': dict(is_runtime=True, is_optional=False), + "Build-Depends": dict(is_runtime=False, is_optional=True), + "Depends": dict(is_runtime=True, is_optional=False), + "Pre-Depends": dict(is_runtime=True, is_optional=False), # 'Provides': dict(is_runtime=True, is_optional=False), # 'Recommends': dict(is_runtime=True, is_optional=True), # 'Suggests': dict(is_runtime=True, is_optional=True), @@ -482,27 +501,29 @@ def get_dependencies(data): # break each dep in package names and version constraints # FIXME:!!! for name in dependencies: - purl = PackageURL(type='deb', namespace='debian', name=name) - dep = scan_models.DependentPackage(purl=purl.to_string(), score=scope, **flags) + purl = PackageURL(type="deb", namespace="debian", name=name) + dep = scan_models.DependentPackage( + purl=purl.to_string(), score=scope, **flags + ) dep_pkgs.append(dep) return dep_pkgs def get_vcs_repo(description): - """ - Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found. - """ + """Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found.""" repos = [] for vcs_tool, vcs_repo in description.items(): vcs_tool = vcs_tool.lower() - if not vcs_tool.startswith('vcs-') or vcs_tool.startswith('vcs-browser'): + if not vcs_tool.startswith("vcs-") or vcs_tool.startswith("vcs-browser"): continue - _, _, vcs_tool = vcs_tool.partition('-') + _, _, vcs_tool = vcs_tool.partition("-") repos.append((vcs_tool, vcs_repo)) if len(repos) > 1: - raise TypeError('Debian description with more than one Vcs repos: %(repos)r' % locals()) + raise TypeError( + "Debian description with more than one Vcs repos: %(repos)r" % locals() + ) if repos: vcs_tool, vcs_repo = repos[0] diff --git a/minecode/collectors/generic.py b/minecode/collectors/generic.py index 85aa4abd..ab20197a 100644 --- a/minecode/collectors/generic.py +++ b/minecode/collectors/generic.py @@ -36,7 +36,7 @@ def map_generic_package(package_url, pipelines, priority=0): from minecode.model_utils import add_package_to_scan_queue from minecode.model_utils import merge_or_create_package - download_url = package_url.qualifiers.get('download_url') + download_url = package_url.qualifiers.get("download_url") package = PackageData( type=package_url.type, namespace=package_url.namespace, @@ -69,19 +69,19 @@ def process_request(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) except ValueError as e: - error = f'error occured when parsing {purl_str}: {e}' + error = f"error occured when parsing {purl_str}: {e}" return error - download_url = package_url.qualifiers.get('download_url') + download_url = package_url.qualifiers.get("download_url") if not download_url: - error = f'package_url {purl_str} does not contain a download_url qualifier' + error = f"package_url {purl_str} does not contain a download_url qualifier" return error error_msg = map_generic_package(package_url, pipelines, priority) @@ -183,9 +183,9 @@ def process_request_fetchcode_generic(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) @@ -193,8 +193,7 @@ def process_request_fetchcode_generic(purl_str, **kwargs): error = f"error occurred when parsing {purl_str}: {e}" return error - error_msg = map_fetchcode_supported_package( - package_url, pipelines, priority) + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) if error_msg: return error_msg diff --git a/minecode/collectors/github.py b/minecode/collectors/github.py index b43f98da..fbf6337e 100644 --- a/minecode/collectors/github.py +++ b/minecode/collectors/github.py @@ -8,13 +8,14 @@ # from packageurl import PackageURL + from minecode import priority_router from minecode.collectors.generic import map_fetchcode_supported_package # Indexing GitHub PURLs requires a GitHub API token. # Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. -@priority_router.route('pkg:github/.*') +@priority_router.route("pkg:github/.*") def process_request_dir_listed(purl_str, **kwargs): """ Process `priority_resource_uri` containing a GitHub Package URL (PURL). @@ -25,9 +26,9 @@ def process_request_dir_listed(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) @@ -35,8 +36,7 @@ def process_request_dir_listed(purl_str, **kwargs): error = f"error occurred when parsing {purl_str}: {e}" return error - error_msg = map_fetchcode_supported_package( - package_url, pipelines, priority) + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) if error_msg: return error_msg diff --git a/minecode/collectors/gnu.py b/minecode/collectors/gnu.py index ccd1a3e9..861d6231 100644 --- a/minecode/collectors/gnu.py +++ b/minecode/collectors/gnu.py @@ -33,16 +33,15 @@ def process_request(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) package_url = PackageURL.from_string(purl_str) if not package_url.version: return - error_msg = map_fetchcode_supported_package( - package_url, pipelines, priority) + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) if error_msg: return error_msg diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index b3e7e825..0aed49ef 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -7,27 +7,28 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from typing import Dict -from urllib.parse import urlparse -from minecode.miners.maven import get_artifacts, is_worthy_artifact, build_url_and_filename -from packagedcode.maven import get_urls -from minecode.utils import fetch_http, get_temp_file -from packagedcode.models import PackageData import hashlib +import logging import re +from urllib.parse import urlparse + import requests -from packagedcode.maven import get_urls +from packagedcode.maven import _parse from packagedcode.maven import get_maven_pom +from packagedcode.maven import get_urls +from packagedcode.models import PackageData +from packageurl import PackageURL + from minecode import priority_router +from minecode.miners.maven import build_url_and_filename +from minecode.miners.maven import get_artifacts +from minecode.miners.maven import is_worthy_artifact +from minecode.utils import fetch_http +from minecode.utils import get_temp_file from minecode.utils import validate_sha1 -from packagedb.models import make_relationship from packagedb.models import PackageContentType from packagedb.models import PackageRelation from packagedb.models import make_relationship -from packageurl import PackageURL -from packagedcode.maven import _parse -import logging - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -37,12 +38,15 @@ if TRACE: import sys + logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) -MAVEN_BASE_URL = 'https://repo1.maven.org/maven2' -MAVEN_INDEX_URL = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' +MAVEN_BASE_URL = "https://repo1.maven.org/maven2" +MAVEN_INDEX_URL = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" +) class MavenNexusCollector: @@ -59,15 +63,13 @@ def fetch_index(self, uri=MAVEN_INDEX_URL, timeout=10): `timeout` is a default timeout. """ content = fetch_http(uri, timeout=timeout) - temp_file = get_temp_file('NonPersistentHttpVisitor') - with open(temp_file, 'wb') as tmp: + temp_file = get_temp_file("NonPersistentHttpVisitor") + with open(temp_file, "wb") as tmp: tmp.write(content) return temp_file def get_packages(self, content=None): - """ - Yield Package objects from maven index - """ + """Yield Package objects from maven index""" if content: index_location = content else: @@ -86,12 +88,12 @@ def get_packages(self, content=None): continue qualifiers = {} - if extension and extension != 'jar': - qualifiers['type'] = extension + if extension and extension != "jar": + qualifiers["type"] = extension classifier = artifact.classifier if classifier: - qualifiers['classifier'] = classifier + qualifiers["classifier"] = classifier # FIXME: also use the Artifact.src_exist flags too? @@ -100,7 +102,8 @@ def get_packages(self, content=None): # instead togther with the filename... especially we could use # different REPOs. jar_download_url, _ = build_url_and_filename( - group_id, artifact_id, version, extension, classifier) + group_id, artifact_id, version, extension, classifier + ) # FIXME: should this be set in the yielded URI too last_mod = artifact.last_modified @@ -112,12 +115,12 @@ def get_packages(self, content=None): qualifiers=qualifiers or None, ) - repository_homepage_url = urls['repository_homepage_url'] - repository_download_url = urls['repository_download_url'] - api_data_url = urls['api_data_url'] + repository_homepage_url = urls["repository_homepage_url"] + repository_download_url = urls["repository_download_url"] + api_data_url = urls["api_data_url"] yield PackageData( - type='maven', + type="maven", namespace=group_id, name=artifact_id, version=version, @@ -138,7 +141,7 @@ def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_UR field arguments in a string. """ # Create URLs using purl fields - if qualifiers and not isinstance(qualifiers, Dict): + if qualifiers and not isinstance(qualifiers, dict): return urls = get_urls( namespace=namespace, @@ -150,7 +153,7 @@ def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_UR if not urls: return # Get and parse POM info - pom_url = urls['api_data_url'] + pom_url = urls["api_data_url"] # TODO: manage different types of errors (404, etc.) response = requests.get(pom_url) if not response: @@ -159,9 +162,7 @@ def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_UR def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): - """ - Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. - """ + """Return the parent pom text of `pom_text`, or None if `pom_text` has no parent.""" if not pom_text: return pom = get_maven_pom(text=pom_text) @@ -223,13 +224,11 @@ def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_ def merge_parent(package, parent_package): - """ - Merge `parent_package` data into `package` and return `package. - """ + """Merge `parent_package` data into `package` and return `package.""" mergeable_fields = ( - 'declared_license_expression', - 'homepage_url', - 'parties', + "declared_license_expression", + "homepage_url", + "parties", ) for field in mergeable_fields: # If `field` is empty on the package we're looking at, populate @@ -238,12 +237,12 @@ def merge_parent(package, parent_package): value = getattr(parent_package, field) setattr(package, field, value) - msg = f'Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}' - history = package.extra_data.get('history') + msg = f"Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}" + history = package.extra_data.get("history") if history: - package.extra_data['history'].append(msg) + package.extra_data["history"].append(msg) else: - package.extra_data['history'] = [msg] + package.extra_data["history"] = [msg] return package @@ -257,16 +256,18 @@ def merge_ancestors(ancestor_pom_texts, package): """ for ancestor_pom_text in ancestor_pom_texts: ancestor_package = _parse( - datasource_id='maven_pom', - package_type='maven', - primary_language='Java', + datasource_id="maven_pom", + package_type="maven", + primary_language="Java", text=ancestor_pom_text, ) package = merge_parent(package, ancestor_package) return package -def map_maven_package(package_url, package_content, pipelines, priority=0, reindex_metadata=False): +def map_maven_package( + package_url, package_content, pipelines, priority=0, reindex_metadata=False +): """ Add a maven `package_url` to the PackageDB. @@ -274,13 +275,14 @@ def map_maven_package(package_url, package_content, pipelines, priority=0, reind if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. """ - from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package db_package = None - error = '' + error = "" - if 'repository_url' in package_url.qualifiers: - base_url = package_url.qualifiers['repository_url'] + if "repository_url" in package_url.qualifiers: + base_url = package_url.qualifiers["repository_url"] else: base_url = MAVEN_BASE_URL @@ -292,15 +294,15 @@ def map_maven_package(package_url, package_content, pipelines, priority=0, reind base_url=base_url, ) if not pom_text: - msg = f'Package does not exist on maven: {package_url}' - error += msg + '\n' + msg = f"Package does not exist on maven: {package_url}" + error += msg + "\n" logger.error(msg) return db_package, error package = _parse( - 'maven_pom', - 'maven', - 'Java', + "maven_pom", + "maven", + "Java", text=pom_text, base_url=base_url, ) @@ -320,11 +322,11 @@ def map_maven_package(package_url, package_content, pipelines, priority=0, reind # url is not properly generated since it would be missing the sources bit # from the filename. package.qualifiers = package_url.qualifiers - package.download_url = urls['repository_download_url'] - package.repository_download_url = urls['repository_download_url'] + package.download_url = urls["repository_download_url"] + package.repository_download_url = urls["repository_download_url"] # Set package_content value - package.extra_data['package_content'] = package_content + package.extra_data["package_content"] = package_content # If sha1 exists for a jar, we know we can create the package # Use pom info as base and create packages for binary and source package @@ -334,32 +336,34 @@ def map_maven_package(package_url, package_content, pipelines, priority=0, reind if sha1: package.sha1 = sha1 override = reindex_metadata - db_package, _, _, _ = merge_or_create_package(package, visit_level=50, override=override) + db_package, _, _, _ = merge_or_create_package( + package, visit_level=50, override=override + ) else: - msg = f'Failed to retrieve JAR: {package_url}' - error += msg + '\n' + msg = f"Failed to retrieve JAR: {package_url}" + error += msg + "\n" logger.error(msg) if not reindex_metadata: # Submit package for scanning if db_package: add_package_to_scan_queue( - package=db_package, - pipelines=pipelines, - priority=priority + package=db_package, pipelines=pipelines, priority=priority ) return db_package, error -def map_maven_binary_and_source(package_url, pipelines, priority=0, reindex_metadata=False): +def map_maven_binary_and_source( + package_url, pipelines, priority=0, reindex_metadata=False +): """ Get metadata for the binary and source release of the Maven package `package_url` and save it to the PackageDB. Return an error string for errors that occur, or empty string if there is no error. """ - error = '' + error = "" package, emsg = map_maven_package( package_url=package_url, package_content=PackageContentType.BINARY, @@ -371,7 +375,7 @@ def map_maven_binary_and_source(package_url, pipelines, priority=0, reindex_meta error += emsg source_package_url = package_url - source_package_url.qualifiers['classifier'] = 'sources' + source_package_url.qualifiers["classifier"] = "sources" source_package, emsg = map_maven_package( package_url=source_package_url, package_content=PackageContentType.SOURCE_ARCHIVE, @@ -400,21 +404,21 @@ def map_maven_packages(package_url, pipelines): Return an error string for errors that occur, or empty string if there is no error. """ - error = '' + error = "" namespace = package_url.namespace name = package_url.name # Find all versions of this package - query_params = f'g:{namespace}+AND+a:{name}' - url = f'https://search.maven.org/solrsearch/select?q={query_params}&core=gav' + query_params = f"g:{namespace}+AND+a:{name}" + url = f"https://search.maven.org/solrsearch/select?q={query_params}&core=gav" response = requests.get(url) if response: - package_listings = response.json().get('response', {}).get('docs', []) + package_listings = response.json().get("response", {}).get("docs", []) for listing in package_listings: purl = PackageURL( - type='maven', - namespace=listing.get('g'), - name=listing.get('a'), - version=listing.get('v'), + type="maven", + namespace=listing.get("g"), + name=listing.get("a"), + version=listing.get("v"), ) emsg = map_maven_binary_and_source(purl, pipelines) if emsg: @@ -430,7 +434,7 @@ def get_package_sha1(package): from that. """ download_url = package.repository_download_url - sha1_download_url = f'{download_url}.sha1' + sha1_download_url = f"{download_url}.sha1" response = requests.get(sha1_download_url) if response.ok: sha1_contents = response.text.strip().split() @@ -440,12 +444,12 @@ def get_package_sha1(package): # Download JAR and calculate sha1 if we cannot get it from the repo response = requests.get(download_url) if response: - sha1_hash = hashlib.new('sha1', response.content) + sha1_hash = hashlib.new("sha1", response.content) sha1 = sha1_hash.hexdigest() return sha1 -@priority_router.route('pkg:maven/.*') +@priority_router.route("pkg:maven/.*") def process_request(purl_str, **kwargs): """ Process `priority_resource_uri` containing a maven Package URL (PURL) as a @@ -461,19 +465,19 @@ def process_request(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) except ValueError as e: - error = f'error occured when parsing {purl_str}: {e}' + error = f"error occured when parsing {purl_str}: {e}" return error has_version = bool(package_url.version) if has_version: - reindex_metadata=kwargs.get("reindex_metadata", False) + reindex_metadata = kwargs.get("reindex_metadata", False) error = map_maven_binary_and_source( package_url, pipelines, @@ -493,30 +497,22 @@ def process_request(purl_str, **kwargs): def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): - """ - Return True if `file_name` is in `links` - """ + """Return True if `file_name` is in `links`""" return any(l.endswith(file_name) for l in links) def check_if_page_has_pom_files(links, **kwargs): - """ - Return True of any entry in `links` ends with .pom. - """ - return any(l.endswith('.pom') for l in links) + """Return True of any entry in `links` ends with .pom.""" + return any(l.endswith(".pom") for l in links) def check_if_page_has_directories(links, **kwargs): - """ - Return True if any entry, excluding "../", ends with /. - """ - return any(l.endswith('/') for l in links if l != '../') + """Return True if any entry, excluding "../", ends with /.""" + return any(l.endswith("/") for l in links if l != "../") def check_if_package_version_page(links, **kwargs): - """ - Return True if `links` contains pom files and has no directories - """ + """Return True if `links` contains pom files and has no directories""" return check_if_page_has_pom_files( links=links ) and not check_if_page_has_directories(links=links) @@ -524,7 +520,7 @@ def check_if_package_version_page(links, **kwargs): def check_if_package_page(links, **kwargs): return check_if_file_name_is_linked_on_page( - file_name='maven-metadata.xml', links=links + file_name="maven-metadata.xml", links=links ) and not check_if_page_has_pom_files(links=links) @@ -534,7 +530,7 @@ def check_if_maven_root(links, **kwargs): repo contains "archetype-catalog.xml". """ return check_if_file_name_is_linked_on_page( - file_name='archetype-catalog.xml', links=links + file_name="archetype-catalog.xml", links=links ) @@ -551,23 +547,17 @@ def check_on_page(url, checker): def is_maven_root(url): - """ - Return True if `url` is the root of a Maven repo, False otherwise. - """ + """Return True if `url` is the root of a Maven repo, False otherwise.""" return check_on_page(url, check_if_maven_root) def is_package_page(url): - """ - Return True if `url` is a package page on a Maven repo, False otherwise. - """ + """Return True if `url` is a package page on a Maven repo, False otherwise.""" return check_on_page(url, check_if_package_page) def is_package_version_page(url): - """ - Return True if `url` is a package version page on a Maven repo, False otherwise. - """ + """Return True if `url` is a package version page on a Maven repo, False otherwise.""" return check_on_page(url, check_if_package_version_page) @@ -575,14 +565,14 @@ def url_parts(url): parsed_url = urlparse(url) scheme = parsed_url.scheme netloc = parsed_url.netloc - path_segments = [p for p in parsed_url.path.split('/') if p] + path_segments = [p for p in parsed_url.path.split("/") if p] return scheme, netloc, path_segments def create_url(scheme, netloc, path_segments): - url_template = f'{scheme}://{netloc}' - path = '/'.join(path_segments) - return f'{url_template}/{path}' + url_template = f"{scheme}://{netloc}" + path = "/".join(path_segments) + return f"{url_template}/{path}" def get_maven_root(url): @@ -617,34 +607,32 @@ def determine_namespace_name_version_from_url(url, root_url=None): if not root_url: root_url = get_maven_root(url) if not root_url: - raise Exception(f'Error: not a Maven repository: {url}') + raise Exception(f"Error: not a Maven repository: {url}") _, remaining_path_segments = url.split(root_url) - remaining_path_segments = remaining_path_segments.split('/') + remaining_path_segments = remaining_path_segments.split("/") remaining_path_segments = [p for p in remaining_path_segments if p] namespace_segments = [] - package_name = '' - package_version = '' + package_name = "" + package_version = "" for i in range(len(remaining_path_segments)): segment = remaining_path_segments[i] segments = remaining_path_segments[: i + 1] - path = '/'.join(segments) - url_segment = f'{root_url}/{path}' + path = "/".join(segments) + url_segment = f"{root_url}/{path}" if is_package_page(url_segment): package_name = segment elif is_package_version_page(url_segment): package_version = segment else: namespace_segments.append(segment) - namespace = '.'.join(namespace_segments) + namespace = ".".join(namespace_segments) return namespace, package_name, package_version def add_to_import_queue(url, root_url): - """ - Create ImportableURI for the Maven repo package page at `url`. - """ + """Create ImportableURI for the Maven repo package page at `url`.""" from minecode.models import ImportableURI data = None @@ -653,42 +641,40 @@ def add_to_import_queue(url, root_url): data = response.text namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) purl = PackageURL( - type='maven', + type="maven", namespace=namespace, name=name, ) importable_uri = ImportableURI.objects.insert(url, data, purl) if importable_uri: - logger.info(f'Inserted {url} into ImportableURI queue') + logger.info(f"Inserted {url} into ImportableURI queue") def filter_only_directories(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), - """ + """Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`),""" timestamps_by_links_filtered = {} for link, timestamp in timestamps_by_links.items(): - if link != '../' and link.endswith('/'): + if link != "../" and link.endswith("/"): timestamps_by_links_filtered[link] = timestamp return timestamps_by_links_filtered valid_artifact_extensions = [ - 'ejb3', - 'ear', - 'aar', - 'apk', - 'gem', - 'jar', - 'nar', + "ejb3", + "ear", + "aar", + "apk", + "gem", + "jar", + "nar", # 'pom', - 'so', - 'swc', - 'tar', - 'tar.gz', - 'war', - 'xar', - 'zip', + "so", + "swc", + "tar", + "tar.gz", + "war", + "xar", + "zip", ] @@ -714,8 +700,8 @@ def collect_links_from_text(text, filter): links_and_timestamps = collect_links_and_artifact_timestamps(text) timestamps_by_links = {} for link, timestamp in links_and_timestamps: - if timestamp == '-': - timestamp = '' + if timestamp == "-": + timestamp = "" timestamps_by_links[link] = timestamp timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) @@ -728,19 +714,17 @@ def create_absolute_urls_for_links(text, url, filter): links from `url` and their timestamps, that is then filtered by `filter`. """ timestamps_by_absolute_links = {} - url = url.rstrip('/') + url = url.rstrip("/") timestamps_by_links = collect_links_from_text(text, filter) for link, timestamp in timestamps_by_links.items(): if not link.startswith(url): - link = f'{url}/{link}' + link = f"{url}/{link}" timestamps_by_absolute_links[link] = timestamp return timestamps_by_absolute_links def get_directory_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ + """Return a list of absolute directory URLs of the hyperlinks from `url`""" timestamps_by_directory_links = {} response = requests.get(url) if response: @@ -751,9 +735,7 @@ def get_directory_links(url): def get_artifact_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ + """Return a list of absolute directory URLs of the hyperlinks from `url`""" timestamps_by_artifact_links = [] response = requests.get(url) if response: @@ -764,9 +746,7 @@ def get_artifact_links(url): def crawl_to_package(url, root_url): - """ - Given a maven repo `url`, - """ + """Given a maven repo `url`,""" if is_package_page(url): add_to_import_queue(url, root_url) return @@ -784,11 +764,9 @@ def crawl_maven_repo_from_root(root_url): def get_artifact_sha1(artifact_url): - """ - Return the SHA1 value of the Maven artifact located at `artifact_url`. - """ + """Return the SHA1 value of the Maven artifact located at `artifact_url`.""" sha1 = None - artifact_sha1_url = f'{artifact_url}.sha1' + artifact_sha1_url = f"{artifact_url}.sha1" response = requests.get(artifact_sha1_url) if response: sha1_contents = response.text.strip().split() @@ -806,18 +784,18 @@ def get_classifier_from_artifact_url( """ classifier = None # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 - package_version_page_url = package_version_page_url.rstrip('/') + package_version_page_url = package_version_page_url.rstrip("/") # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 - leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' + leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}" # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' # ['', '-onejar.jar'] _, remaining_url_portion = artifact_url.split(leading_url_portion) # ['-onejar', 'jar'] - remaining_url_portions = remaining_url_portion.split('.') + remaining_url_portions = remaining_url_portion.split(".") if remaining_url_portions and remaining_url_portions[0]: # '-onejar' classifier = remaining_url_portions[0] - if classifier.startswith('-'): + if classifier.startswith("-"): # 'onejar' classifier = classifier[1:] return classifier diff --git a/minecode/collectors/npm.py b/minecode/collectors/npm.py index 10ab3575..50593744 100644 --- a/minecode/collectors/npm.py +++ b/minecode/collectors/npm.py @@ -9,13 +9,13 @@ import logging -from packagedb.models import PackageContentType -from minecode import priority_router import requests -from packageurl import PackageURL -from packagedcode.npm import npm_api_url from packagedcode.npm import NpmPackageJsonHandler +from packagedcode.npm import npm_api_url +from packageurl import PackageURL +from minecode import priority_router +from packagedb.models import PackageContentType """ Collect NPM packages from npm registries. @@ -63,29 +63,25 @@ def map_npm_package(package_url, pipelines, priority=0): ) if not package_json: - error = f'Package does not exist on npmjs: {package_url}' + error = f"Package does not exist on npmjs: {package_url}" logger.error(error) return error - package = NpmPackageJsonHandler._parse( - json_data=package_json - ) - package.extra_data['package_content'] = PackageContentType.SOURCE_ARCHIVE + package = NpmPackageJsonHandler._parse(json_data=package_json) + package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE db_package, _, _, error = merge_or_create_package(package, visit_level=0) # Submit package for scanning if db_package: add_package_to_scan_queue( - package=db_package, - pipelines=pipelines, - priority=priority + package=db_package, pipelines=pipelines, priority=priority ) return error -@priority_router.route('pkg:npm/.*') +@priority_router.route("pkg:npm/.*") def process_request(purl_str, **kwargs): """ Process `priority_resource_uri` containing a npm Package URL (PURL) as a @@ -97,9 +93,9 @@ def process_request(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) package_url = PackageURL.from_string(purl_str) if not package_url.version: diff --git a/minecode/collectors/openssl.py b/minecode/collectors/openssl.py index aee9a54e..a74cb421 100644 --- a/minecode/collectors/openssl.py +++ b/minecode/collectors/openssl.py @@ -8,13 +8,14 @@ # from packageurl import PackageURL -from minecode import priority_router +from minecode import priority_router from minecode.collectors.generic import map_fetchcode_supported_package + # Indexing OpenSSL PURLs requires a GitHub API token. # Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. -@priority_router.route('pkg:openssl/openssl@.*') +@priority_router.route("pkg:openssl/openssl@.*") def process_request_dir_listed(purl_str, **kwargs): """ Process `priority_resource_uri` containing a OpenSSL Package URL (PURL) @@ -26,9 +27,9 @@ def process_request_dir_listed(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) diff --git a/minecode/command.py b/minecode/command.py index b46f29ae..7e22aee1 100644 --- a/minecode/command.py +++ b/minecode/command.py @@ -15,13 +15,12 @@ from minecode import ON_WINDOWS - logger = logging.getLogger(__name__) # FIXME: use commoncode instead -class Command(object): +class Command: """Simple wrapper around a subprocess.""" def __init__(self, command, env=None, cwd=None): @@ -31,15 +30,17 @@ def __init__(self, command, env=None, cwd=None): self.start() def start(self): - self.proc = subprocess.Popen(self.command, - shell=True, - cwd=self.cwd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=self.env, - universal_newlines=True, - close_fds=not ON_WINDOWS) + self.proc = subprocess.Popen( + self.command, + shell=True, + cwd=self.cwd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=self.env, + universal_newlines=True, + close_fds=not ON_WINDOWS, + ) self.returncode = self.proc.returncode def execute(self): @@ -49,15 +50,16 @@ def stop(self): if not self.proc: return - close_pipe(getattr(self.proc, 'stdin', None)) - close_pipe(getattr(self.proc, 'stderr', None)) - close_pipe(getattr(self.proc, 'stdout', None)) + close_pipe(getattr(self.proc, "stdin", None)) + close_pipe(getattr(self.proc, "stderr", None)) + close_pipe(getattr(self.proc, "stdout", None)) # Ensure process death in all cases, otherwise proc.wait seems to hang # in some cases def kill(sig, fun): if self.proc and self.proc.poll() is None: self.proc.kill() + signal.signal(signal.SIGALRM, kill) # @UndefinedVariable signal.alarm(5) # @UndefinedVariable @@ -71,5 +73,5 @@ def close_pipe(pipe): return try: pipe.close() - except IOError: + except OSError: pass diff --git a/minecode/debutils.py b/minecode/debutils.py index 5c33f531..04e28bdc 100644 --- a/minecode/debutils.py +++ b/minecode/debutils.py @@ -15,35 +15,29 @@ def parse_email(text): """ if not text: return None, None - name, _, email = text.partition('<') - email = email.strip('>') + name, _, email = text.partition("<") + email = email.strip(">") name = name.strip() email = email.strip() return name or None, email or None def comma_separated(text): - """ - Return a list of strings from a comma-separated text. - """ + """Return a list of strings from a comma-separated text.""" if not text: return [] - return [t.strip() for t in text.split(',') if t and t.strip()] + return [t.strip() for t in text.split(",") if t and t.strip()] def fold(value): - """ - Return a folded `value` string. - """ + """Return a folded `value` string.""" if not value: return value - return ''.join(value.split()) + return "".join(value.split()) def line_separated(value): - """ - Return a list of values from a `value` string using line delimiters. - """ + """Return a list of values from a `value` string using line delimiters.""" if not value: return [] return [v.strip() for v in value.splitlines(False) if v] diff --git a/minecode/filter.py b/minecode/filter.py index 38d89c49..371e9415 100644 --- a/minecode/filter.py +++ b/minecode/filter.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -22,22 +21,23 @@ def sf_net(input_file, output): adding new columns and trying to sf_net the data """ download_url_template = ( - 'http://master.dl.sourceforge.net/project' - '/%(project_id)s%(filename)s' + "http://master.dl.sourceforge.net/project" "/%(project_id)s%(filename)s" ) - existing_headers = ('external_id,name,version,license,owners,' - 'homepage_url,keywords,description'.split(',') - ) + existing_headers = ( + "external_id,name,version,license,owners," + "homepage_url,keywords,description".split(",") + ) - new_headers = ('computed_version,release_date_ts,file_download_url,' - 'reviewed,curated_name,excluded_reason,curated_owner,' - 'owner_type'.split(',') - ) + new_headers = ( + "computed_version,release_date_ts,file_download_url," + "reviewed,curated_name,excluded_reason,curated_owner," + "owner_type".split(",") + ) - with open(output, 'w') as fo: + with open(output, "w") as fo: writer = csv.writer(fo, quoting=csv.QUOTE_ALL) - with open(input_file, 'r') as fi: + with open(input_file) as fi: reader = csv.reader(fi) for i, l in enumerate(reader): if i == 0: @@ -48,7 +48,7 @@ def sf_net(input_file, output): project_id = l[0] name = l[1] version_column = l[2] - sep = ': released on ' + sep = ": released on " if sep not in version_column: # write as is if we do not have a file release date # separator @@ -56,23 +56,23 @@ def sf_net(input_file, output): continue filename, release_date_ts = version_column.split(sep, 1) found_version = version.version_hint(filename) - l.append(found_version or '') - l.append(release_date_ts or '') + l.append(found_version or "") + l.append(release_date_ts or "") l.append(download_url_template % locals()) - l.append('') # reviewed - l.append('') # curated name - excluded_reason = '' - if '.' in project_id: - excluded_reason = 'mirror or special project' + l.append("") # reviewed + l.append("") # curated name + excluded_reason = "" + if "." in project_id: + excluded_reason = "mirror or special project" elif not found_version: - excluded_reason = 'no version' + excluded_reason = "no version" elif not good_name(name): - excluded_reason = 'special chars in name' + excluded_reason = "special chars in name" elif not good_filename(project_id, filename, name): - excluded_reason = 'multi component possible' + excluded_reason = "multi component possible" l.append(excluded_reason) - l.append('') # curated_owner - l.append('') # owner_type + l.append("") # curated_owner + l.append("") # owner_type writer.writerow(l) @@ -85,9 +85,11 @@ def good_name(s): -- there is a punctuation sign string.punctuation -- there is non-ascii letters string.letters + string.digit """ - return (s - and all(c not in string.punctuation for c in s) - and all(c in string.ascii_lowercase for c in s.lower())) + return ( + s + and all(c not in string.punctuation for c in s) + and all(c in string.ascii_lowercase for c in s.lower()) + ) def good_filename(pid, fn, name): diff --git a/minecode/indexing.py b/minecode/indexing.py index ba6a1e36..797f6d15 100644 --- a/minecode/indexing.py +++ b/minecode/indexing.py @@ -31,7 +31,9 @@ def index_package_files(package, scan_data, reindex=False): deleted and recreated from `scan_data`. """ if reindex: - logger.info(f'Deleting fingerprints and Resources related to {package.package_url}') + logger.info( + f"Deleting fingerprints and Resources related to {package.package_url}" + ) package.approximatedirectorycontentindex_set.all().delete() package.approximatedirectorystructureindex_set.all().delete() package.approximateresourcecontentindex_set.all().delete() @@ -40,21 +42,24 @@ def index_package_files(package, scan_data, reindex=False): scan_index_errors = [] try: - logger.info(f'Indexing Resources and fingerprints related to {package.package_url} from scan data') - for resource in scan_data.get('files', []): + logger.info( + f"Indexing Resources and fingerprints related to {package.package_url} from scan data" + ) + for resource in scan_data.get("files", []): r, _, _ = update_or_create_resource(package, resource) path = r.path sha1 = r.sha1 if sha1: - _, _ = ExactFileIndex.index( - sha1=sha1, - package=package - ) + _, _ = ExactFileIndex.index(sha1=sha1, package=package) - resource_extra_data = resource.get('extra_data', {}) - directory_content_fingerprint = resource_extra_data.get('directory_content', '') - directory_structure_fingerprint = resource_extra_data.get('directory_structure', '') - halo1 = resource_extra_data.get('halo1', '') + resource_extra_data = resource.get("extra_data", {}) + directory_content_fingerprint = resource_extra_data.get( + "directory_content", "" + ) + directory_structure_fingerprint = resource_extra_data.get( + "directory_structure", "" + ) + halo1 = resource_extra_data.get("halo1", "") if directory_content_fingerprint: _, _ = ApproximateDirectoryContentIndex.index( @@ -85,50 +90,56 @@ def index_package_files(package, scan_data, reindex=False): return scan_index_errors -def index_package(scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False): +def index_package( + scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False +): scan_index_errors = [] try: indexing_errors = index_package_files(package, scan_data, reindex=reindex) scan_index_errors.extend(indexing_errors) - declared_license_expression = summary_data.get('declared_license_expression') - other_license_expressions = summary_data.get('other_license_expressions', []) - other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] + declared_license_expression = summary_data.get("declared_license_expression") + other_license_expressions = summary_data.get("other_license_expressions", []) + other_license_expressions = [ + l["value"] for l in other_license_expressions if l["value"] + ] other_license_expression = combine_expressions(other_license_expressions) - copyright = '' - declared_holder = summary_data.get('declared_holder') + copyright = "" + declared_holder = summary_data.get("declared_holder") if declared_holder: - copyright = f'Copyright (c) {declared_holder}' + copyright = f"Copyright (c) {declared_holder}" checksums_and_size_by_field = { k: v for k, v in project_extra_data.items() - if k in [ - 'md5','sha1', 'size', 'sha256', 'sha512', 'filename' - ] + if k in ["md5", "sha1", "size", "sha256", "sha512", "filename"] } values_by_updateable_fields = { - 'summary': summary_data, - 'declared_license_expression': declared_license_expression, - 'other_license_expression': other_license_expression, - 'copyright': copyright, - **checksums_and_size_by_field + "summary": summary_data, + "declared_license_expression": declared_license_expression, + "other_license_expression": other_license_expression, + "copyright": copyright, + **checksums_and_size_by_field, } # do not override fields with empty values - values_by_updateable_fields = {k: v for k, v in values_by_updateable_fields.items() if v} - - _, updated_fields = package.update_fields(save=True, **values_by_updateable_fields) - updated_fields = ', '.join(updated_fields) - message = f'Updated fields for Package {package.purl}: {updated_fields}' + values_by_updateable_fields = { + k: v for k, v in values_by_updateable_fields.items() if v + } + + _, updated_fields = package.update_fields( + save=True, **values_by_updateable_fields + ) + updated_fields = ", ".join(updated_fields) + message = f"Updated fields for Package {package.purl}: {updated_fields}" logger.info(message) scannable_uri.scan_status = ScannableURI.SCAN_INDEXED scannable_uri.save() except Exception: traceback_message = traceback.format_exc() - error_message = traceback_message + '\n' + error_message = traceback_message + "\n" # TODO: We should rerun the specific indexers that have failed if scan_index_errors: - error_message += '\n'.join(scan_index_errors) + error_message += "\n".join(scan_index_errors) logger.error(error_message) scannable_uri.index_error = error_message scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED diff --git a/minecode/ls.py b/minecode/ls.py index 3d636b8b..14a9ad71 100644 --- a/minecode/ls.py +++ b/minecode/ls.py @@ -8,21 +8,21 @@ # -from datetime import datetime -from functools import total_ordering import logging import posixpath import stat +from datetime import datetime +from functools import total_ordering -from ftputil.stat import UnixParser from ftputil.error import ParserError - +from ftputil.stat import UnixParser TRACE = False logger = logging.getLogger(__name__) if TRACE: import sys + logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) logger.setLevel(logging.DEBUG) @@ -32,20 +32,19 @@ """ # TODO: use constants for entry types -DIR = 'd' -FILE = 'f' -LINK = 'l' -SPECIAL = 's' +DIR = "d" +FILE = "f" +LINK = "l" +SPECIAL = "s" # FIXME: do we really need link and special file support? @total_ordering -class Entry(object): - """ - Represent a file, directory or link entry in a directory listing. - """ - __slots__ = 'path', 'type', 'size', 'date', 'target' +class Entry: + """Represent a file, directory or link entry in a directory listing.""" + + __slots__ = "path", "type", "size", "date", "target" def __init__(self, path=None, type=None, size=None, date=None, target=None): # NOQA self.path = path @@ -54,35 +53,37 @@ def __init__(self, path=None, type=None, size=None, date=None, target=None): # self.date = date self.target = target if TRACE: - logger.debug('Entry(): ' + repr(self)) + logger.debug("Entry(): " + repr(self)) def __repr__(self): - base = 'Entry(path=%(path)r, type=%(type)r, size=%(size)r, date=%(date)r' - link_target = ')' + base = "Entry(path=%(path)r, type=%(type)r, size=%(size)r, date=%(date)r" + link_target = ")" if self.type == LINK: - link_target = ', target=%(target)r)' + link_target = ", target=%(target)r)" return (base + link_target) % self.to_dict() def __eq__(self, other): return isinstance(other, Entry) and self.to_dict() == other.to_dict() def __lt__(self, other): - return isinstance(other, Entry) and tuple(self.to_dict().items()) < tuple(other.to_dict().items()) + return isinstance(other, Entry) and tuple(self.to_dict().items()) < tuple( + other.to_dict().items() + ) def __hash__(self): return hash(tuple(self.to_dict().items())) def to_dict(self): return { - 'path': self.path, - 'type': self.type, - 'size': self.size, - 'date': self.date, - 'target': self.target, + "path": self.path, + "type": self.type, + "size": self.size, + "date": self.date, + "target": self.target, } @classmethod - def from_stat(self, stat_result, base_dir='', use_utc_time=True): + def from_stat(self, stat_result, base_dir="", use_utc_time=True): """ Return a new Entry built from a stat-like tuple and a base directory. @@ -119,7 +120,7 @@ def from_stat(self, stat_result, base_dir='', use_utc_time=True): res_type = SPECIAL # rejoin path with base-dir if any - if base_dir and base_dir != '.': + if base_dir and base_dir != ".": base_dir = clean_path(base_dir) path = posixpath.join(base_dir, path) @@ -127,11 +128,9 @@ def from_stat(self, stat_result, base_dir='', use_utc_time=True): def clean_path(path): - """ - Return a path cleaned from leading and trailing slashes and leading ./. - """ - path = path.strip().strip('/') - if path.startswith('./'): + """Return a path cleaned from leading and trailing slashes and leading ./.""" + path = path.strip().strip("/") + if path.startswith("./"): path = path[2:] return path.strip() @@ -141,8 +140,8 @@ def remove_inode(line): Return the line with leading inode number and size in block (which are numbers separated by spaces) are removed. """ - _, _, line = line.strip().partition(' ') - _, _, line = line.strip().partition(' ') + _, _, line = line.strip().partition(" ") + _, _, line = line.strip().partition(" ") return line.strip() @@ -161,7 +160,7 @@ def parse_directory_listing(dir_listing, from_find=False): parser = UnixParser() # default in case this would not be a recursive listing: we always need a base dir - base_dir = '' + base_dir = "" for ln, line in enumerate(lines, 1): line = line.strip() if parser.ignores_line(line): @@ -174,26 +173,25 @@ def parse_directory_listing(dir_listing, from_find=False): try: file_stat = parser.parse_line(line) if TRACE: - logger.debug( - 'parse_directory_listing:file_stat: ' + repr(file_stat)) + logger.debug("parse_directory_listing:file_stat: " + repr(file_stat)) dt = datetime.utcfromtimestamp(file_stat.st_mtime) dt = datetime.isoformat(dt) - logger.debug( - 'parse_directory_listing:file_stat:date: ' + repr(dt)) + logger.debug("parse_directory_listing:file_stat:date: " + repr(dt)) - except ParserError as pe: + except ParserError: # this is likely a directory line from an ls -LR listing. Strip # trailing colon and keep track of the base directory - if not line.endswith(':'): + if not line.endswith(":"): raise Exception( - 'Unknown directory listing line format: #%(ln)d: %(line)r' % locals()) - base_dir = line.strip(':') + "Unknown directory listing line format: #%(ln)d: %(line)r" + % locals() + ) + base_dir = line.strip(":") continue - if file_stat._st_name in ('.', '..'): + if file_stat._st_name in (".", ".."): continue - entry = Entry.from_stat( - file_stat, base_dir=base_dir, use_utc_time=False) + entry = Entry.from_stat(file_stat, base_dir=base_dir, use_utc_time=False) if entry: yield entry diff --git a/minecode/management/commands/__init__.py b/minecode/management/commands/__init__.py index 27a8178c..a629f92f 100644 --- a/minecode/management/commands/__init__.py +++ b/minecode/management/commands/__init__.py @@ -9,8 +9,8 @@ import logging -from os import getenv import traceback +from os import getenv from django.conf import settings from django.core.management.base import BaseCommand @@ -25,7 +25,7 @@ class VerboseCommand(BaseCommand): """ def get_verbosity(self, **options): - verbosity = int(options.get('verbosity', 1)) + verbosity = int(options.get("verbosity", 1)) levels = {1: logging.INFO, 2: logging.ERROR, 3: logging.DEBUG} return levels.get(verbosity, logging.CRITICAL) @@ -42,17 +42,13 @@ def stop_handler(cls, *args, **kwargs): def get_error_message(e): - """ - Return an error message with a traceback given an exception. - """ + """Return an error message with a traceback given an exception.""" tb = traceback.format_exc() - msg = e.__class__.__name__ + ' ' + repr(e) - msg += '\n' + tb + msg = e.__class__.__name__ + " " + repr(e) + msg += "\n" + tb return msg def get_settings(var_name): - """ - Return the settings value from the environment or Django settings. - """ - return getenv(var_name) or getattr(settings, var_name, None) or '' + """Return the settings value from the environment or Django settings.""" + return getenv(var_name) or getattr(settings, var_name, None) or "" diff --git a/minecode/management/commands/check_licenses.py b/minecode/management/commands/check_licenses.py index 5716c626..62eb32cc 100644 --- a/minecode/management/commands/check_licenses.py +++ b/minecode/management/commands/check_licenses.py @@ -9,18 +9,17 @@ import codecs -from functools import reduce import json import logging import operator import os import sys +from functools import reduce from django.db.models import Q -from packagedb.models import Package - from minecode.management.commands import VerboseCommand +from packagedb.models import Package """ Utility command to find license oddities. @@ -35,19 +34,20 @@ class Command(VerboseCommand): - help = ('Find packages with an ambiguous declared license.') + help = "Find packages with an ambiguous declared license." def add_arguments(self, parser): parser.add_argument( - '-o', '--output', type=str, - help='Define the output file name') + "-o", "--output", type=str, help="Define the output file name" + ) parser.add_argument( - '--types', - dest='types', - default='maven', - action='store', - help='Package types to check, comma-separated [maven]') + "--types", + dest="types", + default="maven", + action="store", + help="Package types to check, comma-separated [maven]", + ) def handle(self, *args, **options): """ @@ -57,37 +57,46 @@ def handle(self, *args, **options): """ logger.setLevel(self.get_verbosity(**options)) - output_filename = options.get('output') + output_filename = options.get("output") - types = options.get('types') - types = [t.strip() for t in types.split(',') if t.strip()] + types = options.get("types") + types = [t.strip() for t in types.split(",") if t.strip()] packages_with_ambiguous_licenses = find_ambiguous_packages(types=types) file_location = os.path.abspath(output_filename) found_counter = dump( - packages=packages_with_ambiguous_licenses, json_location=file_location) + packages=packages_with_ambiguous_licenses, json_location=file_location + ) visited_counter = Package.objects.filter(type__in=types).count() - self.stdout.write('Visited {} packages'.format(visited_counter)) - self.stdout.write('Found {} possible packages'.format(found_counter)) + self.stdout.write(f"Visited {visited_counter} packages") + self.stdout.write(f"Found {found_counter} possible packages") if found_counter > 0: - self.stdout.write( - 'Found packages dumped to: {}'.format(file_location)) + self.stdout.write(f"Found packages dumped to: {file_location}") -def find_ambiguous_packages(types=('maven',), keywords=('unknown', 'proprietary', 'commercial',)): +def find_ambiguous_packages( + types=("maven",), + keywords=( + "unknown", + "proprietary", + "commercial", + ), +): """ Search the package DB and yield the package that declared_license and license_expression contain "unknown", "proprietary" and "commercial" words. """ # filter to detect declared_license field filter_expression = [ - Q(declared_license_expression__icontains=word) for word in keywords] + Q(declared_license_expression__icontains=word) for word in keywords + ] # filter to detect license_expression field, add or relationship between these two fields filter_expression.extend( - [Q(other_license_expression__icontains=word) for word in keywords]) + [Q(other_license_expression__icontains=word) for word in keywords] + ) license_filter = reduce(operator.or_, filter_expression) for package in Package.objects.filter(type__in=types).filter(license_filter): @@ -95,13 +104,11 @@ def find_ambiguous_packages(types=('maven',), keywords=('unknown', 'proprietary' def dump(packages, json_location): - """ - Dump the packages as json format at the passing json_location and return the count of the packages. - """ + """Dump the packages as json format at the passing json_location and return the count of the packages.""" if not packages: return 0 packages = [p.to_dict() for p in packages] if packages: - with codecs.open(json_location, mode='wb', encoding='utf-8') as expect: - json.dump(packages, expect, indent=2, separators=(',', ': ')) + with codecs.open(json_location, mode="wb", encoding="utf-8") as expect: + json.dump(packages, expect, indent=2, separators=(",", ": ")) return len(packages) diff --git a/minecode/management/commands/check_uri.py b/minecode/management/commands/check_uri.py index 5308dbff..263ab3ef 100644 --- a/minecode/management/commands/check_uri.py +++ b/minecode/management/commands/check_uri.py @@ -14,10 +14,11 @@ from django.core.management.base import BaseCommand +from minecode import map_router + # NOTE: mappers and visitors are Unused Import here: But importing the mappers # module triggers routes registration from minecode import miners # NOQA -from minecode import map_router from minecode import visit_router from minecode.models import ResourceURI from minecode.route import NoRouteAvailable @@ -30,53 +31,54 @@ class Command(BaseCommand): - help = 'Print diagnostic information on a given URI prefix.' + help = "Print diagnostic information on a given URI prefix." def add_arguments(self, parser): parser.add_argument( - '--uri-prefix', - dest='uri_prefix', - action='store', - help='URI prefix to check.') + "--uri-prefix", + dest="uri_prefix", + action="store", + help="URI prefix to check.", + ) parser.add_argument( - '--limit', - dest='limit', + "--limit", + dest="limit", default=10, - action='store', - help='Maximum number of records to return.') + action="store", + help="Maximum number of records to return.", + ) parser.add_argument( - '--show-data', - dest='show_data', + "--show-data", + dest="show_data", default=False, - action='store_true', - help='URI prefix to check.') + action="store_true", + help="URI prefix to check.", + ) def handle(self, *args, **options): - """ - Check uris and print diagnostic information as JSON. - """ - uri_prefix = options.get('uri_prefix') - limit = options.get('limit', 10) - show_data = options.get('show_data') + """Check uris and print diagnostic information as JSON.""" + uri_prefix = options.get("uri_prefix") + limit = options.get("limit", 10) + show_data = options.get("show_data") # get the last 10 uris - uris = ResourceURI.objects.filter( - uri__startswith=uri_prefix).order_by("-id")[:limit] + uris = ResourceURI.objects.filter(uri__startswith=uri_prefix).order_by("-id")[ + :limit + ] # TODO: add if the uri be resolved by visit and/or map router for uri in uris: - try: # FIXME: resolve() returns an acutal Visitor object, using module names for now visit_route_resolve = repr(visit_router.resolve(uri.uri)) except NoRouteAvailable: - visit_route_resolve = 'No Route Availible' + visit_route_resolve = "No Route Availible" try: # FIXME: resolve() returns an acutal Mapper object, using module names for now map_route_resolve = repr(map_router.resolve(uri.uri)) except NoRouteAvailable: - map_route_resolve = 'No Route Availible' + map_route_resolve = "No Route Availible" if uri.last_visit_date: last_visit_date = uri.last_visit_date.isoformat() @@ -93,24 +95,26 @@ def handle(self, *args, **options): else: wip_date = None - uri_info = dict([ - ('id', uri.id), - ('uri', uri.uri), - ('source_uri', uri.source_uri), - ('priority', uri.priority), - ('mining_level', uri.mining_level), - ('visit_route', visit_route_resolve), - ('map_route', map_route_resolve), - ('is_visitable', uri.is_visitable), - ('is_mappable', uri.is_mappable), - ('last_visit_date', last_visit_date), - ('last_map_date', last_map_date), - ('wip_date', wip_date), - ('visit_error', uri.visit_error), - ('map_error', uri.map_error), - ]) + uri_info = dict( + [ + ("id", uri.id), + ("uri", uri.uri), + ("source_uri", uri.source_uri), + ("priority", uri.priority), + ("mining_level", uri.mining_level), + ("visit_route", visit_route_resolve), + ("map_route", map_route_resolve), + ("is_visitable", uri.is_visitable), + ("is_mappable", uri.is_mappable), + ("last_visit_date", last_visit_date), + ("last_map_date", last_map_date), + ("wip_date", wip_date), + ("visit_error", uri.visit_error), + ("map_error", uri.map_error), + ] + ) if show_data: - uri_info.update({'data': uri.data}) + uri_info.update({"data": uri.data}) print(json.dumps(uri_info, indent=2)) diff --git a/minecode/management/commands/create-scan-queue-worker-user.py b/minecode/management/commands/create-scan-queue-worker-user.py index 7708f963..7111c539 100644 --- a/minecode/management/commands/create-scan-queue-worker-user.py +++ b/minecode/management/commands/create-scan-queue-worker-user.py @@ -8,24 +8,24 @@ # from django.contrib.auth.models import Group + from minecode.management.user_creation import CreateUserCommand class Command(CreateUserCommand): - help = 'Create a user and generate an API key for a scan queue worker' + help = "Create a user and generate an API key for a scan queue worker" def handle(self, *args, **options): - username = options['username'] - interactive = options['interactive'] - verbosity = options['verbosity'] + username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] user = self.create_user( - username=username, - interactive=interactive, - verbosity=verbosity + username=username, interactive=interactive, verbosity=verbosity ) # Add user to `scan_queue_workers` group scan_queue_workers_group, _ = Group.objects.get_or_create( - name='scan_queue_workers') + name="scan_queue_workers" + ) scan_queue_workers_group.user_set.add(user) - msg = f'User {username} added to `scan_queue_workers` group' + msg = f"User {username} added to `scan_queue_workers` group" self.stdout.write(msg, self.style.SUCCESS) diff --git a/minecode/management/commands/create-user.py b/minecode/management/commands/create-user.py index 47087f27..9b618ba1 100644 --- a/minecode/management/commands/create-user.py +++ b/minecode/management/commands/create-user.py @@ -11,14 +11,12 @@ class Command(CreateUserCommand): - help = 'Create a user and generate an API key for a scan queue worker' + help = "Create a user and generate an API key for a scan queue worker" def handle(self, *args, **options): - username = options['username'] - interactive = options['interactive'] - verbosity = options['verbosity'] + username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] self.create_user( - username=username, - interactive=interactive, - verbosity=verbosity + username=username, interactive=interactive, verbosity=verbosity ) diff --git a/minecode/management/commands/dump_purls.py b/minecode/management/commands/dump_purls.py index a81d8aba..62494819 100644 --- a/minecode/management/commands/dump_purls.py +++ b/minecode/management/commands/dump_purls.py @@ -14,20 +14,20 @@ def dump_purls(package_type, output): - """ - Dump packagedb purls for ``package_type`` as JSON lines in the ``output`` files - """ - compact_separators = (u',', u':',) + """Dump packagedb purls for ``package_type`` as JSON lines in the ``output`` files""" + compact_separators = ( + ",", + ":", + ) out = None for i, package in enumerate(Package.objects.filter(type=package_type).all()): if not output: out = open(f"{output}-{i}.json", "w") - purl = dict(purl=package.package_url, - download_url=package.download_url) + purl = dict(purl=package.package_url, download_url=package.download_url) if not i % 500: print(f"#{i} purl: {package.package_url}") out.write(json.dumps(purl, separators=compact_separators)) - out.write('\n') + out.write("\n") if not i % 1000000: out.close() out = None diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index 257f7e77..e3ab2f35 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -7,19 +7,18 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse -from os.path import dirname import logging import sys +from os.path import dirname import requests +from dateutil.parser import parse as dateutil_parse -from minecode.management.commands import VerboseCommand from minecode.collectors.maven import collect_links_from_text from minecode.collectors.maven import filter_for_artifacts +from minecode.management.commands import VerboseCommand from packagedb.models import Package - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -30,51 +29,56 @@ class Command(VerboseCommand): - help = 'Get and set release_date for Maven Packages' + help = "Get and set release_date for Maven Packages" def handle(self, *args, **options): queryset = Package.objects.filter( - type='maven', + type="maven", release_date=None, - download_url__startswith='https://repo1.maven.org/maven2' + download_url__startswith="https://repo1.maven.org/maven2", ) object_count = queryset.count() chunk_size = 2000 iterator = queryset.iterator(chunk_size=chunk_size) unsaved_objects = [] - logger.info(f'Updating release_date for {object_count} packages') + logger.info(f"Updating release_date for {object_count} packages") for index, package in enumerate(iterator, start=1): download_url = package.download_url package_url = package.package_url logger.info( - f'Updating release_date for package {package_url} ({download_url})') + f"Updating release_date for package {package_url} ({download_url})" + ) package_version_page_url = dirname(download_url) - filename = download_url.rsplit('/')[-1] + filename = download_url.rsplit("/")[-1] response = requests.get(package_version_page_url) if response: timestamps_by_links = collect_links_from_text( - response.text, filter=filter_for_artifacts) + response.text, filter=filter_for_artifacts + ) timestamp = timestamps_by_links.get(filename) if not timestamp: logger.info( - f'\tCould not get release_date for package {package_url} ({download_url})') + f"\tCould not get release_date for package {package_url} ({download_url})" + ) continue timestamp = dateutil_parse(timestamp) package.release_date = timestamp unsaved_objects.append(package) logger.info( - f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}') + f"\t{package_url} ({download_url}) release_date has been updated to {timestamp}" + ) else: logger.info( - f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}') + f"\t{package_url} not updated: error encountered when visiting {package_version_page_url}" + ) if not (index % chunk_size) and unsaved_objects: - logger.info(f'{index:,} / {object_count:,} Packages processed') + logger.info(f"{index:,} / {object_count:,} Packages processed") - logger.info('Updating Package objects...') + logger.info("Updating Package objects...") updated_packages_count = Package.objects.bulk_update( objs=unsaved_objects, - fields=['release_date'], + fields=["release_date"], batch_size=1000, ) - logger.info(f'Updated {updated_packages_count} Package objects') + logger.info(f"Updated {updated_packages_count} Package objects") diff --git a/minecode/management/commands/get_status.py b/minecode/management/commands/get_status.py index 144d2850..c92358be 100644 --- a/minecode/management/commands/get_status.py +++ b/minecode/management/commands/get_status.py @@ -23,29 +23,43 @@ class Command(BaseCommand): - help = 'Print status information for the minecode system.' + help = "Print status information for the minecode system." def handle(self, *args, **options): - counts = dict([ - ('total_packages', Package.objects.count()), - ('total_uri', ResourceURI.objects.count()), - ('unique_uri', ResourceURI.objects.distinct().count()), - - ('visitables', ResourceURI.objects.get_visitables().count()), - ('visited', ResourceURI.objects.visited().count()), - ('successfully_visited', ResourceURI.objects.successfully_visited().count()), - ('unsuccessfully_visited', - ResourceURI.objects.unsuccessfully_visited().count()), - ('never_visited', ResourceURI.objects.never_visited().count()), - ('visit_in_progress', ResourceURI.objects.filter( - wip_date__isnull=False, last_visit_date__isnull=True).count()), - - ('mappables', ResourceURI.objects.get_mappables().count()), - ('mapped', ResourceURI.objects.mapped().count()), - ('successfully_mapped', ResourceURI.objects.successfully_mapped().count()), - ('unsuccessfully_mapped', - ResourceURI.objects.unsuccessfully_mapped().count()), - ('never_mapped', ResourceURI.objects.never_mapped().count()), - ]) + counts = dict( + [ + ("total_packages", Package.objects.count()), + ("total_uri", ResourceURI.objects.count()), + ("unique_uri", ResourceURI.objects.distinct().count()), + ("visitables", ResourceURI.objects.get_visitables().count()), + ("visited", ResourceURI.objects.visited().count()), + ( + "successfully_visited", + ResourceURI.objects.successfully_visited().count(), + ), + ( + "unsuccessfully_visited", + ResourceURI.objects.unsuccessfully_visited().count(), + ), + ("never_visited", ResourceURI.objects.never_visited().count()), + ( + "visit_in_progress", + ResourceURI.objects.filter( + wip_date__isnull=False, last_visit_date__isnull=True + ).count(), + ), + ("mappables", ResourceURI.objects.get_mappables().count()), + ("mapped", ResourceURI.objects.mapped().count()), + ( + "successfully_mapped", + ResourceURI.objects.successfully_mapped().count(), + ), + ( + "unsuccessfully_mapped", + ResourceURI.objects.unsuccessfully_mapped().count(), + ), + ("never_mapped", ResourceURI.objects.never_mapped().count()), + ] + ) print(json.dumps(counts, indent=2)) diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 74f2ed67..5b53c28a 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -7,30 +7,29 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse import logging import signal import sys import time -import requests - from django.db import transaction from django.utils import timezone + +import requests +from dateutil.parser import parse as dateutil_parse +from packagedcode.models import PackageData from packageurl import PackageURL -from minecode.management.commands import get_error_message -from minecode.management.commands import VerboseCommand -from minecode.models import ImportableURI -from minecode.collectors.maven import get_artifact_links -from minecode.collectors.maven import get_classifier_from_artifact_url from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import determine_namespace_name_version_from_url from minecode.collectors.maven import filter_only_directories +from minecode.collectors.maven import get_artifact_links from minecode.collectors.maven import get_artifact_sha1 +from minecode.collectors.maven import get_classifier_from_artifact_url +from minecode.management.commands import VerboseCommand +from minecode.management.commands import get_error_message from minecode.model_utils import merge_or_create_package -from packagedcode.models import PackageData -from minecode.collectors.maven import determine_namespace_name_version_from_url - +from minecode.models import ImportableURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -47,9 +46,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -58,7 +55,7 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def handle(self, *args, **options): """ @@ -66,7 +63,6 @@ def handle(self, *args, **options): processing. Loops forever and sleeps a short while if there are no PriorityResourceURI left to process. """ - global MUST_STOP sleeping = False @@ -74,7 +70,7 @@ def handle(self, *args, **options): while True: if MUST_STOP: - logger.info('Graceful exit of the request queue.') + logger.info("Graceful exit of the request queue.") break with transaction.atomic(): @@ -84,7 +80,7 @@ def handle(self, *args, **options): # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No more processable request, sleeping...') + logger.info("No more processable request, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -92,12 +88,11 @@ def handle(self, *args, **options): sleeping = False # process request - logger.info('Processing {}'.format(importable_uri)) + logger.info(f"Processing {importable_uri}") try: errors = process_request(importable_uri) except Exception as e: - errors = 'Error: Failed to process ImportableURI: {}\n'.format( - repr(importable_uri)) + errors = f"Error: Failed to process ImportableURI: {repr(importable_uri)}\n" errors += get_error_message(e) finally: if errors: @@ -113,7 +108,7 @@ def handle(self, *args, **options): def process_request(importable_uri): uri = importable_uri.uri - uri = uri.rstrip('/') + uri = uri.rstrip("/") data = importable_uri.data if not data: # collect data again if we don't have it @@ -130,22 +125,24 @@ def process_request(importable_uri): namespace, name, _ = determine_namespace_name_version_from_url(uri) timestamps_by_directory_links = collect_links_from_text( - data, filter_only_directories) + data, filter_only_directories + ) # Go into each version directory for directory_link in timestamps_by_directory_links.keys(): - version = directory_link.rstrip('/') - version_page_url = f'{uri}/{version}' + version = directory_link.rstrip("/") + version_page_url = f"{uri}/{version}" timestamps_by_artifact_links = get_artifact_links(version_page_url) for artifact_link, timestamp in timestamps_by_artifact_links.items(): sha1 = get_artifact_sha1(artifact_link) classifier = get_classifier_from_artifact_url( - artifact_link, version_page_url, name, version) + artifact_link, version_page_url, name, version + ) qualifiers = None if classifier: - qualifiers = f'classifier={classifier}' + qualifiers = f"classifier={classifier}" release_date = dateutil_parse(timestamp) package_data = PackageData( - type='maven', + type="maven", namespace=namespace, name=name, version=version, @@ -155,14 +152,13 @@ def process_request(importable_uri): release_date=release_date, ) package, created, merged, map_error = merge_or_create_package( - scanned_package=package_data, - visit_level=50 + scanned_package=package_data, visit_level=50 ) if created: - logger.info(f'Created package {package}') + logger.info(f"Created package {package}") if merged: - logger.info(f'Updated package {package}') + logger.info(f"Updated package {package}") if map_error: - logger.error(f'Error encountered: {map_error}') + logger.error(f"Error encountered: {map_error}") importable_uri.processing_error = map_error importable_uri.save() diff --git a/minecode/management/commands/increase_scannableuri_priority.py b/minecode/management/commands/increase_scannableuri_priority.py index 5724ec56..e6c237d7 100644 --- a/minecode/management/commands/increase_scannableuri_priority.py +++ b/minecode/management/commands/increase_scannableuri_priority.py @@ -2,16 +2,13 @@ # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import unicode_literals import logging import sys -from minecode.models import ScannableURI -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand - +from minecode.management.commands import get_error_message +from minecode.models import ScannableURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -21,16 +18,21 @@ class Command(VerboseCommand): logger = logger - help = 'Increase the priority of the Package to be scanned' + help = "Increase the priority of the Package to be scanned" def add_arguments(self, parser): - parser.add_argument('--pattern', '-p', action='store', dest='pattern', - help='Only increase the priority of URIs matching this regex pattern.') + parser.add_argument( + "--pattern", + "-p", + action="store", + dest="pattern", + help="Only increase the priority of URIs matching this regex pattern.", + ) def handle(self, *args, **options): logger.setLevel(self.get_verbosity(**options)) - pattern = options.get('pattern') + pattern = options.get("pattern") for scannable_uri in ScannableURI.objects.filter(uri__iregex=pattern): uri = scannable_uri.uri @@ -38,8 +40,8 @@ def handle(self, *args, **options): # Priority is arbitrarily set to 100 to immediately increase its processing priority scannable_uri.priority = 100 scannable_uri.save() - logger.info('Increased priority of: '.format(uri)) + logger.info(f"Increased priority of: ") except Exception as e: - msg = 'Error setting priority for: '.format(uri) + msg = f"Error setting priority for: " msg += get_error_message(e) logger.error(msg) diff --git a/minecode/management/commands/load_priority_queue.py b/minecode/management/commands/load_priority_queue.py index 8e4fbfde..8af14d3b 100644 --- a/minecode/management/commands/load_priority_queue.py +++ b/minecode/management/commands/load_priority_queue.py @@ -11,12 +11,11 @@ import sys import requests - from commoncode.resource import VirtualCodebase + from minecode.management.commands import VerboseCommand from minecode.models import PriorityResourceURI - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -27,33 +26,35 @@ class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def add_arguments(self, parser): parser.add_argument("--input", type=str) def handle(self, *args, **options): - input = options.get('input') + input = options.get("input") if input: vc = VirtualCodebase(location=input) for resource in vc.walk(): if not resource.sha1: continue - maven_api_search_url = f'https://search.maven.org/solrsearch/select?q=1:{resource.sha1}' + maven_api_search_url = ( + f"https://search.maven.org/solrsearch/select?q=1:{resource.sha1}" + ) response = requests.get(maven_api_search_url) if not response.ok: - logger.error( - f"API query failed for: {maven_api_search_url}") + logger.error(f"API query failed for: {maven_api_search_url}") continue contents = response.json() - resp = contents.get('response', {}) - if resp.get('numFound', 0) > 0: - for matched_package in resp.get('docs', []): - namespace = matched_package.get('g', '') - name = matched_package.get('a', '') - version = matched_package.get('v', '') + resp = contents.get("response", {}) + if resp.get("numFound", 0) > 0: + for matched_package in resp.get("docs", []): + namespace = matched_package.get("g", "") + name = matched_package.get("a", "") + version = matched_package.get("v", "") if namespace and name and version: - purl = f'pkg:maven/{namespace}/{name}@{version}' + purl = f"pkg:maven/{namespace}/{name}@{version}" PriorityResourceURI.objects.create( - uri=purl, package_url=purl, sha1=resource.sha1) - logger.info(f'Added {purl} to priority queue') + uri=purl, package_url=purl, sha1=resource.sha1 + ) + logger.info(f"Added {purl} to priority queue") diff --git a/minecode/management/commands/make_scannableuris.py b/minecode/management/commands/make_scannableuris.py index 992b9c76..ae06a7a9 100644 --- a/minecode/management/commands/make_scannableuris.py +++ b/minecode/management/commands/make_scannableuris.py @@ -2,18 +2,15 @@ # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import unicode_literals import logging import sys from django.core.management.base import BaseCommand -from packagedb.models import Package from minecode.management.commands import get_error_message from minecode.models import ScannableURI - +from packagedb.models import Package logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -23,19 +20,20 @@ class Command(BaseCommand): logger = logger - help = 'Create ScannableURIs from Packages' + help = "Create ScannableURIs from Packages" def handle(self, *args, **options): for package in Package.objects.all(): package_uri = package.download_url try: _, created = ScannableURI.objects.get_or_create( - uri=package_uri, - package=package + uri=package_uri, package=package ) if created: - self.stdout.write('ScannableURI created for: {}'.format(package_uri)) + self.stdout.write( + f"ScannableURI created for: {package_uri}" + ) except Exception as e: - msg = 'Error creating ScannableURI for: {}'.format(package_uri) + msg = f"Error creating ScannableURI for: {package_uri}" msg += get_error_message(e) logger.error(msg) diff --git a/minecode/management/commands/manage_scans.py b/minecode/management/commands/manage_scans.py index a361f227..77ed3246 100644 --- a/minecode/management/commands/manage_scans.py +++ b/minecode/management/commands/manage_scans.py @@ -2,19 +2,16 @@ # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -import time import logging import signal import sys +import time from django.db import transaction from django.utils import timezone - -from minecode.models import ScannableURI - from minecode.management.commands import VerboseCommand - +from minecode.models import ScannableURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -26,39 +23,40 @@ class ScanningCommand(VerboseCommand): - """ - Base command class for processing ScannableURIs. - """ + """Base command class for processing ScannableURIs.""" + # subclasses must override logger = None def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) parser.add_argument( - '--max-uris', - dest='max_uris', + "--max-uris", + dest="max_uris", default=0, - action='store', - help='Limit the number of Scannable URIs processed to a maximum number. ' - '0 means no limit. Used only for testing.') + action="store", + help="Limit the number of Scannable URIs processed to a maximum number. " + "0 means no limit. Used only for testing.", + ) def handle(self, *args, **options): - exit_on_empty = options.get('exit_on_empty') - max_uris = options.get('max_uris', 0) + exit_on_empty = options.get("exit_on_empty") + max_uris = options.get("max_uris", 0) uris_counter = self.process_scans( exit_on_empty=exit_on_empty, max_uris=max_uris, # Pass options to allow subclasses to add their own options - options=options + options=options, ) - self.stdout.write('Processed {} ScannableURI.'.format(uris_counter)) + self.stdout.write(f"Processed {uris_counter} ScannableURI.") @classmethod def process_scans(cls, exit_on_empty=False, max_uris=0, **kwargs): @@ -77,29 +75,35 @@ def process_scans(cls, exit_on_empty=False, max_uris=0, **kwargs): time.sleep(10) if cls.MUST_STOP: - cls.logger.info('Graceful exit of the scan processing loop.') + cls.logger.info("Graceful exit of the scan processing loop.") break if max_uris and uris_counter >= max_uris: - cls.logger.info('max_uris requested reached: exiting scan processing loop.') + cls.logger.info( + "max_uris requested reached: exiting scan processing loop." + ) break scannable_uri = cls.get_next_uri() if not scannable_uri: if exit_on_empty: - cls.logger.info('exit-on-empty requested: No more scannable URIs, exiting...') + cls.logger.info( + "exit-on-empty requested: No more scannable URIs, exiting..." + ) break # Only log a single message when we go to sleep if not sleeping: sleeping = True - cls.logger.info('No more scannable URIs, sleeping for at least {} seconds...'.format(SLEEP_WHEN_EMPTY)) + cls.logger.info( + f"No more scannable URIs, sleeping for at least {SLEEP_WHEN_EMPTY} seconds..." + ) time.sleep(SLEEP_WHEN_EMPTY) continue - cls.logger.info('Processing scannable URI: {}'.format(scannable_uri)) + cls.logger.info(f"Processing scannable URI: {scannable_uri}") cls.process_scan(scannable_uri, **kwargs) uris_counter += 1 @@ -129,11 +133,12 @@ def process_scan(scannable_uri, **kwargs): class Command(ScanningCommand): - logger = logger - help = ('Check scancode.io requested scans for status then fetch and process ' - 'completed scans for indexing and updates.') + help = ( + "Check scancode.io requested scans for status then fetch and process " + "completed scans for indexing and updates." + ) def handle(self, *args, **options): logger.setLevel(self.get_verbosity(**options)) @@ -146,15 +151,24 @@ def get_next_uri(self): return scannable_uri @classmethod - def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_save_loc='', **kwargs): + def process_scan( + cls, + scannable_uri, + get_scan_info_save_loc="", + get_scan_data_save_loc="", + **kwargs, + ): """ Manage a ScannableURI based on its status. - For submitted but not completed scans, check the timestamp of when the scan was submitted, if it has been past some time, then we set the scan as timed out - For timed out scans, we set that as failed and then create a new one? """ - logger.info('Checking scan for URI: {}'.format(scannable_uri)) + logger.info(f"Checking scan for URI: {scannable_uri}") - if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS): + if scannable_uri.scan_status in ( + ScannableURI.SCAN_SUBMITTED, + ScannableURI.SCAN_IN_PROGRESS, + ): scan_duration = timezone.now() - scannable_uri.scan_date scan_duration_hours = scan_duration.seconds / (60 * 60) @@ -162,7 +176,7 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa scannable_uri.scan_status = ScannableURI.SCAN_TIMEOUT scannable_uri.wip_date = None scannable_uri.save() - logger.info('Scan for URI has timed out: {}'.format(scannable_uri)) + logger.info(f"Scan for URI has timed out: {scannable_uri}") # support graceful death when used as a service diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index f4be733c..df6da9cf 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -13,7 +13,6 @@ from minecode.collectors.maven import crawl_maven_repo_from_root from minecode.management.commands import VerboseCommand - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -24,8 +23,8 @@ class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def handle(self, *args, **options): - maven_root_url = 'https://repo.maven.apache.org/maven2' + maven_root_url = "https://repo.maven.apache.org/maven2" crawl_maven_repo_from_root(root_url=maven_root_url) diff --git a/minecode/management/commands/priority_queue.py b/minecode/management/commands/priority_queue.py index 6c445702..3c88e849 100644 --- a/minecode/management/commands/priority_queue.py +++ b/minecode/management/commands/priority_queue.py @@ -19,12 +19,11 @@ # But importing the collectors module triggers routes registration from minecode import collectors # NOQA from minecode import priority_router -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand +from minecode.management.commands import get_error_message from minecode.models import PriorityResourceURI from minecode.route import NoRouteAvailable - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -40,9 +39,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -51,7 +48,7 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def handle(self, *args, **options): """ @@ -59,7 +56,6 @@ def handle(self, *args, **options): processing. Loops forever and sleeps a short while if there are no PriorityResourceURI left to process. """ - global MUST_STOP sleeping = False @@ -67,7 +63,7 @@ def handle(self, *args, **options): while True: if MUST_STOP: - logger.info('Graceful exit of the request queue.') + logger.info("Graceful exit of the request queue.") break with transaction.atomic(): @@ -77,7 +73,7 @@ def handle(self, *args, **options): # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No more processable request, sleeping...') + logger.info("No more processable request, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -85,12 +81,11 @@ def handle(self, *args, **options): sleeping = False # process request - logger.info('Processing {}'.format(priority_resource_uri)) + logger.info(f"Processing {priority_resource_uri}") try: errors = process_request(priority_resource_uri) except Exception as e: - errors = 'Error: Failed to process PriorityResourceURI: {}\n'.format( - repr(priority_resource_uri)) + errors = f"Error: Failed to process PriorityResourceURI: {repr(priority_resource_uri)}\n" errors += get_error_message(e) finally: if errors: @@ -112,24 +107,23 @@ def process_request(priority_resource_uri, _priority_router=priority_router): try: if TRACE: - logger.debug('visit_uri: uri: {}'.format(purl_to_visit)) + logger.debug(f"visit_uri: uri: {purl_to_visit}") kwargs = dict() if source_purl: - kwargs['source_purl'] = source_purl + kwargs["source_purl"] = source_purl if addon_pipelines: - kwargs['addon_pipelines'] = addon_pipelines + kwargs["addon_pipelines"] = addon_pipelines if priority: - kwargs['priority'] = priority + kwargs["priority"] = priority errors = _priority_router.process(purl_to_visit, **kwargs) if TRACE: new_uris_to_visit = list(new_uris_to_visit or []) - logger.debug( - 'visit_uri: new_uris_to_visit: {}'.format(new_uris_to_visit)) + logger.debug(f"visit_uri: new_uris_to_visit: {new_uris_to_visit}") return errors except NoRouteAvailable: - error = f'No route available for {purl_to_visit}' + error = f"No route available for {purl_to_visit}" logger.error(error) # TODO: For now, when a route is not yet supported, we keep a value for # the wip_date value so the instance is not back in the queue. It will diff --git a/minecode/management/commands/remap.py b/minecode/management/commands/remap.py index 303655b1..68cbc403 100644 --- a/minecode/management/commands/remap.py +++ b/minecode/management/commands/remap.py @@ -22,23 +22,30 @@ class Command(BaseCommand): - help = 'Mark ResourceURIs for remapping to packages.' + help = "Mark ResourceURIs for remapping to packages." def handle(self, *args, **options): - q1 = Q(uri__startswith='https://repo1') - q2 = Q(uri__startswith='maven-index://') - q3 = Q(uri__startswith='https://replicate') - q4 = Q(uri__startswith='https://registry') + q1 = Q(uri__startswith="https://repo1") + q2 = Q(uri__startswith="maven-index://") + q3 = Q(uri__startswith="https://replicate") + q4 = Q(uri__startswith="https://registry") for uri in ResourceURI.objects.successfully_mapped().filter(q1 | q2 | q3 | q4): uri.last_map_date = None uri.wip_date = None uri.save() - ResourceURI.objects.successfully_mapped().filter( - uri__contains='maven').update(last_map_date=None) - ResourceURI.objects.successfully_mapped().filter( - uri__contains='npm').update(last_map_date=None) - - ResourceURI.objects.successfully_mapped().exclude(uri__startswith='http://repo1').exclude(uri__startswith='maven-index://').exclude( - uri__startswith='https://replicate').exclude(uri__startswith='https://registry.npmjs.org').update(is_mappable=False) + ResourceURI.objects.successfully_mapped().filter(uri__contains="maven").update( + last_map_date=None + ) + ResourceURI.objects.successfully_mapped().filter(uri__contains="npm").update( + last_map_date=None + ) + + ResourceURI.objects.successfully_mapped().exclude( + uri__startswith="http://repo1" + ).exclude(uri__startswith="maven-index://").exclude( + uri__startswith="https://replicate" + ).exclude(uri__startswith="https://registry.npmjs.org").update( + is_mappable=False + ) diff --git a/minecode/management/commands/run_map.py b/minecode/management/commands/run_map.py index fbf80b9c..d8ff545f 100644 --- a/minecode/management/commands/run_map.py +++ b/minecode/management/commands/run_map.py @@ -16,19 +16,17 @@ from django.db import transaction from django.utils import timezone +from minecode import map_router + # UnusedImport here! # But importing the miners module triggers routes registration - from minecode import miners # NOQA - -from minecode import map_router -from minecode.models import ResourceURI -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand +from minecode.management.commands import get_error_message from minecode.model_utils import merge_or_create_package +from minecode.models import ResourceURI from minecode.models import ScannableURI - TRACE = True logger = logging.getLogger(__name__) @@ -43,9 +41,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -57,15 +53,16 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a mapping worker.' + help = "Run a mapping worker." def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) def handle(self, *args, **options): """ @@ -75,26 +72,26 @@ def handle(self, *args, **options): global MUST_STOP logger.setLevel(self.get_verbosity(**options)) - exit_on_empty = options.get('exit_on_empty') + exit_on_empty = options.get("exit_on_empty") sleeping = False while True: if MUST_STOP: - logger.info('Graceful exit of the map loop.') + logger.info("Graceful exit of the map loop.") break mappables = ResourceURI.objects.get_mappables()[:MAP_BATCH_SIZE] if not mappables: if exit_on_empty: - logger.info('No mappable resource, exiting...') + logger.info("No mappable resource, exiting...") break # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No mappable resource, sleeping...') + logger.info("No mappable resource, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -102,7 +99,7 @@ def handle(self, *args, **options): sleeping = False for resource_uri in mappables: - logger.info('Mapping {}'.format(resource_uri)) + logger.info(f"Mapping {resource_uri}") map_uri(resource_uri) @@ -114,16 +111,18 @@ def map_uri(resource_uri, _map_router=map_router): # FIXME: returning a string or sequence is UGLY try: mapped_scanned_packages = _map_router.process( - resource_uri.uri, resource_uri=resource_uri) + resource_uri.uri, resource_uri=resource_uri + ) - logger.debug('map_uri: Package URI: {}'.format(resource_uri.uri)) + logger.debug(f"map_uri: Package URI: {resource_uri.uri}") # consume generators mapped_scanned_packages = mapped_scanned_packages and list( - mapped_scanned_packages) + mapped_scanned_packages + ) if not mapped_scanned_packages: - msg = 'No visited scanned packages returned.' + msg = "No visited scanned packages returned." logger.error(msg) resource_uri.last_map_date = timezone.now() resource_uri.map_error = msg @@ -131,8 +130,7 @@ def map_uri(resource_uri, _map_router=map_router): return except Exception as e: - msg = 'Error: Failed to map while processing ResourceURI: {}\n'.format( - repr(resource_uri)) + msg = f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" msg += get_error_message(e) logger.error(msg) # we had an error, so mapped_scanned_packages is an error string @@ -144,7 +142,7 @@ def map_uri(resource_uri, _map_router=map_router): # if we reached this place, we have mapped_scanned_packages that contains # packages in ScanCode models format that these are ready to save to the DB - map_error = '' + map_error = "" try: with transaction.atomic(): @@ -155,7 +153,8 @@ def map_uri(resource_uri, _map_router=map_router): for scanned_package in mapped_scanned_packages: visit_level = resource_uri.mining_level package, package_created, _, m_err = merge_or_create_package( - scanned_package, visit_level) + scanned_package, visit_level + ) map_error += m_err if package_created: # Add this Package to the scan queue @@ -166,13 +165,12 @@ def map_uri(resource_uri, _map_router=map_router): ) if scannable_uri_created: logger.debug( - ' + Inserted ScannableURI\t: {}'.format(package_uri)) + f" + Inserted ScannableURI\t: {package_uri}" + ) except Exception as e: - msg = 'Error: Failed to map while processing ResourceURI: {}\n'.format( - repr(resource_uri)) - msg += 'While processing scanned_package: {}\n'.format( - repr(scanned_package)) + msg = f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" + msg += f"While processing scanned_package: {repr(scanned_package)}\n" msg += get_error_message(e) logger.error(msg) # this is enough to save the error to the ResourceURI which is done at last diff --git a/minecode/management/commands/run_visit.py b/minecode/management/commands/run_visit.py index 4295400d..d428e95a 100644 --- a/minecode/management/commands/run_visit.py +++ b/minecode/management/commands/run_visit.py @@ -8,11 +8,11 @@ # -from collections import Counter import logging import signal import sys import time +from collections import Counter # FIXME: why use Django cache for this? any benefits and side effects? from django.core.cache import cache as visit_delay_by_hostname @@ -24,17 +24,13 @@ # UnusedImport here! # But importing the miners module triggers routes registration - from minecode import miners # NOQA from minecode import visit_router - -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand - +from minecode.management.commands import get_error_message from minecode.models import ResourceURI from minecode.route import NoRouteAvailable - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -54,15 +50,13 @@ # FIXME: we should rotate UA strings or setup our own UA # this one is for FF Windows 7 agent 32 on win7 64 as of July 2016 -USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0' +USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" MUST_STOP = False def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -71,7 +65,7 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a visiting worker loop.' + help = "Run a visiting worker loop." # Note: we use the GLOBAL visit_router by default here. # Test subclasses can override this class-level attribute for testing. @@ -79,41 +73,46 @@ class Command(VerboseCommand): def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) parser.add_argument( - '--max-uris', - dest='max_uris', + "--max-uris", + dest="max_uris", default=0, - action='store', - help='Limit the number of URIs yielded from a visit to a maximum ' - 'number. 0 means no limit. Used only for testing.') + action="store", + help="Limit the number of URIs yielded from a visit to a maximum " + "number. 0 means no limit. Used only for testing.", + ) parser.add_argument( - '--max-loops', - dest='max_loops', + "--max-loops", + dest="max_loops", default=0, - action='store', - help='Limit the number of visit loops to a maximum number. ' - '0 means no limit. Used only for testing.') + action="store", + help="Limit the number of visit loops to a maximum number. " + "0 means no limit. Used only for testing.", + ) parser.add_argument( - '--ignore-robots', - dest='ignore_robots', + "--ignore-robots", + dest="ignore_robots", default=False, - action='store_true', - help='Ignore robots.txt politeness.') + action="store_true", + help="Ignore robots.txt politeness.", + ) parser.add_argument( - '--ignore-throttle', - dest='ignore_throttle', + "--ignore-throttle", + dest="ignore_throttle", default=False, - action='store_true', - help='Ignore throttling politeness.') + action="store_true", + help="Ignore throttling politeness.", + ) def handle(self, *args, **options): """ @@ -122,12 +121,12 @@ def handle(self, *args, **options): no ResourceURI left to visit. """ logger.setLevel(self.get_verbosity(**options)) - exit_on_empty = options.get('exit_on_empty') - max_uris = options.get('max_uris', 0) + exit_on_empty = options.get("exit_on_empty") + max_uris = options.get("max_uris", 0) max_uris = int(max_uris) - max_loops = options.get('max_loops', 0) - ignore_robots = options.get('ignore_robots') - ignore_throttle = options.get('ignore_throttle') + max_loops = options.get("max_loops", 0) + ignore_robots = options.get("ignore_robots") + ignore_throttle = options.get("ignore_throttle") visited_counter, inserted_counter = visit_uris( ignore_robots=ignore_robots, @@ -137,13 +136,18 @@ def handle(self, *args, **options): max_uris=max_uris, ) - self.stdout.write('Visited {} URIs'.format(visited_counter)) - self.stdout.write('Inserted {} new URIs'.format(inserted_counter)) + self.stdout.write(f"Visited {visited_counter} URIs") + self.stdout.write(f"Inserted {inserted_counter} new URIs") -def visit_uris(ignore_robots=False, ignore_throttle=False, - exit_on_empty=False, max_loops=0, max_uris=0, - user_agent=USER_AGENT): +def visit_uris( + ignore_robots=False, + ignore_throttle=False, + exit_on_empty=False, + max_loops=0, + max_uris=0, + user_agent=USER_AGENT, +): """ Run an infinite visit loop. Return a tuple of (visited, inserted) counts. @@ -164,7 +168,7 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, while True: if MUST_STOP: - logger.info('Graceful exit of the visit loop.') + logger.info("Graceful exit of the visit loop.") break with transaction.atomic(): @@ -173,13 +177,14 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, if not resource_uri: if exit_on_empty: logger.info( - 'exit-on-empty requested: No more visitable resource, exiting...') + "exit-on-empty requested: No more visitable resource, exiting..." + ) break # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No more visitable resource, sleeping...') + logger.info("No more visitable resource, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -187,7 +192,7 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, sleeping = False if not ignore_robots and robots.disallowed(resource_uri.uri, user_agent): - msg = 'Denied by robots.txt' + msg = "Denied by robots.txt" logger.error(msg) resource_uri.last_visit_date = timezone.now() resource_uri.wip_date = None @@ -198,8 +203,9 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, if not ignore_throttle: sleep_time = get_sleep_time(resource_uri) if sleep_time: - logger.debug('Respecting revisit delay: wait for {} for {}'.format( - sleep_time, resource_uri.uri)) + logger.debug( + f"Respecting revisit delay: wait for {sleep_time} for {resource_uri.uri}" + ) time.sleep(sleep_time) # Set new value in cache 'visit_delay_by_hostname' right before making the request # TODO: The cache logic should move closer to the requests calls @@ -207,27 +213,33 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, visit_delay_by_hostname.set(uri_hostname, timezone.now()) # visit proper - logger.info('Visiting {}'.format(resource_uri)) + logger.info(f"Visiting {resource_uri}") visited_counter += 1 inserted_counter += visit_uri( - resource_uri=resource_uri, max_uris=max_uris, - uri_counter_by_visitor=uri_counter_by_visitor) + resource_uri=resource_uri, + max_uris=max_uris, + uri_counter_by_visitor=uri_counter_by_visitor, + ) if max_loops and int(visited_counter) > int(max_loops): logger.info( - 'Stopping visits after max_loops: {} visit loops.'.format(max_loops)) + f"Stopping visits after max_loops: {max_loops} visit loops." + ) break return visited_counter, inserted_counter -def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_router=visit_router): +def visit_uri( + resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_router=visit_router +): """ Call a visitor for a single ResourceURI. Process up to `max_uris` records. `_visit_router` is the Router to use for routing. Used for tests only. """ - from requests.exceptions import ConnectionError, Timeout + from requests.exceptions import ConnectionError + from requests.exceptions import Timeout if not resource_uri: return @@ -252,18 +264,18 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout return 0 if TRACE: - logger.debug('visit_uri: uri: {}'.format(uri_to_visit)) + logger.debug(f"visit_uri: uri: {uri_to_visit}") # TODO: Consider pass a full visitors.URI plain object rather than a plain string new_uris_to_visit, visited_data, visit_error = _visit_router.process( - uri_to_visit) + uri_to_visit + ) if TRACE: new_uris_to_visit = list(new_uris_to_visit or []) - logger.debug( - 'visit_uri: new_uris_to_visit: {}'.format(new_uris_to_visit)) + logger.debug(f"visit_uri: new_uris_to_visit: {new_uris_to_visit}") except NoRouteAvailable: - logger.error('No route available.') + logger.error("No route available.") # TODO: For now, when a route is not yet supported, we keep a value for # the wip_date value so the instance is not back in the queue. It will # not be selected by a worker again until the wip_date is manually @@ -273,8 +285,8 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout return 0 except (ConnectionError, Timeout, Exception) as e: # FIXME: is catching all expections here correct? - msg = 'Visit error for URI: {}'.format(uri_to_visit) - msg += '\n'.format(uri_to_visit) + msg = f"Visit error for URI: {uri_to_visit}" + msg += "\n".format() msg += get_error_message(e) visit_errors.append(msg) logger.error(msg) @@ -282,8 +294,8 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout ######################################## # Also log visit errors!!!1 if visit_error: - msg = 'Visit error for URI: {}'.format(uri_to_visit) - msg += '\n'.format(uri_to_visit) + msg = f"Visit error for URI: {uri_to_visit}" + msg += "\n".format() msg += get_error_message(e) visit_errors.append(msg) logger.error(msg) @@ -304,63 +316,64 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout uri_str = smart_str(vuri.uri) visited_uri = vuri.to_dict() - last_modified_date = visited_uri.pop('date') + last_modified_date = visited_uri.pop("date") if last_modified_date: - visited_uri['last_modified_date'] = last_modified_date + visited_uri["last_modified_date"] = last_modified_date if vuri_count % 1000 == 0: - logger.debug( - ' * Processed: {} visited URIs'.format(vuri_count)) + logger.debug(f" * Processed: {vuri_count} visited URIs") try: # insert new if pre-visited - pre_visited = visited_uri.pop('visited') + pre_visited = visited_uri.pop("visited") if pre_visited: # set last visit date for this pre-visited URI - visited_uri['last_visit_date'] = timezone.now() + visited_uri["last_visit_date"] = timezone.now() new_uri = ResourceURI(**visited_uri) new_uri.save() - logger.debug( - ' + Inserted pre-visited:\t{}'.format(uri_str)) + logger.debug(f" + Inserted pre-visited:\t{uri_str}") inserted_count += 1 if max_uris: uri_counter_by_visitor[visitor_key] += 1 else: # if not pre-visited only insert if not existing - if not ResourceURI.objects.filter(uri=vuri.uri, last_visit_date=None).exists(): - visited_uri['last_visit_date'] = None + if not ResourceURI.objects.filter( + uri=vuri.uri, last_visit_date=None + ).exists(): + visited_uri["last_visit_date"] = None new_uri = ResourceURI(**visited_uri) new_uri.save() - logger.debug(' + Inserted new:\t{}'.format(uri_str)) + logger.debug(f" + Inserted new:\t{uri_str}") inserted_count += 1 if max_uris: uri_counter_by_visitor[visitor_key] += 1 else: - logger.debug(' + NOT Inserted:\t{}'.format(uri_str)) + logger.debug(f" + NOT Inserted:\t{uri_str}") except Exception as e: # FIXME: is catching all expections here correct? - msg = 'ERROR while processing URI from a visit through: {}'.format( - uri_str) - msg += '\n' + msg = f"ERROR while processing URI from a visit through: {uri_str}" + msg += "\n" msg += repr(visited_uri) - msg += '\n' + msg += "\n" msg += get_error_message(e) visit_errors.append(msg) logger.error(msg) if len(visit_errors) > 10: logger.error( - ' ! Breaking after processing over 10 vuris errors for: {}'.format(uri_str)) + f" ! Breaking after processing over 10 vuris errors for: {uri_str}" + ) break if max_uris and int(uri_counter_by_visitor[visitor_key]) > int(max_uris): logger.info( - ' ! Breaking after processing max-uris: {} URIs.'.format(max_uris)) + f" ! Breaking after processing max-uris: {max_uris} URIs." + ) break except Exception as e: - msg = 'Visit error for URI: {}'.format(uri_to_visit) - msg += '\n'.format(uri_to_visit) + msg = f"Visit error for URI: {uri_to_visit}" + msg += "\n".format() msg += get_error_message(e) visit_errors.append(msg) logger.error(msg) @@ -370,14 +383,14 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout resource_uri.last_visit_date = timezone.now() resource_uri.wip_date = None if visited_data: - logger.debug(' + Data collected.') + logger.debug(" + Data collected.") resource_uri.data = visited_data if visit_errors: - logger.debug(' ! Errors.') - resource_uri.visit_error = '\n'.join(visit_errors)[:5000] + logger.debug(" ! Errors.") + resource_uri.visit_error = "\n".join(visit_errors)[:5000] resource_uri.save() - logger.debug(' Inserted\t: {} new URI(s).'.format(inserted_count)) + logger.debug(f" Inserted\t: {inserted_count} new URI(s).") return inserted_count diff --git a/minecode/management/commands/seed.py b/minecode/management/commands/seed.py index ef3c3121..5b001b61 100644 --- a/minecode/management/commands/seed.py +++ b/minecode/management/commands/seed.py @@ -16,13 +16,10 @@ # UnusedImport here! # But importing the miners module triggers routes registration - from minecode import miners # NOQA - from minecode import seed -from minecode.models import ResourceURI from minecode.management.commands import VerboseCommand - +from minecode.models import ResourceURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -30,12 +27,19 @@ class Command(VerboseCommand): - help = ('Insert ResourceURIs records from Seed ' - 'objects with a URI matching a pattern.') + help = ( + "Insert ResourceURIs records from Seed " + "objects with a URI matching a pattern." + ) def add_arguments(self, parser): - parser.add_argument('--pattern', '-p', action='store', dest='pattern', - help='Only add seed URIs matching this regex pattern.') + parser.add_argument( + "--pattern", + "-p", + action="store", + dest="pattern", + help="Only add seed URIs matching this regex pattern.", + ) def handle(self, *args, **options): """ @@ -44,14 +48,14 @@ def handle(self, *args, **options): """ logger.setLevel(self.get_verbosity(**options)) - pattern = options.get('pattern') + pattern = options.get("pattern") seeders = seed.get_active_seeders() counter = 0 for uri in insert_seed_uris(pattern, seeders=seeders): - logger.info('Inserting new seed URI: {}'.format(uri)) + logger.info(f"Inserting new seed URI: {uri}") counter += 1 - self.stdout.write('Inserted {} seed URIs'.format(counter)) + self.stdout.write(f"Inserted {counter} seed URIs") SEED_PRIORITY = 100 @@ -66,15 +70,17 @@ def insert_seed_uris(pattern=None, priority=SEED_PRIORITY, seeders=()): for seeder in seeders: for uri in seeder.get_seeds(): if pattern and not re.match(pattern, uri): - logger.info('Skipping seeding for: {}. Pattern {}' - 'not matched.'.format(uri, pattern)) + logger.info( + f"Skipping seeding for: {uri}. Pattern {pattern}" "not matched." + ) continue if ResourceURI.objects.filter(uri=uri).exists(): needs_revisit = ResourceURI.objects.needs_revisit( - uri=uri, hours=seeder.revisit_after) + uri=uri, hours=seeder.revisit_after + ) if not needs_revisit: - logger.info('Revisit not needed for: {}'.format(uri)) + logger.info(f"Revisit not needed for: {uri}") continue # FIXME: Currently, we update the existing a new ResourceURI @@ -83,8 +89,7 @@ def insert_seed_uris(pattern=None, priority=SEED_PRIORITY, seeders=()): # to store this datablob on the filesystem and have a single # ResourceURI per `uri` that points to one or more data blobs. seed_uri = ResourceURI.objects.update_or_create( - uri=uri, - priority=priority, - last_visit_date=None) + uri=uri, priority=priority, last_visit_date=None + ) assert seed_uri yield uri diff --git a/minecode/management/commands/update_maven_package_data.py b/minecode/management/commands/update_maven_package_data.py index c62a0be4..f8607cf1 100644 --- a/minecode/management/commands/update_maven_package_data.py +++ b/minecode/management/commands/update_maven_package_data.py @@ -6,15 +6,16 @@ # See https://github.com/aboutcode-org/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse -from os.path import basename import logging import sys import traceback +from os.path import basename from django.db import transaction from django.db.utils import DataError from django.utils import timezone + +from dateutil.parser import parse as dateutil_parse from packageurl import normalize_qualifiers from minecode.collectors.maven import MavenNexusCollector @@ -34,10 +35,7 @@ def update_packages(packages, fields_to_update): try: with transaction.atomic(): - Package.objects.bulk_update( - objs=packages, - fields=fields_to_update - ) + Package.objects.bulk_update(objs=packages, fields=fields_to_update) updated_packages_count = len(packages) except DataError: updated_packages_count = 0 @@ -50,7 +48,7 @@ def update_packages(packages, fields_to_update): except DataError: service = basename(__file__) traceback_message = traceback.format_exc() - message = f'Error updating Package {package.package_uid}:\n\n{traceback_message}' + message = f"Error updating Package {package.package_uid}:\n\n{traceback_message}" ProcessingError.objects.create( service=service, date=timezone.now(), @@ -75,7 +73,9 @@ def create_packages(packages): except DataError: service = basename(__file__) traceback_message = traceback.format_exc() - message = f'Error creating Package {package.purl}:\n\n{traceback_message}' + message = ( + f"Error creating Package {package.purl}:\n\n{traceback_message}" + ) ProcessingError.objects.create( service=service, date=timezone.now(), @@ -105,13 +105,13 @@ def process_packages( updated = False if unsaved_existing_packages: fields_to_update = [ - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', - 'last_modified_date', - 'history', + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", + "last_modified_date", + "history", ] upc = update_packages(unsaved_existing_packages, fields_to_update) updated_packages_count += upc @@ -121,44 +121,47 @@ def process_packages( if unsaved_existing_packages_lowercased: fields_to_update = [ - 'namespace', - 'name', - 'version', - 'qualifiers', - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', - 'last_modified_date', - 'history', + "namespace", + "name", + "version", + "qualifiers", + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", + "last_modified_date", + "history", ] - upc = update_packages( - unsaved_existing_packages_lowercased, fields_to_update) + upc = update_packages(unsaved_existing_packages_lowercased, fields_to_update) updated_packages_count += upc unsaved_existing_packages_lowercased = [] if upc > 0: updated = True if updated: - logger.info(f'Updated {updated_packages_count:,} Maven Packages') + logger.info(f"Updated {updated_packages_count:,} Maven Packages") if unsaved_new_packages: cpc = create_packages(unsaved_new_packages) created_packages_count += cpc unsaved_new_packages = [] if cpc > 0: - logger.info(f'Created {created_packages_count:,} Maven Packages') + logger.info(f"Created {created_packages_count:,} Maven Packages") if packages_to_delete: dpc = delete_packages(packages_to_delete) packages_to_delete = [] deleted_packages_count += dpc if dpc > 0: - logger.info( - f'Deleted {deleted_packages_count:,} Duplicate Maven Packages') + logger.info(f"Deleted {deleted_packages_count:,} Duplicate Maven Packages") - return unsaved_existing_packages, unsaved_existing_packages_lowercased, unsaved_new_packages, packages_to_delete + return ( + unsaved_existing_packages, + unsaved_existing_packages_lowercased, + unsaved_new_packages, + packages_to_delete, + ) def update_package_fields(package, maven_package, field_names): @@ -166,13 +169,13 @@ def update_package_fields(package, maven_package, field_names): for field in field_names: p_val = getattr(package, field) value = getattr(maven_package, field) - if field == 'qualifiers': + if field == "qualifiers": value = normalize_qualifiers(value, encode=True) - if field == 'release_date': + if field == "release_date": value = dateutil_parse(value) if p_val != value: setattr(package, field, value) - if field == 'release_date': + if field == "release_date": p_val = str(p_val) value = str(value) entry = dict( @@ -184,22 +187,23 @@ def update_package_fields(package, maven_package, field_names): if updated_fields: data = { - 'updated_fields': updated_fields, + "updated_fields": updated_fields, } package.append_to_history( - 'Package field values have been updated.', + "Package field values have been updated.", data=data, ) - logger.debug(f'Updated existing Package {package.package_uid}') + logger.debug(f"Updated existing Package {package.package_uid}") return package -def update_maven_packages(maven_package, fields_to_update, lowercased_purl_fields=False): +def update_maven_packages( + maven_package, fields_to_update, lowercased_purl_fields=False +): namespace = maven_package.namespace name = maven_package.name version = maven_package.version - normalized_qualifiers = normalize_qualifiers( - maven_package.qualifiers, encode=True) + normalized_qualifiers = normalize_qualifiers(maven_package.qualifiers, encode=True) if lowercased_purl_fields: namespace = namespace.lower() @@ -208,35 +212,34 @@ def update_maven_packages(maven_package, fields_to_update, lowercased_purl_field normalize_qualifiers = normalize_qualifiers.lower() existing_packages = Package.objects.filter( - type='maven', + type="maven", namespace=namespace, name=name, version=version, - qualifiers=normalized_qualifiers or '' + qualifiers=normalized_qualifiers or "", ) if existing_package.exists(): duplicate_packages = [] for existing_package in existing_packages: if existing_package.download_url != maven_package.download_url: logger.debug( - f'Deleted duplicate Package with incorrect download URL {existing_package.package_uid}') + f"Deleted duplicate Package with incorrect download URL {existing_package.package_uid}" + ) duplicate_packages.append(existing_package) duplicate_packages_pks = [p.pk for p in duplicate_packages] existing_package = Package.objects.exclude( pk__in=duplicate_packages_pks ).get_or_none( - type='maven', + type="maven", namespace=namespace, name=name, version=version, - qualifiers=normalized_qualifiers or '' + qualifiers=normalized_qualifiers or "", ) if existing_package: existing_package = update_package_fields( - existing_package, - maven_package, - fields_to_update + existing_package, maven_package, fields_to_update ) return existing_package, duplicate_packages else: @@ -244,17 +247,17 @@ def update_maven_packages(maven_package, fields_to_update, lowercased_purl_field class Command(VerboseCommand): - help = 'Update maven Package values' + help = "Update maven Package values" def add_arguments(self, parser): parser.add_argument( - '--create_package', + "--create_package", type=bool, - help='Create new Maven Packages if it does not exist in our database' + help="Create new Maven Packages if it does not exist in our database", ) def handle(self, *args, **options): - create_package = options.get('create_package', False) + create_package = options.get("create_package", False) updated_packages_count = 0 created_packages_count = 0 deleted_packages_count = 0 @@ -263,11 +266,11 @@ def handle(self, *args, **options): unsaved_existing_packages_lowercased = [] packages_to_delete = [] - logger.info('Updating or Adding new Packages from Maven Index') + logger.info("Updating or Adding new Packages from Maven Index") collector = MavenNexusCollector() for i, maven_package in enumerate(collector.get_packages()): if not i % 1000: - logger.info(f'Processed {i:,} Maven Artifacts') + logger.info(f"Processed {i:,} Maven Artifacts") if not i % 2000: ( unsaved_existing_packages, @@ -288,15 +291,14 @@ def handle(self, *args, **options): ) fields_to_update = [ - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", ] existing_package, duplicate_packages = update_maven_packages( - maven_package, - fields_to_update + maven_package, fields_to_update ) if existing_package: unsaved_existing_packages.append(existing_package) @@ -304,43 +306,40 @@ def handle(self, *args, **options): continue fields_to_update = [ - 'namespace', - 'name', - 'version', - 'qualifiers', - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', + "namespace", + "name", + "version", + "qualifiers", + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", ] existing_package_lowercased, duplicate_packages = update_maven_packages( - maven_package, - fields_to_update, - lowercased_purl_fields=True + maven_package, fields_to_update, lowercased_purl_fields=True ) if existing_package_lowercased: - unsaved_existing_packages_lowercased.append( - existing_package_lowercased) + unsaved_existing_packages_lowercased.append(existing_package_lowercased) packages_to_delete.extend(duplicate_packages) continue if Package.objects.filter(download_url=maven_package.download_url).exists(): logger.debug( - f'Skipping creation of {maven_package.purl} - already exists') + f"Skipping creation of {maven_package.purl} - already exists" + ) continue if create_package: normalized_qualifiers = normalize_qualifiers( - maven_package.qualifiers, - encode=True + maven_package.qualifiers, encode=True ) new_package = Package( type=maven_package.type, namespace=maven_package.namespace, name=maven_package.name, version=maven_package.version, - qualifiers=normalized_qualifiers or '', + qualifiers=normalized_qualifiers or "", download_url=maven_package.download_url, size=maven_package.size, sha1=maven_package.sha1, @@ -351,7 +350,7 @@ def handle(self, *args, **options): ) new_package.created_date = timezone.now() unsaved_new_packages.append(new_package) - logger.debug(f'Created Package {maven_package.purl}') + logger.debug(f"Created Package {maven_package.purl}") ( unsaved_existing_packages, diff --git a/minecode/management/user_creation.py b/minecode/management/user_creation.py index ee8cba8a..6248514d 100644 --- a/minecode/management/user_creation.py +++ b/minecode/management/user_creation.py @@ -19,7 +19,7 @@ class CreateUserCommand(BaseCommand): - help = 'Create a user and generate an API key for authentication.' + help = "Create a user and generate an API key for authentication." requires_migrations_checks = True def __init__(self, *args, **kwargs): @@ -30,23 +30,20 @@ def __init__(self, *args, **kwargs): ) def add_arguments(self, parser): + parser.add_argument("username", help="Specifies the username for the user.") parser.add_argument( - 'username', help='Specifies the username for the user.') - parser.add_argument( - '--no-input', - action='store_false', - dest='interactive', - help='Do not prompt the user for input of any kind.', + "--no-input", + action="store_false", + dest="interactive", + help="Do not prompt the user for input of any kind.", ) def handle(self, *args, **options): - username = options['username'] - interactive = options['interactive'] - verbosity = options['verbosity'] + username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] self.create_user( - username=username, - interactive=interactive, - verbosity=verbosity + username=username, interactive=interactive, verbosity=verbosity ) def create_user(self, username, interactive, verbosity): @@ -58,12 +55,11 @@ def create_user(self, username, interactive, verbosity): if interactive: password = self.get_password_from_stdin(username) - user = self.UserModel._default_manager.create_user( - username, password=password) + user = self.UserModel._default_manager.create_user(username, password=password) token, _ = Token._default_manager.get_or_create(user=user) if verbosity >= 1: - msg = f'User {username} created with API key: {token.key}' + msg = f"User {username} created with API key: {token.key}" self.stdout.write(msg, self.style.SUCCESS) return user @@ -78,21 +74,21 @@ def get_password_from_stdin(self, username): password = None while password is None: password1 = getpass.getpass() - password2 = getpass.getpass('Password (again): ') + password2 = getpass.getpass("Password (again): ") if password1 != password2: self.stderr.write("Error: Your passwords didn't match.") continue - if password1.strip() == '': + if password1.strip() == "": self.stderr.write("Error: Blank passwords aren't allowed.") continue try: validate_password(password2, self.UserModel(**fake_user_data)) except exceptions.ValidationError as err: - self.stderr.write('\n'.join(err.messages)) + self.stderr.write("\n".join(err.messages)) response = input( - 'Bypass password validation and create user anyway? [y/N]: ' + "Bypass password validation and create user anyway? [y/N]: " ) - if response.lower() != 'y': + if response.lower() != "y": continue password = password1 @@ -106,9 +102,9 @@ def _validate_username(self, username): except self.UserModel.DoesNotExist: pass else: - return 'Error: That username is already taken.' + return "Error: That username is already taken." try: self.username_field.clean(username, None) except exceptions.ValidationError as e: - return '; '.join(e.messages) + return "; ".join(e.messages) diff --git a/minecode/mappings/gcode_keywords.py b/minecode/mappings/gcode_keywords.py index 57fe1648..23e84dff 100644 --- a/minecode/mappings/gcode_keywords.py +++ b/minecode/mappings/gcode_keywords.py @@ -13,438 +13,439 @@ See visitors.googlecode.get_project_labels_feed_as_list() for the origin of those values. """ + GCODE_KEYWORDS = { - 'AJAX': 'AJAX', - 'AOP': 'AOP', - 'API': 'API', - 'AWS': 'AWS', - 'Academic': 'Academic', - 'Accessibility': 'Accessibility', - 'Accounting': 'Accounting', - 'ActiveRecord': 'ActiveRecord', - 'Agent': 'Agent', - 'Agile': 'Agile', - 'Air': 'Air', - 'Album': 'Album', - 'Algorithm': 'Algorithms', - 'Analysis': 'Analysis', - 'Analytics': 'Analytics', - 'Android': 'Android', - 'Animation': 'Animation', - 'Annotation': 'Annotation', - 'Apache': 'Apache', - 'AppEngine': 'AppEngine', - 'Applet': 'Applet', - 'Application': 'Application', - 'Apps': 'Apps', - 'Arcade': 'Arcade', - 'Archive': 'Archive', - 'Arm': 'Arm', - 'Asterisk': 'Asterisk', - 'Astronomy': 'Astronomy', - 'Atom': 'Atom', - 'Automation': 'Automation', - 'BSD': 'BSD', - 'Backup': 'Backup', - 'Batch': 'Batch', - 'Bioinformatics': 'Bioinformatics', - 'Biology': 'Biology', - 'Bittorrent': 'BitTorrent', - 'Blender': 'Blender', - 'Blogger': 'Blogger', - 'Blogging': 'Blogging', - 'Bluetooth': 'Bluetooth', - 'Board': 'Board', - 'Boardgame': 'Board Games', - 'Book': 'Book', - 'Books': 'Books', - 'Boost': 'Boost', - 'Browser': 'Browsers', - 'Build': 'Build Tool', - 'Business': 'Business', - 'CGI': 'CGI', - 'CML': 'CML', - 'CMS': 'CMS Systems', - 'CRM': 'CRM', - 'CRUD': 'CRUD', - 'CS': 'CS', - 'CSE': 'CSE', - 'CSV': 'CSV', - 'CakePHP': 'CakePHP', - 'Calculator': 'Calculator', - 'Canvas': 'Canvas', - 'Chess': 'Chess', - 'Chinese': 'Chinese', - 'Chrome': 'Chrome', - 'ChromeOS': 'ChromeOS', - 'Client': 'Client', - 'Cluster': 'Cluster', - 'Cocoa': 'Cocoa', - 'CodeGeneration': 'Code Generation', - 'Codeigniter': 'Codeigniter', - 'Color': 'Color', - 'Communication': 'Communications', - 'Community': 'Community', - 'Component': 'Component', - 'Compression': 'Compression', - 'Computer': 'Computer', - 'Concurrency': 'Concurrency', - 'Console': 'Console', - 'Contacts': 'Contacts', - 'Content': 'Content', - 'ContentManagement': 'ContentManagement', - 'Control': 'Control', - 'Controller': 'Controller', - 'Convert': 'Convert', - 'Crawler': 'Crawler', - 'CrossPlatform': 'CrossPlatform', - 'Cryptography': 'Cryptography', - 'Cuda': 'Cuda', - 'Custom': 'Custom', - 'DAO': 'DAO', - 'DHTML': 'DHTML', - 'DNS': 'DNS', - 'DSL': 'DSL', - 'Dashboard': 'Dashboard', - 'Database': 'Database', - 'Debug': 'Debugger', - 'Delphi': 'Delphi', - 'Demo': 'Demo', - 'Design': 'Design', - 'Desktop': 'Desktop', - 'DevTool': 'DevTool', - 'Developer': 'Developer', - 'Dictionary': 'Dictionary', - 'Distributed': 'Distributed', - 'Django': 'Django', - 'Doc': 'Documentation', - 'Documentation': 'Documentation', - 'Dojo': 'Dojo', - 'DotNet': 'DotNet', - 'Downloader': 'Downloader', - 'Driver': 'Driver', - 'Drupal': 'Drupal', - 'Dynamic': 'Dynamic', - 'E-commerce': 'E-commerce', - 'EJB': 'EJB', - 'ERP': 'ERP', - 'Ebook': 'Ebook', - 'Eclipse': 'Eclipse', - 'Embedded': 'Embedded', - 'Emulator': 'Emulators', - 'Engineering': 'Engineering', - 'English': 'English', - 'Enterprise': 'Enterprise', - 'Events': 'Events', - 'Evolution': 'Evolution', - 'Extension': 'Extension', - 'FLV': 'FLV', - 'FUSE': 'FUSE', - 'Facebook': 'Facebook', - 'Filesystem': 'Filesystems', - 'Finance': 'Finance', - 'Firefox': 'Firefox', - 'Firewall': 'Firewalls', - 'Flash': 'Flash', - 'Football': 'Football', - 'Forms': 'Forms', - 'Framework': 'Framework', - 'FreeBSD': 'FreeBSD', - 'Functional': 'Functional', - 'GIS': 'GIS', - 'GPU': 'GPU', - 'GTD': 'GTD', - 'GTK': 'GTK', - 'GWT': 'GWT', - 'Gadget': 'Gadget', - 'Gallery': 'Gallery', - 'Game': 'Game', - 'Gdata': 'Gdata', - 'Generator': 'Generator', - 'Gentoo': 'Gentoo', - 'Geo': 'Geo', - 'Gnome': 'Gnome', - 'Grails': 'Grails', - 'Grid': 'Grid', - 'Guice': 'Guice', - 'HTML': 'HTML/XHTML', - 'HTTP': 'HTTP', - 'Hadoop': 'Hadoop', - 'Hardware': 'Hardware', - 'Health': 'Health', - 'Hello': 'Hello', - 'Hibernate': 'Hibernate', - 'Home': 'Home', - 'Hosting': 'Hosting', - 'I18n': 'I18N (Internationalization)', - 'IDE': 'IDE', - 'IM': 'IM', - 'IOC': 'IOC', - 'IP': 'IP', - 'IRC': 'IRC', - 'Images': 'Images', - 'Installer': 'Installer', - 'Integration': 'Integration', - 'Interactive': 'Interactive', - 'Interface': 'Interface', - 'Internet': 'Internet', - 'Itunes': 'Itunes', - 'JBoss': 'JBoss', - 'JEE': 'JEE', - 'JME': 'JME', - 'JPA': 'JPA', - 'JSF': 'JSF', - 'JSON': 'JSON', - 'JSP': 'JSP', - 'Jabber': 'Jabber', - 'Japanese': 'Japanese', - 'JavaFX': 'JavaFX', - 'Jobeet': 'Jobeet', - 'Joomla': 'Joomla', - 'KDE': 'KDE', - 'KML': 'KML', - 'Kernel': 'Kernel', - 'Keyboard': 'Keyboard', - 'LDAP': 'LDAP', - 'LaTex': 'TeX/LaTeX', - 'Lab': 'Lab', - 'Layout': 'Layout', - 'Learn': 'Learn', - 'Library': 'Library', - 'Life': 'Life', - 'Light': 'Light', - 'Linq': 'Linq', - 'Linux': 'Linux', - 'List': 'List', - 'Live': 'Live', - 'Localization': 'Localization', - 'Location': 'Location', - 'Log': 'Logging', - 'Logger': 'Logger', - 'MFC': 'MFC', - 'MIDI': 'MIDI', - 'MMO': 'MMO', - 'MMORPG': 'MMORPG', - 'Mac': 'Mac', - 'Machinelearning': 'Machine Learning', - 'Mail': 'Mail', - 'Manage': 'Manage', - 'Mapping': 'Mapping', - 'Mashup': 'Mashup', - 'Mathematics': 'Mathematics', - 'Matlab': 'Matlab', - 'Maven': 'Maven', - 'Mediawiki': 'Mediawiki', - 'Medical': 'Medical', - 'Memory': 'Memory', - 'Menu': 'Menu', - 'Message': 'Message', - 'Messaging': 'Messaging', - 'Messenger': 'Messenger', - 'Microcontroller': 'Microcontroller', - 'Middleware': 'Middleware', - 'Mod': 'Mod', - 'Modeling': 'Modeling', - 'Module': 'Module', - 'Modules': 'Modules', - 'Monitoring': 'Monitoring', - 'Mono': 'Mono', - 'Multiplayer': 'Multiplayer', - 'Multitouch': 'Multitouch', - 'MySQL': 'MySQL', - 'NHibernate': 'NHibernate', - 'Navigation': 'Navigation', - 'Netbeans': 'Netbeans', - 'Networking': 'Networking', - 'News': 'News', - 'Nintendo': 'Nintendo', - 'Notes': 'Notes', - 'OAuth': 'OAuth', - 'OOP': 'OOP', - 'OWL': 'OWL', - 'Object': 'Object', - 'Ocaml': 'Ocaml', - 'Office': 'Office', - 'Ogre': 'Ogre', - 'Online': 'Online', - 'Ontology': 'Ontology', - 'OpenGL': 'OpenGL', - 'OpenID': 'OpenID', - 'OpenSocial': 'OpenSocial', - 'PDF': 'PDF', - 'PSP': 'PSP', - 'Package': 'Package', - 'Parsing': 'Parsing', - 'Password': 'Password', - 'Pattern': 'Pattern', - 'Performance': 'Performance', - 'Persistence': 'Persistence', - 'PhpBB': 'PhpBB', - 'Picasa': 'Picasa', - 'Platform': 'Platform', - 'Player': 'Player', - 'Podcast': 'Podcast', - 'Poker': 'Poker', - 'Portable': 'Portable', - 'Portal': 'Portal', - 'PostgreSQL': 'PostgreSQL', - 'Process': 'Process', - 'Projects': 'Projects', - 'Projeto': 'Projeto', - 'Protocol': 'Protocol', - 'Prototype': 'Prototype', - 'Proxy': 'Proxy', - 'Prueba': 'Prueba', - 'Query': 'Query', - 'RCP': 'RCP', - 'RDF': 'RDF', - 'REST': 'REST', - 'RIA': 'RIA', - 'RMI': 'RMI', - 'RPC': 'RPC', - 'RSS': 'RSS', - 'RTS': 'RTS', - 'Rails': 'Rails', - 'Random': 'Random', - 'Realtime': 'Realtime', - 'Report': 'Report', - 'Research': 'Research', - 'Robotics': 'Robotics', - 'RogueLike': 'RogueLike', - 'SDF': 'SDF', - 'SDK': 'SDK', - 'SDL': 'SDL', - 'SEO': 'SEO', - 'SIP': 'SIP', - 'SMS': 'SMS', - 'SMTP': 'SMTP', - 'SQL': 'SQL', - 'SQLServer': 'SQLServer', - 'SSH': 'SSH', - 'SWF': 'SWF', - 'SWT': 'SWT', - 'Sandbox': 'Sandbox', - 'Schedule': 'Schedule', - 'Scheduler': 'Scheduler', - 'Scheduling': 'Scheduling', - 'Scrum': 'Scrum', - 'Seam': 'Seam', - 'SearchEngine': 'SearchEngine', - 'Semantic': 'Semantic', - 'SemanticWeb': 'SemanticWeb', - 'Server': 'Server', - 'Service': 'Service', - 'Services': 'Services', - 'Sharing': 'Sharing', - 'Shooter': 'Shooter', - 'Simple': 'Simple', - 'Simulator': 'Simulator', - 'Sistema': 'Sistema', - 'SlideShow': 'SlideShow', - 'Small': 'Small', - 'SocialNetworking': 'SocialNetworking', - 'Socket': 'Socket', - 'Sockets': 'Sockets', - 'Spider': 'Spider', - 'Spring': 'Spring', - 'Sqlite': 'Sqlite', - 'Statistics': 'Statistics', - 'Storage': 'Storage', - 'Stream': 'Stream', - 'Struts': 'Struts', - 'Student': 'Student', - 'Study': 'Study', - 'Subversion': 'Subversion', - 'Sudoku': 'Sudoku', - 'Svn': 'Svn', - 'Swing': 'Swing', - 'Symfony': 'Symfony', - 'Sync': 'Sync', - 'TCL': 'TCL', - 'TCP': 'TCP', - 'Table': 'Table', - 'Taggi': 'Taggi', - 'Tasks': 'Tasks', - 'Template': 'Templates', - 'Terminal': 'Terminal', - 'Theme': 'Theme', - 'Thesis': 'Thesis', - 'Time': 'Time', - 'Timer': 'Timer', - 'Tool': 'Tool', - 'Toolkit': 'Toolkit', - 'Tracking': 'Tracking', - 'Traffic': 'Traffic', - 'Training': 'Training', - 'Translate': 'Translate', - 'Translation': 'Translation', - 'Travel': 'Travel', - 'Tree': 'Tree', - 'Tutorial': 'Tutorial', - 'Twitter': 'Twitter', - 'UDP': 'UDP', - 'UI': 'UI', - 'UML': 'UML', - 'URL': 'URL', - 'Ubuntu': 'Ubuntu', - 'Unicode': 'Unicode', - 'UnitTesting': 'Unit Test', - 'Unittest': 'Unit Test', - 'University': 'University', - 'Unix': 'Unix', - 'Utility': 'Utility', - 'Vector': 'Vector', - 'Videogame': 'Videogame', - 'Viewer': 'Viewer', - 'Virtual': 'Virtual', - 'Visual': 'Visual', - 'VisualStudio': 'VisualStudio', - 'WPF': 'WPF', - 'Wave': 'Wave', - 'Web': 'Web', - 'Webcam': 'Webcam', - 'Webkit': 'Webkit', - 'Webservice': 'Web Service', - 'Webservices': 'Web Service', - 'Website': 'Website', - 'WiFi': 'WiFi', - 'Wicket': 'Wicket', - 'Widget': 'Widget', - 'Widgets': 'Widgets', - 'Wiki': 'Wiki', - 'Wikipedia': 'Wikipedia', - 'Windows': 'Windows', - 'WoW': 'WoW', - 'Word': 'Word', - 'Work': 'Work', - 'World': 'World', - 'XHTML': 'XHTML', - 'XMPP': 'XMPP', - 'XNA': 'XNA', - 'XSL': 'XSL', - 'XUL': 'XUL', - 'XWindow': 'XWindow', - 'YUI': 'YUI', - 'YouTube': 'YouTube', - 'Zend': 'Zend', - 'ZendFramework': 'ZendFramework', - 'addon': 'Addon', - 'extjs': 'extjs', - 'ffmpeg': 'ffmpeg', - 'iPhone': 'iPhone', - 'j2ee': 'j2ee', - 'j2me': 'j2me', - 'j2se': 'j2se', - 'jQuery': 'jQuery', - 'memcached': 'memcached', - 'mp3': 'MP3', - 'p2p': 'p2p', - 'plugin': 'plugin', - 'pygame': 'pygame', - 'pyqt': 'pyqt', - 'regex': 'Regex', - 's3': 's3', - 's60': 's60', - 'twisted': 'Twisted', - 'wxwidgets': 'wxwidgets', + "AJAX": "AJAX", + "AOP": "AOP", + "API": "API", + "AWS": "AWS", + "Academic": "Academic", + "Accessibility": "Accessibility", + "Accounting": "Accounting", + "ActiveRecord": "ActiveRecord", + "Agent": "Agent", + "Agile": "Agile", + "Air": "Air", + "Album": "Album", + "Algorithm": "Algorithms", + "Analysis": "Analysis", + "Analytics": "Analytics", + "Android": "Android", + "Animation": "Animation", + "Annotation": "Annotation", + "Apache": "Apache", + "AppEngine": "AppEngine", + "Applet": "Applet", + "Application": "Application", + "Apps": "Apps", + "Arcade": "Arcade", + "Archive": "Archive", + "Arm": "Arm", + "Asterisk": "Asterisk", + "Astronomy": "Astronomy", + "Atom": "Atom", + "Automation": "Automation", + "BSD": "BSD", + "Backup": "Backup", + "Batch": "Batch", + "Bioinformatics": "Bioinformatics", + "Biology": "Biology", + "Bittorrent": "BitTorrent", + "Blender": "Blender", + "Blogger": "Blogger", + "Blogging": "Blogging", + "Bluetooth": "Bluetooth", + "Board": "Board", + "Boardgame": "Board Games", + "Book": "Book", + "Books": "Books", + "Boost": "Boost", + "Browser": "Browsers", + "Build": "Build Tool", + "Business": "Business", + "CGI": "CGI", + "CML": "CML", + "CMS": "CMS Systems", + "CRM": "CRM", + "CRUD": "CRUD", + "CS": "CS", + "CSE": "CSE", + "CSV": "CSV", + "CakePHP": "CakePHP", + "Calculator": "Calculator", + "Canvas": "Canvas", + "Chess": "Chess", + "Chinese": "Chinese", + "Chrome": "Chrome", + "ChromeOS": "ChromeOS", + "Client": "Client", + "Cluster": "Cluster", + "Cocoa": "Cocoa", + "CodeGeneration": "Code Generation", + "Codeigniter": "Codeigniter", + "Color": "Color", + "Communication": "Communications", + "Community": "Community", + "Component": "Component", + "Compression": "Compression", + "Computer": "Computer", + "Concurrency": "Concurrency", + "Console": "Console", + "Contacts": "Contacts", + "Content": "Content", + "ContentManagement": "ContentManagement", + "Control": "Control", + "Controller": "Controller", + "Convert": "Convert", + "Crawler": "Crawler", + "CrossPlatform": "CrossPlatform", + "Cryptography": "Cryptography", + "Cuda": "Cuda", + "Custom": "Custom", + "DAO": "DAO", + "DHTML": "DHTML", + "DNS": "DNS", + "DSL": "DSL", + "Dashboard": "Dashboard", + "Database": "Database", + "Debug": "Debugger", + "Delphi": "Delphi", + "Demo": "Demo", + "Design": "Design", + "Desktop": "Desktop", + "DevTool": "DevTool", + "Developer": "Developer", + "Dictionary": "Dictionary", + "Distributed": "Distributed", + "Django": "Django", + "Doc": "Documentation", + "Documentation": "Documentation", + "Dojo": "Dojo", + "DotNet": "DotNet", + "Downloader": "Downloader", + "Driver": "Driver", + "Drupal": "Drupal", + "Dynamic": "Dynamic", + "E-commerce": "E-commerce", + "EJB": "EJB", + "ERP": "ERP", + "Ebook": "Ebook", + "Eclipse": "Eclipse", + "Embedded": "Embedded", + "Emulator": "Emulators", + "Engineering": "Engineering", + "English": "English", + "Enterprise": "Enterprise", + "Events": "Events", + "Evolution": "Evolution", + "Extension": "Extension", + "FLV": "FLV", + "FUSE": "FUSE", + "Facebook": "Facebook", + "Filesystem": "Filesystems", + "Finance": "Finance", + "Firefox": "Firefox", + "Firewall": "Firewalls", + "Flash": "Flash", + "Football": "Football", + "Forms": "Forms", + "Framework": "Framework", + "FreeBSD": "FreeBSD", + "Functional": "Functional", + "GIS": "GIS", + "GPU": "GPU", + "GTD": "GTD", + "GTK": "GTK", + "GWT": "GWT", + "Gadget": "Gadget", + "Gallery": "Gallery", + "Game": "Game", + "Gdata": "Gdata", + "Generator": "Generator", + "Gentoo": "Gentoo", + "Geo": "Geo", + "Gnome": "Gnome", + "Grails": "Grails", + "Grid": "Grid", + "Guice": "Guice", + "HTML": "HTML/XHTML", + "HTTP": "HTTP", + "Hadoop": "Hadoop", + "Hardware": "Hardware", + "Health": "Health", + "Hello": "Hello", + "Hibernate": "Hibernate", + "Home": "Home", + "Hosting": "Hosting", + "I18n": "I18N (Internationalization)", + "IDE": "IDE", + "IM": "IM", + "IOC": "IOC", + "IP": "IP", + "IRC": "IRC", + "Images": "Images", + "Installer": "Installer", + "Integration": "Integration", + "Interactive": "Interactive", + "Interface": "Interface", + "Internet": "Internet", + "Itunes": "Itunes", + "JBoss": "JBoss", + "JEE": "JEE", + "JME": "JME", + "JPA": "JPA", + "JSF": "JSF", + "JSON": "JSON", + "JSP": "JSP", + "Jabber": "Jabber", + "Japanese": "Japanese", + "JavaFX": "JavaFX", + "Jobeet": "Jobeet", + "Joomla": "Joomla", + "KDE": "KDE", + "KML": "KML", + "Kernel": "Kernel", + "Keyboard": "Keyboard", + "LDAP": "LDAP", + "LaTex": "TeX/LaTeX", + "Lab": "Lab", + "Layout": "Layout", + "Learn": "Learn", + "Library": "Library", + "Life": "Life", + "Light": "Light", + "Linq": "Linq", + "Linux": "Linux", + "List": "List", + "Live": "Live", + "Localization": "Localization", + "Location": "Location", + "Log": "Logging", + "Logger": "Logger", + "MFC": "MFC", + "MIDI": "MIDI", + "MMO": "MMO", + "MMORPG": "MMORPG", + "Mac": "Mac", + "Machinelearning": "Machine Learning", + "Mail": "Mail", + "Manage": "Manage", + "Mapping": "Mapping", + "Mashup": "Mashup", + "Mathematics": "Mathematics", + "Matlab": "Matlab", + "Maven": "Maven", + "Mediawiki": "Mediawiki", + "Medical": "Medical", + "Memory": "Memory", + "Menu": "Menu", + "Message": "Message", + "Messaging": "Messaging", + "Messenger": "Messenger", + "Microcontroller": "Microcontroller", + "Middleware": "Middleware", + "Mod": "Mod", + "Modeling": "Modeling", + "Module": "Module", + "Modules": "Modules", + "Monitoring": "Monitoring", + "Mono": "Mono", + "Multiplayer": "Multiplayer", + "Multitouch": "Multitouch", + "MySQL": "MySQL", + "NHibernate": "NHibernate", + "Navigation": "Navigation", + "Netbeans": "Netbeans", + "Networking": "Networking", + "News": "News", + "Nintendo": "Nintendo", + "Notes": "Notes", + "OAuth": "OAuth", + "OOP": "OOP", + "OWL": "OWL", + "Object": "Object", + "Ocaml": "Ocaml", + "Office": "Office", + "Ogre": "Ogre", + "Online": "Online", + "Ontology": "Ontology", + "OpenGL": "OpenGL", + "OpenID": "OpenID", + "OpenSocial": "OpenSocial", + "PDF": "PDF", + "PSP": "PSP", + "Package": "Package", + "Parsing": "Parsing", + "Password": "Password", + "Pattern": "Pattern", + "Performance": "Performance", + "Persistence": "Persistence", + "PhpBB": "PhpBB", + "Picasa": "Picasa", + "Platform": "Platform", + "Player": "Player", + "Podcast": "Podcast", + "Poker": "Poker", + "Portable": "Portable", + "Portal": "Portal", + "PostgreSQL": "PostgreSQL", + "Process": "Process", + "Projects": "Projects", + "Projeto": "Projeto", + "Protocol": "Protocol", + "Prototype": "Prototype", + "Proxy": "Proxy", + "Prueba": "Prueba", + "Query": "Query", + "RCP": "RCP", + "RDF": "RDF", + "REST": "REST", + "RIA": "RIA", + "RMI": "RMI", + "RPC": "RPC", + "RSS": "RSS", + "RTS": "RTS", + "Rails": "Rails", + "Random": "Random", + "Realtime": "Realtime", + "Report": "Report", + "Research": "Research", + "Robotics": "Robotics", + "RogueLike": "RogueLike", + "SDF": "SDF", + "SDK": "SDK", + "SDL": "SDL", + "SEO": "SEO", + "SIP": "SIP", + "SMS": "SMS", + "SMTP": "SMTP", + "SQL": "SQL", + "SQLServer": "SQLServer", + "SSH": "SSH", + "SWF": "SWF", + "SWT": "SWT", + "Sandbox": "Sandbox", + "Schedule": "Schedule", + "Scheduler": "Scheduler", + "Scheduling": "Scheduling", + "Scrum": "Scrum", + "Seam": "Seam", + "SearchEngine": "SearchEngine", + "Semantic": "Semantic", + "SemanticWeb": "SemanticWeb", + "Server": "Server", + "Service": "Service", + "Services": "Services", + "Sharing": "Sharing", + "Shooter": "Shooter", + "Simple": "Simple", + "Simulator": "Simulator", + "Sistema": "Sistema", + "SlideShow": "SlideShow", + "Small": "Small", + "SocialNetworking": "SocialNetworking", + "Socket": "Socket", + "Sockets": "Sockets", + "Spider": "Spider", + "Spring": "Spring", + "Sqlite": "Sqlite", + "Statistics": "Statistics", + "Storage": "Storage", + "Stream": "Stream", + "Struts": "Struts", + "Student": "Student", + "Study": "Study", + "Subversion": "Subversion", + "Sudoku": "Sudoku", + "Svn": "Svn", + "Swing": "Swing", + "Symfony": "Symfony", + "Sync": "Sync", + "TCL": "TCL", + "TCP": "TCP", + "Table": "Table", + "Taggi": "Taggi", + "Tasks": "Tasks", + "Template": "Templates", + "Terminal": "Terminal", + "Theme": "Theme", + "Thesis": "Thesis", + "Time": "Time", + "Timer": "Timer", + "Tool": "Tool", + "Toolkit": "Toolkit", + "Tracking": "Tracking", + "Traffic": "Traffic", + "Training": "Training", + "Translate": "Translate", + "Translation": "Translation", + "Travel": "Travel", + "Tree": "Tree", + "Tutorial": "Tutorial", + "Twitter": "Twitter", + "UDP": "UDP", + "UI": "UI", + "UML": "UML", + "URL": "URL", + "Ubuntu": "Ubuntu", + "Unicode": "Unicode", + "UnitTesting": "Unit Test", + "Unittest": "Unit Test", + "University": "University", + "Unix": "Unix", + "Utility": "Utility", + "Vector": "Vector", + "Videogame": "Videogame", + "Viewer": "Viewer", + "Virtual": "Virtual", + "Visual": "Visual", + "VisualStudio": "VisualStudio", + "WPF": "WPF", + "Wave": "Wave", + "Web": "Web", + "Webcam": "Webcam", + "Webkit": "Webkit", + "Webservice": "Web Service", + "Webservices": "Web Service", + "Website": "Website", + "WiFi": "WiFi", + "Wicket": "Wicket", + "Widget": "Widget", + "Widgets": "Widgets", + "Wiki": "Wiki", + "Wikipedia": "Wikipedia", + "Windows": "Windows", + "WoW": "WoW", + "Word": "Word", + "Work": "Work", + "World": "World", + "XHTML": "XHTML", + "XMPP": "XMPP", + "XNA": "XNA", + "XSL": "XSL", + "XUL": "XUL", + "XWindow": "XWindow", + "YUI": "YUI", + "YouTube": "YouTube", + "Zend": "Zend", + "ZendFramework": "ZendFramework", + "addon": "Addon", + "extjs": "extjs", + "ffmpeg": "ffmpeg", + "iPhone": "iPhone", + "j2ee": "j2ee", + "j2me": "j2me", + "j2se": "j2se", + "jQuery": "jQuery", + "memcached": "memcached", + "mp3": "MP3", + "p2p": "p2p", + "plugin": "plugin", + "pygame": "pygame", + "pyqt": "pyqt", + "regex": "Regex", + "s3": "s3", + "s60": "s60", + "twisted": "Twisted", + "wxwidgets": "wxwidgets", } diff --git a/minecode/mappings/gcode_licenses.py b/minecode/mappings/gcode_licenses.py index ca5f6483..6ae85c06 100644 --- a/minecode/mappings/gcode_licenses.py +++ b/minecode/mappings/gcode_licenses.py @@ -18,8 +18,7 @@ if not License.objects.filter(dataspace__name='nexB', name=name).exists()] """ - -''' +""" Code licenses @@ -31,32 +30,30 @@ -''' +""" -''' +""" Possible separate content license -''' +""" GCODE_LICENSES = { - 'Apache License 2.0': 'Apache License 2.0', - 'GNU GPL v2': 'GNU General Public License 2.0', + "Apache License 2.0": "Apache License 2.0", + "GNU GPL v2": "GNU General Public License 2.0", # FIXME: or GPL 1.0? - 'Artistic License/GPL': 'Artistic License 2.0', - 'New BSD License': 'BSD-Modified', - 'Eclipse Public License 1.0': 'Eclipse Public License 1.0', - 'GNU GPL v3': 'GNU General Public License 3.0', + "Artistic License/GPL": "Artistic License 2.0", + "New BSD License": "BSD-Modified", + "Eclipse Public License 1.0": "Eclipse Public License 1.0", + "GNU GPL v3": "GNU General Public License 3.0", # FIXME: v3.0 only?? - 'GNU Lesser GPL': 'GNU Lesser General Public License 3.0', - 'MIT License': 'MIT License', - 'Mozilla Public License 1.1': 'Mozilla Public License 1.1', - - 'Other Open Source': None, - 'See source code': None, - - 'Creative Commons 3.0 BY': 'Creative Commons Attribution License 3.0', - 'Creative Commons 3.0 BY-SA': 'Creative Commons Attribution Share Alike License 3.0', + "GNU Lesser GPL": "GNU Lesser General Public License 3.0", + "MIT License": "MIT License", + "Mozilla Public License 1.1": "Mozilla Public License 1.1", + "Other Open Source": None, + "See source code": None, + "Creative Commons 3.0 BY": "Creative Commons Attribution License 3.0", + "Creative Commons 3.0 BY-SA": "Creative Commons Attribution Share Alike License 3.0", } diff --git a/minecode/mappings/gcode_programming_languages.py b/minecode/mappings/gcode_programming_languages.py index e4ef8608..b50a1c92 100644 --- a/minecode/mappings/gcode_programming_languages.py +++ b/minecode/mappings/gcode_programming_languages.py @@ -8,47 +8,45 @@ # -""" -Structure: {'googlecode': 'dje'} -""" +"""Structure: {'googlecode': 'dje'}""" GCODE_PROGRAMMING_LANGUAGES = { - 'ASP': 'ASP', - 'ASP.net': 'ASP', - 'ActionScript': 'ActionScript', - 'Ada': 'Ada', - 'Arduino': 'Arduino', - 'Assembly': 'Assembly', - 'Bash': 'Bash', - 'BASIC': 'Visual Basic', - 'C': 'C', - 'CPlusPlus': 'C++', - 'CSS': 'CSS', - 'CSharp': 'C#', - 'DLanguage': 'D', - 'Erlang': 'Erlang', - 'Flex': 'Flex', - 'Forth': 'Forth', - 'Fortran': 'Fortran', - 'Go': 'Go', - 'Groovy': 'Groovy', - 'Java': 'Java', - 'JavaScript': 'JavaScript', - 'Lisp': 'Lisp', - 'Lua': 'Lua', - 'Mirah': 'Mirah', - 'ObjectivE-C': 'Objective-C', - 'PHP': 'PHP', - 'Pascal': 'Pascal', - 'Perl': 'Perl', - 'Processing': 'Processing', - 'Python': 'Python', - 'Ruby': 'Ruby', - 'SLanguage': 'S', - 'Scala': 'Scala', - 'Scheme': 'Scheme', - 'Shell': 'Shell', - 'Smalltalk': 'Smalltalk', - 'VisualBASIC': 'Visual Basic', - 'XSLT': 'XSLT', + "ASP": "ASP", + "ASP.net": "ASP", + "ActionScript": "ActionScript", + "Ada": "Ada", + "Arduino": "Arduino", + "Assembly": "Assembly", + "Bash": "Bash", + "BASIC": "Visual Basic", + "C": "C", + "CPlusPlus": "C++", + "CSS": "CSS", + "CSharp": "C#", + "DLanguage": "D", + "Erlang": "Erlang", + "Flex": "Flex", + "Forth": "Forth", + "Fortran": "Fortran", + "Go": "Go", + "Groovy": "Groovy", + "Java": "Java", + "JavaScript": "JavaScript", + "Lisp": "Lisp", + "Lua": "Lua", + "Mirah": "Mirah", + "ObjectivE-C": "Objective-C", + "PHP": "PHP", + "Pascal": "Pascal", + "Perl": "Perl", + "Processing": "Processing", + "Python": "Python", + "Ruby": "Ruby", + "SLanguage": "S", + "Scala": "Scala", + "Scheme": "Scheme", + "Shell": "Shell", + "Smalltalk": "Smalltalk", + "VisualBASIC": "Visual Basic", + "XSLT": "XSLT", } diff --git a/minecode/mappings/pypi_trove.py b/minecode/mappings/pypi_trove.py index 4c860c2d..4ad05e7c 100644 --- a/minecode/mappings/pypi_trove.py +++ b/minecode/mappings/pypi_trove.py @@ -12,75 +12,74 @@ See https://pypi.python.org/pypi?%3Aaction=list_classifiers """ - licenses = { - 'License :: Aladdin Free Public License (AFPL)': 'afpl-9.0', - 'License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication': 'cc0-1.0', - 'License :: DFSG approved': 'unknown', - 'License :: Eiffel Forum License (EFL)': 'efl-2.0', - 'License :: Free For Educational Use': 'proprietary', - 'License :: Free For Home Use': 'proprietary', - 'License :: Free for non-commercial use': 'proprietary', - 'License :: Freely Distributable': 'unknown', - 'License :: Free To Use But Restricted': 'proprietary', - 'License :: Freeware': 'proprietary', - 'License :: Netscape Public License (NPL)': 'npl-1.1', - 'License :: Nokia Open Source License (NOKOS)': 'nokos-1.0a', + "License :: Aladdin Free Public License (AFPL)": "afpl-9.0", + "License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication": "cc0-1.0", + "License :: DFSG approved": "unknown", + "License :: Eiffel Forum License (EFL)": "efl-2.0", + "License :: Free For Educational Use": "proprietary", + "License :: Free For Home Use": "proprietary", + "License :: Free for non-commercial use": "proprietary", + "License :: Freely Distributable": "unknown", + "License :: Free To Use But Restricted": "proprietary", + "License :: Freeware": "proprietary", + "License :: Netscape Public License (NPL)": "npl-1.1", + "License :: Nokia Open Source License (NOKOS)": "nokos-1.0a", # 'License :: OSI Approved': '', - 'License :: OSI Approved :: Academic Free License (AFL)': 'afl-3.0', - 'License :: OSI Approved :: Apache Software License': 'apache-2.0', - 'License :: OSI Approved :: Apple Public Source License': 'apsl-2.0', - 'License :: OSI Approved :: Artistic License': 'artistic-2.0', - 'License :: OSI Approved :: Attribution Assurance License': 'attribution', - 'License :: OSI Approved :: BSD License': 'bsd-new', - 'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)': 'cecill-2.1', - 'License :: OSI Approved :: Common Public License': 'cpl-1.0', - 'License :: OSI Approved :: Eiffel Forum License': 'efl-2.0', - 'License :: OSI Approved :: European Union Public Licence 1.0 (EUPL 1.0)': 'eupl-1.0', - 'License :: OSI Approved :: European Union Public Licence 1.1 (EUPL 1.1)': 'eupl-1.1', - 'License :: OSI Approved :: GNU Affero General Public License v3': 'agpl-3.0', + "License :: OSI Approved :: Academic Free License (AFL)": "afl-3.0", + "License :: OSI Approved :: Apache Software License": "apache-2.0", + "License :: OSI Approved :: Apple Public Source License": "apsl-2.0", + "License :: OSI Approved :: Artistic License": "artistic-2.0", + "License :: OSI Approved :: Attribution Assurance License": "attribution", + "License :: OSI Approved :: BSD License": "bsd-new", + "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)": "cecill-2.1", + "License :: OSI Approved :: Common Public License": "cpl-1.0", + "License :: OSI Approved :: Eiffel Forum License": "efl-2.0", + "License :: OSI Approved :: European Union Public Licence 1.0 (EUPL 1.0)": "eupl-1.0", + "License :: OSI Approved :: European Union Public Licence 1.1 (EUPL 1.1)": "eupl-1.1", + "License :: OSI Approved :: GNU Affero General Public License v3": "agpl-3.0", # FIXME: we do not have agpl-3.0+ - 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)': 'agpl-3.0', - 'License :: OSI Approved :: GNU Free Documentation License (FDL)': 'gfdl-1.3', - 'License :: OSI Approved :: GNU General Public License (GPL)': 'gpl', - 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)': 'gpl-2.0', - 'License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)': 'gpl-2.0-plus', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)': 'gpl-3.0', - 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)': 'gpl-3.0-plus', - 'License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)': 'lgpl-2.0', - 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)': 'lgpl-2.0-plus', - 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)': 'lgpl-3.0', - 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)': 'lgpl-3.0-plus', - 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)': 'lgpl', - 'License :: OSI Approved :: IBM Public License': 'ibmpl-1.0', - 'License :: OSI Approved :: Intel Open Source License': 'intel-bsd-export-control', - 'License :: OSI Approved :: ISC License (ISCL)': 'isc', - 'License :: OSI Approved :: Jabber Open Source License': 'josl-1.0', - 'License :: OSI Approved :: MIT License': 'mit', + "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)": "agpl-3.0", + "License :: OSI Approved :: GNU Free Documentation License (FDL)": "gfdl-1.3", + "License :: OSI Approved :: GNU General Public License (GPL)": "gpl", + "License :: OSI Approved :: GNU General Public License v2 (GPLv2)": "gpl-2.0", + "License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)": "gpl-2.0-plus", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)": "gpl-3.0", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)": "gpl-3.0-plus", + "License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)": "lgpl-2.0", + "License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)": "lgpl-2.0-plus", + "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)": "lgpl-3.0", + "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)": "lgpl-3.0-plus", + "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)": "lgpl", + "License :: OSI Approved :: IBM Public License": "ibmpl-1.0", + "License :: OSI Approved :: Intel Open Source License": "intel-bsd-export-control", + "License :: OSI Approved :: ISC License (ISCL)": "isc", + "License :: OSI Approved :: Jabber Open Source License": "josl-1.0", + "License :: OSI Approved :: MIT License": "mit", # FIXME: old and not in scancode: https://opensource.org/licenses/mitrepl # 'License :: OSI Approved :: MITRE Collaborative Virtual Workspace License (CVW)': '', - 'License :: OSI Approved :: Motosoto License': 'motosoto-0.9.1', - 'License :: OSI Approved :: Mozilla Public License 1.0 (MPL)': 'mpl-1.0', - 'License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)': 'mpl-1.1', - 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)': 'mpl-2.0', - 'License :: OSI Approved :: Nethack General Public License': 'ngpl', - 'License :: OSI Approved :: Nokia Open Source License': 'nokos-1.0a', - 'License :: OSI Approved :: Open Group Test Suite License': 'opengroup', - 'License :: OSI Approved :: Python License (CNRI Python License)': 'cnri-python-1.6.1', - 'License :: OSI Approved :: Python Software Foundation License': 'python', - 'License :: OSI Approved :: Qt Public License (QPL)': 'qpl-1.0', - 'License :: OSI Approved :: Ricoh Source Code Public License': 'ricoh-1.0', - 'License :: OSI Approved :: Sleepycat License': 'sleepycat', - 'License :: OSI Approved :: Sun Industry Standards Source License (SISSL)': 'sun-sissl-1.2', - 'License :: OSI Approved :: Sun Public License': 'spl-1.0', - 'License :: OSI Approved :: University of Illinois/NCSA Open Source License': 'uoi-ncsa', - 'License :: OSI Approved :: Vovida Software License 1.0': 'vsl-1.0', - 'License :: OSI Approved :: W3C License': 'w3c', - 'License :: OSI Approved :: X.Net License': 'xnet', - 'License :: OSI Approved :: zlib/libpng License': 'zlib', - 'License :: OSI Approved :: Zope Public License': 'zpl-2.1', - 'License :: Other/Proprietary License': 'proprietary', - 'License :: Public Domain': 'public-domain', + "License :: OSI Approved :: Motosoto License": "motosoto-0.9.1", + "License :: OSI Approved :: Mozilla Public License 1.0 (MPL)": "mpl-1.0", + "License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)": "mpl-1.1", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)": "mpl-2.0", + "License :: OSI Approved :: Nethack General Public License": "ngpl", + "License :: OSI Approved :: Nokia Open Source License": "nokos-1.0a", + "License :: OSI Approved :: Open Group Test Suite License": "opengroup", + "License :: OSI Approved :: Python License (CNRI Python License)": "cnri-python-1.6.1", + "License :: OSI Approved :: Python Software Foundation License": "python", + "License :: OSI Approved :: Qt Public License (QPL)": "qpl-1.0", + "License :: OSI Approved :: Ricoh Source Code Public License": "ricoh-1.0", + "License :: OSI Approved :: Sleepycat License": "sleepycat", + "License :: OSI Approved :: Sun Industry Standards Source License (SISSL)": "sun-sissl-1.2", + "License :: OSI Approved :: Sun Public License": "spl-1.0", + "License :: OSI Approved :: University of Illinois/NCSA Open Source License": "uoi-ncsa", + "License :: OSI Approved :: Vovida Software License 1.0": "vsl-1.0", + "License :: OSI Approved :: W3C License": "w3c", + "License :: OSI Approved :: X.Net License": "xnet", + "License :: OSI Approved :: zlib/libpng License": "zlib", + "License :: OSI Approved :: Zope Public License": "zpl-2.1", + "License :: Other/Proprietary License": "proprietary", + "License :: Public Domain": "public-domain", # not in scancode # 'License :: Repoze Public License': '', } diff --git a/minecode/mappings/sfnet_licenses.py b/minecode/mappings/sfnet_licenses.py index dd8810dd..77267caf 100644 --- a/minecode/mappings/sfnet_licenses.py +++ b/minecode/mappings/sfnet_licenses.py @@ -17,86 +17,85 @@ if not License.objects.filter(dataspace__name='nexB', name=name).exists()] """ - SFNET_LICENSES = { - 'Academic Free License (AFL)': 'Academic Free License 3.0', - 'Adaptive Public License': 'Adaptive Public License', - 'Affero GNU Public License ': 'GNU Affero General Public License 3.0', - 'Apache License V2.0': 'Apache License 2.0', - 'Apache Software License': 'Apache License 2.0', - 'Apple Public Source License': 'Apple Public Source License 2.0', - 'Artistic License': 'Artistic License 2.0', - 'Artistic License 2.0': 'Artistic License 2.0', - 'Attribution Assurance License': 'Attribution Assurance License', - 'BSD License': 'BSD-Modified', - 'Boost Software License (BSL1.0)': 'Boost Software License 1.0', - 'Common Development and Distribution License': 'Common Development and Distribution License 1.1', - 'Common Public Attribution License 1.0 (CPAL)': 'Common Public Attribution License 1.0', - 'Common Public License 1.0': 'Common Public License 1.0', - 'Computer Associates Trusted Open Source License 1.1': 'Computer Associates Trusted Open Source License 1.1', - 'Creative Commons Attribution License': 'Creative Commons Attribution License 3.0', - 'Creative Commons Attribution Non-Commercial License V2.0': 'Creative Commons Attribution Non-Commercial 2.0', - 'Creative Commons Attribution ShareAlike License V2.0': 'Creative Commons Attribution Share Alike License 2.0', - 'Creative Commons Attribution ShareAlike License V3.0': 'Creative Commons Attribution Share Alike License 3.0', - 'CUA Office Public License Version 1.0': 'CUA Office Public License 1.0', - 'Eclipse Public License': 'Eclipse Public License 1.0', - 'Educational Community License, Version 2.0': 'Educational Community License 2.0', - 'Eiffel Forum License V2.0': 'Eiffel Forum License 2.0', - 'Eiffel Forum License': 'Eiffel Forum License 2.0', - 'Entessa Public License': 'Entessa Public License v1.0', - 'EU DataGrid Software License': 'EU DataGrid Software License', - 'European Union Public License': 'European Union Public Licence 1.1', - 'Fair License': 'Fair License', - 'GNU General Public License version 2.0 (GPLv2)': 'GNU General Public License 2.0', - 'GNU General Public License version 3.0 (GPLv3)': 'GNU General Public License 3.0', - 'GNU General Public License with Classpath exception (Classpath::License)': 'GNU General Public License 2.0 with Classpath exception', - 'GNU Library or Lesser General Public License version 2.0 (LGPLv2)': 'GNU Library General Public License 2.0', - 'GNU Library or Lesser General Public License version 3.0 (LGPLv3)': 'GNU Lesser General Public License 3.0', - 'Historical Permission Notice and Disclaimer': 'Historical Permission Notice and Disclaimer', - 'IBM Public License': 'IBM Public License', - 'ISC License': 'ISC License (ISCL)', - 'Intel Open Source License': 'Intel Open Source License 1989', - 'Jabber Open Source License': 'Jabber Open Source License 1.0', - 'LaTeX Project Public License': 'LaTeX Project Public License v1.3a', - 'Lucent Public License Version 1.02': 'Lucent Public License 1.02', - 'MIT License': 'MIT License', - 'Microsoft Public License': 'Microsoft Public License', - 'Microsoft Reciprocal License': 'Microsoft Reciprocal License', - 'Mozilla Public License 1.0 (MPL)': 'Mozilla Public License 1.0', - 'Mozilla Public License 1.1 (MPL 1.1)': 'Mozilla Public License 1.1', - 'Mozilla Public License 2.0 (MPL 2.0)': 'Mozilla Public License 2.0', - 'NASA Open Source Agreement': 'NASA Open Source License v1.3', - 'Nethack General Public License': 'Nethack General Public License', - 'Nokia Open Source License': 'Nokia Open Source License 1.0a', - 'Non-Profit Open Software License 3.0 (Non-Profit OSL 3.0)': 'Non-Profit Open Software License 3.0', - 'NTP License': 'NTP License', - 'OCLC Research Public License 2.0': 'OCLC Research Public License 2.0', - 'OSI-Approved Open Source': None, - 'Open Font License 1.1 (OFL 1.1)': 'Open Font License 1.1', - 'Open Group Test Suite License': 'Open Group Test Suite License', - 'Open Software License 3.0 (OSL3.0)': 'Open Software License 3.0', - 'Other License': None, - 'PHP License': 'PHP License 3.01', - 'Public Domain': 'Public Domain', - 'Python License (CNRI Python License)': 'CNRI Open Source License Agreement for Python 1.6.1', - 'Python Software Foundation License': 'Python Software Foundation License v2', - 'Qt Public License (QPL)': 'Q Public License Version 1.0', - 'Reciprocal Public License 1.5 (RPL1.5)': 'Reciprocal Public License 1.5', - 'RealNetworks Public Source License V1.0': 'RealNetworks Public Source License v1.0', - 'Reciprocal Public License': 'Reciprocal Public License 1.5', - 'Ricoh Source Code Public License': 'Ricoh Source Code Public License v1.0', - 'Simple Public License 2.0': 'Simple Public License Version 2.0', - 'Sleepycat License': 'Sleepycat License (Berkeley Database License)', - 'Sun Industry Standards Source License (SISSL)': 'Sun Industry Standards Source License 1.2', - 'Sun Public License': 'Sun Public License 1.0', - 'Sybase Open Watcom Public License': 'Sybase Open Watcom Public License v1.0', - 'University of Illinois/NCSA Open Source License': 'University of Illinois/NCSA Open Source License', - 'Vovida Software License 1.0': 'Vovida Software License v. 1.0', - 'W3C License': 'W3C Software Notice and License', - 'Zope Public License': 'Zope Public License 2.1', - 'wxWindows Library Licence': 'wxWindows Library Licence 3.1', - 'X.Net License': 'X.Net Inc. License', - 'zlib/libpng License': 'Libpng License', + "Academic Free License (AFL)": "Academic Free License 3.0", + "Adaptive Public License": "Adaptive Public License", + "Affero GNU Public License ": "GNU Affero General Public License 3.0", + "Apache License V2.0": "Apache License 2.0", + "Apache Software License": "Apache License 2.0", + "Apple Public Source License": "Apple Public Source License 2.0", + "Artistic License": "Artistic License 2.0", + "Artistic License 2.0": "Artistic License 2.0", + "Attribution Assurance License": "Attribution Assurance License", + "BSD License": "BSD-Modified", + "Boost Software License (BSL1.0)": "Boost Software License 1.0", + "Common Development and Distribution License": "Common Development and Distribution License 1.1", + "Common Public Attribution License 1.0 (CPAL)": "Common Public Attribution License 1.0", + "Common Public License 1.0": "Common Public License 1.0", + "Computer Associates Trusted Open Source License 1.1": "Computer Associates Trusted Open Source License 1.1", + "Creative Commons Attribution License": "Creative Commons Attribution License 3.0", + "Creative Commons Attribution Non-Commercial License V2.0": "Creative Commons Attribution Non-Commercial 2.0", + "Creative Commons Attribution ShareAlike License V2.0": "Creative Commons Attribution Share Alike License 2.0", + "Creative Commons Attribution ShareAlike License V3.0": "Creative Commons Attribution Share Alike License 3.0", + "CUA Office Public License Version 1.0": "CUA Office Public License 1.0", + "Eclipse Public License": "Eclipse Public License 1.0", + "Educational Community License, Version 2.0": "Educational Community License 2.0", + "Eiffel Forum License V2.0": "Eiffel Forum License 2.0", + "Eiffel Forum License": "Eiffel Forum License 2.0", + "Entessa Public License": "Entessa Public License v1.0", + "EU DataGrid Software License": "EU DataGrid Software License", + "European Union Public License": "European Union Public Licence 1.1", + "Fair License": "Fair License", + "GNU General Public License version 2.0 (GPLv2)": "GNU General Public License 2.0", + "GNU General Public License version 3.0 (GPLv3)": "GNU General Public License 3.0", + "GNU General Public License with Classpath exception (Classpath::License)": "GNU General Public License 2.0 with Classpath exception", + "GNU Library or Lesser General Public License version 2.0 (LGPLv2)": "GNU Library General Public License 2.0", + "GNU Library or Lesser General Public License version 3.0 (LGPLv3)": "GNU Lesser General Public License 3.0", + "Historical Permission Notice and Disclaimer": "Historical Permission Notice and Disclaimer", + "IBM Public License": "IBM Public License", + "ISC License": "ISC License (ISCL)", + "Intel Open Source License": "Intel Open Source License 1989", + "Jabber Open Source License": "Jabber Open Source License 1.0", + "LaTeX Project Public License": "LaTeX Project Public License v1.3a", + "Lucent Public License Version 1.02": "Lucent Public License 1.02", + "MIT License": "MIT License", + "Microsoft Public License": "Microsoft Public License", + "Microsoft Reciprocal License": "Microsoft Reciprocal License", + "Mozilla Public License 1.0 (MPL)": "Mozilla Public License 1.0", + "Mozilla Public License 1.1 (MPL 1.1)": "Mozilla Public License 1.1", + "Mozilla Public License 2.0 (MPL 2.0)": "Mozilla Public License 2.0", + "NASA Open Source Agreement": "NASA Open Source License v1.3", + "Nethack General Public License": "Nethack General Public License", + "Nokia Open Source License": "Nokia Open Source License 1.0a", + "Non-Profit Open Software License 3.0 (Non-Profit OSL 3.0)": "Non-Profit Open Software License 3.0", + "NTP License": "NTP License", + "OCLC Research Public License 2.0": "OCLC Research Public License 2.0", + "OSI-Approved Open Source": None, + "Open Font License 1.1 (OFL 1.1)": "Open Font License 1.1", + "Open Group Test Suite License": "Open Group Test Suite License", + "Open Software License 3.0 (OSL3.0)": "Open Software License 3.0", + "Other License": None, + "PHP License": "PHP License 3.01", + "Public Domain": "Public Domain", + "Python License (CNRI Python License)": "CNRI Open Source License Agreement for Python 1.6.1", + "Python Software Foundation License": "Python Software Foundation License v2", + "Qt Public License (QPL)": "Q Public License Version 1.0", + "Reciprocal Public License 1.5 (RPL1.5)": "Reciprocal Public License 1.5", + "RealNetworks Public Source License V1.0": "RealNetworks Public Source License v1.0", + "Reciprocal Public License": "Reciprocal Public License 1.5", + "Ricoh Source Code Public License": "Ricoh Source Code Public License v1.0", + "Simple Public License 2.0": "Simple Public License Version 2.0", + "Sleepycat License": "Sleepycat License (Berkeley Database License)", + "Sun Industry Standards Source License (SISSL)": "Sun Industry Standards Source License 1.2", + "Sun Public License": "Sun Public License 1.0", + "Sybase Open Watcom Public License": "Sybase Open Watcom Public License v1.0", + "University of Illinois/NCSA Open Source License": "University of Illinois/NCSA Open Source License", + "Vovida Software License 1.0": "Vovida Software License v. 1.0", + "W3C License": "W3C Software Notice and License", + "Zope Public License": "Zope Public License 2.1", + "wxWindows Library Licence": "wxWindows Library Licence 3.1", + "X.Net License": "X.Net Inc. License", + "zlib/libpng License": "Libpng License", } SFNET_NAMES = SFNET_LICENSES.keys() diff --git a/minecode/mappings/sfnet_programming_languages.py b/minecode/mappings/sfnet_programming_languages.py index 3a646004..ed036b02 100644 --- a/minecode/mappings/sfnet_programming_languages.py +++ b/minecode/mappings/sfnet_programming_languages.py @@ -8,102 +8,100 @@ # -""" -Structure: {'sf.net': 'dje'} -""" +"""Structure: {'sf.net': 'dje'}""" SFNET_PROGRAMMING_LANGUAGES = { - 'ALGOL 68': 'Algol', - 'APL': 'APL', - 'ASP': 'ASP', - 'ASP.NET': 'ASP', - 'AWK': 'Awk', - 'ActionScript': 'ActionScript', - 'Ada': 'Ada', - 'AppleScript': 'AppleScript', - 'AspectJ': 'AspectJ', - 'Assembly': 'Assembly', - 'AutoIt': 'AutoIt', - 'BASIC': 'Visual Basic', - 'BlitzMax': 'BlitzMax', - 'Boo': 'Boo', - 'C': 'C', - 'C#': 'C#', - 'C++': 'C++', - 'COBOL': 'COBOL', - 'Clarion': 'Clarion', - 'Cold Fusion': 'ColdFusion', - 'Common Lisp': 'Common Lisp', - 'Curl': 'Curl', - 'D': 'D', - 'Delphi/Kylix': 'Delphi/Object Pascal', - 'Dylan': 'Dylan', - 'Eiffel': 'Eiffel', - 'Emacs-Lisp': 'Emacs Lisp', - 'Erlang': 'Erlang', - 'Euler': 'Euler', - 'Euphoria': 'Euphoria', - 'Flex': 'Flex', - 'Forth': 'Forth', - 'Fortran': 'Fortran', - 'Free Pascal': 'Pascal', - 'GLSL (OpenGL Shading Language)': 'GLSL (OpenGL Shading Language)', - 'Groovy': 'Groovy', - 'Haskell': 'Haskell', - 'IDL': 'IDL', - 'JSP': 'Java', - 'Java': 'Java', - 'JavaScript': 'JavaScript', - 'Kaya': 'Kaya', - 'LPC': 'LPC', - 'LabVIEW': 'LabVIEW', - 'Lazarus': 'Pascal', - 'Lisp': 'Lisp', - 'Logo': 'Logo', - 'LotusScript': 'LotusScript', - 'Lua': 'Lua', - 'MATLAB': 'MATLAB', - 'MUMPS': 'MUMPS', - 'Mathematica': 'Mathematica', - 'Modula': 'Modula', - 'OCaml (Objective Caml)': 'OCaml', - 'Oberon': 'Oberon', - 'Object Pascal': 'Delphi/Object Pascal', - 'Objective C': 'Objective-C', - 'Objective-C 2.0': 'Objective-C', - 'Oz': 'Oz', - 'PHP': 'PHP', - 'PL/SQL': 'PL/SQL', - 'PROGRESS': 'Progress 4GL', - 'Pascal': 'Pascal', - 'Perl': 'Perl', - 'Pike': 'Pike', - 'Prolog': 'Prolog', - 'Python': 'Python', - 'REALbasic': 'REALBasic', - 'REBOL': 'REBOL', - 'Rexx': 'REXX', - 'Ruby': 'Ruby', - 'S/R': 'SR', - 'Scala': 'Scala', - 'Scheme': 'Scheme', - 'Scilab': 'Scilab', - 'Scriptol': 'Scriptol', - 'Simulink': 'Simulink', - 'Smalltalk': 'Smalltalk', - 'Standard ML': 'Standard ML', - 'Tcl': 'Tcl', - 'Transcript/Revolution': 'Revolution', - 'Unix Shell': 'Shell', - 'VBScript': 'VBScript', - 'VHDL/Verilog': 'Verilog', - 'Visual Basic': '(Visual) Basic', - 'Visual Basic .NET': 'Visual Basic .NET', - 'Visual Basic for Applications (VBA)': '(Visual) Basic', - 'Visual FoxPro': '(Visual) FoxPro', - 'XBase/Clipper': 'Clipper', - 'XBasic': 'XBasic', - 'XSL (XSLT/XPath/XSL-FO)': 'XSLT', - 'Yacc': 'yacc', - 'haXe': 'haXe', + "ALGOL 68": "Algol", + "APL": "APL", + "ASP": "ASP", + "ASP.NET": "ASP", + "AWK": "Awk", + "ActionScript": "ActionScript", + "Ada": "Ada", + "AppleScript": "AppleScript", + "AspectJ": "AspectJ", + "Assembly": "Assembly", + "AutoIt": "AutoIt", + "BASIC": "Visual Basic", + "BlitzMax": "BlitzMax", + "Boo": "Boo", + "C": "C", + "C#": "C#", + "C++": "C++", + "COBOL": "COBOL", + "Clarion": "Clarion", + "Cold Fusion": "ColdFusion", + "Common Lisp": "Common Lisp", + "Curl": "Curl", + "D": "D", + "Delphi/Kylix": "Delphi/Object Pascal", + "Dylan": "Dylan", + "Eiffel": "Eiffel", + "Emacs-Lisp": "Emacs Lisp", + "Erlang": "Erlang", + "Euler": "Euler", + "Euphoria": "Euphoria", + "Flex": "Flex", + "Forth": "Forth", + "Fortran": "Fortran", + "Free Pascal": "Pascal", + "GLSL (OpenGL Shading Language)": "GLSL (OpenGL Shading Language)", + "Groovy": "Groovy", + "Haskell": "Haskell", + "IDL": "IDL", + "JSP": "Java", + "Java": "Java", + "JavaScript": "JavaScript", + "Kaya": "Kaya", + "LPC": "LPC", + "LabVIEW": "LabVIEW", + "Lazarus": "Pascal", + "Lisp": "Lisp", + "Logo": "Logo", + "LotusScript": "LotusScript", + "Lua": "Lua", + "MATLAB": "MATLAB", + "MUMPS": "MUMPS", + "Mathematica": "Mathematica", + "Modula": "Modula", + "OCaml (Objective Caml)": "OCaml", + "Oberon": "Oberon", + "Object Pascal": "Delphi/Object Pascal", + "Objective C": "Objective-C", + "Objective-C 2.0": "Objective-C", + "Oz": "Oz", + "PHP": "PHP", + "PL/SQL": "PL/SQL", + "PROGRESS": "Progress 4GL", + "Pascal": "Pascal", + "Perl": "Perl", + "Pike": "Pike", + "Prolog": "Prolog", + "Python": "Python", + "REALbasic": "REALBasic", + "REBOL": "REBOL", + "Rexx": "REXX", + "Ruby": "Ruby", + "S/R": "SR", + "Scala": "Scala", + "Scheme": "Scheme", + "Scilab": "Scilab", + "Scriptol": "Scriptol", + "Simulink": "Simulink", + "Smalltalk": "Smalltalk", + "Standard ML": "Standard ML", + "Tcl": "Tcl", + "Transcript/Revolution": "Revolution", + "Unix Shell": "Shell", + "VBScript": "VBScript", + "VHDL/Verilog": "Verilog", + "Visual Basic": "(Visual) Basic", + "Visual Basic .NET": "Visual Basic .NET", + "Visual Basic for Applications (VBA)": "(Visual) Basic", + "Visual FoxPro": "(Visual) FoxPro", + "XBase/Clipper": "Clipper", + "XBasic": "XBasic", + "XSL (XSLT/XPath/XSL-FO)": "XSLT", + "Yacc": "yacc", + "haXe": "haXe", } diff --git a/minecode/miners/__init__.py b/minecode/miners/__init__.py index f353e8c5..5ff27551 100644 --- a/minecode/miners/__init__.py +++ b/minecode/miners/__init__.py @@ -7,9 +7,9 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from functools import total_ordering import json import pkgutil +from functools import total_ordering from minecode.utils import fetch_http from minecode.utils import get_temp_file @@ -17,35 +17,48 @@ # FIXME: use attr or use a plain ResourceURI object insteaad @total_ordering -class URI(object): +class URI: """ Describe a URI to visit as returned by Visitors subclasses or visit functions. This mostly mirrors the ResourceURI models as a plain Python object. """ + __slots__ = ( - 'uri', - 'source_uri', - 'package_url', - 'file_name', - 'size', - 'date', - 'md5', - 'sha1', - 'sha256', - 'priority', - 'data', - 'visited', - 'mining_level', - 'visit_error' + "uri", + "source_uri", + "package_url", + "file_name", + "size", + "date", + "md5", + "sha1", + "sha256", + "priority", + "data", + "visited", + "mining_level", + "visit_error", ) - def __init__(self, - uri, source_uri=None, package_url=None, - file_name=None, size=None, date=None, md5=None, sha1=None, sha256=None, - priority=0, - data=None, visited=False, mining_level=0, visit_error=None, **kwargs - ): + def __init__( + self, + uri, + source_uri=None, + package_url=None, + file_name=None, + size=None, + date=None, + md5=None, + sha1=None, + sha256=None, + priority=0, + data=None, + visited=False, + mining_level=0, + visit_error=None, + **kwargs, + ): """ Construct a new URI. A URI represents an address and extra information about this address at some point in time. `uri` is a mandatory URI @@ -91,7 +104,7 @@ def to_dict(self, data_is_json=False): ordered_dict = dict() for k in self.__slots__: value = getattr(self, k) - if value and data_is_json and k == 'data': + if value and data_is_json and k == "data": value = json.loads(value) ordered_dict[k] = value return ordered_dict @@ -103,19 +116,21 @@ def __eq__(self, other): return isinstance(other, URI) and self.to_dict() == other.to_dict() def __lt__(self, other): - return (isinstance(other, URI) - and self.to_dict().items() < other.to_dict().items()) + return ( + isinstance(other, URI) and self.to_dict().items() < other.to_dict().items() + ) def __repr__(self): - args = [key + '=%(' + key + ')r' for key in self.__slots__ - if getattr(self, key, None)] - return ('URI(' + ', '.join(args) + ')') % self.to_dict() + args = [ + key + "=%(" + key + ")r" + for key in self.__slots__ + if getattr(self, key, None) + ] + return ("URI(" + ", ".join(args) + ")") % self.to_dict() @classmethod def from_db(cls, resource_uri): - """ - Build a new URI from a ResourceURI model object. - """ + """Build a new URI from a ResourceURI model object.""" kwargs = {} for key in cls.__slots__: value = getattr(resource_uri, key, None) @@ -125,11 +140,12 @@ def from_db(cls, resource_uri): return URI(**kwargs) -class Visitor(object): +class Visitor: """ Abstract base class for visitors. Subclasses must implement the fetch() and get_uris() methods and use a routing decorator for the URIs they can handle. """ + save_data = True def __call__(self, uri): @@ -150,9 +166,7 @@ def __call__(self, uri): return uris_to_visit, self.dumps(content_object), None def fetch(self, uri): - """ - Fetch and return the content content found at a remote URI. - """ + """Fetch and return the content content found at a remote URI.""" raise NotImplementedError def get_uris(self, content): @@ -213,17 +227,14 @@ def fetch(self, uri, timeout=10): `timeout` is a default timeout. """ - content = super(NonPersistentHttpVisitor, - self).fetch(uri, timeout=timeout) - temp_file = get_temp_file('NonPersistentHttpVisitor') - with open(temp_file, 'wb') as tmp: + content = super(NonPersistentHttpVisitor, self).fetch(uri, timeout=timeout) + temp_file = get_temp_file("NonPersistentHttpVisitor") + with open(temp_file, "wb") as tmp: tmp.write(content) return temp_file def dumps(self, content): - """ - Return nothing. The content should not be saved. - """ + """Return nothing. The content should not be saved.""" return None @@ -241,12 +252,13 @@ def loads(self, content): return json.loads(content) -class Mapper(object): +class Mapper: """ Abstract base class for mappers. Subclasses must implement the get_packages() method and use a routing decorator for the URIs they can handle. """ + def __call__(self, uri, resource_uri): # Note: we let exceptions bubble up and they will be caught and # processed by the worker loop @@ -265,5 +277,5 @@ def get_packages(self, uri, resource_uri): imported, all submodules will be imported: this triggers the actual registration of miners. This should stay as the last import in this init module. """ -for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + '.'): +for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + "."): __import__(name) diff --git a/minecode/miners/apache.py b/minecode/miners/apache.py index e12619ec..1c56ccca 100644 --- a/minecode/miners/apache.py +++ b/minecode/miners/apache.py @@ -2,26 +2,25 @@ # Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from itertools import chain import json import logging +from itertools import chain +import packagedcode.models as scan_models from commoncode import fileutils from packageurl import PackageURL -import packagedcode.models as scan_models from minecode import ls -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper -from minecode.miners import HttpVisitor +from minecode.miners import URI from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI from minecode.utils import parse_date - logger = logging.getLogger(__name__) handler = logging.StreamHandler() logger.addHandler(handler) @@ -78,76 +77,84 @@ class ApacheSeed(seed.Seeder): - def get_seeds(self): # note: this is the same as below and does not list archived files # https://archive.apache.org/dist/zzz/find-ls.gz # to get these we need to rsync or use other techniques - yield 'https://apache.org/dist/zzz/find-ls.gz' + yield "https://apache.org/dist/zzz/find-ls.gz" # FIXME: we cannot relate this to a download package: disabled for now # yield 'https://projects.apache.org/json/foundation/projects.json' # yield 'https://projects.apache.org/json/foundation/podlings.json' -CHECKSUM_EXTS = '.sha256', '.sha512', '.md5', '.sha', '.sha1', +CHECKSUM_EXTS = ( + ".sha256", + ".sha512", + ".md5", + ".sha", + ".sha1", +) # only keep downloads with certain extensions for some archives, packages and checksums ARCHIVE_EXTS = ( # archives - '.jar', '.zip', '.tar.gz', '.tgz', '.tar.bz2', '.war', '.tar.xz', '.tgz', '.tar', + ".jar", + ".zip", + ".tar.gz", + ".tgz", + ".tar.bz2", + ".war", + ".tar.xz", + ".tgz", + ".tar", # packages # '.deb', '.rpm', '.msi', '.exe', - '.whl', '.gem', '.nupkg', + ".whl", + ".gem", + ".nupkg", # '.dmg', # '.nbm', ) IGNORED_PATH_CONTAINS = ( - 'META/', # # + "META/", # # # doc - '/documentation/', - '/doc/', # # - '-doc.', # # - '-doc-', # # - - '/docs/', # # - '-docs.', # # - '-docs-', # # - - 'javadoc', # # - 'fulldoc', # # - 'apidoc', # # - '-manual.', - '-asdocs.', # # - + "/documentation/", + "/doc/", # # + "-doc.", # # + "-doc-", # # + "/docs/", # # + "-docs.", # # + "-docs-", # # + "javadoc", # # + "fulldoc", # # + "apidoc", # # + "-manual.", + "-asdocs.", # # # eclipse p2/update sites are redundant # redundant - 'updatesite/', # # - 'eclipse-update-site', # # - 'update/eclipse', # # - 'sling/eclipse', # # - 'eclipse.site-', - + "updatesite/", # # + "eclipse-update-site", # # + "update/eclipse", # # + "sling/eclipse", # # + "eclipse.site-", # large multi-origin binary distributions - '-distro.', - '-bin-withdeps.', - '-bin-with-deps', - + "-distro.", + "-bin-withdeps.", + "-bin-with-deps", # these are larger distributions with third-parties - 'apache-airavata-distribution', - 'apache-airavata-server', - 'apache-mahout-distribution', - '/syncope-standalone-', - - 'binaries/conda', - + "apache-airavata-distribution", + "apache-airavata-server", + "apache-mahout-distribution", + "/syncope-standalone-", + "binaries/conda", # obscure - 'perl/contrib', + "perl/contrib", # index data - 'zzz', + "zzz", # doc - 'ant/manual' + "ant/manual", ) @@ -158,37 +165,41 @@ def get_seeds(self): SOURCE_INDICATORS = ( - '_src.', - '-src.', - '-source.', - '-sources.', - '-source-release', - '/source/', - '/sources/', - '/src/', - '_sources.', + "_src.", + "-src.", + "-source.", + "-sources.", + "-source-release", + "/source/", + "/sources/", + "/src/", + "_sources.", ) -BINARY_INDICATORS = ( -) +BINARY_INDICATORS = () -@visit_router.route('https?://apache.org/dist/zzz/find\-ls\.gz') +@visit_router.route(r"https?://apache.org/dist/zzz/find\-ls\.gz") class ApacheDistIndexVisitor(NonPersistentHttpVisitor): """ Collect URIs for all packages in the "find -ls" index available from Apache dist sites. """ + def get_uris(self, content): import gzip - with gzip.open(content, 'rt') as f: + + with gzip.open(content, "rt") as f: content = f.read() - url_template = 'https://apache.org/dist/{path}' + url_template = "https://apache.org/dist/{path}" - archive_checksum_extensions = tuple(chain.from_iterable( - [[ae + cke for ae in ARCHIVE_EXTS] for cke in CHECKSUM_EXTS])) + archive_checksum_extensions = tuple( + chain.from_iterable( + [[ae + cke for ae in ARCHIVE_EXTS] for cke in CHECKSUM_EXTS] + ) + ) kept_extensions = archive_checksum_extensions + ARCHIVE_EXTS for entry in ls.parse_directory_listing(content, from_find=True): @@ -198,8 +209,9 @@ def get_uris(self, content): path = entry.path # ignore several downloads - if (not path.endswith(kept_extensions) - or any(i in path for i in IGNORED_PATH_CONTAINS)): + if not path.endswith(kept_extensions) or any( + i in path for i in IGNORED_PATH_CONTAINS + ): continue # only checksums need further visit, the archive will be scanned only is_visited = not path.endswith(CHECKSUM_EXTS) @@ -209,7 +221,7 @@ def get_uris(self, content): source_uri=self.uri, uri=url_template.format(path=path), package_url=build_purl(path), - size=entry.size + size=entry.size, ) @@ -221,30 +233,32 @@ def build_purl(uri): """ # FIXME: this is the essence of collecting name and versions for Apache and # this need to be super robust - segments = [p for p in uri.split('/') if p] + segments = [p for p in uri.split("/") if p] version = None project_name = segments[0] # The path typically contains the version but where is highly inconsistent # - bahir/bahir-spark/2.1.1/apache-bahir-2.1.1-src.zip # - groovy/2.4.15/sources/apache-groovy-src-2.4.15.zip # FIXME: this is not correct - if len(segments) > 1 and ('/distribution/' in uri or '/sources/' in uri): + if len(segments) > 1 and ("/distribution/" in uri or "/sources/" in uri): version = segments[1] package_url = PackageURL( - type='apache', + type="apache", # TODO: namespace='', name=project_name, - version=version) + version=version, + ) return package_url -@visit_router.route('https?://(archive\.)apache.org/dist/.*\.(md5|sha1?|sha256|sha512)',) +@visit_router.route( + r"https?://(archive\.)apache.org/dist/.*\.(md5|sha1?|sha256|sha512)", +) class ApacheChecksumVisitor(HttpVisitor): - """ - Collect files that contain archive checksums. - """ + """Collect files that contain archive checksums.""" + def dumps(self, content): if content: # the format can be md5sum-like this way: @@ -254,7 +268,7 @@ def dumps(self, content): if content: content = content[0] else: - content = '' + content = "" return content @@ -291,14 +305,16 @@ class ApacheProjectsJsonVisitor(HttpJsonVisitor): "shortdesc": "An open source Atom implementation" }, """ + def get_uris(self, content): - url_template = 'https://projects.apache.org/json/projects/{name}.json' + url_template = "https://projects.apache.org/json/projects/{name}.json" for project_name, project_meta in content.items(): - package_url = PackageURL(type='apache', name=project_name) + package_url = PackageURL(type="apache", name=project_name) yield URI( uri=url_template.format(name=project_name), package_url=package_url.to_string(), - date=project_meta.get('created')) + date=project_meta.get("created"), + ) # FIXME: we cannot relate this to a download package: disabled for now @@ -309,6 +325,7 @@ class ApacheSingleProjectJsonVisitor(HttpJsonVisitor): return any URI as the json contains the project meatadata only, so this visitor is getting the json to pass to mapper. """ + pass @@ -328,44 +345,44 @@ class ApachePodlingsJsonVisitor(HttpJsonVisitor): "started": "2016-03" }, """ + def get_uris(self, content): for project_name, project_meta in content.items(): - if 'homepage' not in project_meta: + if "homepage" not in project_meta: continue package_url = PackageURL( - type='apache', - namespace='incubator', - name=project_name) + type="apache", namespace="incubator", name=project_name + ) yield URI( - uri=project_meta.get('homepage'), + uri=project_meta.get("homepage"), package_url=package_url.to_string(), data=project_meta, source_uri=self.uri, - visited=True) + visited=True, + ) # common licenses found in JSON APACHE_LICENSE_URL = { - 'http://usefulinc.com/doap/licenses/asl20', - 'https://usefulinc.com/doap/licenses/asl20', - 'http://spdx.org/licenses/Apache-2.0', - 'https://spdx.org/licenses/Apache-2.0', - 'http://www.apache.org/licenses/LICENSE-2.0', - 'https://www.apache.org/licenses/LICENSE-2.0', - 'http://www.apache.org/licenses/LICENSE-2.0.txt', - 'https://www.apache.org/licenses/LICENSE-2.0.txt', - 'http://www.apache.org/licenses/', - 'http://forrest.apache.org/license.html', - 'https://svn.apache.org/repos/asf/tomee/tomee/trunk/LICENSE', + "http://usefulinc.com/doap/licenses/asl20", + "https://usefulinc.com/doap/licenses/asl20", + "http://spdx.org/licenses/Apache-2.0", + "https://spdx.org/licenses/Apache-2.0", + "http://www.apache.org/licenses/LICENSE-2.0", + "https://www.apache.org/licenses/LICENSE-2.0", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + "https://www.apache.org/licenses/LICENSE-2.0.txt", + "http://www.apache.org/licenses/", + "http://forrest.apache.org/license.html", + "https://svn.apache.org/repos/asf/tomee/tomee/trunk/LICENSE", } # FIXME: this is NOT specific to a download URL but to a project: disabled for now # @map_router.route('https://projects.apache.org/json/foundation/projects.json') class ApacheProjectJsonMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Packages built from resource_uri record for a single @@ -382,70 +399,79 @@ def build_packages_from_projects(metadata, uri=None): Yield as many Package as there are download URLs. """ for project_name, project_meta in metadata.items(): - short_desc = project_meta.get('shortdesc') - long_desc = project_meta.get('description') + short_desc = project_meta.get("shortdesc") + long_desc = project_meta.get("description") descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + description = "\n".join(descriptions) common_data = dict( datasource_id="apache_json", - type='apache', + type="apache", name=project_name, description=description, - homepage_url=project_meta.get('homepage'), - bug_tracking_url=project_meta.get('bug-database'), - primary_language=project_meta.get('programming-language'), + homepage_url=project_meta.get("homepage"), + bug_tracking_url=project_meta.get("bug-database"), + primary_language=project_meta.get("programming-language"), ) # FIXME: setting the download-page as the download_url is not right - if project_meta.get('download-page'): - download_url = project_meta.get('download-page') - common_data['download_url'] = download_url - for repo in project_meta.get('repository', []): - common_data['code_view_url'] = repo + if project_meta.get("download-page"): + download_url = project_meta.get("download-page") + common_data["download_url"] = download_url + for repo in project_meta.get("repository", []): + common_data["code_view_url"] = repo # Package code_view_url only support one URL, so break when # finding a code_view_url break - maintainers = project_meta.get('maintainer', []) + maintainers = project_meta.get("maintainer", []) for maintainer in maintainers: - mailbox = maintainer.get('mbox', '').replace('mailto:', '') - name = maintainer.get('name') - party = scan_models.Party(type=scan_models.party_person, name=name, role='maintainer', email=mailbox) - parties = common_data.get('parties') + mailbox = maintainer.get("mbox", "").replace("mailto:", "") + name = maintainer.get("name") + party = scan_models.Party( + type=scan_models.party_person, + name=name, + role="maintainer", + email=mailbox, + ) + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - common_data['parties'].append(party.to_dict()) + common_data["parties"] = [] + common_data["parties"].append(party.to_dict()) # license is just a URL in the json file, for example: # http://usefulinc.com/doap/licenses/asl20 - license_url = project_meta.get('license') - common_data['extracted_license_statement'] = license_url + license_url = project_meta.get("license") + common_data["extracted_license_statement"] = license_url if license_url in APACHE_LICENSE_URL: - common_data['declared_license_expression'] = 'apache-2.0' - common_data['declared_license_expression_spdx'] = 'Apache-2.0' - common_data['license_detections'] = [] + common_data["declared_license_expression"] = "apache-2.0" + common_data["declared_license_expression_spdx"] = "Apache-2.0" + common_data["license_detections"] = [] keywords = [] - category = project_meta.get('category', '') - for kw in category.split(','): + category = project_meta.get("category", "") + for kw in category.split(","): kw = kw.strip() if kw: keywords.append(kw) - common_data['keywords'] = keywords + common_data["keywords"] = keywords - common_data['primary_language'] = project_meta.get('programming-language') + common_data["primary_language"] = project_meta.get("programming-language") # FIXME: these cannot be related to actual packages with a download URL - releases = project_meta.get('release') + releases = project_meta.get("release") if releases: for release in releases: rdata = dict(common_data) - rdata['version'] = release.get('revision') - if release.get('created') and len(release.get('created')) == 10: - rdata['release_date'] = parse_date(release.get('created')) + rdata["version"] = release.get("revision") + if release.get("created") and len(release.get("created")) == 10: + rdata["release_date"] = parse_date(release.get("created")) else: - logger.warn('Unexpected date format for release date: {}'.format(release.get('created'))) + logger.warn( + "Unexpected date format for release date: {}".format( + release.get("created") + ) + ) package = scan_models.Package.from_package_data( package_data=rdata, datafile_path=uri, @@ -453,9 +479,9 @@ def build_packages_from_projects(metadata, uri=None): yield package else: package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) + package_data=common_data, + datafile_path=uri, + ) yield package @@ -463,7 +489,6 @@ def build_packages_from_projects(metadata, uri=None): # FIXME: this is casting too wide a net! # @map_router.route('http?://[\w\-\.]+.incubator.apache.org/"') class ApachePodlingsMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Packages built from resource_uri record for a single @@ -479,36 +504,33 @@ def build_packages_from_podlings(metadata, purl): which is a dictionary keyed by project name and values are project_metadata. Yield as many Package as there are download URLs. """ - name = metadata.get('name') + name = metadata.get("name") if name: common_data = dict( - type='apache-podling', + type="apache-podling", name=name, - description=metadata.get('description'), - homepage_url=metadata.get('homepage'), + description=metadata.get("description"), + homepage_url=metadata.get("homepage"), ) package = scan_models.Package(**common_data) package.set_purl(purl) yield package -@map_router.route('http?s://(archive\.)?apache\.org/dist/.*') +@map_router.route(r"http?s://(archive\.)?apache\.org/dist/.*") class ApacheDownloadMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield Packages build from a bare download URI or download checksum URI. - """ + """Yield Packages build from a bare download URI or download checksum URI.""" if uri.endswith(CHECKSUM_EXTS): # 1. create a regular package from the URL stripped from its checksum extension - archive_uri, _, checksum_type = uri.rpartition('.') + archive_uri, _, checksum_type = uri.rpartition(".") pack = build_package_from_download(archive_uri, resource_uri.package_url) # 2. collect the checksum inside the file # and attach it to the package checksum_value = resource_uri.data.strip() if checksum_value: - checksum_field_name = 'download_{checksum_type}'.format(**locals()) + checksum_field_name = "download_{checksum_type}".format(**locals()) setattr(pack, checksum_field_name, checksum_value) yield pack else: @@ -531,7 +553,7 @@ def build_package_from_download(uri, purl=None): name = purl.name # FIXME: use purl data?? package = scan_models.Package( - type='apache', + type="apache", namespace=purl.namespace, name=name, version=version, @@ -543,17 +565,15 @@ def build_package_from_download(uri, purl=None): # FIXME: there should be only one such method and this one is rather weak def get_name_version(uri): - """ - Return name and version extracted from a path. - """ + """Return name and version extracted from a path.""" # base_url will end being 'https://archive.apache.org/dist' or 'https://apache.org/dist' # path is the uri without base url, for example: # /groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip - _, _, path = uri.partition('apache.org/dist/') + _, _, path = uri.partition("apache.org/dist/") base_name = fileutils.file_base_name(path) version = None - package_name = '' - name_segments = base_name.split('-') + package_name = "" + name_segments = base_name.split("-") for segment in name_segments: try: # To test if each split segment with . is integer. @@ -563,10 +583,10 @@ def get_name_version(uri): # The segment after integer segment should belong to version too. # For example: turbine-4.0-M1, after detecting 4.0, # M1 should be including in version too, so the final version is 4.0-M1 - version = '-'.join([version, segment]) + version = "-".join([version, segment]) continue - is_all_int = all(n.isdigit() for n in segment.split('.')) + is_all_int = all(n.isdigit() for n in segment.split(".")) if is_all_int: version = segment except ValueError: @@ -575,6 +595,6 @@ def get_name_version(uri): if not package_name: package_name = segment else: - package_name = ('-').join([package_name, segment]) + package_name = ("-").join([package_name, segment]) continue return package_name, version diff --git a/minecode/miners/bitbucket.py b/minecode/miners/bitbucket.py index 8e7ae845..e7b95356 100644 --- a/minecode/miners/bitbucket.py +++ b/minecode/miners/bitbucket.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # @@ -9,13 +8,12 @@ from packagedcode import models as scan_models from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper -from minecode.miners import HttpJsonVisitor from minecode.miners import URI - +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -68,52 +66,56 @@ class BitbucketSeed(seed.Seeder): - def get_seeds(self): - yield 'https://api.bitbucket.org/2.0/repositories?pagelen=400' + yield "https://api.bitbucket.org/2.0/repositories?pagelen=400" # TODO: review mapper -@visit_router.route('https://api\.bitbucket\.org/2\.0/repositories\?pagelen=.*',) +@visit_router.route( + r"https://api\.bitbucket\.org/2\.0/repositories\?pagelen=.*", +) class BitbucketIndexVisitor(HttpJsonVisitor): """ Collect repository data through paginated API calls. The index contains repo-level data for every repo. """ + def get_uris(self, content): - next_page = content.get('next') + next_page = content.get("next") if next_page: yield URI(uri=next_page, source_uri=self.uri) -@visit_router.route('https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/\?]*/?') +@visit_router.route(r"https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/\?]*/?") class BitbucketSingleRepoVisitor(HttpJsonVisitor): """ Collect data for a single repository. Note: this is strictly equivalent to one item of the index paginated calls. """ + def get_uris(self, content): return get_repo_uris(content, source_uri=self.uri) -@visit_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*/(refs/tags|downloads).*') +@visit_router.route( + r"https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*/(refs/tags|downloads).*" +) class BitbucketDetailsVisitorPaginated(HttpJsonVisitor): - """ - Collect repository details for data that are paginated. - """ + """Collect repository details for data that are paginated.""" + def get_uris(self, content): - next_page = content.get('next') + next_page = content.get("next") if next_page: purl = get_purl(self.uri) yield URI(uri=next_page, source_uri=self.uri, package_url=purl) @visit_router.route( - 'https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/]*/(watchers|forks|commits).*') + r"https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/]*/(watchers|forks|commits).*" +) class BitbucketDetailsVisitor(HttpJsonVisitor): - """ - Collect repository details for data that are not paginated. - """ + """Collect repository details for data that are not paginated.""" + pass @@ -130,25 +132,25 @@ def get_repo_ns_name(url_like): >>> get_repo_ns_name('/bastiand/mercurialeclipse/src') ('bastiand', 'mercurialeclipse') """ - if url_like.startswith('https://api.bitbucket.org'): - head, _, path = url_like.partition('2.0/repositories') + if url_like.startswith("https://api.bitbucket.org"): + head, _, path = url_like.partition("2.0/repositories") if head: - segments = [p for p in path.split('/') if p] + segments = [p for p in path.split("/") if p] if len(segments) >= 2: ns = segments[0] name = segments[1] return ns, name - if url_like.startswith('https://bitbucket.org/'): - head, _, path = url_like.partition('bitbucket.org/') + if url_like.startswith("https://bitbucket.org/"): + head, _, path = url_like.partition("bitbucket.org/") if head: - segments = [p for p in path.split('/') if p] + segments = [p for p in path.split("/") if p] if len(segments) >= 2: ns = segments[0] name = segments[1] return ns, name - segments = [p for p in url_like.strip('/').split('/') if p] + segments = [p for p in url_like.strip("/").split("/") if p] if len(segments) >= 2: ns = segments[0] name = segments[1] @@ -156,65 +158,58 @@ def get_repo_ns_name(url_like): def get_purl(url_like): - """ - Return a Package URL string created from a bitbucket url or url-like. - """ + """Return a Package URL string created from a bitbucket url or url-like.""" ns_name = get_repo_ns_name(url_like) if not ns_name: return ns, name = ns_name - return PackageURL(type='bitbucket', namespace=ns, name=name).to_string() + return PackageURL(type="bitbucket", namespace=ns, name=name).to_string() def get_repo_uris(repo_data, source_uri): - """ - Yield URIs from a single repository `repo_data` data. - """ - full_name = repo_data.get('full_name', '').strip() + """Yield URIs from a single repository `repo_data` data.""" + full_name = repo_data.get("full_name", "").strip() package_url = get_purl(full_name) - links = repo_data.get('links', {}) - repo_uri = links.get('html', {}).get('href') + links = repo_data.get("links", {}) + repo_uri = links.get("html", {}).get("href") if not repo_uri: - repo_uri = 'https://bitbucket.org/{full_name}'.format(full_name=full_name) + repo_uri = f"https://bitbucket.org/{full_name}" # Yield URI for latest commits, tags and downloads as candidate packages. - commits_url = links.get('commits', {}).get('href') + commits_url = links.get("commits", {}).get("href") # we only care about the latest commit - commits_url += '?pagelen=1' + commits_url += "?pagelen=1" yield URI(uri=commits_url, package_url=package_url, source_uri=source_uri) # for counts only: these should go to the package template - for link in ('forks', 'watchers'): - url = links.get(link, {}).get('href') + for link in ("forks", "watchers"): + url = links.get(link, {}).get("href") if url: # we get a single fields and only one page - url += '?pagelen=1&fields=size' + url += "?pagelen=1&fields=size" yield URI(uri=url, package_url=package_url, source_uri=source_uri) - for link in ('refs/tags', 'downloads'): - url = links.get(link, {}).get('href') + for link in ("refs/tags", "downloads"): + url = links.get(link, {}).get("href") if url: # paginated, we want them all - url += '?pagelen=100' + url += "?pagelen=100" yield URI(uri=url, package_url=package_url, source_uri=source_uri) @map_router.route( - 'https://api.bitbucket\.org/2\.0/repositories/.*/downloads/', + r"https://api.bitbucket\.org/2\.0/repositories/.*/downloads/", ) class BitbucketDownloadMapper(Mapper): - """ - Build package from download urls if present. - """ + """Build package from download urls if present.""" def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single package version. - """ + """Yield Package built from resource_uri record for a single package version.""" downloads_data = json.loads(resource_uri.data) - for download_data in downloads_data.get('values', []): + for download_data in downloads_data.get("values", []): for package in build_bitbucket_download_packages( - download_data, resource_uri.package_url): + download_data, resource_uri.package_url + ): yield package @@ -228,14 +223,14 @@ def build_bitbucket_download_packages(download_data, purl): name = purl.name # FIXME: add these ? - filename = download_data.get('name') - download_counts = download_data.get('downloads', 0) + filename = download_data.get("name") + download_counts = download_data.get("downloads", 0) - download_url = download_data.get('links', {}).get('self', {}).get('href') - size = download_data.get('size') + download_url = download_data.get("links", {}).get("self", {}).get("href") + size = download_data.get("size") package = scan_models.Package( - type='bitbucket', + type="bitbucket", name=name, namespace=namespace, download_url=download_url, @@ -247,9 +242,8 @@ def build_bitbucket_download_packages(download_data, purl): # @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') class BitbucketIndexMapper(Mapper): - """ - Build a Package for a repo. - """ + """Build a Package for a repo.""" + def get_packages(self, uri, resource_uri): repo = json.loads(resource_uri.data) if not repo: @@ -260,9 +254,8 @@ def get_packages(self, uri, resource_uri): # FIXME: disabled as this is for a package template # @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') class BitbucketRepoMapper(Mapper): - """ - Build a Package for a repo. - """ + """Build a Package for a repo.""" + def get_packages(self, uri, resource_uri): repo = json.loads(resource_uri.data) if not repo: @@ -276,21 +269,23 @@ def build_bitbucket_repo_package(repo_data, purl): Notes: this is not version-specific and has no download URL. """ purl = PackageURL.from_string(purl) - scm_protocol = repo_data.get('scm') + scm_protocol = repo_data.get("scm") if not scm_protocol: - scm_protocol = 'git' - bb_url = '{protocol}+https://bitbucket.org/{namespace}/{name}'.format(protocol=scm_protocol, **purl.to_dict()) + scm_protocol = "git" + bb_url = "{protocol}+https://bitbucket.org/{namespace}/{name}".format( + protocol=scm_protocol, **purl.to_dict() + ) - owner = repo_data.get('owner') + owner = repo_data.get("owner") owner_party = scan_models.Party( type=scan_models.party_person, - name=owner.get('username'), - role='owner', - url=owner.get('links', {}).get('html', {}).get('href', {}) + name=owner.get("username"), + role="owner", + url=owner.get("links", {}).get("html", {}).get("href", {}), ) - if repo_data.get('has_issues'): - bug_tracking_url = bb_url + '/issues' + if repo_data.get("has_issues"): + bug_tracking_url = bb_url + "/issues" else: bug_tracking_url = None @@ -298,12 +293,12 @@ def build_bitbucket_repo_package(repo_data, purl): type=purl.type, namespace=purl.namespace, name=purl.name, - homepage_url=repo_data.get('website') or bb_url, - code_view_url=bb_url + '/src', + homepage_url=repo_data.get("website") or bb_url, + code_view_url=bb_url + "/src", bug_tracking_url=bug_tracking_url, - description=repo_data.get('description'), + description=repo_data.get("description"), vcs_url=bb_url, - primary_language=repo_data.get('language'), + primary_language=repo_data.get("language"), parties=[owner_party], ) package.set_purl(purl) diff --git a/minecode/miners/bower.py b/minecode/miners/bower.py index 811f6aee..7195294d 100644 --- a/minecode/miners/bower.py +++ b/minecode/miners/bower.py @@ -13,25 +13,22 @@ from packagedcode.models import DependentPackage from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper -from minecode.miners import HttpJsonVisitor from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper class BowerSeed(seed.Seeder): - def get_seeds(self): - yield 'https://registry.bower.io/packages' + yield "https://registry.bower.io/packages" -@visit_router.route('https://registry.bower.io/packages') +@visit_router.route("https://registry.bower.io/packages") class BowerTopJsonVisitor(HttpJsonVisitor): - """ - Collect URIs for all packages from the json returned. - """ + """Collect URIs for all packages from the json returned.""" def get_uris(self, content): """ @@ -49,44 +46,56 @@ def get_uris(self, content): The url could be in the following formats like github, loglg, gitcafe, bitbuckets etc. # FIXME: We should cover all urls beyond the above four categories. """ - github_base_url = 'https://raw.githubusercontent.com/{owner}/{name}/master/bower.json' - lolg_base_url = 'https://lolg.it/{owner}/{name}/raw/master/bower.json' - gitcafe_base_url = 'https://coding.net/u/{owner}/p/{name}/git/raw/master/bower.json' - bitbucket_base_url = 'https://bitbucket.org/{owner}/{name}/raw/master/bower.json' + github_base_url = ( + "https://raw.githubusercontent.com/{owner}/{name}/master/bower.json" + ) + lolg_base_url = "https://lolg.it/{owner}/{name}/raw/master/bower.json" + gitcafe_base_url = ( + "https://coding.net/u/{owner}/p/{name}/git/raw/master/bower.json" + ) + bitbucket_base_url = ( + "https://bitbucket.org/{owner}/{name}/raw/master/bower.json" + ) base_url_map = { - 'https://github.com/': github_base_url, - 'https://lolg.it/': lolg_base_url, - 'https://gitcafe.com/': gitcafe_base_url, - 'https://bitbucket.org/': bitbucket_base_url + "https://github.com/": github_base_url, + "https://lolg.it/": lolg_base_url, + "https://gitcafe.com/": gitcafe_base_url, + "https://bitbucket.org/": bitbucket_base_url, } for entry in content: - name = entry.get('name') - url = entry.get('url') + name = entry.get("name") + url = entry.get("url") if name in url: owner = None - package_url = PackageURL(type='bower', name=name).to_string() + package_url = PackageURL(type="bower", name=name).to_string() for host_name, base_url in base_url_map.iteritems(): if url.startswith(host_name): - owner = url[len(host_name): url.index(name) - 1] - yield URI(uri=base_url.format(owner=owner, name=name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://raw.githubusercontent.com/.*/master/bower.json', - 'https://lolg.it/.*/master/bower.json', - 'https://coding.net/.*/master/bower.json', - 'https://bitbucket.org/*/master/bower.json') + owner = url[len(host_name) : url.index(name) - 1] + yield URI( + uri=base_url.format(owner=owner, name=name), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route( + "https://raw.githubusercontent.com/.*/master/bower.json", + "https://lolg.it/.*/master/bower.json", + "https://coding.net/.*/master/bower.json", + "https://bitbucket.org/*/master/bower.json", +) class BowerJsonVisitor(HttpJsonVisitor): - """ - Collect content of the json itself by the visitor. - """ + """Collect content of the json itself by the visitor.""" + pass -@map_router.route('https://raw.githubusercontent.com/.*/master/bower.json', - 'https://lolg.it/.*/master/bower.json', - 'https://coding.net/.*/master/bower.json') +@map_router.route( + "https://raw.githubusercontent.com/.*/master/bower.json", + "https://lolg.it/.*/master/bower.json", + "https://coding.net/.*/master/bower.json", +) class BowerJsonMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -95,16 +104,15 @@ def get_packages(self, uri, resource_uri): """ metadata = resource_uri.data build_packages_from_jsonfile( - metadata, resource_uri.uri, resource_uri.package_url) + metadata, resource_uri.uri, resource_uri.package_url + ) def build_packages_from_jsonfile(metadata, uri=None, purl=None): - """ - Yield Package built from Bower json content - """ + """Yield Package built from Bower json content""" content = json.loads(metadata) - licenses_content = content.get('licenses') + licenses_content = content.get("licenses") extracted_license_statement = set([]) if licenses_content: if isinstance(licenses_content, list): @@ -113,25 +121,27 @@ def build_packages_from_jsonfile(metadata, uri=None, purl=None): else: extracted_license_statement.add(licenses_content) - keywords_content = content.get('keywords', []) - name = content.get('name') + keywords_content = content.get("keywords", []) + name = content.get("name") - devdependencies = content.get('devDependencies') + devdependencies = content.get("devDependencies") dev_dependencies = [] if devdependencies: for key, value in devdependencies.items(): dev_dependencies.append( DependentPackage( - purl=key, extracted_requirement=value, scope='devdependency').to_dict() + purl=key, extracted_requirement=value, scope="devdependency" + ).to_dict() ) - dependencies = content.get('dependencies') + dependencies = content.get("dependencies") dependencies_build = [] if dependencies: for key, value in dependencies.items(): dependencies_build.append( DependentPackage( - purl=key, extracted_requirement=value, scope='runtime').to_dict() + purl=key, extracted_requirement=value, scope="runtime" + ).to_dict() ) if name: @@ -139,40 +149,49 @@ def build_packages_from_jsonfile(metadata, uri=None, purl=None): if vcs_tool and vcs_repo: # Form the vsc_url by # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo + vcs_repo = vcs_tool + "+" + vcs_repo common_data = dict( - type='bower', + type="bower", name=name, - description=content.get('description'), - version=content.get('version'), + description=content.get("description"), + version=content.get("version"), vcs_url=vcs_repo, keywords=keywords_content, - homepage_url=content.get('homepage'), - datasource_id='bower_json', + homepage_url=content.get("homepage"), + datasource_id="bower_json", license_detections=[], ) if extracted_license_statement: - common_data['extracted_license_statement'] = list( - extracted_license_statement) + common_data["extracted_license_statement"] = list( + extracted_license_statement + ) - author_content = content.get('author') + author_content = content.get("author") if author_content: - parties = common_data.get('parties') + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - common_data['parties'].append(scan_models.Party( - name=author_content, role='author',).to_dict()) + common_data["parties"] = [] + common_data["parties"].append( + scan_models.Party( + name=author_content, + role="author", + ).to_dict() + ) else: - parties = common_data.get('parties') + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - author_content = content.get('authors', []) + common_data["parties"] = [] + author_content = content.get("authors", []) for author in author_content: - author_split = author.split(':') + author_split = author.split(":") if len(author_split) > 1: - common_data['parties'].append(scan_models.Party( - name=author_split[1].strip(), role='author',).to_dict()) + common_data["parties"].append( + scan_models.Party( + name=author_split[1].strip(), + role="author", + ).to_dict() + ) dependencies = [] if dependencies_build: @@ -180,7 +199,7 @@ def build_packages_from_jsonfile(metadata, uri=None, purl=None): if dev_dependencies: dependencies.extend(dev_dependencies) if len(dependencies) > 0: - common_data['dependencies'] = dependencies + common_data["dependencies"] = dependencies package = scan_models.Package.from_package_data( package_data=common_data, datafile_path=uri, @@ -190,10 +209,8 @@ def build_packages_from_jsonfile(metadata, uri=None, purl=None): def get_vcs_repo(content): - """ - Return the repo type and url. - """ - repo = content.get('repository', {}) + """Return the repo type and url.""" + repo = content.get("repository", {}) if repo: - return repo.get('type'), repo.get('url') + return repo.get("type"), repo.get("url") return None, None diff --git a/minecode/miners/cpan.py b/minecode/miners/cpan.py index b6421562..ec4f941c 100644 --- a/minecode/miners/cpan.py +++ b/minecode/miners/cpan.py @@ -9,29 +9,31 @@ import json -from bs4 import BeautifulSoup -from packageurl import PackageURL import packagedcode.models as scan_models import saneyaml +from bs4 import BeautifulSoup +from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpJsonVisitor from minecode.miners import HttpVisitor -from minecode.miners import URI +from minecode.miners import Mapper from minecode.utils import parse_date class CpanSeed(seed.Seeder): - def get_seeds(self): - yield 'http://www.cpan.org/modules/01modules.index.html' - author_search_template = 'https://fastapi.metacpan.org/author/_search?q=email:{char}*&size=5000' - for char in 'abcdefghijklmnopqrstuvwxyz'.split(): + yield "http://www.cpan.org/modules/01modules.index.html" + author_search_template = ( + "https://fastapi.metacpan.org/author/_search?q=email:{char}*&size=5000" + ) + for char in "abcdefghijklmnopqrstuvwxyz".split(): yield author_search_template.format(char) + # The idea of CPAN API visitor is based on # https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md # @@ -58,7 +60,9 @@ def get_seeds(self): # https://fastapi.metacpan.org/release/_search?q=author:ABERNDT&size=5000 -@visit_router.route('https://fastapi.metacpan.org/author/_search\?q=email:[a-z]\*&size=5000') +@visit_router.route( + r"https://fastapi.metacpan.org/author/_search\?q=email:[a-z]\*&size=5000" +) class MetaCpanAuthorURLVisitors(HttpJsonVisitor): """ Run search on author's email, and parse the returned json content and form @@ -70,17 +74,21 @@ class MetaCpanAuthorURLVisitors(HttpJsonVisitor): """ def get_uris(self, content): - release_visitor_template = 'https://fastapi.metacpan.org/release/_search?q=author:{id}&size=5000' - hits = content.get('hits', {}) - inner_hits = hits.get('hits', []) + release_visitor_template = ( + "https://fastapi.metacpan.org/release/_search?q=author:{id}&size=5000" + ) + hits = content.get("hits", {}) + inner_hits = hits.get("hits", []) for hit in inner_hits: - _id = hit.get('_id') + _id = hit.get("_id") if not _id: continue yield URI(uri=release_visitor_template.format(id=_id), source_uri=self.uri) -@visit_router.route('https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000') +@visit_router.route( + r"https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000" +) class MetaCpanRleaseURLVisitors(HttpJsonVisitor): """ Run the release results by searching the passing AUTHOR ID. The visitor will @@ -88,31 +96,31 @@ class MetaCpanRleaseURLVisitors(HttpJsonVisitor): implementation if the class is empty, it just returns for mapper use of the json content. """ + pass -@visit_router.route('http://www.cpan.org/modules/01modules.index.html') +@visit_router.route("http://www.cpan.org/modules/01modules.index.html") class CpanModulesVisitors(HttpVisitor): - """ - Return URIs by parsing the HTML page of cpan modules page. - """ + """Return URIs by parsing the HTML page of cpan modules page.""" + def get_uris(self, content): """ Return the uris of authors pages, the returning URIs will be an input of CpanProjectHTMLVisitors """ - page = BeautifulSoup(content, 'lxml') - url_template = 'http://www.cpan.org/{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + page = BeautifulSoup(content, "lxml") + url_template = "http://www.cpan.org/{path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - url = a['href'] + url = a["href"] if not url: continue - if url.startswith('../authors'): - if url.endswith(('.zip', '.tar.gz')): + if url.startswith("../authors"): + if url.endswith((".zip", ".tar.gz")): # Skip tar.gz since it will be captured by the CpanProjectHTMLVisitors continue else: @@ -120,77 +128,81 @@ def get_uris(self, content): yield URI(uri=url, source_uri=self.uri) -@visit_router.route('http://www.cpan.org/authors/.*/') +@visit_router.route("http://www.cpan.org/authors/.*/") class CpanProjectHTMLVisitors(HttpVisitor): """ Visit the HTML page of cpan project page and return the Packages info, HTML data and error. """ + def get_uris(self, content): """ Return the uris by looking for the tar.gz in the html, and then forming the uri for meta and readme files """ - page = BeautifulSoup(content, 'lxml') - if self.uri.endswith('/'): - url_template = self.uri + '{path}' + page = BeautifulSoup(content, "lxml") + if self.uri.endswith("/"): + url_template = self.uri + "{path}" else: - url_template = self.uri + '/{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + url_template = self.uri + "/{path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - url = a['href'] + url = a["href"] if not url: continue - if url.startswith(('/', '?')): + if url.startswith(("/", "?")): continue # Avoid the directory and other non-file links else: name = url - name = name.replace('tar.gz', ''). replace('.readme', '').replace('.meta', '') - partions = name.rpartition('-') + name = ( + name.replace("tar.gz", "") + .replace(".readme", "") + .replace(".meta", "") + ) + partions = name.rpartition("-") name = partions[0] version = partions[-1] package_url = None if name and version: - package_url = PackageURL(type='cpan', name=name, version=version).to_string() + package_url = PackageURL( + type="cpan", name=name, version=version + ).to_string() url = url_template.format(path=url) yield URI(uri=url, package_url=package_url, source_uri=self.uri) -@visit_router.route('http://www.cpan.org/.*.meta') +@visit_router.route("http://www.cpan.org/.*.meta") class CpanMetaVisitors(HttpVisitor): """ Visit the meta file and return the meta data of the Package The goal of this visitor is to get the content instead of returning any valid uris. """ + pass -@visit_router.route('http://www.cpan.org/.*.readme') +@visit_router.route("http://www.cpan.org/.*.readme") class CpanReadmeVisitors(HttpVisitor): - """ - Visit the readme file and translate to json and dump it and return for mapper use. - """ + """Visit the readme file and translate to json and dump it and return for mapper use.""" def dumps(self, content): - """ - Return the json by parsing the readme content - """ + """Return the json by parsing the readme content""" # Handle bytes properly in python3 if type(content) == bytes: - content = content.decode('utf-8') + content = content.decode("utf-8") lines = content.splitlines() readme_dict = dict() body = [] head = None for line in lines: - if len(line) > 1 and line.isupper() and line[0] != ' ': + if len(line) > 1 and line.isupper() and line[0] != " ": if head: - readme_dict[head] = '\n'.join(body).lstrip('\n').rstrip('\n') + readme_dict[head] = "\n".join(body).lstrip("\n").rstrip("\n") head = line body = [] else: @@ -198,16 +210,16 @@ def dumps(self, content): return json.dumps(readme_dict) -@map_router.route('https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000') +@map_router.route( + r"https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000" +) class MetaCpanReleaseSearchMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield packages by parsing the json returned from release search request. - """ + """Yield packages by parsing the json returned from release search request.""" metadata = resource_uri.data build_packages_from_release_json( - metadata, resource_uri.uri, resource_uri.package_url) + metadata, resource_uri.uri, resource_uri.package_url + ) def build_packages_from_release_json(metadata, uri=None): @@ -217,31 +229,32 @@ def build_packages_from_release_json(metadata, uri=None): uri: the uri of the ResourceURI object """ content = json.loads(metadata) - hits = content.get('hits', {}) - inner_hits = hits.get('hits', []) + hits = content.get("hits", {}) + inner_hits = hits.get("hits", []) for hit in inner_hits: - release = hit.get('_source', {}) + release = hit.get("_source", {}) if not release: continue - name = release.get('name') + name = release.get("name") if not name: continue extracted_license_statement = [ - l for l in release.get('license', []) if l and l.strip()] + l for l in release.get("license", []) if l and l.strip() + ] common_data = dict( datasource_id="cpan_release_json", - type='cpan', + type="cpan", name=name, - description=release.get('abstract'), - version=release.get('version'), - download_url=release.get('download_url'), + description=release.get("abstract"), + version=release.get("version"), + download_url=release.get("download_url"), extracted_license_statement=extracted_license_statement, license_detections=[], # the date format passing is like: # "2014-04-20T21:30:13" - release_date=parse_date(release.get('date')), + release_date=parse_date(release.get("date")), ) # Get the homepage_url, declared_license and vcs_repository/vcs_tool under resources section. @@ -258,64 +271,61 @@ def build_packages_from_release_json(metadata, uri=None): # "url" : "git://github.com/plack/Plack.git" # } # }, - resources = release.get('resources') or {} + resources = release.get("resources") or {} - common_data['homepage_url'] = resources.get('homepage') + common_data["homepage_url"] = resources.get("homepage") # Usually the license in root node contains the license name # like perl_5. The license here under resources section is the # url of license for example: http://dev.perl.org/licenses/ So # it's useful to collect both information... - license_url = [l for l in resources.get( - 'license', []) if l and l.strip()] + license_url = [l for l in resources.get("license", []) if l and l.strip()] if license_url: - common_data['extracted_license_statement'].extend(license_url) + common_data["extracted_license_statement"].extend(license_url) vcs_tool, vcs_repo = get_vcs_repo1(resources) if vcs_tool and vcs_repo: # Form the vsc_url by # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo - common_data['vcs_url'] = vcs_repo + vcs_repo = vcs_tool + "+" + vcs_repo + common_data["vcs_url"] = vcs_repo - bugtracker_section = resources.get('bugtracker', {}) - common_data['bug_tracking_url'] = bugtracker_section.get('web') + bugtracker_section = resources.get("bugtracker", {}) + common_data["bug_tracking_url"] = bugtracker_section.get("web") - if release.get('author'): + if release.get("author"): party = scan_models.Party( - type=scan_models.party_person, - name=release.get('author'), role='author') - common_data['parties'] = common_data.get('parties', []) - common_data['parties'].append(party.to_dict()) + type=scan_models.party_person, name=release.get("author"), role="author" + ) + common_data["parties"] = common_data.get("parties", []) + common_data["parties"].append(party.to_dict()) package = scan_models.Package.from_package_data( package_data=common_data, datafile_path=uri, ) - package_url = PackageURL(type='cpan', name=release.get( - 'name'), version=release.get('version')) + package_url = PackageURL( + type="cpan", name=release.get("name"), version=release.get("version") + ) package.set_purl(package_url.to_string()) yield package def get_vcs_repo1(content): - """ - Return the repo type and url. - """ + """Return the repo type and url.""" repo_type = None repo_url = None - repo = content.get('repository', {}) + repo = content.get("repository", {}) if repo: - url = repo.get('url') + url = repo.get("url") if url: repo_url = url - if '.git' in url: - repo_type = 'git' + if ".git" in url: + repo_type = "git" return repo_type, repo_url -@map_router.route('http://www.cpan.org/.*.meta') +@map_router.route("http://www.cpan.org/.*.meta") class CpanMetaFileMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -324,7 +334,8 @@ def get_packages(self, uri, resource_uri): """ metadata = resource_uri.data build_packages_from_metafile( - metadata, resource_uri.uri, resource_uri.package_url) + metadata, resource_uri.uri, resource_uri.package_url + ) def build_packages_from_metafile(metadata, uri=None, purl=None): @@ -341,7 +352,7 @@ def build_packages_from_metafile(metadata, uri=None, purl=None): else: content = saneyaml.load(metadata) - licenses_content = content.get('license') + licenses_content = content.get("license") extracted_license_statement = [] if licenses_content: if isinstance(licenses_content, (list,)): @@ -350,45 +361,45 @@ def build_packages_from_metafile(metadata, uri=None, purl=None): else: extracted_license_statement.append(licenses_content) - keywords_content = content.get('keywords', []) + keywords_content = content.get("keywords", []) - download_url = uri.replace('.meta', '.tar.gz') if uri else None + download_url = uri.replace(".meta", ".tar.gz") if uri else None - name = content.get('name') + name = content.get("name") if name: vcs_tool, vcs_repo = get_vcs_repo(content) if vcs_tool and vcs_repo: # Form the vsc_url by # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo + vcs_repo = vcs_tool + "+" + vcs_repo common_data = dict( datasource_id="cpan_meta_json", - type='cpan', + type="cpan", name=name, - description=content.get('abstract', name), - version=content.get('version'), + description=content.get("abstract", name), + version=content.get("version"), download_url=download_url, extracted_license_statement=extracted_license_statement, vcs_url=vcs_repo, keywords=keywords_content, ) - parties = common_data['parties'] = [] + parties = common_data["parties"] = [] - for author_content in content.get('author', []): + for author_content in content.get("author", []): # The author format is like: Abigail - if '<' in author_content: - author_name, _, author_email = author_content.partition('<') - author_email = author_email.strip('>') + if "<" in author_content: + author_name, _, author_email = author_content.partition("<") + author_email = author_email.strip(">") else: author_name = author_content - author_email = '' + author_email = "" party = scan_models.Party( - role='author', + role="author", type=scan_models.party_person, name=author_name.rstrip(), - email=author_email + email=author_email, ) parties.append(party.to_dict()) @@ -399,15 +410,13 @@ def build_packages_from_metafile(metadata, uri=None, purl=None): def get_vcs_repo(content): - """ - Return the repo type and url. - """ - repo = content.get('resources', {}).get('repository') + """Return the repo type and url.""" + repo = content.get("resources", {}).get("repository") if repo: if isinstance(repo, dict): - repo = repo.get('url', '') - if repo.startswith('git:'): - return 'git', repo + repo = repo.get("url", "") + if repo.startswith("git:"): + return "git", repo return None, None @@ -419,9 +428,8 @@ def is_json(json_content): return True -@map_router.route('http://www.cpan.org/.*.readme') +@map_router.route("http://www.cpan.org/.*.readme") class CpanReadmeFileMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -430,7 +438,8 @@ def get_packages(self, uri, resource_uri): """ metadata = resource_uri.data build_packages_from_metafile( - metadata, resource_uri.uri, resource_uri.package_url) + metadata, resource_uri.uri, resource_uri.package_url + ) def build_packages_from_readmefile(metadata, uri=None, purl=None): @@ -441,41 +450,45 @@ def build_packages_from_readmefile(metadata, uri=None, purl=None): purl: String value of the package url of the ResourceURI object """ content = json.loads(metadata) - name = content.get('NAME') + name = content.get("NAME") if name: - download_url = uri.replace('.meta', '.tar.gz') if uri else None + download_url = uri.replace(".meta", ".tar.gz") if uri else None vcs_tool, vcs_repo = get_vcs_repo_fromstring(content) if vcs_tool and vcs_repo: # Form the vsc_url by # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo - copyr = content.get('COPYRIGHT and LICENSE') + vcs_repo = vcs_tool + "+" + vcs_repo + copyr = content.get("COPYRIGHT and LICENSE") common_data = dict( datasource_id="cpan_readme", - type='cpan', + type="cpan", name=name, - description=content.get('ABSTRACT', name), + description=content.get("ABSTRACT", name), download_url=download_url, vcs_url=vcs_repo, copyright=copyr, - version=content.get('VERSION') + version=content.get("VERSION"), ) - authors = content.get('AUTHOR', []) + authors = content.get("AUTHOR", []) for author_content in authors: - author_split = author_content.split('<') + author_split = author_content.split("<") if len(author_split) > 1: - party = scan_models.Party(type=scan_models.party_person, name=author_split[0].rstrip( - ), role='author', email=author_split[1].replace('>', '')) - parties = common_data.get('parties') + party = scan_models.Party( + type=scan_models.party_person, + name=author_split[0].rstrip(), + role="author", + email=author_split[1].replace(">", ""), + ) + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - common_data['parties'].append(party) + common_data["parties"] = [] + common_data["parties"].append(party) keywords_content = [] - if content.get('KEYWORDS'): - keywords_content = [content.get('KEYWORDS')] - common_data['keywords'] = keywords_content + if content.get("KEYWORDS"): + keywords_content = [content.get("KEYWORDS")] + common_data["keywords"] = keywords_content package = scan_models.PackageData.from_data(package_data=common_data) package.set_purl(purl) @@ -483,11 +496,9 @@ def build_packages_from_readmefile(metadata, uri=None, purl=None): def get_vcs_repo_fromstring(content): - """ - Return the repo type and url. - """ - repo = content.get('DEVELOPMENT') - if repo and repo.index('<') < repo.index('>') and 'git:' in repo: - return 'git', repo[repo.index('<') + 1: repo.index('>')] + """Return the repo type and url.""" + repo = content.get("DEVELOPMENT") + if repo and repo.index("<") < repo.index(">") and "git:" in repo: + return "git", repo[repo.index("<") + 1 : repo.index(">")] else: return None, None diff --git a/minecode/miners/cran.py b/minecode/miners/cran.py index 1ae46db8..5465c37d 100644 --- a/minecode/miners/cran.py +++ b/minecode/miners/cran.py @@ -7,54 +7,55 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import packagedcode.models as scan_models from bs4 import BeautifulSoup from packageurl import PackageURL -import packagedcode.models as scan_models from minecode import map_router from minecode import seed from minecode import visit_router -from minecode.miners import Mapper -from minecode.miners import HttpVisitor from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper from minecode.utils import parse_date - -CRAN_URL = 'https://cloud.r-project.org/' -CRAN_WEB_URL = CRAN_URL + 'web/' +CRAN_URL = "https://cloud.r-project.org/" +CRAN_WEB_URL = CRAN_URL + "web/" class CranSeed(seed.Seeder): - def get_seeds(self): - yield 'https://cloud.r-project.org/web/packages/available_packages_by_date.html' + yield "https://cloud.r-project.org/web/packages/available_packages_by_date.html" -@visit_router.route('https://cloud.r-project.org/web/packages/available_packages_by_date.html') +@visit_router.route( + "https://cloud.r-project.org/web/packages/available_packages_by_date.html" +) class CranPackagesVisitors(HttpVisitor): - """ - Return URIs by parsing the HTML content of the page - """ + """Return URIs by parsing the HTML content of the page""" + def get_uris(self, content): - base_url = 'https://cloud.r-project.org/web/packages/{package}/index.html' - a_blocks = BeautifulSoup(content, 'lxml').find_all('a') + base_url = "https://cloud.r-project.org/web/packages/{package}/index.html" + a_blocks = BeautifulSoup(content, "lxml").find_all("a") for a in a_blocks: package = a.text - package_url = PackageURL(type='cran', name=package).to_string() - yield URI(uri=base_url.format(package=package), package_url=package_url, source_uri=self.uri) + package_url = PackageURL(type="cran", name=package).to_string() + yield URI( + uri=base_url.format(package=package), + package_url=package_url, + source_uri=self.uri, + ) -@visit_router.route('https://cloud.r-project.org/web/packages/[\w\-\.]/index.html') +@visit_router.route(r"https://cloud.r-project.org/web/packages/[\w\-\.]/index.html") class CranSinglePackageVisitor(HttpVisitor): - """ - Return only the HTML content of the page, and will be parsed in mapper - """ + """Return only the HTML content of the page, and will be parsed in mapper""" + pass -@map_router.route('https://cloud.r-project.org/web/packages/[\w\-\.]/index.html') +@map_router.route(r"https://cloud.r-project.org/web/packages/[\w\-\.]/index.html") class CranMetaFileMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -62,18 +63,15 @@ def get_packages(self, uri, resource_uri): Yield as many Package as there are download URLs. """ metadata = resource_uri.data - build_packages_from_html( - metadata, resource_uri.uri, resource_uri.package_url) + build_packages_from_html(metadata, resource_uri.uri, resource_uri.package_url) def get_download_url(url): - return url.replace('../../../', CRAN_URL) + return url.replace("../../../", CRAN_URL) def get_dependencies(depends): - """ - Return a dictionary of dependencies keyed by dep_group. - """ + """Return a dictionary of dependencies keyed by dep_group.""" dep_pkgs = [] if not depends: return dep_pkgs @@ -86,12 +84,10 @@ def get_dependencies(depends): def comma_separated(text): - """ - Return a list of strings from a comma-separated text. - """ + """Return a list of strings from a comma-separated text.""" if not text: return [] - return [t.strip() for t in text.split(',') if t and t.strip()] + return [t.strip() for t in text.split(",") if t and t.strip()] def build_packages_from_html(metadata, uri=None, purl=None): @@ -104,78 +100,84 @@ def build_packages_from_html(metadata, uri=None, purl=None): # Parse the name from the url, for example: https://cloud.r-project.org/web/packages/ANN2/index.html common_data = dict( datasource_id="cran_metadata", - type='cran', - name=uri.rpartition('/')[0].rpartition('/')[-1] + type="cran", + name=uri.rpartition("/")[0].rpartition("/")[-1], ) extracted_license_statement = [] download_urls = [] - soup = BeautifulSoup(metadata, 'lxml') - first_pblock = soup.find('p') + soup = BeautifulSoup(metadata, "lxml") + first_pblock = soup.find("p") if first_pblock: - common_data['description'] = first_pblock.string + common_data["description"] = first_pblock.string else: - h2_block = soup.find('h2') + h2_block = soup.find("h2") if h2_block: - common_data['description'] = h2_block.string + common_data["description"] = h2_block.string - tables = soup.find_all('table') + tables = soup.find_all("table") for table in tables: - rows = table.find_all('tr') + rows = table.find_all("tr") for row in rows: col_values = [] - cols = row.find_all('td') + cols = row.find_all("td") for ele in cols: - if ele.find_all('a'): - col_values.append([a['href'].strip() - for a in ele.find_all('a')]) + if ele.find_all("a"): + col_values.append([a["href"].strip() for a in ele.find_all("a")]) col_values.append(ele.text.strip()) if len(cols) >= 2: key = col_values[0] value = col_values[1] - if key == 'Version:': - common_data['version'] = value - elif key == 'URL:': + if key == "Version:": + common_data["version"] = value + elif key == "URL:": if type(value) == list and len(value) > 0: homepages = [] for home_page in value: homepages.append(home_page) - common_data['homepage_url'] = '\n'.join(homepages) + common_data["homepage_url"] = "\n".join(homepages) else: - common_data['homepage_url'] = value - elif key == 'License:': + common_data["homepage_url"] = value + elif key == "License:": for license_url in value: extracted_license_statement.append(license_url) - elif key == 'Author:': - parties = common_data.get('parties') + elif key == "Author:": + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] + common_data["parties"] = [] party = scan_models.Party( - type=scan_models.party_person, name=value, role='author') - common_data['parties'].append(party.to_dict()) - elif key == 'Maintainer:': - maintainer_split = value.split('<') + type=scan_models.party_person, name=value, role="author" + ) + common_data["parties"].append(party.to_dict()) + elif key == "Maintainer:": + maintainer_split = value.split("<") if len(maintainer_split) > 1: - parties = common_data.get('parties') + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - party = scan_models.Party(type=scan_models.party_person, name=maintainer_split[0].rstrip( - ), role='maintainer', email=maintainer_split[1].replace('>', '').replace(' at ', '@')) - common_data['parties'].append(party.to_dict()) - elif 'source' in key or 'binaries' in key: + common_data["parties"] = [] + party = scan_models.Party( + type=scan_models.party_person, + name=maintainer_split[0].rstrip(), + role="maintainer", + email=maintainer_split[1] + .replace(">", "") + .replace(" at ", "@"), + ) + common_data["parties"].append(party.to_dict()) + elif "source" in key or "binaries" in key: if type(value) == list: for url in value: download_urls.append(get_download_url(url)) - elif key == 'Published:': - common_data['release_date'] = parse_date(value) - elif key == 'Imports:': + elif key == "Published:": + common_data["release_date"] = parse_date(value) + elif key == "Imports:": # use the text instead of a href since the text is more accurate if len(col_values) == 3: value = col_values[2] - common_data['dependencies'] = get_dependencies(value) + common_data["dependencies"] = get_dependencies(value) if extracted_license_statement: - common_data['extracted_license_statement'] = extracted_license_statement - common_data['license_detections'] = [] + common_data["extracted_license_statement"] = extracted_license_statement + common_data["license_detections"] = [] if download_urls: # for else statement will have else running always if there is no break statement for download_url in download_urls: diff --git a/minecode/miners/debian.py b/minecode/miners/debian.py index ae2975c9..0eeed102 100644 --- a/minecode/miners/debian.py +++ b/minecode/miners/debian.py @@ -7,31 +7,30 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import defaultdict -import attr import gzip import json import logging +from collections import defaultdict -from commoncode import fileutils +import attr import debian_inspector -from debian_inspector import debcon +from commoncode import fileutils from debian_inspector import copyright as debcopy +from debian_inspector import debcon from packagedcode import models as scan_models from packageurl import PackageURL from minecode import debutils from minecode import ls -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router +from minecode.miners import URI from minecode.miners import HttpVisitor from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI from minecode.utils import form_vcs_url - logger = logging.getLogger(__name__) handler = logging.StreamHandler() logger.addHandler(handler) @@ -60,79 +59,85 @@ class DebianSeed(seed.Seeder): - def get_seeds(self): - yield 'http://ftp.debian.org/debian/ls-lR.gz' - yield 'http://archive.ubuntu.com/ubuntu/ls-lR.gz' + yield "http://ftp.debian.org/debian/ls-lR.gz" + yield "http://archive.ubuntu.com/ubuntu/ls-lR.gz" def is_collectible(file_name): - """ - Return True if a `file_name` is collectible. - """ + """Return True if a `file_name` is collectible.""" # 'Contents-*.gz' are mapping/indexes of installed files to the actual package that provides them. # TODO: add tests! - return (file_name and ( - file_name in ('Packages.gz', 'Release', 'Sources.gz',) - or file_name.endswith(('.deb', '.dsc',)) - or (file_name.startswith('Contents-') and file_name.endswith('.gz')) - )) + return file_name and ( + file_name + in ( + "Packages.gz", + "Release", + "Sources.gz", + ) + or file_name.endswith( + ( + ".deb", + ".dsc", + ) + ) + or (file_name.startswith("Contents-") and file_name.endswith(".gz")) + ) def is_debian_url(uri): - return 'debian.org' in uri + return "debian.org" in uri def is_ubuntu_url(uri): - return 'ubuntu' in uri + return "ubuntu" in uri @visit_router.route( - 'http://ftp.debian.org/.*/ls\-lR\.gz', - 'http://.*/ubuntu/ls\-lR\.gz', + r"http://ftp.debian.org/.*/ls\-lR\.gz", + r"http://.*/ubuntu/ls\-lR\.gz", # mirrors - 'http://ftp.[a-z][a-z].debian.org/.*/ls\-lR\.gz', + r"http://ftp.[a-z][a-z].debian.org/.*/ls\-lR\.gz", ) class DebianDirectoryIndexVisitor(NonPersistentHttpVisitor): - """ - Collect package URIs from Debian-like repos with an ls-LR directory listing. - """ + """Collect package URIs from Debian-like repos with an ls-LR directory listing.""" def get_uris(self, content): - with gzip.open(content, 'rt') as f: + with gzip.open(content, "rt") as f: content = f.read() - url_template = self.uri.replace('ls-lR.gz', '{path}') + url_template = self.uri.replace("ls-lR.gz", "{path}") for entry in ls.parse_directory_listing(content): if entry.type != ls.FILE: continue - path = entry.path.lstrip('/') + path = entry.path.lstrip("/") file_name = fileutils.file_name(path) if not is_collectible(file_name): continue if is_debian_url(self.uri): - namespace = 'debian' + namespace = "debian" elif is_ubuntu_url(self.uri): - namespace = 'ubuntu' + namespace = "ubuntu" else: - logger.error( - 'Unknown Debian URI namespace: {}'.format(self.uri)) + logger.error(f"Unknown Debian URI namespace: {self.uri}") continue - if file_name.endswith(('.deb', '.udeb', '.tar.gz', '.tar.xz', '.tar.bz2', '.tar.lzma')): - name, version, arch = debian_inspector.package.get_nva( - file_name) + if file_name.endswith( + (".deb", ".udeb", ".tar.gz", ".tar.xz", ".tar.bz2", ".tar.lzma") + ): + name, version, arch = debian_inspector.package.get_nva(file_name) package_url = PackageURL( - type='deb', + type="deb", namespace=namespace, name=name, version=str(version), - qualifiers=dict(arch=arch) if arch else None).to_string() + qualifiers=dict(arch=arch) if arch else None, + ).to_string() else: package_url = None @@ -142,7 +147,8 @@ def get_uris(self, content): file_name=file_name, date=entry.date, size=entry.size, - source_uri=self.uri) + source_uri=self.uri, + ) def parse_release(location): @@ -179,26 +185,23 @@ def parse_release(location): def parse_copyright_only(location): - """ - Return a DebianCopyright from the Debian copyright file at `location`. - """ + """Return a DebianCopyright from the Debian copyright file at `location`.""" return debcopy.DebianCopyright.from_file(location) def parse_copyright_allinfo(location): - """ - Return a DebianCopyright from the Debian copyright file at `location`. - """ + """Return a DebianCopyright from the Debian copyright file at `location`.""" return debcopy.DebianCopyright.from_file(location) def parse_license(location): - """ - Return a list of License paragraphs from Debian copyright file at location. - """ + """Return a list of License paragraphs from Debian copyright file at location.""" copyparas = debcopy.DebianCopyright.from_file(location) - return [para for para in copyparas.paragraphs - if isinstance(para, debian_inspector.copyright.CopyrightLicenseParagraph)] + return [ + para + for para in copyparas.paragraphs + if isinstance(para, debian_inspector.copyright.CopyrightLicenseParagraph) + ] def collect_source_packages(location): @@ -228,73 +231,64 @@ def parse_packages_index(location): return debcon.get_paragraphs_data_from_file(location) -@visit_router.route('http://ftp.debian.org/debian/dists/.*/Sources.gz') +@visit_router.route("http://ftp.debian.org/debian/dists/.*/Sources.gz") class DebianSourcesVisitor(NonPersistentHttpVisitor): - """ - Collect package URIs from a Sources gz data file. - """ + """Collect package URIs from a Sources gz data file.""" def get_uris(self, content): - base_url = 'http://ftp.debian.org/debian' - with gzip.open(content, 'rb') as f: + base_url = "http://ftp.debian.org/debian" + with gzip.open(content, "rb") as f: text = f.read() for source in debcon.get_paragraphs_data(text): - dir_info = source.get('Directory') + dir_info = source.get("Directory") if not dir_info: continue - package = source.get('Package') - version = source.get('Version') + package = source.get("Package") + version = source.get("Version") package_url = None if package and version: package_url = PackageURL( - type='deb', namespace='debian', name=package, - version=version).to_string() + type="deb", namespace="debian", name=package, version=version + ).to_string() - dir_info = dir_info.lstrip('/') - dir_url = base_url + '/{}'.format(dir_info) + dir_info = dir_info.lstrip("/") + dir_url = base_url + f"/{dir_info}" yield URI(uri=dir_url, package_url=package_url, source_uri=self.uri) # TODO add .xz support -@visit_router.route('http://ftp.debian.org/debian/dists/.*Packages.gz') +@visit_router.route("http://ftp.debian.org/debian/dists/.*Packages.gz") class DebianPackagesVisitor(NonPersistentHttpVisitor): - """ - Collect URIs to actual .deb Packages and the content itself from a Packages gz data file. - """ + """Collect URIs to actual .deb Packages and the content itself from a Packages gz data file.""" def get_uris(self, content): - base_url = 'http://ftp.debian.org/debian' - with gzip.open(content, 'rb') as f: + base_url = "http://ftp.debian.org/debian" + with gzip.open(content, "rb") as f: text = f.read() for package in debcon.get_paragraphs_data(text): - file_info = package.get('Filename') + file_info = package.get("Filename") if not file_info: continue - package = package.get('Package') - version = package.get('Version') + package = package.get("Package") + version = package.get("Version") if package and version: package_url = PackageURL( - type='deb', - namespace='debian', - name=package, - version=version).to_string() + type="deb", namespace="debian", name=package, version=version + ).to_string() else: package_url = None # FIXME: we we do not keep the actual content... we should! - file_info = file_info.lstrip('/') + file_info = file_info.lstrip("/") dir_url = base_url + file_info - yield URI( - uri=dir_url, - package_url=package_url, - source_uri=self.uri) + yield URI(uri=dir_url, package_url=package_url, source_uri=self.uri) -@visit_router.route('http://ftp.debian.org/debian/pool/.*\.dsc') +@visit_router.route(r"http://ftp.debian.org/debian/pool/.*\.dsc") class DebianDescriptionVisitor(HttpVisitor): """ Collect package data from a .dsc Package description file. @@ -307,25 +301,22 @@ def dumps(self, content): return json.dumps(dsc.to_dict()) -@visit_router.route('http://ftp.debian.org/debian/.*/Release') +@visit_router.route("http://ftp.debian.org/debian/.*/Release") class DebianReleaseVisitor(HttpVisitor): - """ - Collect Release file content from a Release data file. - """ + """Collect Release file content from a Release data file.""" + pass -@map_router.route('http://ftp.debian.org/debian/pool/.*\.dsc') +@map_router.route(r"http://ftp.debian.org/debian/pool/.*\.dsc") class DebianDescriptionMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield packages parsed from a dsc Debian control file mapping. - """ + """Yield packages parsed from a dsc Debian control file mapping.""" return parse_description( metadata=json.loads(resource_uri.data), purl=resource_uri.package_url, - base_download_url=None) + base_download_url=None, + ) def get_files(text): @@ -336,9 +327,9 @@ def get_files(text): if text: for line in text.splitlines(False): # we have htree space-separated items, so we perform two partitions - line = ' '.join(line.split()) - checksum, _, rest = line.partition(' ') - size, _, filename = rest.partition(' ') + line = " ".join(line.split()) + checksum, _, rest = line.partition(" ") + size, _, filename = rest.partition(" ") yield checksum, size, filename @@ -351,36 +342,35 @@ def parse_description(metadata, purl=None, base_download_url=None): """ # FIXME: this may not be correct: Source and Binary are package names common_data = dict( - name=metadata['Source'], - version=metadata['Version'], - homepage_url=metadata.get('Homepage'), - code_view_url=metadata.get('Vcs-Browser'), - parties=[] + name=metadata["Source"], + version=metadata["Version"], + homepage_url=metadata.get("Homepage"), + code_view_url=metadata.get("Vcs-Browser"), + parties=[], ) - if metadata.get('Label'): - common_data['keywords'] = [metadata.get('Label')] + if metadata.get("Label"): + common_data["keywords"] = [metadata.get("Label")] vcs_tool, vcs_repo = get_vcs_repo(metadata) if vcs_tool and vcs_repo: vcs_repo = form_vcs_url(vcs_tool, vcs_repo) - common_data['vcs_url'] = vcs_repo + common_data["vcs_url"] = vcs_repo dependencies = get_dependencies(metadata) if dependencies: - common_data['dependencies'] = dependencies + common_data["dependencies"] = dependencies # TODO: add "original maintainer" seen in Ubuntu - maintainer = metadata.get('Maintainer') + maintainer = metadata.get("Maintainer") if maintainer: name, email = debutils.parse_email(maintainer) if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - common_data['parties'].append(party) + party = scan_models.Party(name=name, role="maintainer", email=email) + common_data["parties"].append(party) @attr.s() - class File(object): + class File: name = attr.ib(default=None) size = attr.ib(default=None) md5 = attr.ib(default=None) @@ -398,24 +388,33 @@ def collect_files(existing_files, field_value, checksum_name): # TODO: what do we do with files? # FIXME: we should store them in the package record files = defaultdict(File) - collect_files(existing_files=files, field_value=metadata.get('Files'), checksum_name='md5') - collect_files(existing_files=files, field_value=metadata.get('Checksums-Sha1'), checksum_name='sha1') - collect_files(existing_files=files, field_value=metadata.get('Checksums-Sha256'), checksum_name='sha256') + collect_files( + existing_files=files, field_value=metadata.get("Files"), checksum_name="md5" + ) + collect_files( + existing_files=files, + field_value=metadata.get("Checksums-Sha1"), + checksum_name="sha1", + ) + collect_files( + existing_files=files, + field_value=metadata.get("Checksums-Sha256"), + checksum_name="sha256", + ) # FIXME: craft a download_url download_url = None if base_download_url: download_url = None - common_data['download_url'] = download_url + common_data["download_url"] = download_url package = scan_models.DebianPackage(**common_data) package.set_purl(purl) yield package -@map_router.route('http://ftp.debian.org/debian/dists/.*Sources.gz') +@map_router.route("http://ftp.debian.org/debian/dists/.*Sources.gz") class DebianSourceFileMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield ScannedPackages built from resource_uri record for a single @@ -433,94 +432,89 @@ def build_source_file_packages(metadata, purl=None): purl: String value of the package url of the ResourceURI object """ for source in debcon.get_paragraphs_data(metadata): - package_name = source.get('Package') + package_name = source.get("Package") parties = [] - maintainer_names = debutils.comma_separated(source.get('Maintainer', '')) + maintainer_names = debutils.comma_separated(source.get("Maintainer", "")) if maintainer_names: for maintainer in maintainer_names: name, email = debutils.parse_email(maintainer) if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) + party = scan_models.Party(name=name, role="maintainer", email=email) parties.append(party) - contributor_names = debutils.comma_separated(source.get('Uploaders', '')) + contributor_names = debutils.comma_separated(source.get("Uploaders", "")) if contributor_names: for contributor in contributor_names: name, email = debutils.parse_email(contributor) if name: party = scan_models.Party( - name=name, role='contributor', email=email) + name=name, role="contributor", email=email + ) parties.append(party) - dependencies = get_dependencies(source, ['Build-Depends']) + dependencies = get_dependencies(source, ["Build-Depends"]) keywords = set() - keywords.update(debutils.comma_separated(source.get('Binary', ''))) - if source.get('Section'): - keywords.add(source.get('Section')) + keywords.update(debutils.comma_separated(source.get("Binary", ""))) + if source.get("Section"): + keywords.add(source.get("Section")) - files = source.get('Files') + files = source.get("Files") for f in files: - name = f.get('name') + name = f.get("name") package = dict( name=package_name, - version=source.get('Version'), + version=source.get("Version"), dependencies=dependencies, parties=parties, - code_view_url=source.get('Vcs-Browser'), - homepage_url=source.get('Homepage'), + code_view_url=source.get("Vcs-Browser"), + homepage_url=source.get("Homepage"), keywords=list(keywords), ) - download_url = 'http://ftp.debian.org/debian/{path}/{name}'.format( - path=source.get('Directory'), - name=name) + download_url = "http://ftp.debian.org/debian/{path}/{name}".format( + path=source.get("Directory"), name=name + ) - package['download_url'] = download_url + package["download_url"] = download_url vcs_tool, vcs_repo = get_vcs_repo(source) if vcs_tool and vcs_repo: vcs_repo = form_vcs_url(vcs_tool, vcs_repo) - package['vcs_url'] = vcs_repo + package["vcs_url"] = vcs_repo - package['md5'] = f.get('md5sum') + package["md5"] = f.get("md5sum") # TODO: Why would we have more than a single SHA1 or SHA256 - sha1s = source.get('Checksums-Sha1', []) + sha1s = source.get("Checksums-Sha1", []) for sha1 in sha1s: - sha1value = sha1.get('sha1') - name = sha1.get('name') + sha1value = sha1.get("sha1") + name = sha1.get("name") if name and sha1value: - package['sha1'] = sha1value - sha256s = source.get('Checksums-Sha256', []) + package["sha1"] = sha1value + sha256s = source.get("Checksums-Sha256", []) for sha256 in sha256s: - sha256value = sha256.get('sha256') - name = sha256.get('name') + sha256value = sha256.get("sha256") + name = sha256.get("name") if name and sha256value: - package['sha256'] = sha256value + package["sha256"] = sha256value package = scan_models.DebianPackage(**package) package.set_purl(purl) yield package -@map_router.route('http://ftp.debian.org/debian/dists/.*Packages.gz') +@map_router.route("http://ftp.debian.org/debian/dists/.*Packages.gz") class DebianPackageFileMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield Packages from a Debian Packages inex. - """ + """Yield Packages from a Debian Packages inex.""" metadata = resource_uri.data return parse_packages(metadata, resource_uri.package_url) def get_programming_language(tags): - """ - Return the programming language extracted from list of `tags` strings. - """ + """Return the programming language extracted from list of `tags` strings.""" for tag in tags: - key, _, value = tag.partition('::') - if key == 'implemented-in': + key, _, value = tag.partition("::") + if key == "implemented-in": return value @@ -532,42 +526,41 @@ def parse_packages(metadata, purl=None): """ for pack in debcon.get_paragraphs_data(metadata): data = dict( - name=pack['Package'], - version=pack['Version'], - homepage_url=pack.get('Homepage'), - code_view_url=pack.get('Vcs-Browser'), - description=pack.get('Description'), - bug_tracking_url=pack.get('Bugs'), + name=pack["Package"], + version=pack["Version"], + homepage_url=pack.get("Homepage"), + code_view_url=pack.get("Vcs-Browser"), + description=pack.get("Description"), + bug_tracking_url=pack.get("Bugs"), parties=[], - md5=pack.get('MD5sum'), - sha1=pack.get('SHA1'), - sha256=pack.get('SHA256'), + md5=pack.get("MD5sum"), + sha1=pack.get("SHA1"), + sha256=pack.get("SHA256"), ) - filename = pack.get('Filename'), + filename = (pack.get("Filename"),) if filename: - data['download_url'] = 'http://ftp.debian.org/debian/{}'.format(filename) + data["download_url"] = f"http://ftp.debian.org/debian/{filename}" - maintainers = pack.get('Maintainer') + maintainers = pack.get("Maintainer") if maintainers: name, email = debutils.parse_email(maintainers) if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - data['parties'].append(party) + party = scan_models.Party(name=name, role="maintainer", email=email) + data["parties"].append(party) dependencies = get_dependencies(pack) if dependencies: - data['dependencies'] = dependencies + data["dependencies"] = dependencies - keywords = debutils.comma_separated(pack.get('Tag', '')) + keywords = debutils.comma_separated(pack.get("Tag", "")) - section = pack.get('Section') + section = pack.get("Section") if section: keywords.append(section) - data['keywords'] = keywords + data["keywords"] = keywords - data['primary_language'] = get_programming_language(keywords) + data["primary_language"] = get_programming_language(keywords) package = scan_models.DebianPackage(**data) if purl: @@ -580,11 +573,12 @@ def parse_packages(metadata, purl=None): ################################################################################# -@map_router.route('http://ftp.debian.org/debian/dists/.*\.zip', - 'http://ftp.debian.org/debian/dists/.*\.jar', - 'http://ftp.debian.org/debian/dists/.*\.gz') +@map_router.route( + r"http://ftp.debian.org/debian/dists/.*\.zip", + r"http://ftp.debian.org/debian/dists/.*\.jar", + r"http://ftp.debian.org/debian/dists/.*\.gz", +) class DebianArchiveFileMapper(Mapper): - def get_packages(self, uri, resource_uri): return build_packages_from_dist_archive(resource_uri.data, resource_uri.uri) @@ -595,15 +589,15 @@ def build_packages_from_dist_archive(metadata, uri): which is a result by running ls LR command at the Debiain root folder. Yield as many Package as there are download URLs. """ - debian_dist_length = len('http://ftp.debian.org/debian/dists') + debian_dist_length = len("http://ftp.debian.org/debian/dists") # The parent folder URI related to uri file itself. - folder_uri = uri[debian_dist_length: uri.rindex('/')] - debian_dist_length = len('http://ftp.debian.org/debian/dists') + folder_uri = uri[debian_dist_length : uri.rindex("/")] + debian_dist_length = len("http://ftp.debian.org/debian/dists") # project name by trucking the uri - name = uri[debian_dist_length:uri.index('/', debian_dist_length)] + name = uri[debian_dist_length : uri.index("/", debian_dist_length)] folder_length = debian_dist_length + len(name) + 1 # version by analysing the uri - version = uri[folder_length:uri.index('/', folder_length)] + version = uri[folder_length : uri.index("/", folder_length)] common_data = dict( datasource_id="debian_archive_file", name=name, @@ -612,15 +606,14 @@ def build_packages_from_dist_archive(metadata, uri): # FIXME: this is NOT RIGHT def get_resourceuri_by_uri(uri): - """ - Return the Resource URI by searching with passing uri string value. - """ + """Return the Resource URI by searching with passing uri string value.""" from minecode.models import ResourceURI + uris = ResourceURI.objects.filter(uri=uri) if uris: return uris[0] - url_template = 'http://ftp.debian.org/debian/dists{name}' + url_template = "http://ftp.debian.org/debian/dists{name}" download_urls = [] for entry in ls.parse_directory_listing(metadata): if entry.type != ls.FILE: @@ -628,18 +621,18 @@ def get_resourceuri_by_uri(uri): path = entry.path if path.startswith(folder_uri): - path = path.lstrip('/') + path = path.lstrip("/") url = url_template.format(name=path) # FIXME: this is NOT RIGHT - if path.endswith('.md5') and url.replace('.md5', '') == uri: + if path.endswith(".md5") and url.replace(".md5", "") == uri: if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).md5: - common_data['md5'] = get_resourceuri_by_uri(url).md5 + common_data["md5"] = get_resourceuri_by_uri(url).md5 # FIXME: this is NOT RIGHT - if path.endswith('.sha') and url.replace('.sha', '') == uri: + if path.endswith(".sha") and url.replace(".sha", "") == uri: if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).sha1: - common_data['sha1'] = get_resourceuri_by_uri(url).sha1 + common_data["sha1"] = get_resourceuri_by_uri(url).sha1 - if path.endswith(('.jar', 'zip', 'gz')) and url != uri: + if path.endswith((".jar", "zip", "gz")) and url != uri: download_urls.append(url) if download_urls: @@ -648,14 +641,14 @@ def get_resourceuri_by_uri(uri): package_data=common_data, datafile_path=uri, ) - package['download_url'] = download_url + package["download_url"] = download_url yield package else: # yield package without a download_url value package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) + package_data=common_data, + datafile_path=uri, + ) # FIXME: this is NOT RIGHT: purl is not defined package.set_purl(package.purl) yield package diff --git a/minecode/miners/dockerhub.py b/minecode/miners/dockerhub.py index ff8685a5..3824294e 100644 --- a/minecode/miners/dockerhub.py +++ b/minecode/miners/dockerhub.py @@ -14,17 +14,18 @@ from packagedcode import models as scan_models from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import HttpVisitor +from minecode.miners import URI from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor from minecode.miners import Mapper -from minecode.miners import URI def get_search_conditions(): - """ Return a list of combination of char and char, char and number, number and number. + """ + Return a list of combination of char and char, char and number, number and number. By doing this, we can pass the conditions to the query API of docker hub, the API does not support the single char, so we combine two chars as a list. For example: ['aa', 'ab', .....'a1', 'a2'.....'z9'...] @@ -45,75 +46,91 @@ def get_search_conditions(): class DockerHubSeed(seed.Seeder): - def get_seeds(self): - yield 'https://hub.docker.com/explore/?page=1' - search_uril_format = 'https://index.docker.io/v1/search?q={condition}&n=100&page=1' + yield "https://hub.docker.com/explore/?page=1" + search_uril_format = ( + "https://index.docker.io/v1/search?q={condition}&n=100&page=1" + ) for condition in get_search_conditions(): # yield a combination of query conditions, the API accepts at least # two chars for searching conditions. yield search_uril_format.format(condition=condition) -@visit_router.route('https://hub.docker.com/explore/\?page=\d?') +@visit_router.route(r"https://hub.docker.com/explore/\?page=\d?") class DockHubExplorePageVisitor(HttpVisitor): - """ - Visit the HTML page of DockerHub Explore Page and yield each uri of the project, and yield the next page of DockHub. - """ + """Visit the HTML page of DockerHub Explore Page and yield each uri of the project, and yield the next page of DockHub.""" def get_uris(self, content): - dockhub_library_html_template = 'https://hub.docker.com/{project}' - dockhub_library_restapi_template = 'https://registry.hub.docker.com/v2/repositories/library/{project}' - dockhub_next_page_template = 'https://hub.docker.com/explore/?page={page}' + dockhub_library_html_template = "https://hub.docker.com/{project}" + dockhub_library_restapi_template = ( + "https://registry.hub.docker.com/v2/repositories/library/{project}" + ) + dockhub_next_page_template = "https://hub.docker.com/explore/?page={page}" page_legal = False - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] - if href and href.startswith('/_/'): + href = a["href"] + if href and href.startswith("/_/"): page_legal = True project_name = href[1:] - package_url = PackageURL(type='docker', name=project_name.replace('_/', 'library/').rstrip('/')).to_string() - yield URI(uri=dockhub_library_html_template.format(project=project_name), package_url=package_url, source_uri=self.uri) - yield URI(uri=dockhub_library_restapi_template.format(project=href.partition('/_/')[-1]), package_url=package_url, source_uri=self.uri) + package_url = PackageURL( + type="docker", + name=project_name.replace("_/", "library/").rstrip("/"), + ).to_string() + yield URI( + uri=dockhub_library_html_template.format(project=project_name), + package_url=package_url, + source_uri=self.uri, + ) + yield URI( + uri=dockhub_library_restapi_template.format( + project=href.partition("/_/")[-1] + ), + package_url=package_url, + source_uri=self.uri, + ) if page_legal: - current_page = int(self.uri.partition('=')[-1]) + current_page = int(self.uri.partition("=")[-1]) next_page = current_page + 1 - yield URI(uri=dockhub_next_page_template.format(page=next_page), source_uri=self.uri) + yield URI( + uri=dockhub_next_page_template.format(page=next_page), + source_uri=self.uri, + ) -@visit_router.route('https://hub.docker.com/_/[\w\-\.]+/') +@visit_router.route(r"https://hub.docker.com/_/[\w\-\.]+/") class DockHubProjectHTMLVisitor(HttpVisitor): - def dumps(self, content): - """ - Return the json by parsing the HTML project page - """ + """Return the json by parsing the HTML project page""" metadata_dict = dict() - page = BeautifulSoup(content, 'lxml') - for div in page.find_all(name='div'): - for span in div.find_all(name='span'): - if span.string == 'Short Description': + page = BeautifulSoup(content, "lxml") + for div in page.find_all(name="div"): + for span in div.find_all(name="span"): + if span.string == "Short Description": next_sibling = div.next_sibling if next_sibling: - for sibling_span in next_sibling.find_all(name='span'): + for sibling_span in next_sibling.find_all(name="span"): sibling_text = sibling_span.string - metadata_dict['summary'] = sibling_text - for h1 in div.find_all(name='h1'): - if h1.string == 'License': + metadata_dict["summary"] = sibling_text + for h1 in div.find_all(name="h1"): + if h1.string == "License": licenses_paras = [] next_sibling = h1.next_sibling - while(next_sibling): + while next_sibling: if next_sibling.string: licenses_paras.append(next_sibling.string) next_sibling = next_sibling.next_sibling if licenses_paras: - metadata_dict['license_text'] = ''.join(licenses_paras) + metadata_dict["license_text"] = "".join(licenses_paras) return json.dumps(metadata_dict) -@visit_router.route('https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/') +@visit_router.route( + r"https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/" +) class DockHubLibraryRESTJsonVisitor(HttpJsonVisitor): """ Return URIs by parsing the json content of API of Dock Hub library @@ -121,29 +138,35 @@ class DockHubLibraryRESTJsonVisitor(HttpJsonVisitor): """ -@visit_router.route('https://index.docker.io/v1/search\?q=\w\w&n=100&page=\d+') +@visit_router.route(r"https://index.docker.io/v1/search\?q=\w\w&n=100&page=\d+") class DockHubGetAllProjectsFromSearchVisitor(HttpJsonVisitor): def get_uris(self, content): - base_url = 'https://hub.docker.com/v2/repositories/{name}' - num_page = content.get('num_pages') - current_page = content.get('page') + base_url = "https://hub.docker.com/v2/repositories/{name}" + num_page = content.get("num_pages") + current_page = content.get("page") if num_page and current_page: if int(current_page) < int(num_page): next_page = int(current_page) + 1 - yield URI(uri=(self.uri.rpartition('=')[0] + '=' + str(next_page)), source_uri=self.uri) - results = content.get('results', {}) + yield URI( + uri=(self.uri.rpartition("=")[0] + "=" + str(next_page)), + source_uri=self.uri, + ) + results = content.get("results", {}) for result in results: - name = result.get('name') + name = result.get("name") # TODO: This will be used when new Package definition is merged. - star_count = result.get('star_count') + star_count = result.get("star_count") if name: - package_url = PackageURL(type='docker', name=name).to_string() - yield URI(uri=base_url.format(name=name), package_url=package_url, source_uri=self.uri) + package_url = PackageURL(type="docker", name=name).to_string() + yield URI( + uri=base_url.format(name=name), + package_url=package_url, + source_uri=self.uri, + ) -@map_router.route('https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/') +@map_router.route(r"https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/") class DockerHubLiraryJsonMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -151,7 +174,9 @@ def get_packages(self, uri, resource_uri): Yield as many Package as there are download URLs. """ metadata = resource_uri.data - build_packages_from_jsonfile(metadata, resource_uri.uri, resource_uri.package_url) + build_packages_from_jsonfile( + metadata, resource_uri.uri, resource_uri.package_url + ) def build_packages_from_jsonfile(metadata, uri=None, purl=None): @@ -162,15 +187,15 @@ def build_packages_from_jsonfile(metadata, uri=None, purl=None): purl: String value of the package url of the ResourceURI object """ content = json.loads(metadata) - dockhub_library_htmlpage_template = 'https://hub.docker.com/_/{project}' - name = content.get('name') + dockhub_library_htmlpage_template = "https://hub.docker.com/_/{project}" + name = content.get("name") if name: - short_desc = content.get('description') - long_desc = content.get('full_description') + short_desc = content.get("description") + long_desc = content.get("full_description") descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + description = "\n".join(descriptions) common_data = dict( - type='docker', + type="docker", name=name, description=description, homepage_url=dockhub_library_htmlpage_template.format(project=name), diff --git a/minecode/miners/eclipse.py b/minecode/miners/eclipse.py index 6166dc14..60c0e525 100644 --- a/minecode/miners/eclipse.py +++ b/minecode/miners/eclipse.py @@ -14,41 +14,40 @@ from packagedcode import models as scan_models from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpJsonVisitor from minecode.miners import HttpVisitor -from minecode.miners import URI +from minecode.miners import Mapper class EclipseSeed(seed.Seeder): - def get_seeds(self): - yield 'http://projects.eclipse.org/json/projects/all' + yield "http://projects.eclipse.org/json/projects/all" -@visit_router.route('https://projects.eclipse.org/list-of-projects') +@visit_router.route("https://projects.eclipse.org/list-of-projects") class EclipseProjectVisitors(HttpVisitor): - """ - Visit the HTML page of eclipse projects page and return the Packages info, json data and error. - """ + """Visit the HTML page of eclipse projects page and return the Packages info, json data and error.""" def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] - if href and href.startswith('https://projects.eclipse.org/projects/'): + href = a["href"] + if href and href.startswith("https://projects.eclipse.org/projects/"): # if the herf content starts with Eclipse single project suffix, generate a URI with the href content - project_name = href.replace('https://projects.eclipse.org/projects/', '') - package_url = PackageURL(type='eclipse', name=project_name).to_string() + project_name = href.replace( + "https://projects.eclipse.org/projects/", "" + ) + package_url = PackageURL(type="eclipse", name=project_name).to_string() yield URI(uri=href, package_url=package_url, source_uri=self.uri) -@visit_router.route('https://projects.eclipse.org/projects/.*') +@visit_router.route("https://projects.eclipse.org/projects/.*") class EclipseSingleProjectVisitor(HttpVisitor): """ Visit the HTML page of single eclipse project. @@ -58,118 +57,124 @@ class EclipseSingleProjectVisitor(HttpVisitor): For example:https://projects.eclipse.org/projects/modeling.m2t.accele """ + pass -@visit_router.route('http://git.eclipse.org/c') +@visit_router.route("http://git.eclipse.org/c") class EclipseGitVisitor(HttpVisitor): - """ - Visitor Eclipse Git HTML page and return URIs in the Git HTML page. - """ + """Visitor Eclipse Git HTML page and return URIs in the Git HTML page.""" def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='td'): - if 'class' not in td.attrs: + page = BeautifulSoup(content, "lxml") + for td in page.find_all(name="td"): + if "class" not in td.attrs: continue - if td.attrs.get('class') != ['sublevel-repo']: + if td.attrs.get("class") != ["sublevel-repo"]: continue - for a in td.findChildren(name='a'): - href = a['href'] + for a in td.findChildren(name="a"): + href = a["href"] name = a.contents[0] - package_url = PackageURL(type='eclipse', name=name).to_string() + package_url = PackageURL(type="eclipse", name=name).to_string() yield URI(uri=href, package_url=package_url, source_uri=self.uri) -@visit_router.route('http://www.eclipse.org/downloads/packages/all') +@visit_router.route("http://www.eclipse.org/downloads/packages/all") class EclipsePackagesVisitor(HttpVisitor): - """ - Visit the Eclipse packages HTML page and return URIs parsed from HTML page. - """ + """Visit the Eclipse packages HTML page and return URIs parsed from HTML page.""" def fetch(self, uri, timeout=40): - """ - Fetch and return the content found at a remote uri with an extra timeout - """ + """Fetch and return the content found at a remote uri with an extra timeout""" return HttpVisitor.fetch(self, uri, timeout=timeout) def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='span'): - if 'class' not in td.attrs: + page = BeautifulSoup(content, "lxml") + for td in page.find_all(name="span"): + if "class" not in td.attrs: continue - if td.attrs.get('class') != ['field-content']: + if td.attrs.get("class") != ["field-content"]: continue - a = td.find(name='a') - href = a['href'] + a = td.find(name="a") + href = a["href"] name = a.contents[0] # Skip some of the nodes if it's a HTML tag but not a string if name and isinstance(name, str): - package_url = PackageURL(type='eclipse', name=name).to_string() + package_url = PackageURL(type="eclipse", name=name).to_string() yield URI(uri=href, package_url=package_url, source_uri=self.uri) -@visit_router.route('http://www.eclipse.org/downloads/packages/release/.*') +@visit_router.route("http://www.eclipse.org/downloads/packages/release/.*") class EclipseReleaseVisitor(HttpVisitor): - """ - Visit the Eclipse release HTML page and return expected Package URIs. - """ + """Visit the Eclipse release HTML page and return expected Package URIs.""" def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - suffix_list = ['-win32.zip', '-win64.exe', '-win32-x86_64.zip', '-linux-gtk-x86_64.tar.gz', - '-linux-gtk-x86_64.tar.gz', '-macosx-cocoa-x86_64.tar.gz', '-linux-gtk.tar.gz', '-x86_64.tar.gz'] - for div in page.find_all(name='div'): - for a in div.find_all(name='a'): - url = a.get('href') - if url and 'download.php?file=' in url: + page = BeautifulSoup(content, "lxml") + suffix_list = [ + "-win32.zip", + "-win64.exe", + "-win32-x86_64.zip", + "-linux-gtk-x86_64.tar.gz", + "-linux-gtk-x86_64.tar.gz", + "-macosx-cocoa-x86_64.tar.gz", + "-linux-gtk.tar.gz", + "-x86_64.tar.gz", + ] + for div in page.find_all(name="div"): + for a in div.find_all(name="a"): + url = a.get("href") + if url and "download.php?file=" in url: file_name = fileutils.file_name(url) name = file_name for suffix in suffix_list: - name = name.replace(suffix, '') - package_url = PackageURL(type='eclipse', name=name).to_string() - yield URI(uri=url, file_name=file_name, package_url=package_url, source_uri=self.uri) + name = name.replace(suffix, "") + package_url = PackageURL(type="eclipse", name=name).to_string() + yield URI( + uri=url, + file_name=file_name, + package_url=package_url, + source_uri=self.uri, + ) -@visit_router.route('http://projects.eclipse.org/json/projects/all') +@visit_router.route("http://projects.eclipse.org/json/projects/all") class EclipseProjectsJsonVisitor(HttpJsonVisitor): - """ - Visit the Ecipse json API and return expected project specified URIs. - """ + """Visit the Ecipse json API and return expected project specified URIs.""" def fetch(self, uri, timeout=40): - """ - Fetch and return the content found at a remote uri with an extra timeout - """ + """Fetch and return the content found at a remote uri with an extra timeout""" return HttpJsonVisitor.fetch(self, uri, timeout=timeout) def get_uris(self, content): - url_template = 'http://projects.eclipse.org/json/project/{name}' - projects = content.get('projects', {}) + url_template = "http://projects.eclipse.org/json/project/{name}" + projects = content.get("projects", {}) for project in projects: # TODO: are we sure there is not more data available in this JSON? - package_url = PackageURL(type='eclipse', name=project).to_string() - yield URI(uri=url_template.format(name=project), package_url=package_url, source_uri=self.uri) + package_url = PackageURL(type="eclipse", name=project).to_string() + yield URI( + uri=url_template.format(name=project), + package_url=package_url, + source_uri=self.uri, + ) -@visit_router.route('http://projects.eclipse.org/json/project/.*') +@visit_router.route("http://projects.eclipse.org/json/project/.*") class EclipseSingleProjectJsonVisitor(HttpJsonVisitor): """ Visit json of a single Eclipse project. This is to return the json itself without any URIs, as the URI itself is returned by EclipseProjectsJsonVisitor. """ + pass # FIXME: we should create packages from releases!!!! not from projects -@map_router.route('http://projects.eclipse.org/json/project/.*') +@map_router.route("http://projects.eclipse.org/json/project/.*") class EclipseJsonPackageMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -189,40 +194,39 @@ def build_packages_with_json(metadata, purl=None, uri=None): metadata: json metadata content purl: String value of the package url of the ResourceURI object """ - - projects = metadata['projects'] + projects = metadata["projects"] for project, project_metadata in projects.items(): common_data = dict( datasource_id="eclipse_metadata", - type='eclipse', + type="eclipse", name=project, ) - descriptions = project_metadata.get('description') + descriptions = project_metadata.get("description") if descriptions and len(descriptions) > 0: - common_data['description'] = descriptions[0].get('value') + common_data["description"] = descriptions[0].get("value") else: - common_data['description'] = project_metadata['title'] + common_data["description"] = project_metadata["title"] - homepage_urls = project_metadata.get('website_url') + homepage_urls = project_metadata.get("website_url") if homepage_urls and len(homepage_urls) > 0: - common_data['homepage_url'] = homepage_urls[0].get('url') + common_data["homepage_url"] = homepage_urls[0].get("url") - bug_tracking_urls = project_metadata.get('bugzilla') + bug_tracking_urls = project_metadata.get("bugzilla") if bug_tracking_urls and len(bug_tracking_urls) > 0: - common_data['bug_tracking_url'] = bug_tracking_urls[0].get( - 'query_url') + common_data["bug_tracking_url"] = bug_tracking_urls[0].get("query_url") - if project_metadata.get('licenses'): - common_data['extracted_license_statement'] = [ - l.get('name') for l in project_metadata.get('licenses', [])] - common_data['license_detections'] = [] + if project_metadata.get("licenses"): + common_data["extracted_license_statement"] = [ + l.get("name") for l in project_metadata.get("licenses", []) + ] + common_data["license_detections"] = [] # FIXME: this is a download page and NOT a download URL!!!!! - for download_url in project_metadata.get('download_url', []): - durl = download_url.get('url') + for download_url in project_metadata.get("download_url", []): + durl = download_url.get("url") if durl: - common_data['download_url'] = durl + common_data["download_url"] = durl package = scan_models.Package.from_package_data( package_data=common_data, datafile_path=uri, @@ -231,7 +235,7 @@ def build_packages_with_json(metadata, purl=None, uri=None): yield package -@map_router.route('https://projects.eclipse.org/projects/.*') +@map_router.route("https://projects.eclipse.org/projects/.*") class EclipseHTMLProjectMapper(Mapper): def get_packages(self, uri, resource_uri): """ @@ -248,43 +252,45 @@ def build_packages(html_text, purl=None, uri=None): Yield Package objects built from `html_text`and the `purl` package URL string. """ - page = BeautifulSoup(html_text, 'lxml') + page = BeautifulSoup(html_text, "lxml") common_data = dict( datasource_id="eclipse_html", - type='eclipse', + type="eclipse", ) extracted_license_statement = [] - for meta in page.find_all(name='meta'): - if 'name' in meta.attrs and 'dcterms.title' in meta.attrs.get('name'): - common_data['name'] = meta.attrs.get('content') - if 'name' in meta.attrs and 'dcterms.description' in meta.attrs.get('name'): - common_data['description'] = meta.attrs.get('content') - - for div in page.find_all(name='div'): - if 'class' not in div.attrs: + for meta in page.find_all(name="meta"): + if "name" in meta.attrs and "dcterms.title" in meta.attrs.get("name"): + common_data["name"] = meta.attrs.get("content") + if "name" in meta.attrs and "dcterms.description" in meta.attrs.get("name"): + common_data["description"] = meta.attrs.get("content") + + for div in page.find_all(name="div"): + if "class" not in div.attrs: continue - if 'field-name-field-project-licenses' in div.attrs.get('class'): + if "field-name-field-project-licenses" in div.attrs.get("class"): # Visit div element whose class atttribute is field-name-field-project-licenses - for a in div.find_all(name='a'): - if 'href' not in a.attrs: + for a in div.find_all(name="a"): + if "href" not in a.attrs: continue license_name = str(a.contents[0]) extracted_license_statement.append(license_name) if extracted_license_statement: - common_data['extracted_license_statement'] = extracted_license_statement - common_data['license_detections'] = [] + common_data["extracted_license_statement"] = extracted_license_statement + common_data["license_detections"] = [] - for a in page.find_all(name='a'): + for a in page.find_all(name="a"): if a.contents: - if str(a.contents[0]).strip() == 'Website': - common_data['homepage_url'] = a['href'] + if str(a.contents[0]).strip() == "Website": + common_data["homepage_url"] = a["href"] - for a in page.find_all(name='a'): + for a in page.find_all(name="a"): if not a.contents: continue - if str(a.contents[0]).strip() == 'Downloads': - download_data = dict(download_url=a['href'],) + if str(a.contents[0]).strip() == "Downloads": + download_data = dict( + download_url=a["href"], + ) download_data.update(common_data) package = scan_models.Package.from_package_data( package_data=download_data, @@ -293,27 +299,27 @@ def build_packages(html_text, purl=None, uri=None): package.set_purl(purl) yield package - for div in page.find_all(name='div'): - if 'class' not in div.attrs: + for div in page.find_all(name="div"): + if "class" not in div.attrs: continue - if 'field-name-field-latest-releases' not in div.attrs.get('class'): + if "field-name-field-latest-releases" not in div.attrs.get("class"): continue # Visit div element whose class attribute is ield-name-field-latest-releases - tbody = div.find(name='tbody') + tbody = div.find(name="tbody") if not tbody: continue - for tr in tbody.find_all(name='tr'): - for td in tr.find_all(name='td'): - a = td.find(name='a') + for tr in tbody.find_all(name="tr"): + for td in tr.find_all(name="td"): + a = td.find(name="a") if not a: continue - if 'href' not in a.attrs or 'class' in a.attrs: + if "href" not in a.attrs or "class" in a.attrs: continue version = a.contents[0] - href = a['href'] + href = a["href"] download_data = dict( version=version, download_url=href, diff --git a/minecode/miners/fdroid.py b/minecode/miners/fdroid.py index d0b4085d..4a10a162 100644 --- a/minecode/miners/fdroid.py +++ b/minecode/miners/fdroid.py @@ -15,20 +15,20 @@ from packagedcode.models import party_person from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper from minecode.miners import URI +from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor - TRACE = False logger = logging.getLogger(__name__) if TRACE: import sys + logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) @@ -59,24 +59,21 @@ class FdroidSeed(seed.Seeder): - def get_seeds(self): - yield 'https://f-droid.org/repo/index-v2.json' + yield "https://f-droid.org/repo/index-v2.json" def build_purl(package_id, version_code, filename): - """ - Return a PackageURL for an F-Droid package. - """ + """Return a PackageURL for an F-Droid package.""" return PackageURL( - type='fdroid', + type="fdroid", name=package_id, version=version_code, - qualifiers=dict(filename=filename) + qualifiers=dict(filename=filename), ) -@visit_router.route('https://f-droid.org/repo/index-v2.json') +@visit_router.route("https://f-droid.org/repo/index-v2.json") class FdroidIndexVisitor(NonPersistentHttpVisitor): """ Collect package metadata URIs from the F-Droid index for each package. @@ -84,30 +81,29 @@ class FdroidIndexVisitor(NonPersistentHttpVisitor): """ def get_uris(self, content): - """ - Yield a URI for each F-Droid package. - """ + """Yield a URI for each F-Droid package.""" json_location = content with open(json_location) as c: content = json.loads(c.read()) - packages = content['packages'] + packages = content["packages"] for package_id, package_data in packages.items(): - purl = PackageURL(type='fdroid', name=package_id).to_string() + purl = PackageURL(type="fdroid", name=package_id).to_string() yield URI( uri=purl, package_url=purl, source_uri=self.uri, - data=json.dumps(package_data, separators=(',', ':'), ensure_ascii=False), + data=json.dumps( + package_data, separators=(",", ":"), ensure_ascii=False + ), # note: visited is True since there nothing more to visit - visited=True + visited=True, ) -@map_router.route('pkg:fdroid/.+') +@map_router.route("pkg:fdroid/.+") class FdroidPackageMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package(s) built from the index data for all versions of an F-Droid @@ -127,20 +123,20 @@ def build_packages(purl, data): # we map categories to keyword # "categories": ["Time"], - keywords = metadata.get('categories', []) + keywords = metadata.get("categories", []) # "issueTracker": "https://github.com/jdmonin/anstop/issues", - bug_tracking_url = metadata.get('issueTracker') + bug_tracking_url = metadata.get("issueTracker") # "license": "GPL-2.0-only", # this is supposed to be an SPDX expression - extracted_license_statement = metadata.get('license') + extracted_license_statement = metadata.get("license") # "sourceCode": "https://github.com/jdmonin/anstop", - vcs_url = metadata.get('sourceCode') + vcs_url = metadata.get("sourceCode") # "webSite": "https://sourceforge.net/projects/androidspeedo", - homepage_url = metadata.get('webSite') + homepage_url = metadata.get("webSite") description = build_description(metadata, language="en-US") @@ -148,16 +144,18 @@ def build_packages(purl, data): # "authorEmail": "jigsaw-code@google.com", # "authorName": "Jigsaw", # "authorWebSite": "https://jigsaw.google.com/", - author_name = metadata.get('authorName') - author_email = metadata.get('authorEmail') - author_url = metadata.get('authorWebSite') + author_name = metadata.get("authorName") + author_email = metadata.get("authorEmail") + author_url = metadata.get("authorWebSite") if any([author_name, author_email, author_url]): - parties.append(Party( - type=party_person, - name=author_name, - role="author", - email=author_email, - url=author_url) + parties.append( + Party( + type=party_person, + name=author_name, + role="author", + email=author_email, + url=author_url, + ) ) # TODO: add these @@ -176,7 +174,7 @@ def build_packages(purl, data): extracted_license_statement=extracted_license_statement, vcs_url=vcs_url, homepage_url=homepage_url, - repository_homepage_url=f'https://f-droid.org/en/packages/{base_purl.name}', + repository_homepage_url=f"https://f-droid.org/en/packages/{base_purl.name}", description=description, parties=parties, ) @@ -186,22 +184,21 @@ def build_packages(purl, data): # "added": 1344556800000, # "file": { # "name": "/An.stop_10.apk", .... - versions = data['versions'] + versions = data["versions"] for _sha256_of_apk, version_data in versions.items(): # TODO: collect versionName - version_code = str(version_data['manifest']['versionCode']) - logger.debug( - f'build_packages: base_purl: {base_purl} version: {version_code}') - logger.debug(f'build_packages: data: {version_data}') + version_code = str(version_data["manifest"]["versionCode"]) + logger.debug(f"build_packages: base_purl: {base_purl} version: {version_code}") + logger.debug(f"build_packages: data: {version_data}") # TODO: add release_date from "added": 1655164800000, # these must exists since F-Droid builds from sources - src = version_data['src'] - src_filename = src['name'] - src_sha256 = src['sha256'] - src_size = src['size'] + src = version_data["src"] + src_filename = src["name"] + src_sha256 = src["sha256"] + src_size = src["size"] download_url = f'https://f-droid.org/repo/{src_filename.strip("/")}' package_mapping = dict( @@ -219,15 +216,15 @@ def build_packages(purl, data): type=src.type, name=src.name, version=src.version, - qualifiers=dict(download_url=download_url) + qualifiers=dict(download_url=download_url), ) # these must exists or there is no F-Droid package - file = version_data['file'] - filename = file['name'] - sha256 = file['sha256'] - size = file['size'] - download_url = f'https://f-droid.org/repo/{filename}' + file = version_data["file"] + filename = file["name"] + sha256 = file["sha256"] + size = file["size"] + download_url = f"https://f-droid.org/repo/{filename}" package_mappping = dict( version=version_code, @@ -241,7 +238,7 @@ def build_packages(purl, data): yield PackageData.from_data(package_mapping) -def build_description(metadata, language='en-US'): +def build_description(metadata, language="en-US"): r""" Return a description in ``language`` built from a package name, summary and description, one per line. @@ -273,20 +270,20 @@ def build_description(metadata, language='en-US'): >>> build_description(metadata) 'Anstop' """ - names = metadata.get('name') or {} + names = metadata.get("name") or {} name = names.get(language) - summaries = metadata.get('summary') or {} + summaries = metadata.get("summary") or {} summary = summaries.get(language) if name and summary and summary.startswith(name): name = None - descriptions = metadata.get('description') or {} + descriptions = metadata.get("description") or {} description = descriptions.get(language) if summary and description and description.startswith(summary): summary = None non_empty_parts = [p for p in [name, summary, description] if p] - return '\n'.join(non_empty_parts) + return "\n".join(non_empty_parts) diff --git a/minecode/miners/fedora.py b/minecode/miners/fedora.py index ae93637b..9c20c179 100644 --- a/minecode/miners/fedora.py +++ b/minecode/miners/fedora.py @@ -1,4 +1,3 @@ - # use this to find all /repodata directories: # https://archive.fedoraproject.org/pub/DIRECTORY_SIZES.txt diff --git a/minecode/miners/freebsd.py b/minecode/miners/freebsd.py index aa75f2fa..c2c0fbde 100644 --- a/minecode/miners/freebsd.py +++ b/minecode/miners/freebsd.py @@ -7,25 +7,24 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from io import StringIO import logging import os +from io import StringIO +import saneyaml from bs4 import BeautifulSoup from packagedcode.freebsd import CompactManifestHandler -import saneyaml -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.utils import extract_file -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpVisitor +from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI +from minecode.utils import extract_file from minecode.utils import get_temp_dir - logger = logging.getLogger(__name__) handler = logging.StreamHandler() logger.addHandler(handler) @@ -33,68 +32,59 @@ class FreeBSDSeed(seed.Seeder): - def get_seeds(self): - yield 'https://pkg.freebsd.org' + yield "https://pkg.freebsd.org" -@visit_router.route('https://pkg.freebsd.org') +@visit_router.route("https://pkg.freebsd.org") class FreeBSDBaseHTMLVisitors(HttpVisitor): - """ - Visit the freeBSD home link and yield uri for each FreeBSD repo - """ + """Visit the freeBSD home link and yield uri for each FreeBSD repo""" def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - base_url = 'https://pkg.freebsd.org/{path}/' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + page = BeautifulSoup(content, "lxml") + base_url = "https://pkg.freebsd.org/{path}/" + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] + href = a["href"] # the sub link useful is like: FreeBSD:13:aarch64 - if href and href.startswith('FreeBSD%3A'): + if href and href.startswith("FreeBSD%3A"): url = base_url.format(path=href) yield URI(uri=url, source_uri=self.uri) -@visit_router.route('https://pkg.freebsd.org/.*/') +@visit_router.route("https://pkg.freebsd.org/.*/") class FreeBSDSubHTMLVisitors(HttpVisitor): - """ - Visit the sub repo URL and yield all uris in the page and in its children page - """ + """Visit the sub repo URL and yield all uris in the page and in its children page""" def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - base_url = self.uri + '{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs or 'title' not in a.attrs: + page = BeautifulSoup(content, "lxml") + base_url = self.uri + "{path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs or "title" not in a.attrs: # parent link doesn't have title. continue - href = a['href'] + href = a["href"] url = base_url.format(path=href) yield URI(uri=url, source_uri=self.uri) -@visit_router.route('https://pkg.freebsd.org/.*packagesite.txz') +@visit_router.route("https://pkg.freebsd.org/.*packagesite.txz") class FreeBSDIndexVisitors(NonPersistentHttpVisitor): - """ - Extract packagesite.txz index file, get the data of packagesite.yaml file. - """ + """Extract packagesite.txz index file, get the data of packagesite.yaml file.""" def dumps(self, content): - """ - Extract the file packagesite.yaml and read the content of the file and return. - """ + """Extract the file packagesite.yaml and read the content of the file and return.""" extracted_location = extract_file(content) - manifest_file = os.path.join(extracted_location, 'packagesite.yaml') + manifest_file = os.path.join(extracted_location, "packagesite.yaml") if os.path.exists(manifest_file): with open(manifest_file) as file_handler: return file_handler.read() else: - logger.warn('The packagesite.yaml is not existing in index file:' + content) + logger.warn("The packagesite.yaml is not existing in index file:" + content) -@map_router.route('https://pkg.freebsd.org/.*packagesite.txz') +@map_router.route("https://pkg.freebsd.org/.*packagesite.txz") class FreeBSDIndexMapper(Mapper): def get_packages(self, uri, resource_uri): """ @@ -114,15 +104,15 @@ def build_packages(metadata, purl=None): buf = StringIO(metadata) # The passing metadata is not a well-formatted yaml or json, but each line is a yaml, so read by line and parse with FreeBSDPackage parser. for each_line in buf: - if each_line and each_line.strip() in ('', '{', '}'): + if each_line and each_line.strip() in ("", "{", "}"): continue content = saneyaml.load(each_line) - if content and content.get('name'): - temp_dir = get_temp_dir('freebsd_index') - location = os.path.join(temp_dir, '+COMPACT_MANIFEST') - with open(location, 'w') as manifest: + if content and content.get("name"): + temp_dir = get_temp_dir("freebsd_index") + location = os.path.join(temp_dir, "+COMPACT_MANIFEST") + with open(location, "w") as manifest: manifest.write(each_line) - with open(location, encoding='utf-8') as loc: + with open(location, encoding="utf-8") as loc: yaml_data = saneyaml.load(loc) package = CompactManifestHandler._parse(yaml_data=yaml_data) package.set_purl(purl) diff --git a/minecode/miners/freedesktop.py b/minecode/miners/freedesktop.py index 8dc10e4e..5ca9802a 100644 --- a/minecode/miners/freedesktop.py +++ b/minecode/miners/freedesktop.py @@ -11,49 +11,52 @@ from packagedcode import models as scan_models from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper -from minecode.miners import HttpVisitor from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper from minecode.utils import form_vcs_url class FreedesktopSeed(seed.Seeder): - def get_seeds(self): - yield 'https://www.freedesktop.org/wiki/Software' + yield "https://www.freedesktop.org/wiki/Software" -@visit_router.route('https://www.freedesktop.org/wiki/Software') +@visit_router.route("https://www.freedesktop.org/wiki/Software") class FreedesktopHTMLVisitor(HttpVisitor): - """ - Visit the Freedesktop Software HTML page and return URIs parsed from HTML page. - """ + """Visit the Freedesktop Software HTML page and return URIs parsed from HTML page.""" + def get_uris(self, content): - url_template = 'https://www.freedesktop.org/wiki/Software/{name}' - page = BeautifulSoup(content, 'lxml') - for div in page.find_all(name='div'): - for a in div.find_all(name='a'): - if 'href' not in a.attrs: + url_template = "https://www.freedesktop.org/wiki/Software/{name}" + page = BeautifulSoup(content, "lxml") + for div in page.find_all(name="div"): + for a in div.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] - if href and href.startswith('./'): - project_name = href.replace('./', '').strip('/') - package_url = PackageURL(type='freedesktop', name=project_name).to_string() - yield URI(uri=url_template.format(name=project_name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://www.freedesktop.org/wiki/Software/.*') + href = a["href"] + if href and href.startswith("./"): + project_name = href.replace("./", "").strip("/") + package_url = PackageURL( + type="freedesktop", name=project_name + ).to_string() + yield URI( + uri=url_template.format(name=project_name), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("https://www.freedesktop.org/wiki/Software/.*") class FreedesktopProjectHTMLVisitor(HttpVisitor): - """ - Visit the Freedesktop Project HTML page. - """ + """Visit the Freedesktop Project HTML page.""" + pass -@map_router.route('https://www.freedesktop.org/wiki/Software/.*') +@map_router.route("https://www.freedesktop.org/wiki/Software/.*") class FreedesktopHTMLProjectMapper(Mapper): def get_packages(self, uri, resource_uri): """ @@ -69,35 +72,31 @@ def build_packages(html_text, uri, purl): Yield Package objects built from `html_text` from the `uri` and the `purl` package URL string. """ - purl = PackageURL.from_string(purl) package_data = dict( - type='freedesktop', - name=purl.name, - version=purl.version, - homepage_url=uri + type="freedesktop", name=purl.name, version=purl.version, homepage_url=uri ) - page = BeautifulSoup(html_text, 'lxml') + page = BeautifulSoup(html_text, "lxml") if page.h1: - package_data['description'] = page.h1.string.strip() + package_data["description"] = page.h1.string.strip() - for a in page.find_all(name='a'): - link = a['href'] - if 'freedesktop.org' not in link: + for a in page.find_all(name="a"): + link = a["href"] + if "freedesktop.org" not in link: continue - if '/releases/' in link or '/dist/' in link: - package_data['download_url'] = link + if "/releases/" in link or "/dist/" in link: + package_data["download_url"] = link - if 'https://bugs.freedesktop.org/buglist.cgi' in link: - package_data['bug_tracking_url'] = link + if "https://bugs.freedesktop.org/buglist.cgi" in link: + package_data["bug_tracking_url"] = link - if 'http://cgit.freedesktop.org/' in link and 'tree/' in link: - package_data['code_view_url'] = link + if "http://cgit.freedesktop.org/" in link and "tree/" in link: + package_data["code_view_url"] = link - for li in page.find_all(name='li'): - if li.text and li.text.startswith('git://'): - package_data['vcs_url'] = form_vcs_url('git', li.text) + for li in page.find_all(name="li"): + if li.text and li.text.startswith("git://"): + package_data["vcs_url"] = form_vcs_url("git", li.text) yield scan_models.Package(**package_data) diff --git a/minecode/miners/github.py b/minecode/miners/github.py index 2052b1dc..8d0c9758 100644 --- a/minecode/miners/github.py +++ b/minecode/miners/github.py @@ -7,21 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from datetime import date -from datetime import datetime import json import logging +from datetime import date +from datetime import datetime -from github.MainClass import Github -from packageurl import PackageURL import attr import packagedcode.models as scan_models +from github.MainClass import Github +from packageurl import PackageURL from minecode import map_router -from minecode import visit_router, seed +from minecode import seed +from minecode import visit_router +from minecode.miners import URI from minecode.miners import HttpJsonVisitor from minecode.miners import Mapper -from minecode.miners import URI from minecode.utils import form_vcs_url from minecode.utils import parse_date @@ -35,12 +36,11 @@ class GithubSeed(seed.Seeder): - def get_seeds(self): - yield 'https://api.github.com/repositories?since=0' + yield "https://api.github.com/repositories?since=0" -@visit_router.route('https://api.github.com/repositories\?since=\d+') +@visit_router.route(r"https://api.github.com/repositories\?since=\d+") class GithubReposVisitor(HttpJsonVisitor): """ Visitor to run repositories request to get all repositories by increasing since symbol 100 each loop time. @@ -49,30 +49,29 @@ class GithubReposVisitor(HttpJsonVisitor): """ def get_uris(self, content): - repo_request_base = 'https://api.github.com/repositories?since=' + repo_request_base = "https://api.github.com/repositories?since=" has_content = False if content: for entry in content: has_content = True - url = entry.get('url') + url = entry.get("url") # Take full_name instead of name here since we want to keep more info, especially when forming the package url # "name": "grit", # "full_name": "mojombo/grit", - name = entry.get('full_name') + name = entry.get("full_name") if url: package_url = None if name: - package_url = PackageURL( - type='github', name=name).to_string() + package_url = PackageURL(type="github", name=name).to_string() # Yield URI for GithubSingleRepoVisitor use yield URI(uri=url, package_url=package_url, source_uri=self.uri) if not has_content: logger.info( - 'The content of the response is empty, the processing might be finished for URI: {}'.format(self.uri)) + f"The content of the response is empty, the processing might be finished for URI: {self.uri}" + ) else: uri = self.uri - current_id = uri.replace( - 'https://api.github.com/repositories?since=', '') + current_id = uri.replace("https://api.github.com/repositories?since=", "") current_id = int(current_id) # 100 is fixed since each page has 100 entries. Plus 100 means to go from next page. new_id = current_id + 100 @@ -80,7 +79,7 @@ def get_uris(self, content): yield URI(uri=new_url, source_uri=self.uri) -@visit_router.route('https://api.github.com/repos/[\w\-\.]+/[\w\-\.]+') +@visit_router.route(r"https://api.github.com/repos/[\w\-\.]+/[\w\-\.]+") class GithubSingleRepoVisitor(HttpJsonVisitor): """ Visitor to get the json and add more content with GitHub API from one repo. @@ -93,7 +92,7 @@ def fetch(self, uri, timeout=None): The json itself has lots of URL info, the Github API can get content without acccessing the URLs inside the json explicitly. The main idea is to fetch download_url... """ - full_name = uri.replace('https://api.github.com/repos/', '') + full_name = uri.replace("https://api.github.com/repos/", "") g = Github() repo = g.get_repo(full_name) @@ -118,12 +117,12 @@ def fetch(self, uri, timeout=None): ) if repo.owner: - common_data['owner'] = repo.owner.name + common_data["owner"] = repo.owner.name if repo._issues_url: - common_data['issue_url'] = repo._issues_url.value + common_data["issue_url"] = repo._issues_url.value if repo._git_url: - common_data['git_url'] = repo._git_url.value + common_data["git_url"] = repo._git_url.value if repo.organization: repo.origanization = repo.organization.name @@ -131,24 +130,25 @@ def fetch(self, uri, timeout=None): downloads = [] if repo.get_downloads(): for download in list(repo.get_downloads()): - downloads.append(dict( - name=download.name, - url=download.url, - size=download.size, - s3_url=download.s3_url, - created_at=json_serial_date_obj(download.created_at), - download_count=download.download_count, - description=download.description, - redirect=download.redirect, - signature=download.signature, - html_url=download.html_url, - bucket=download.bucket, - acl=download.acl, - accesskeyid=download.accesskeyid, - expirationdate=json_serial_date_obj( - download.expirationdate), - )) - common_data['downloads'] = downloads + downloads.append( + dict( + name=download.name, + url=download.url, + size=download.size, + s3_url=download.s3_url, + created_at=json_serial_date_obj(download.created_at), + download_count=download.download_count, + description=download.description, + redirect=download.redirect, + signature=download.signature, + html_url=download.html_url, + bucket=download.bucket, + acl=download.acl, + accesskeyid=download.accesskeyid, + expirationdate=json_serial_date_obj(download.expirationdate), + ) + ) + common_data["downloads"] = downloads tags = [] if repo.get_tags(): @@ -159,26 +159,30 @@ def fetch(self, uri, timeout=None): zipball_url=tag.zipball_url, ) if tag.commit: - tag_info['sha1'] = tag.commit.sha + tag_info["sha1"] = tag.commit.sha tags.append(tag_info) - common_data['tags'] = tags + common_data["tags"] = tags - if not common_data.get('tags') and not common_data.get('downloads'): + if not common_data.get("tags") and not common_data.get("downloads"): # If there is no downloads and tags, let's make the download_url by forming archive/master.zip at the end # For example, the base html is: https://github.com/collectiveidea/calendar_builder # The final download_url is https://github.com/collectiveidea/calendar_builder/archive/master.zip branches_download_urls = [] - download_url_bases = '{html_url}/archive/{branch_name}.zip' + download_url_bases = "{html_url}/archive/{branch_name}.zip" if repo.get_branches(): for branch in list(repo.get_branches()): - branches_download_urls.append(download_url_bases.format( - html_url=common_data.get('html_url'), branch_name=branch.name)) - common_data['branches_download_urls'] = branches_download_urls - - common_data['labels'] = [] + branches_download_urls.append( + download_url_bases.format( + html_url=common_data.get("html_url"), + branch_name=branch.name, + ) + ) + common_data["branches_download_urls"] = branches_download_urls + + common_data["labels"] = [] if repo.get_labels(): for label in repo.get_labels(): - common_data['labels'].append(label.name) + common_data["labels"].append(label.name) return json.dumps(common_data) @@ -189,9 +193,8 @@ def json_serial_date_obj(obj): return obj.isoformat() -@map_router.route('https://api\.github\.com/repos/([^/]+)/([^/]+)') +@map_router.route(r"https://api\.github\.com/repos/([^/]+)/([^/]+)") class GithubMetaFileMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -201,7 +204,9 @@ def get_packages(self, uri, resource_uri): visited_data = resource_uri.data if not visited_data: return - return build_github_packages(visited_data, resource_uri.uri, resource_uri.package_url) + return build_github_packages( + visited_data, resource_uri.uri, resource_uri.package_url + ) def build_github_packages(visited_data, uri, purl=None): @@ -213,68 +218,75 @@ def build_github_packages(visited_data, uri, purl=None): """ visited_data = json.loads(visited_data) - full_name = visited_data['full_name'] + full_name = visited_data["full_name"] namespace, name = split_org_repo(full_name) # FIXME: when could this ever happen?? - assert name == visited_data['name'], 'build_github_packages: Inconsistent name and org for URI: ' + uri + assert name == visited_data["name"], ( + "build_github_packages: Inconsistent name and org for URI: " + uri + ) - description = visited_data['description'] + description = visited_data["description"] - vcs_url = visited_data.get('git_url'), + vcs_url = (visited_data.get("git_url"),) if vcs_url: - vcs_url = form_vcs_url('git', vcs_url) + vcs_url = form_vcs_url("git", vcs_url) package = scan_models.Package( - type='github', + type="github", namespace=namespace, name=name, description=description, - primary_language=visited_data.get('language'), - homepage_url=visited_data.get('html_url'), + primary_language=visited_data.get("language"), + homepage_url=visited_data.get("html_url"), vcs_url=vcs_url, # this size does not make sense - size=visited_data.get('size'), + size=visited_data.get("size"), ) - if visited_data.get('owner'): + if visited_data.get("owner"): package.parties = [ scan_models.Party( # FIXME: we can add the org or user URL and we can know if this # is an org or a perrsone too. type=scan_models.party_person, - name=visited_data.get('owner'), - role='owner') + name=visited_data.get("owner"), + role="owner", + ) ] package.set_purl(purl) - downloads = visited_data.get('downloads') or [] + downloads = visited_data.get("downloads") or [] for download in downloads: - html_url = download.get('html_url') + html_url = download.get("html_url") if html_url: # make a copy package = attr.evolve(package) package.download_url = html_url - package.size = download.get('size') - package.release_date = parse_date(download.get('created_at')) + package.size = download.get("size") + package.release_date = parse_date(download.get("created_at")) yield package - tags = visited_data.get('tags') or [] + tags = visited_data.get("tags") or [] for tag in tags: package = attr.evolve(package) - package.version = tag.get('name') - package_url = PackageURL(type='github', name=package.name, - namespace=namespace, version=tag.get('name')).to_string() - package.sha1 = tag.get('sha1') - if tag.get('tarball_url'): - package.download_url = tag.get('tarball_url') + package.version = tag.get("name") + package_url = PackageURL( + type="github", + name=package.name, + namespace=namespace, + version=tag.get("name"), + ).to_string() + package.sha1 = tag.get("sha1") + if tag.get("tarball_url"): + package.download_url = tag.get("tarball_url") package.set_purl(package_url) yield package - if tag.get('zipball_url'): - package.download_url = tag.get('zipball_url') + if tag.get("zipball_url"): + package.download_url = tag.get("zipball_url") package.set_purl(package_url) yield package - branches_download_urls = visited_data.get('branches_download_urls') or [] + branches_download_urls = visited_data.get("branches_download_urls") or [] for branches_download_url in branches_download_urls: package = attr.evolve(package) package.download_url = branches_download_url @@ -296,11 +308,11 @@ def split_org_repo(url_like): >>> split_org_repo('git://github.com/foo/bar.git') ('foo', 'bar') """ - segments = [s.strip() for s in url_like.split('/') if s.strip()] + segments = [s.strip() for s in url_like.split("/") if s.strip()] if not len(segments) >= 2: - raise ValueError('Not a GitHub-like URL: {}'.format(url_like)) + raise ValueError(f"Not a GitHub-like URL: {url_like}") org = segments[-2] name = segments[-1] - if name.endswith('.git'): - name, _, _ = name .rpartition('.git') + if name.endswith(".git"): + name, _, _ = name.rpartition(".git") return org, name diff --git a/minecode/miners/gitlab.py b/minecode/miners/gitlab.py index 478f4436..637ce681 100644 --- a/minecode/miners/gitlab.py +++ b/minecode/miners/gitlab.py @@ -9,27 +9,27 @@ import json -from packageurl import PackageURL import packagedcode.models as scan_models +from packageurl import PackageURL + +from minecode import map_router from minecode import seed -from minecode.utils import get_http_response from minecode import visit_router -from minecode import map_router -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpJsonVisitor from minecode.miners import HttpVisitor -from minecode.miners import URI +from minecode.miners import Mapper from minecode.utils import form_vcs_url +from minecode.utils import get_http_response from minecode.utils import parse_date class GitlabSeed(seed.Seeder): - def get_seeds(self): - yield 'https://gitlab.com/api/v4/projects' + yield "https://gitlab.com/api/v4/projects" -@visit_router.route('https://gitlab.com/api/v4/projects') +@visit_router.route("https://gitlab.com/api/v4/projects") class GitlabAPIHeaderVisitor(HttpVisitor): """ Get the header of the API, and parse the page size and total pages from the @@ -37,25 +37,27 @@ class GitlabAPIHeaderVisitor(HttpVisitor): """ def fetch(self, uri, timeout=10): - """ - Return only the headers of the response. - """ + """Return only the headers of the response.""" return get_http_response(uri, timeout).headers def get_uris(self, content): - new_page_template = 'https://gitlab.com/api/v4/projects?page={next_page}&per_page={per_page}&statistics=true' + new_page_template = "https://gitlab.com/api/v4/projects?page={next_page}&per_page={per_page}&statistics=true" - page_size = content.get('X-Per-Page') - total_pages = content.get('X-Total-Pages') + page_size = content.get("X-Per-Page") + total_pages = content.get("X-Total-Pages") if page_size and total_pages: total_pages = int(total_pages) for i in range(total_pages): # Use the loop to yield the uri of next page of the visitor. - nextpage_url = new_page_template.format(next_page=i + 1, per_page=page_size) + nextpage_url = new_page_template.format( + next_page=i + 1, per_page=page_size + ) yield URI(uri=nextpage_url, source_uri=self.uri, visited=False) -@visit_router.route('https://gitlab.com/api/v4/projects\?page=\d+&per_page=\d+&statistics=true') +@visit_router.route( + r"https://gitlab.com/api/v4/projects\?page=\d+&per_page=\d+&statistics=true" +) class GitlabAPIVisitor(HttpJsonVisitor): """ Return URIs from the json content of one API page returned from gitlab api. @@ -63,7 +65,8 @@ class GitlabAPIVisitor(HttpJsonVisitor): """ def get_uris(self, content): - """Yield URIs from the json content, the passing content is the json info, the example is: + """ + Yield URIs from the json content, the passing content is the json info, the example is: [ { "id": 6377679, @@ -80,22 +83,26 @@ def get_uris(self, content): ] Each element in the list is a dictionary, and we concern the web_url for the visitor and also return the data. """ - if not content: # If the page is empty, just return return for element in content: # The element is one package in the list of current returned page. - url = element.get('web_url') + url = element.get("web_url") if url: - project_name = url.rpartition('/')[-1] - package_url = PackageURL(type='gitlab', name=project_name).to_string() - yield URI(uri=url, package_url=package_url, data=element, source_uri=self.uri, visited=False) - - -@map_router.route('https://gitlab.com/.*') + project_name = url.rpartition("/")[-1] + package_url = PackageURL(type="gitlab", name=project_name).to_string() + yield URI( + uri=url, + package_url=package_url, + data=element, + source_uri=self.uri, + visited=False, + ) + + +@map_router.route("https://gitlab.com/.*") class GitLabMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -114,20 +121,20 @@ def build_packages_from_json(metadata, purl=None): """ content = json.loads(metadata) - name = content.get('name') + name = content.get("name") if name: common_data = dict( - type='gitlab', + type="gitlab", name=name, - homepage_url=content.get('web_url'), - description=content.get('description'), + homepage_url=content.get("web_url"), + description=content.get("description"), ) - repo_url = content.get('http_url_to_repo') + repo_url = content.get("http_url_to_repo") if repo_url: - repo_url = form_vcs_url('git', repo_url) - common_data['vcs_url'] = repo_url - common_data['code_view_url'] = repo_url - common_data['release_date'] = parse_date(content.get('created_at')) + repo_url = form_vcs_url("git", repo_url) + common_data["vcs_url"] = repo_url + common_data["code_view_url"] = repo_url + common_data["release_date"] = parse_date(content.get("created_at")) package = scan_models.Package(**common_data) package.set_purl(purl) yield package diff --git a/minecode/miners/golang.py b/minecode/miners/golang.py index d8c174a1..cee9eb7e 100644 --- a/minecode/miners/golang.py +++ b/minecode/miners/golang.py @@ -12,30 +12,26 @@ from packagedcode import models as scan_models from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router +from minecode.miners import URI from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI from minecode.utils import form_vcs_url -class GoLangSeed(seed.Seeder): +class GoLangSeed(seed.Seeder): def get_seeds(self): - yield 'https://api.godoc.org/packages' + yield "https://api.godoc.org/packages" -@visit_router.route('https://api.godoc.org/packages') +@visit_router.route("https://api.godoc.org/packages") class GodocIndexVisitor(NonPersistentHttpVisitor): - """ - Collect Golang URIs for packages available in the Go doc index. - """ + """Collect Golang URIs for packages available in the Go doc index.""" def get_uris(self, content): - """ - Return URIs to search the API further for a package - """ + """Return URIs to search the API further for a package""" seen_paths = set() for path, package in get_packages(content): package_url, path = parse_package_path(path) @@ -45,35 +41,35 @@ def get_uris(self, content): # note the addition of a * at the end of the search string... # without this the returned data are sparse - details_url = 'https://api.godoc.org/search?q={path}*'.format(**locals()) + details_url = "https://api.godoc.org/search?q={path}*".format(**locals()) host = get_well_known_host(path) # If the path belongs github/bitbucket, yield a repo too if host: # keep github, bitbucket... as type: - repo_type, _, _ = host.lower().partition('.') # NOQA - repo_url = 'https://{namespace}/{name}'.format(**package_url.to_dict()) + repo_type, _, _ = host.lower().partition(".") # NOQA + repo_url = "https://{namespace}/{name}".format(**package_url.to_dict()) repo_purl = PackageURL( type=repo_type, namespace=package_url.namespace, name=package_url.name, - qualifiers=dict(package_url=package_url.to_string()) + qualifiers=dict(package_url=package_url.to_string()), ).to_string() yield URI(uri=repo_url, package_url=repo_purl, source_uri=self.uri) - yield URI(uri=details_url, - package_url=package_url.to_string(), - source_uri=self.uri) + yield URI( + uri=details_url, + package_url=package_url.to_string(), + source_uri=self.uri, + ) else: yield URI(uri=details_url, package_url=package_url, source_uri=self.uri) -@visit_router.route('https://api\.godoc\.org/search\?q=.*') +@visit_router.route(r"https://api\.godoc\.org/search\?q=.*") class GodocSearchVisitor(NonPersistentHttpVisitor): - """ - Collect URIs and data through the godoc searchi API. - """ + """Collect URIs and data through the godoc searchi API.""" def get_uris(self, content): seen_paths = set() @@ -91,7 +87,8 @@ def get_uris(self, content): source_uri=self.uri, # the data contains some popcounts and a description data=package, - visited=True) + visited=True, + ) def get_packages(packages_json_location): @@ -108,139 +105,127 @@ def get_packages(packages_json_location): """ with open(packages_json_location) as f: data = json.load(f) - for package in data.get('results', []): - path = package['path'] + for package in data.get("results", []): + path = package["path"] if path and not is_standard_import(path): yield path, package def is_standard_import(path): - """ - Return True if a Go import path is for a standard library import - """ + """Return True if a Go import path is for a standard library import""" standard_packages = ( - 'archive', - 'bufio', - 'builtin', - 'bytes', - 'compress', - 'container', - 'context', - 'crypto', - 'database', - 'debug', - 'encoding', - 'expvar', - 'flag', - 'fmt', - 'go', - 'hash', - 'html', - 'image', - 'index', - 'io', - 'log', - 'math', - 'mime', - 'net', - 'os', - 'path', - 'plugin', - 'reflect', - 'regexp', - 'runtime', - 'sort', - 'strconv', - 'strings', - 'sync', - 'syscall', - 'testing', - 'text', - 'time', - 'unsafe', - 'golang.org/x/benchmarks', - 'golang.org/x/blog', - 'golang.org/x/build', - 'golang.org/x/crypto', - 'golang.org/x/debug', - 'golang.org/x/image', - 'golang.org/x/mobile', - 'golang.org/x/net', - 'golang.org/x/perf', - 'golang.org/x/review', - 'golang.org/x/sync', - 'golang.org/x/sys', - 'golang.org/x/text', - 'golang.org/x/time', - 'golang.org/x/tools', - 'golang.org/x/tour', - 'golang.org/x/exp' + "archive", + "bufio", + "builtin", + "bytes", + "compress", + "container", + "context", + "crypto", + "database", + "debug", + "encoding", + "expvar", + "flag", + "fmt", + "go", + "hash", + "html", + "image", + "index", + "io", + "log", + "math", + "mime", + "net", + "os", + "path", + "plugin", + "reflect", + "regexp", + "runtime", + "sort", + "strconv", + "strings", + "sync", + "syscall", + "testing", + "text", + "time", + "unsafe", + "golang.org/x/benchmarks", + "golang.org/x/blog", + "golang.org/x/build", + "golang.org/x/crypto", + "golang.org/x/debug", + "golang.org/x/image", + "golang.org/x/mobile", + "golang.org/x/net", + "golang.org/x/perf", + "golang.org/x/review", + "golang.org/x/sync", + "golang.org/x/sys", + "golang.org/x/text", + "golang.org/x/time", + "golang.org/x/tools", + "golang.org/x/tour", + "golang.org/x/exp", ) return path.startswith(standard_packages) -repo_hosters = 'bitbucket.org/', 'github.com/', 'gitlab.com/' +repo_hosters = "bitbucket.org/", "github.com/", "gitlab.com/" def get_well_known_host(path): - """ - Return a host if this path is from a well known hoster or None. - """ + """Return a host if this path is from a well known hoster or None.""" if path.startswith(repo_hosters): - host, _, _ = path.partition('.') + host, _, _ = path.partition(".") return host def parse_package_path(path): - """ - Return a PackageURL and transformed path given a path to a Go import. - """ - path = path or '' - segments = path.split('/') + """Return a PackageURL and transformed path given a path to a Go import.""" + path = path or "" + segments = path.split("/") host = get_well_known_host(path) qualifiers = None if host: # keep only the first few segments segments = segments[:3] - repo_url = 'https://' + '/'.join(segments) + repo_url = "https://" + "/".join(segments) qualifiers = dict(vcs_repository=repo_url) namespace = None if len(segments) > 1: namespace = segments[:-1] - namespace = '/'.join(namespace) + namespace = "/".join(namespace) name = segments[-1] - path = '/'.join(segments) + path = "/".join(segments) package_url = PackageURL( - type='golang', - namespace=namespace, - name=name, - qualifiers=qualifiers + type="golang", namespace=namespace, name=name, qualifiers=qualifiers ) return package_url, path -@map_router.route('pkg:golang/.*') +@map_router.route("pkg:golang/.*") class GolangApiDocMapper(Mapper): - def get_packages(self, uri, resource_uri): package = json.loads(resource_uri.data) yield build_golang_package(package, resource_uri.package_url) def build_golang_package(package_data, purl): - """ - Return a single Golang package - """ + """Return a single Golang package""" package_url = PackageURL.from_string(purl) - vcs_url = package_url.qualifiers.get('vcs_repository') - homepage_url = '/'.join(['https:/', package_url.namespace, package_url.name]) - vcs_tool = 'git' if 'github.com' in package_url.namespace else None + vcs_url = package_url.qualifiers.get("vcs_repository") + homepage_url = "/".join(["https:/", package_url.namespace, package_url.name]) + vcs_tool = "git" if "github.com" in package_url.namespace else None if vcs_tool: vcs_url = form_vcs_url(vcs_tool, vcs_url) # TODO: collect stats and counter from package_data too @@ -248,8 +233,8 @@ def build_golang_package(package_data, purl): name=package_url.name, namespace=package_url.namespace, type=package_url.type, - primary_language='Go', - description=package_data.get('synopsis'), + primary_language="Go", + description=package_data.get("synopsis"), homepage_url=homepage_url, vcs_url=vcs_url, ) diff --git a/minecode/miners/googlecode.py b/minecode/miners/googlecode.py index 82cccb2c..b041a42b 100644 --- a/minecode/miners/googlecode.py +++ b/minecode/miners/googlecode.py @@ -8,39 +8,35 @@ # import json - -from datetime import datetime import os +from datetime import datetime from bs4 import BeautifulSoup - +from packagedcode import models as scan_models from packageurl import PackageURL +from minecode import map_router from minecode import seed from minecode import visit_router -from minecode.utils import extract_file +from minecode.miners import URI from minecode.miners import HttpJsonVisitor -from minecode.miners import NonPersistentHttpVisitor from minecode.miners import HttpVisitor -from minecode.miners import URI -from packagedcode import models as scan_models - -from minecode import map_router from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import extract_file class GooglecodeSeed(seed.Seeder): - def get_seeds(self): - yield 'https://code.google.com/archive/search?q=domain:code.google.com' - yield 'https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip' + yield "https://code.google.com/archive/search?q=domain:code.google.com" + yield "https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip" -@visit_router.route('https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip') +@visit_router.route( + "https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip" +) class GooglecodeArchiveVisitor(NonPersistentHttpVisitor): - """ - Fetch the googlecode archive file and extract it, and read the text file and get the URLs - """ + """Fetch the googlecode archive file and extract it, and read the text file and get the URLs""" def get_uris(self, content): """ @@ -56,33 +52,45 @@ def get_uris(self, content): b/google-code-archive/o/v2%2Fcode.google.com%2Fhg4j%2Fproject.json?alt=media """ extracted_location = extract_file(content) - text_file = os.path.join(extracted_location, 'google-code-archive.txt') - url_base = 'https://www.googleapis.com/storage/v1/b/{project_info}?alt=media' + text_file = os.path.join(extracted_location, "google-code-archive.txt") + url_base = "https://www.googleapis.com/storage/v1/b/{project_info}?alt=media" if os.path.exists(text_file): with open(text_file) as project_file: for project_line in project_file: if not project_line: continue project_line = project_line.strip() - if project_line.startswith('gs://google-code-archive/v2') and project_line.endswith('/project.json'): - project_line = project_line.replace('gs://google-code-archive/v2', '') - package_name = project_line.replace('/project.json', '') - package_url = PackageURL(type='googlecode', name=package_name.strip('/')).to_string() - project_line = 'google-code-archive/o/v2' + project_line.replace('/', '%2F') + if project_line.startswith( + "gs://google-code-archive/v2" + ) and project_line.endswith("/project.json"): + project_line = project_line.replace( + "gs://google-code-archive/v2", "" + ) + package_name = project_line.replace("/project.json", "") + package_url = PackageURL( + type="googlecode", name=package_name.strip("/") + ).to_string() + project_line = ( + "google-code-archive/o/v2" + + project_line.replace("/", "%2F") + ) url = url_base.format(project_info=project_line) yield URI(uri=url, package_url=package_url, source_uri=self.uri) -@visit_router.route('https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media') +@visit_router.route( + r"https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media" +) class GoogleAPIProjectJsonVisitor(HttpJsonVisitor): - """ - Fetch the json of the API URL and this will be used for mapper use. - """ + """Fetch the json of the API URL and this will be used for mapper use.""" + pass -@visit_router.route('https://code.google.com/archive/search\?q=domain:code.google.com', - 'https://code.google.com/archive/search\?q=domain:code.google.com&page=[0-9]*') +@visit_router.route( + r"https://code.google.com/archive/search\?q=domain:code.google.com", + r"https://code.google.com/archive/search\?q=domain:code.google.com&page=[0-9]*", +) class GoogleProjectPagesVisitor(HttpVisitor): """ Parse the passing google projects list pages, and return all project json url @@ -90,78 +98,93 @@ class GoogleProjectPagesVisitor(HttpVisitor): """ def get_uris(self, content): - """ - Return URIs for pagnitions of project lists - """ - page = BeautifulSoup(content, 'lxml') - projectjson_url_template = 'https://storage.googleapis.com/google-code-archive/v2/code.google.com/{project}/project.json' - for page in page.find_all('a'): - url = page['href'] - if url and 'https://code.google.com/archive/p/' in url: - project_name = url.replace('https://code.google.com/archive/p/', '') + """Return URIs for pagnitions of project lists""" + page = BeautifulSoup(content, "lxml") + projectjson_url_template = "https://storage.googleapis.com/google-code-archive/v2/code.google.com/{project}/project.json" + for page in page.find_all("a"): + url = page["href"] + if url and "https://code.google.com/archive/p/" in url: + project_name = url.replace("https://code.google.com/archive/p/", "") project_api_url = projectjson_url_template.format(project=project_name) - package_url = PackageURL(type='googlecode', name=project_name.strip('/')).to_string() - yield URI(uri=project_api_url, package_url=package_url, source_uri=self.uri) - if page.text.startswith('Next'): + package_url = PackageURL( + type="googlecode", name=project_name.strip("/") + ).to_string() + yield URI( + uri=project_api_url, package_url=package_url, source_uri=self.uri + ) + if page.text.startswith("Next"): yield URI(uri=url, source_uri=self.uri) -@visit_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json') +@visit_router.route( + "https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json" +) class GoogleProjectJsonVisitor(HttpJsonVisitor): - """ - Collect the project json for mapper use and also return the download page json url. - """ + """Collect the project json for mapper use and also return the download page json url.""" def get_uris(self, content): - """ - Return the download json URL - """ - yield URI(uri=self.uri.replace('project.json', 'downloads-page-1.json')) + """Return the download json URL""" + yield URI(uri=self.uri.replace("project.json", "downloads-page-1.json")) -@visit_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/downloads-page-[0-9]*.json') +@visit_router.route( + "https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/downloads-page-[0-9]*.json" +) class GoogleDownloadsPageJsonVisitor(HttpJsonVisitor): - """ - Collect download URIs and the next page related to the current download page. - """ + """Collect download URIs and the next page related to the current download page.""" def get_uris(self, content): - """Yield the next download page based on current page number and total page number. + """ + Yield the next download page based on current page number and total page number. and yield the download urls in the json, for example: https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hg4j/hg4j_1.2m2.jar """ url = self.uri - page_num = content.get('pageNumber') - total_pages = content.get('totalPages') - name_template = 'downloads-page-{page}.json' + page_num = content.get("pageNumber") + total_pages = content.get("totalPages") + name_template = "downloads-page-{page}.json" filename = name_template.format(page=str(page_num)) new_filename = name_template.format(page=str(page_num + 1)) assert filename in url if page_num < total_pages: new_page_url = url.replace(filename, new_filename) - yield URI(uri=new_page_url, source_uri=self.uri, ) - - download_url_template = url.replace(filename, '') + '{file_name}' - for download in content.get('downloads', []): - file_name = download.get('filename') - package_url = PackageURL(type='googlecode', name=file_name).to_string() - if '_' in file_name and '.' in file_name: - partitions = file_name.partition('_') + yield URI( + uri=new_page_url, + source_uri=self.uri, + ) + + download_url_template = url.replace(filename, "") + "{file_name}" + for download in content.get("downloads", []): + file_name = download.get("filename") + package_url = PackageURL(type="googlecode", name=file_name).to_string() + if "_" in file_name and "." in file_name: + partitions = file_name.partition("_") package_name = partitions[0] - version = partitions[-1].rpartition('.')[0] - package_url = PackageURL(type='googlecode', name=package_name, version=version).to_string() + version = partitions[-1].rpartition(".")[0] + package_url = PackageURL( + type="googlecode", name=package_name, version=version + ).to_string() download_url = download_url_template.format(file_name=file_name) last_modified_date = None - release_date = download.get('releaseDate') + release_date = download.get("releaseDate") if release_date: last_modified_date = datetime.fromtimestamp(release_date) - yield URI(uri=download_url, package_url=package_url, file_name=file_name, source_uri=self.uri, date=last_modified_date, size=download.get('fileSize'), sha1=download.get('sha1Checksum')) - - -@map_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json') + yield URI( + uri=download_url, + package_url=package_url, + file_name=file_name, + source_uri=self.uri, + date=last_modified_date, + size=download.get("fileSize"), + sha1=download.get("sha1Checksum"), + ) + + +@map_router.route( + "https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json" +) class GoogleNewAPIV2ProjectJsonMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Packages built from resource_uri record for a single @@ -170,7 +193,9 @@ def get_packages(self, uri, resource_uri): # FIXME: JSON deserialization should be handled eventually by the # framework metadata = json.loads(resource_uri.data) - return build_packages_from_projectsjson_v2(metadata, resource_uri.package_url, uri) + return build_packages_from_projectsjson_v2( + metadata, resource_uri.package_url, uri + ) def build_packages_from_projectsjson_v2(metadata, purl=None, uri=None): @@ -181,28 +206,28 @@ def build_packages_from_projectsjson_v2(metadata, purl=None, uri=None): metadata: json metadata content from API call purl: String value of the package url of the ResourceURI object """ - short_desc = metadata.get('summary') - long_desc = metadata.get('description') + short_desc = metadata.get("summary") + long_desc = metadata.get("description") descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + description = "\n".join(descriptions) common_data = dict( - datasource_id='googlecode_api_json', - type='googlecode', - name=metadata.get('name'), - description=description + datasource_id="googlecode_api_json", + type="googlecode", + name=metadata.get("name"), + description=description, ) - license_name = metadata.get('license') + license_name = metadata.get("license") if license_name: - common_data['extracted_license_statement'] = license_name - common_data['license_detections'] = [] + common_data["extracted_license_statement"] = license_name + common_data["license_detections"] = [] keywords = [] - labels = metadata.get('labels') + labels = metadata.get("labels") for label in labels: if label: keywords.append(label.strip()) - common_data['keywords'] = keywords + common_data["keywords"] = keywords package = scan_models.Package.from_package_data( package_data=common_data, @@ -212,9 +237,10 @@ def build_packages_from_projectsjson_v2(metadata, purl=None, uri=None): yield package -@map_router.route('https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media') +@map_router.route( + r"https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media" +) class GoogleNewAPIV1ProjectJsonMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Packages built from resource_uri record for a single @@ -223,36 +249,39 @@ def get_packages(self, uri, resource_uri): # FIXME: JSON deserialization should be handled eventually by the # framework metadata = json.loads(resource_uri.data) - return build_packages_from_projectsjson_v1(metadata, resource_uri.package_url, uri) + return build_packages_from_projectsjson_v1( + metadata, resource_uri.package_url, uri + ) def build_packages_from_projectsjson_v1(metadata, purl=None, uri=None): - """Yield Package from the project.json passed by the google code v1 API + """ + Yield Package from the project.json passed by the google code v1 API metadata: json metadata content from API call purl: String value of the package url of the ResourceURI object """ - if metadata.get('name'): + if metadata.get("name"): common_data = dict( datasource_id="googlecode_json", - type='googlecode', - name=metadata.get('name'), - description=metadata.get('description') + type="googlecode", + name=metadata.get("name"), + description=metadata.get("description"), ) - license_name = metadata.get('license') + license_name = metadata.get("license") if license_name: - common_data['extracted_license_statement'] = license_name - common_data['license_detections'] = [] + common_data["extracted_license_statement"] = license_name + common_data["license_detections"] = [] keywords = [] - labels = metadata.get('labels') + labels = metadata.get("labels") for label in labels: if label: keywords.append(label.strip()) - common_data['keywords'] = keywords + common_data["keywords"] = keywords - common_data['vcs_url'] = metadata.get('ancestorRepo') - common_data['namespace'] = metadata.get('domain') + common_data["vcs_url"] = metadata.get("ancestorRepo") + common_data["namespace"] = metadata.get("domain") # createTime doesn't make sense since the timestamp value is incorrect # and parsing it will give a wrong year out of range. diff --git a/minecode/miners/gstreamer.py b/minecode/miners/gstreamer.py index f291a339..f49a5876 100644 --- a/minecode/miners/gstreamer.py +++ b/minecode/miners/gstreamer.py @@ -8,18 +8,16 @@ # from bs4 import BeautifulSoup - +from commoncode import fileutils from commoncode.fileutils import file_base_name +from packagedcode import models as scan_models from packageurl import PackageURL +from minecode import map_router from minecode import seed from minecode import visit_router -from minecode.miners import HttpVisitor from minecode.miners import URI -from commoncode import fileutils -from packagedcode import models as scan_models - -from minecode import map_router +from minecode.miners import HttpVisitor from minecode.miners import Mapper @@ -27,10 +25,10 @@ class GstreamerSeed(seed.Seeder): is_active = False def get_seeds(self): - yield 'https://gstreamer.freedesktop.org/src/' + yield "https://gstreamer.freedesktop.org/src/" -@visit_router.route('https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*') +@visit_router.route(r"https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*") class GstreamerHTMLVisitor(HttpVisitor): """ Visit the HTML page of gstreamer. Yield the uri which can be used for the next visitor use or the uri stands for the file resource. @@ -40,38 +38,48 @@ class GstreamerHTMLVisitor(HttpVisitor): """ def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - url_template = self.uri + '{sub_path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + page = BeautifulSoup(content, "lxml") + url_template = self.uri + "{sub_path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] + href = a["href"] if href: # For parent folder link or other unrelated links, ignore - if href.startswith('/') or href.startswith('?'): + if href.startswith("/") or href.startswith("?"): continue - if href.endswith('/'): + if href.endswith("/"): # If the path is folder, yield it for the next visitor use. - yield URI(uri=url_template.format(sub_path=href), source_uri=self.uri) + yield URI( + uri=url_template.format(sub_path=href), source_uri=self.uri + ) else: # If it's the file resource, form the package_url and yield the URI with package url info # For example: gst-openmax-0.10.0.4.tar.bz2 file_name = href file_name_without_prefix = file_base_name(file_name) - if '-' in file_name_without_prefix: - project_name_versions = file_name.rpartition('-') + if "-" in file_name_without_prefix: + project_name_versions = file_name.rpartition("-") project_name = project_name_versions[0] version = project_name_versions[-1] else: project_name = file_name version = None - package_url = PackageURL(type='gstreamer', name=project_name, version=version).to_string() - yield URI(uri=url_template.format(sub_path=href), package_url=package_url, file_name=file_name, source_uri=self.uri) + package_url = PackageURL( + type="gstreamer", name=project_name, version=version + ).to_string() + yield URI( + uri=url_template.format(sub_path=href), + package_url=package_url, + file_name=file_name, + source_uri=self.uri, + ) -@map_router.route('https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2\\.gz|\.tar\.xz]') +@map_router.route( + "https://gstreamer.freedesktop.org/src/([\\w\\-\\.]+/)*[\\w\\-\\.]+[.tar\\.bz2\\.gz|\\.tar\\.xz]" +) class GstreamerURLMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -88,17 +96,17 @@ def build_package_from_url(uri, purl=None): """ file_name = fileutils.file_name(uri) file_name_without_prefix = file_name - prefixes = ('.tar.bz2', '.tar.gz', '.tar.xz') + prefixes = (".tar.bz2", ".tar.gz", ".tar.xz") for prefix in prefixes: - file_name_without_prefix = file_name_without_prefix.replace(prefix, '') - if '-' in file_name_without_prefix: - project_name, _, version = file_name.rpartition('-') + file_name_without_prefix = file_name_without_prefix.replace(prefix, "") + if "-" in file_name_without_prefix: + project_name, _, version = file_name.rpartition("-") common_data = dict( - type='gstreamer', + type="gstreamer", name=project_name, version=version, download_url=uri, - homepage_url='https://gstreamer.freedesktop.org' + homepage_url="https://gstreamer.freedesktop.org", ) package = scan_models.Package(**common_data) package.set_purl(purl) diff --git a/minecode/miners/haxe.py b/minecode/miners/haxe.py index 83a33fdf..cd6e7f1e 100644 --- a/minecode/miners/haxe.py +++ b/minecode/miners/haxe.py @@ -10,50 +10,51 @@ import json from bs4 import BeautifulSoup - from packagedcode.haxe import HaxelibJsonHandler from packageurl import PackageURL +from minecode import map_router from minecode import seed from minecode import visit_router -from minecode import map_router -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpJsonVisitor from minecode.miners import HttpVisitor -from minecode.miners import URI +from minecode.miners import Mapper class HaxeSeed(seed.Seeder): is_active = False def get_seeds(self): - yield 'https://lib.haxe.org/all' + yield "https://lib.haxe.org/all" -@visit_router.route('https://lib.haxe.org/all') +@visit_router.route("https://lib.haxe.org/all") class HaxeProjectsVisitor(HttpVisitor): - """ - Visit the Haxe all projects page and yield uri of each project. - """ + """Visit the Haxe all projects page and yield uri of each project.""" def get_uris(self, content): """ Parse the HTML to get project name, and format the url with this project name into a version URL. For example: https://lib.haxe.org/p/openfl/versions/ """ - version_url_tempalte = 'https://lib.haxe.org{project_href}versions' - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + version_url_tempalte = "https://lib.haxe.org{project_href}versions" + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] - if href and href.startswith('/p/'): - project_name = href.replace('/p', '').rstrip('/') - package_url = PackageURL(type='haxe', name=project_name).to_string() - yield URI(uri=version_url_tempalte.format(project_href=href), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://lib.haxe.org/p/[\w\-\.]+/versions') + href = a["href"] + if href and href.startswith("/p/"): + project_name = href.replace("/p", "").rstrip("/") + package_url = PackageURL(type="haxe", name=project_name).to_string() + yield URI( + uri=version_url_tempalte.format(project_href=href), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route(r"https://lib.haxe.org/p/[\w\-\.]+/versions") class HaxeVersionsVisitor(HttpVisitor): """ Visit the version page of a project and yield uri of each version. @@ -61,42 +62,48 @@ class HaxeVersionsVisitor(HttpVisitor): """ def get_uris(self, content): - """ - Yield haxelib json URL based on specified version, for example: https://lib.haxe.org/p/openfl/8.6.4/raw-files/openfl/package.json - """ - version_url_tempalte = 'https://lib.haxe.org/p/{project}/{version}/raw-files/{project}/package.json' - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + """Yield haxelib json URL based on specified version, for example: https://lib.haxe.org/p/openfl/8.6.4/raw-files/openfl/package.json""" + version_url_tempalte = "https://lib.haxe.org/p/{project}/{version}/raw-files/{project}/package.json" + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] - if href and href.startswith('/p/') and href.endswith('/'): + href = a["href"] + if href and href.startswith("/p/") and href.endswith("/"): # Parse if the href contains the versino info: - project_version = href.replace('/p/', '').rstrip('/') - project_version = project_version.split('/') + project_version = href.replace("/p/", "").rstrip("/") + project_version = project_version.split("/") if len(project_version) == 2: # if there is only one slash between project and version, openfl/8.6.3 project = project_version[0] version = project_version[1] - package_url = PackageURL(type='haxe', name=project, version=version).to_string() - yield URI(uri=version_url_tempalte.format(project=project, version=version), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json') + package_url = PackageURL( + type="haxe", name=project, version=version + ).to_string() + yield URI( + uri=version_url_tempalte.format( + project=project, version=version + ), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route( + r"https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json" +) class HaxePackageJsonVisitor(HttpJsonVisitor): - """ - Empty Visitor to get the package json content only. - """ + """Empty Visitor to get the package json content only.""" + pass -@map_router.route('https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json') +@map_router.route( + r"https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json" +) class HaxePackageJsonMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield Package built from package json file. - """ + """Yield Package built from package json file.""" # FIXME: JSON deserialization should be handled eventually by the framework metadata = json.loads(resource_uri.data) return build_packages_with_json(metadata, resource_uri.package_url) diff --git a/minecode/miners/java_stream.py b/minecode/miners/java_stream.py index 94a766b2..52480264 100644 --- a/minecode/miners/java_stream.py +++ b/minecode/miners/java_stream.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # The MIT License (MIT) # @@ -23,14 +22,12 @@ # SOFTWARE. -""" -Reading from Java DataInputStream format. -""" +"""Reading from Java DataInputStream format.""" import struct -class DataInputStream(object): +class DataInputStream: def __init__(self, stream): self.stream = stream @@ -42,14 +39,14 @@ def read(self, n=1): return data def read_byte(self): - return struct.unpack('b', self.read(1))[0] + return struct.unpack("b", self.read(1))[0] def read_long(self): - return struct.unpack('>q', self.read(8))[0] + return struct.unpack(">q", self.read(8))[0] def read_utf(self): - utf_length = struct.unpack('>H', self.read(2))[0] + utf_length = struct.unpack(">H", self.read(2))[0] return self.read(utf_length) def read_int(self): - return struct.unpack('>i', self.read(4))[0] + return struct.unpack(">i", self.read(4))[0] diff --git a/minecode/miners/maven.py b/minecode/miners/maven.py index 66e38abb..fc74662e 100644 --- a/minecode/miners/maven.py +++ b/minecode/miners/maven.py @@ -7,36 +7,35 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import namedtuple import gzip import io import json import logging import os +from collections import namedtuple + +import arrow +import javaproperties import packageurl -from commoncode.text import as_unicode -from packagedcode.models import PackageData from bs4 import BeautifulSoup +from commoncode.text import as_unicode from dateutil import tz -import arrow -from packageurl import PackageURL from jawa.util.utf import decode_modified_utf8 -import javaproperties - -from packageurl import PackageURL +from packagedcode.maven import _parse from packagedcode.maven import build_filename from packagedcode.maven import build_url -from packagedcode.maven import _parse +from packagedcode.models import PackageData +from packageurl import PackageURL +from minecode import map_router from minecode import seed from minecode import visit_router -from minecode.miners import java_stream +from minecode.miners import URI from minecode.miners import HttpVisitor +from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI -from minecode import map_router +from minecode.miners import java_stream from minecode.utils import parse_date -from minecode.miners import Mapper """ This module handles the Maven repositories such as central and other @@ -55,11 +54,12 @@ if TRACE: import sys + logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) -MAVEN_BASE_URL = 'https://repo1.maven.org/maven2' +MAVEN_BASE_URL = "https://repo1.maven.org/maven2" class GzipFileWithTrailing(gzip.GzipFile): @@ -67,9 +67,10 @@ class GzipFileWithTrailing(gzip.GzipFile): A subclass of gzip.GzipFile supporting files with trailing garbage. Ignore the garbage. """ + # TODO: what is first_file?? first_file = True - gzip_magic = b'\037\213' + gzip_magic = b"\037\213" has_trailing_garbage = False def _read_gzip_header(self): @@ -81,17 +82,16 @@ def _read_gzip_header(self): if is_gzip and not self.first_file: self.first_file = False self.has_trailing_garbage = True - raise EOFError('Trailing garbage found') + raise EOFError("Trailing garbage found") self.first_file = False gzip.GzipFile._read_gzip_header(self) class MavenSeed(seed.Seeder): - def get_seeds(self): - yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' - yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties' + yield "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" + yield "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties" # yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz' # yield 'http://jcenter.bintray.com/' # yield 'https://repo2.maven.org/maven2/.index/nexus-maven-repository-index.gz' @@ -105,12 +105,14 @@ def get_seeds(self): # also has a npm mirrors: https://maven-eu.nuxeo.org/nexus/#view-repositories;npmjs~browsestorage -@visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') -@visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') +@visit_router.route( + r"http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties" +) +@visit_router.route( + r"https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties" +) class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): - """ - Fetch the property files, parse the create the URI for each increment index - """ + """Fetch the property files, parse the create the URI for each increment index""" def get_uris(self, content): """ @@ -122,13 +124,12 @@ def get_uris(self, content): Each value points to a fragment increamental index that has the same format as the bigger one. """ - - base_url = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz' + base_url = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz" with open(content) as config_file: properties = javaproperties.load(config_file) or {} for key, increment_index in properties.items(): - if key.startswith('nexus.index.incremental'): + if key.startswith("nexus.index.incremental"): yield URI( uri=base_url.format(index=increment_index), source_uri=self.uri, @@ -136,9 +137,10 @@ def get_uris(self, content): @visit_router.route( - 'https?://.*/nexus-maven-repository-index.gz', + "https?://.*/nexus-maven-repository-index.gz", # increments - 'https?://.*/nexus-maven-repository-index\.\d+\.gz') + r"https?://.*/nexus-maven-repository-index\.\d+\.gz", +) class MavenNexusIndexVisitor(NonPersistentHttpVisitor): """ Download and process a Nexus Maven index file. @@ -156,8 +158,7 @@ def get_uris(self, content): """ index_location = content - artifacts = get_artifacts( - index_location, worthyness=is_worthy_artifact) + artifacts = get_artifacts(index_location, worthyness=is_worthy_artifact) for artifact in artifacts: # we cannot do much without these @@ -170,15 +171,15 @@ def get_uris(self, content): continue qualifiers = {} - if extension and extension != 'jar': - qualifiers['type'] = extension + if extension and extension != "jar": + qualifiers["type"] = extension classifier = artifact.classifier if classifier: - qualifiers['classifier'] = classifier + qualifiers["classifier"] = classifier package_url = PackageURL( - type='maven', + type="maven", namespace=group_id, name=artifact_id, version=version, @@ -192,19 +193,24 @@ def get_uris(self, content): # instead togther with the filename... especially we could use # different REPOs. jar_download_url, file_name = build_url_and_filename( - group_id, artifact_id, version, extension, classifier) + group_id, artifact_id, version, extension, classifier + ) # FIXME: should this be set in the yielded URI too last_mod = artifact.last_modified # We yield a pre-visited URI for each JAR mock_maven_index_uri = build_url( - group_id, artifact_id, version, file_name, - base_url='maven-index://repo1.maven.org') + group_id, + artifact_id, + version, + file_name, + base_url="maven-index://repo1.maven.org", + ) artifact_data = artifact.to_dict() - artifact_data['download_url'] = jar_download_url - artifact_as_json = json.dumps(artifact_data, separators=(',', ':')) + artifact_data["download_url"] = jar_download_url + artifact_as_json = json.dumps(artifact_data, separators=(",", ":")) yield URI( # this is the Maven index index URI @@ -222,7 +228,7 @@ def get_uris(self, content): ) package_url = PackageURL( - type='maven', + type="maven", namespace=group_id, name=artifact_id, version=version, @@ -232,7 +238,8 @@ def get_uris(self, content): # the POM of a Jar in the repo. Only for Parent POMs # therefore we create a download with the pomextension pom_download_url, pom_file_name = build_url_and_filename( - group_id, artifact_id, version, extension='pom', classifier='') + group_id, artifact_id, version, extension="pom", classifier="" + ) yield URI( # this is the Maven index index URI source_uri=self.uri, @@ -247,7 +254,7 @@ def get_uris(self, content): ) -@visit_router.route('https?://jcenter\.bintray\.com/(.+/)*') +@visit_router.route(r"https?://jcenter\.bintray\.com/(.+/)*") class MavenHTMLPageVisitor(HttpVisitor): """ Parse the HTML page and yield all necessary uris from the page and its sub pages. @@ -256,17 +263,17 @@ class MavenHTMLPageVisitor(HttpVisitor): """ def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for pre in page.find_all(name='pre'): - for a in pre.find_all(name='a'): - url = a.get('href') + page = BeautifulSoup(content, "lxml") + for pre in page.find_all(name="pre"): + for a in pre.find_all(name="a"): + url = a.get("href") if not url: continue # Remove : symbol since it's a special char for bintray repo. - if url.startswith(':'): + if url.startswith(":"): url = url[1:] filename = None # default is folder, the filename is None. - if not url.endswith('/'): + if not url.endswith("/"): # a file filename = url yield URI( @@ -277,11 +284,9 @@ def get_uris(self, content): ) -@visit_router.route('https?://.*/maven-metadata\.xml') +@visit_router.route(r"https?://.*/maven-metadata\.xml") class MavenMetaDataVisitor(HttpVisitor): - """ - Parse the maven-metadata.xml file and yield uris of jars and pom. - """ + """Parse the maven-metadata.xml file and yield uris of jars and pom.""" def get_uris(self, content): # FIXME this may not be correct. The only thing we can infer from the maven @@ -289,34 +294,32 @@ def get_uris(self, content): # The actual download files likely need to be obtained from directory listing # or infered from parsing the POM??? - base_url = self.uri.partition('maven-metadata.xml')[0] + '{version}/' - pom_url = base_url + '{artifactId}-{version}.pom' + base_url = self.uri.partition("maven-metadata.xml")[0] + "{version}/" + pom_url = base_url + "{artifactId}-{version}.pom" # FIXME: this may not exist and or with another extension?? and this should be PREVISITED - jar_url = base_url + '{artifactId}-{version}.jar' + jar_url = base_url + "{artifactId}-{version}.jar" # FIXME: sources may not exists?? and this should be PREVISITED - source_url = base_url + '{artifactId}-{version}-sources.jar' + source_url = base_url + "{artifactId}-{version}-sources.jar" # FIXME: why use BeautifulSoup for valid XML??? - page = BeautifulSoup(content, 'lxml-xml') + page = BeautifulSoup(content, "lxml-xml") - group_id = page.find(name='groupId') - artifact_id = page.find(name='artifactId') + group_id = page.find(name="groupId") + artifact_id = page.find(name="artifactId") if not (group_id and artifact_id): return group_id = group_id.string artifact_id = artifact_id.string - for version in page.find_all('version'): + for version in page.find_all("version"): version = version.string # FIXME: we may not get the proper extensions and classifiers and miss the qualifiers package_url = PackageURL( - type='maven', - namespace=group_id, - name=artifact_id, - version=version).to_string() + type="maven", namespace=group_id, name=artifact_id, version=version + ).to_string() # the JAR proper as previsited yield URI( @@ -344,8 +347,14 @@ def get_uris(self, content): # TODO: consider switching to HTTPS -def build_url_and_filename(group_id, artifact_id, version, extension, classifier, - base_repo_url='https://repo1.maven.org/maven2'): +def build_url_and_filename( + group_id, + artifact_id, + version, + extension, + classifier, + base_repo_url="https://repo1.maven.org/maven2", +): """ Return a tuple of (url, filename) for the download URL of a Maven artifact built from its coordinates. @@ -356,23 +365,25 @@ def build_url_and_filename(group_id, artifact_id, version, extension, classifier # TODO: consider switching to HTTPS -def build_maven_xml_url(group_id, artifact_id, - base_repo_url='https://repo1.maven.org/maven2'): +def build_maven_xml_url( + group_id, artifact_id, base_repo_url="https://repo1.maven.org/maven2" +): """ Return a download URL for a Maven artifact built from its coordinates. """ - group_id = group_id.replace('.', '/') - path = '{group_id}/{artifact_id}'.format(**locals()) - return '{base_repo_url}/{path}/maven-metadata.xml'.format(**locals()) + group_id = group_id.replace(".", "/") + path = "{group_id}/{artifact_id}".format(**locals()) + return "{base_repo_url}/{path}/maven-metadata.xml".format(**locals()) -@visit_router.route('https?://repo1.maven.org/maven2/.*\.pom') +@visit_router.route(r"https?://repo1.maven.org/maven2/.*\.pom") class MavenPOMVisitor(HttpVisitor): """ Visit a POM. The POM XML is stored as data and there is nothing special to do for this visitor. """ + pass @@ -402,38 +413,40 @@ def is_worthy_artifact(artifact): ejb-client jar ejb client java test-jar jar jar tests java """ - if artifact.version == 'archetypes': + if artifact.version == "archetypes": # we skip these entirely, they have a different shape return - worthy_ext_pack = set([ - # packaging, classifier, extension - (u'jar', u'sources', u'jar'), - (u'jar', None, u'jar'), - (u'bundle', None, u'jar'), - (u'war', None, u'war'), - (u'zip', u'source-release', u'zip'), - (u'maven-plugin', None, u'jar'), - (u'aar', None, u'aar'), - (u'jar', u'sources-commercial', u'jar'), - (u'zip', u'src', u'zip'), - (u'tar.gz', u'src', u'tar.gz'), - (u'jar', None, u'zip'), - (u'zip', u'project-src', u'zip'), - (u'jar', u'src', u'jar'), - ]) - - return (artifact.packaging, - artifact.classifier, - artifact.extension,) in worthy_ext_pack + worthy_ext_pack = set( + [ + # packaging, classifier, extension + ("jar", "sources", "jar"), + ("jar", None, "jar"), + ("bundle", None, "jar"), + ("war", None, "war"), + ("zip", "source-release", "zip"), + ("maven-plugin", None, "jar"), + ("aar", None, "aar"), + ("jar", "sources-commercial", "jar"), + ("zip", "src", "zip"), + ("tar.gz", "src", "tar.gz"), + ("jar", None, "zip"), + ("zip", "project-src", "zip"), + ("jar", "src", "jar"), + ] + ) + + return ( + artifact.packaging, + artifact.classifier, + artifact.extension, + ) in worthy_ext_pack def is_source(classifier): - """ - Return True if the `artifact` Artifact is a source artifact. + """Return True if the `artifact` Artifact is a source artifact.""" + return classifier and ("source" in classifier or "src" in classifier) - """ - return classifier and ('source' in classifier or 'src' in classifier) ######################################################################## # DOCUMENTAION OF the FIELDS aka. Records: @@ -458,56 +471,54 @@ def is_source(classifier): ENTRY_FIELDS = { - 'u': 'Artifact UINFO: Unique groupId, artifactId, version, classifier, extension (or packaging). using', - 'i': 'Artifact INFO: data using | separator', - '1': 'Artifact SHA1 checksum, hex encoded as in sha1sum', - 'm': 'Artifact record last modified, a long as a string representing a Java time for the entry record', - 'n': 'Artifact name', - 'd': 'Artifact description', + "u": "Artifact UINFO: Unique groupId, artifactId, version, classifier, extension (or packaging). using", + "i": "Artifact INFO: data using | separator", + "1": "Artifact SHA1 checksum, hex encoded as in sha1sum", + "m": "Artifact record last modified, a long as a string representing a Java time for the entry record", + "n": "Artifact name", + "d": "Artifact description", } # we IGNORE these fields for now. They can be included optionally. ENTRY_FIELDS_OTHER = { # rarely present, mostly is repos other than central - 'c': 'Artifact Classes (tokenized on newlines only) a list of LF-separated paths, without .class extension', - - 'sha256': 'sha256 of artifact? part of OSGI?', - + "c": "Artifact Classes (tokenized on newlines only) a list of LF-separated paths, without .class extension", + "sha256": "sha256 of artifact? part of OSGI?", # OSGI stuffs, not always there but could be useful metadata - 'Bundle-SymbolicName': 'Bundle-SymbolicName (indexed, stored)', - 'Bundle-Version': 'Bundle-Version (indexed, stored)', - 'Bundle-Description': 'Bundle-Description (indexed, stored)', - 'Bundle-Name': 'Bundle-Name (indexed, stored)', - 'Bundle-License': 'Bundle-License (indexed, stored)', - 'Bundle-DocURL': 'Bundle-DocURL (indexed, stored)', - 'Require-Bundle': 'Require-Bundle (indexed, stored)', + "Bundle-SymbolicName": "Bundle-SymbolicName (indexed, stored)", + "Bundle-Version": "Bundle-Version (indexed, stored)", + "Bundle-Description": "Bundle-Description (indexed, stored)", + "Bundle-Name": "Bundle-Name (indexed, stored)", + "Bundle-License": "Bundle-License (indexed, stored)", + "Bundle-DocURL": "Bundle-DocURL (indexed, stored)", + "Require-Bundle": "Require-Bundle (indexed, stored)", } # we ignore these fields entirely for now. ENTRY_FIELDS_IGNORED = { - - 'IDXINFO': '', - 'DESCRIPTOR': '', - - 'allGroups': '', - 'allGroupsList': '', - 'rootGroups': '', - 'rootGroupsList': '', - + "IDXINFO": "", + "DESCRIPTOR": "", + "allGroups": "", + "allGroupsList": "", + "rootGroups": "", + "rootGroupsList": "", # FIXME: we should deal with these - 'del': 'Deleted marker, will contain UINFO if document is deleted from index', - - 'Export-Package': 'Export-Package (indexed, stored)', - 'Export-Service': 'Export-Service (indexed, stored)', - 'Import-Package': 'Import-Package (indexed, stored)', + "del": "Deleted marker, will contain UINFO if document is deleted from index", + "Export-Package": "Export-Package (indexed, stored)", + "Export-Service": "Export-Service (indexed, stored)", + "Import-Package": "Import-Package (indexed, stored)", # maven-plugin stuffs - 'px': 'MavenPlugin prefix (as keyword, stored)', - 'gx': 'MavenPlugin goals (as keyword, stored)', + "px": "MavenPlugin prefix (as keyword, stored)", + "gx": "MavenPlugin goals (as keyword, stored)", } -def get_artifacts(location, fields=frozenset(ENTRY_FIELDS), - worthyness=is_worthy_artifact, include_all=False): +def get_artifacts( + location, + fields=frozenset(ENTRY_FIELDS), + worthyness=is_worthy_artifact, + include_all=False, +): """ Yield artifact mappings from a Gzipped Maven nexus index data file at location. @@ -521,26 +532,26 @@ def get_artifacts(location, fields=frozenset(ENTRY_FIELDS), _artifact_base_fields = ( - 'group_id', - 'artifact_id', - 'version', - 'packaging', - 'classifier', - 'extension', - 'last_modified', - 'size', - 'sha1', - 'name', - 'description', - 'src_exist', - 'jdoc_exist', - 'sig_exist', + "group_id", + "artifact_id", + "version", + "packaging", + "classifier", + "extension", + "last_modified", + "size", + "sha1", + "name", + "description", + "src_exist", + "jdoc_exist", + "sig_exist", ) _artifact_extended_fields = ( - 'sha256', - 'osgi', - 'classes', + "sha256", + "osgi", + "classes", ) # FIXME: named tuples are suboptimal here for a simple dictionary @@ -550,11 +561,12 @@ def to_dict(self): return self._asdict() -Artifact = namedtuple('Artifact', _artifact_base_fields) +Artifact = namedtuple("Artifact", _artifact_base_fields) Artifact.to_dict = to_dict ArtifactExtended = namedtuple( - 'ArtifactExtended', _artifact_base_fields + _artifact_extended_fields) + "ArtifactExtended", _artifact_base_fields + _artifact_extended_fields +) ArtifactExtended.to_dict = to_dict @@ -563,15 +575,14 @@ def build_artifact(entry, include_all=False): Return a Maven artifact mapping collected from a single entry mapping or None. """ - - SEP = '|' - NA = 'NA' - NULL = 'null' + SEP = "|" + NA = "NA" + NULL = "null" # UINFO # See org.apache.maven.index.reader.RecordExpander.expandUinfo # See org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator.updateArtifactInfo - uinfo = entry.get('u') + uinfo = entry.get("u") if not uinfo: # not much we can do without this return @@ -601,7 +612,7 @@ def build_artifact(entry, include_all=False): jdoc_exist = False sig_exist = False - info = entry.get('i') + info = entry.get("i") if info: info = info.split(SEP) @@ -612,7 +623,7 @@ def build_artifact(entry, include_all=False): # this is the artifact last modified # create a date/time stamp string from a long as a string lm = info[1] - if lm and lm.isdigit() and lm != '0': + if lm and lm.isdigit() and lm != "0": last_modified = java_time_ts(int(lm)) size = info[2] @@ -622,7 +633,7 @@ def build_artifact(entry, include_all=False): # not present locally: '0': False, # present locally: '1': True, ==> the only one we care for # not available: '2': False, - PRESENT = '1' + PRESENT = "1" src_exist = info[3] == PRESENT jdoc_exist = info[4] == PRESENT @@ -630,49 +641,69 @@ def build_artifact(entry, include_all=False): extension = info[6] else: # FIXME: is this likely incorrect see worthyness check - if classifier or packaging in ('pom', 'war', 'ear'): + if classifier or packaging in ("pom", "war", "ear"): extension = packaging else: - extension = 'jar' + extension = "jar" sig_exist = info[5] == PRESENT # other MISC fields - sha1 = entry.get('1') - name = entry.get('n') - description = entry.get('d') + sha1 = entry.get("1") + name = entry.get("n") + description = entry.get("d") if not include_all: artifact = Artifact( - group_id=gid, artifact_id=aid, version=version, - packaging=packaging, classifier=classifier, extension=extension, - last_modified=last_modified, size=size, sha1=sha1, - name=name, description=description, - src_exist=src_exist, jdoc_exist=jdoc_exist, sig_exist=sig_exist, + group_id=gid, + artifact_id=aid, + version=version, + packaging=packaging, + classifier=classifier, + extension=extension, + last_modified=last_modified, + size=size, + sha1=sha1, + name=name, + description=description, + src_exist=src_exist, + jdoc_exist=jdoc_exist, + sig_exist=sig_exist, ) else: # TODO: should this be part of the base set? - sha256 = entry.get('sha256') + sha256 = entry.get("sha256") # OSGI: Rarely there. Note that we ignore 'Export-', 'Import-', on # purpose: these are big and messey for now osgi = dict() for key, value in entry.items(): - if key.startswith('Bundle-') and value: + if key.startswith("Bundle-") and value: # TODO: could also include 'Require-Bundle' osgi[key] = value.strip() # Classes: Rarely there, but eventually useful in the future # Can be quite big too - classes = entry.get('c', '').splitlines(False) + classes = entry.get("c", "").splitlines(False) artifact = ArtifactExtended( - group_id=gid, artifact_id=aid, version=version, - packaging=packaging, classifier=classifier, extension=extension, - last_modified=last_modified, size=size, sha1=sha1, - name=name, description=description, - src_exist=src_exist, jdoc_exist=jdoc_exist, sig_exist=sig_exist, - sha256=sha256, osgi=osgi, classes=classes + group_id=gid, + artifact_id=aid, + version=version, + packaging=packaging, + classifier=classifier, + extension=extension, + last_modified=last_modified, + size=size, + sha1=sha1, + name=name, + description=description, + src_exist=src_exist, + jdoc_exist=jdoc_exist, + sig_exist=sig_exist, + sha256=sha256, + osgi=osgi, + classes=classes, ) return artifact @@ -690,7 +721,7 @@ def get_entries(location, fields=frozenset(ENTRY_FIELDS)): keys = set() keys_update = keys.update - with GzipFileWithTrailing(location, 'rb') as compressed: + with GzipFileWithTrailing(location, "rb") as compressed: # using io.BufferedReader for increased perfs with io.BufferedReader(compressed, buffer_size=buffer_size) as nexus_index: jstream = java_stream.DataInputStream(nexus_index) @@ -712,10 +743,14 @@ def get_entries(location, fields=frozenset(ENTRY_FIELDS)): except EOFError: if TRACE_DEEP: print( - 'Index version: %(_index_version)r last_modified: %(_last_modified)r' % locals()) + "Index version: %(_index_version)r last_modified: %(_last_modified)r" + % locals() + ) print( - 'Processed %(entries_count)d docs. Last entry: %(entry)r' % locals()) - print('Unique keys:') + "Processed %(entries_count)d docs. Last entry: %(entry)r" + % locals() + ) + print("Unique keys:") for k in sorted(keys): print(k) break @@ -728,11 +763,10 @@ def decode_index_header(jstream): and last_updated_date is a an UTC ISO timestamp string or an empty string. """ - -# this.chunkName = chunkName.trim(); -# this.dataInputStream = new DataInputStream( new GZIPInputStream( inputStream, 2 * 1024 ) ); -# this.version = ( (int) dataInputStream.readByte() ) & 0xff; -# this.timestamp = new Date( dataInputStream.readLong() ); + # this.chunkName = chunkName.trim(); + # this.dataInputStream = new DataInputStream( new GZIPInputStream( inputStream, 2 * 1024 ) ); + # this.version = ( (int) dataInputStream.readByte() ) & 0xff; + # this.timestamp = new Date( dataInputStream.readLong() ); supported_format_version = 1 # one byte @@ -740,7 +774,7 @@ def decode_index_header(jstream): assert supported_format_version == index_version # eight byte timestamp = jstream.read_long() - last_modified = timestamp != -1 and java_time_ts(timestamp) or '' + last_modified = timestamp != -1 and java_time_ts(timestamp) or "" return int(index_version), last_modified @@ -771,7 +805,6 @@ def decode_entry(jstream, fields=()): - one int which is the length of the UTF string in bytes - the utf-8 string proper using Java conventions """ - read = jstream.read read_int = jstream.read_int read_byte = jstream.read_byte @@ -826,26 +859,27 @@ def java_time_ts(tm): timestamp. """ tzinfo = tz.tzutc() - ar = arrow.get(tm / 1000).replace(tzinfo=tzinfo).to('utc') + ar = arrow.get(tm / 1000).replace(tzinfo=tzinfo).to("utc") return ar.isoformat() + ################################################################################ # These are CLI/shell test and stat utilities ################################################################################ def _spit_json(location, target): - with open(target, 'w') as t: - t.write('[\n') + with open(target, "w") as t: + t.write("[\n") for i, artifact in enumerate(get_artifacts(location)): if i % 1000 == 0: - print('number or artifacts:', i) - t.write(json.dumps(artifact.to_dict(), separators=(',', ':'))) - t.write(',\n') + print("number or artifacts:", i) + t.write(json.dumps(artifact.to_dict(), separators=(",", ":"))) + t.write(",\n") - t.write(']\n') + t.write("]\n") - print('total number or artifacts:', i) + print("total number or artifacts:", i) def _artifact_stats(location): @@ -854,6 +888,7 @@ def _artifact_stats(location): at location. """ from collections import Counter + pom_packs = Counter() pom_classifs = Counter() pom_extensions = Counter() @@ -877,27 +912,27 @@ def _artifact_stats(location): pom_worthy += 1 if i % 10000 == 0: - print('number or artifacts:', i) + print("number or artifacts:", i) print() - print('Total number of artifacts:', i) - print('Total number of worthy artifacts:', pom_worthy) + print("Total number of artifacts:", i) + print("Total number of worthy artifacts:", pom_worthy) - print('Top packaging:') + print("Top packaging:") for n, c in pom_packs.most_common(): - print(n, ':', c) + print(n, ":", c) - print('Top classifiers:') + print("Top classifiers:") for n, c in pom_classifs.most_common(): - print(n, ':', c) + print(n, ":", c) - print('Top extensions:') + print("Top extensions:") for n, c in pom_extensions.most_common(): - print(n, ':', c) + print(n, ":", c) - print('Top Combos: packaging, classifier, extension') + print("Top Combos: packaging, classifier, extension") for n, c in combos.most_common(): - print(n, ':', c) + print(n, ":", c) """ Latest stats on 2017-08-07: @@ -925,6 +960,7 @@ def _entries_stats(location): at location. """ from collections import Counter + field_names = Counter() field_names_update = field_names.update @@ -937,19 +973,19 @@ def _entries_stats(location): field_sets_update([keys]) if i % 10000 == 0: print() - print('number of entries:', i) - print('field names stats:', field_names) + print("number of entries:", i) + print("field names stats:", field_names) print() - print('Total number of entries:', i) + print("Total number of entries:", i) print() - print('All field names:', field_names.most_common()) + print("All field names:", field_names.most_common()) print() - print('All field name sets:', field_sets.most_common()) + print("All field name sets:", field_sets.most_common()) print() -@map_router.route('maven-index://.*') +@map_router.route("maven-index://.*") class MavenIndexArtifactMapper(Mapper): """ Process the minimal artifacts collected for a Maven Jar or POM in an @@ -972,7 +1008,7 @@ def get_mini_package(data, uri, purl): artdata = json.loads(data) # FIXME: this should a slot in Artifact - download_url = artdata.pop('download_url') + download_url = artdata.pop("download_url") # FIXME: what if this is an ArtifactExtended?? artifact = Artifact(**artdata) @@ -986,10 +1022,10 @@ def get_mini_package(data, uri, purl): qualifiers = packageurl.normalize_qualifiers(purl.qualifiers, encode=False) if qualifiers: assert isinstance(qualifiers, dict) - logger.debug('get_mini_package: qualifiers: {}'.format(qualifiers)) + logger.debug(f"get_mini_package: qualifiers: {qualifiers}") package = PackageData( - type='maven', + type="maven", namespace=artifact.group_id, name=artifact.artifact_id, version=artifact.version, @@ -1000,39 +1036,36 @@ def get_mini_package(data, uri, purl): size=artifact.size, sha1=artifact.sha1 or None, ) - logger.debug('get_mini_package: package.qualifiers: {}'.format(package.qualifiers)) - logger.debug('get_mini_package for uri: {}, package: {}'.format(uri, package)) + logger.debug(f"get_mini_package: package.qualifiers: {package.qualifiers}") + logger.debug(f"get_mini_package for uri: {uri}, package: {package}") return package # FIXME this should be valid for any POM -@map_router.route('https?://repo1.maven.org/maven2/.*\.pom') +@map_router.route(r"https?://repo1.maven.org/maven2/.*\.pom") class MavenPomMapper(Mapper): - """ - Map a proper full POM visited as XML. - """ - def get_packages(self, uri, resource_uri): + """Map a proper full POM visited as XML.""" - logger.debug('MavenPomMapper.get_packages: uri: {}, resource_uri: {}, purl:' - .format(uri, resource_uri.uri, resource_uri.package_url)) + def get_packages(self, uri, resource_uri): + logger.debug( + f"MavenPomMapper.get_packages: uri: {uri}, resource_uri: {resource_uri.uri}, purl:" + ) package = get_package(resource_uri.data, resource_uri.package_url) if package: - logger.debug('MavenPomMapper.get_packages: uri: {}, package: {}' - .format(uri, package)) + logger.debug( + f"MavenPomMapper.get_packages: uri: {uri}, package: {package}" + ) yield package -def get_package(text, package_url=None, - baseurl='https://repo1.maven.org/maven2'): - """ - Return a ScannedPackage built from a POM XML string `text`. - """ +def get_package(text, package_url=None, baseurl="https://repo1.maven.org/maven2"): + """Return a ScannedPackage built from a POM XML string `text`.""" text = as_unicode(text) package = _parse( - datasource_id='maven_pom', - package_type='maven', - primary_language='Java', - text=text + datasource_id="maven_pom", + package_type="maven", + primary_language="Java", + text=text, ) if package: # FIXME: this should be part of the parse call diff --git a/minecode/miners/npm.py b/minecode/miners/npm.py index d0c2c2eb..3f6fec24 100644 --- a/minecode/miners/npm.py +++ b/minecode/miners/npm.py @@ -7,21 +7,20 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import logging import json +import logging +from packagedcode.npm import NpmPackageJsonHandler from packagedcode.npm import npm_api_url from packagedcode.npm import split_scoped_package_name -from packagedcode.npm import NpmPackageJsonHandler from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import NonPersistentHttpVisitor from minecode.miners import URI from minecode.miners import Mapper - +from minecode.miners import NonPersistentHttpVisitor logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -30,12 +29,13 @@ class NpmSeed(seed.Seeder): - def get_seeds(self): - yield 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=0' + yield "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=0" -@visit_router.route('https://replicate.npmjs.com/registry/_changes\?include_docs=true&limit=\d+&since=\d+') +@visit_router.route( + r"https://replicate.npmjs.com/registry/_changes\?include_docs=true&limit=\d+&since=\d+" +) class NpmRegistryVisitor(NonPersistentHttpVisitor): """ Yield one URI for the next batch of changes to re-visit. Yield one URI for @@ -48,68 +48,66 @@ def get_uris(self, content): Yield a URI for the next index sequence to visit and one URI for each package fetched in a batch. """ - next_visitable_index_url_template = ( - 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since={last_seq}') + next_visitable_index_url_template = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since={last_seq}" json_location = content with open(json_location) as c: content = json.loads(c.read()) try: - last_seq = content['last_seq'] + last_seq = content["last_seq"] except KeyError: # provide a more meaningful message in case the JSON is incorrect - raise Exception( - 'NpmRegistryVisitor: Missing "last_seq" field: Aborting.') + raise Exception('NpmRegistryVisitor: Missing "last_seq" field: Aborting.') # Always yield an index URI, even if there is no results to avoid stopping the index visits - yield URI(uri=next_visitable_index_url_template.format(last_seq=last_seq), source_uri=self.uri) + yield URI( + uri=next_visitable_index_url_template.format(last_seq=last_seq), + source_uri=self.uri, + ) try: - results = content['results'] + results = content["results"] except KeyError: # provide a more meaningful message in case the JSON is incorrect - raise Exception( - 'NpmRegistryVisitor: Missing "results" field: Aborting.') + raise Exception('NpmRegistryVisitor: Missing "results" field: Aborting.') for result in results: - doc = result.get('doc') + doc = result.get("doc") # verify if this record is a package record (as opposed to # some couchdb design document that we would ignore) - is_package_record = 'versions' in doc and 'name' in doc + is_package_record = "versions" in doc and "name" in doc if not is_package_record: continue # remove the readme field from the data: this is big and mostly # useless for now - doc.pop('readme', None) + doc.pop("readme", None) - name = doc.get('name') + name = doc.get("name") namespace, name = split_scoped_package_name(name) package_api_url = npm_api_url(namespace, name) package_url = PackageURL( - type='npm', - namespace=namespace, - name=name).to_string() + type="npm", namespace=namespace, name=name + ).to_string() # here: this is ready for mapping yield URI( uri=package_api_url, package_url=package_url, source_uri=self.uri, - data=json.dumps(doc, separators=( - ',', ':'), ensure_ascii=False), + data=json.dumps(doc, separators=(",", ":"), ensure_ascii=False), # note: visited is True since there nothing more to visit - visited=True) + visited=True, + ) # FIXME: This route may not work when we have scoped Packages or URLs to a specific version # or yarn URLs -@map_router.route('https://registry.npmjs.org/[^\/]+') +@map_router.route(r"https://registry.npmjs.org/[^\/]+") class NpmPackageMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield NpmPackage built from a resource_uri record that contains many @@ -124,15 +122,15 @@ def get_packages(self, uri, resource_uri): # FIXME: Consider using PURL here def build_packages(data): """ - Yield NpmPackage built from data corresponding to a single package name - and many npm versions. + Yield NpmPackage built from data corresponding to a single package name + and many npm versions. """ - versions = data.get('versions', {}) + versions = data.get("versions", {}) - logger.debug('build_packages: versions: ' + repr(type(versions))) + logger.debug("build_packages: versions: " + repr(type(versions))) for version, data in versions.items(): - logger.debug('build_packages: version: ' + repr(version)) - logger.debug('build_packages: data: ' + repr(data)) + logger.debug("build_packages: version: " + repr(version)) + logger.debug("build_packages: data: " + repr(data)) package = NpmPackageJsonHandler._parse(json_data=data) if package: yield package diff --git a/minecode/miners/nuget.py b/minecode/miners/nuget.py index 0dd3396c..5761328c 100644 --- a/minecode/miners/nuget.py +++ b/minecode/miners/nuget.py @@ -10,84 +10,87 @@ import json from bs4 import BeautifulSoup - from commoncode import fileutils -from packageurl import PackageURL from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router from minecode import seed from minecode import visit_router +from minecode.miners import URI from minecode.miners import HttpJsonVisitor from minecode.miners import HttpVisitor -from minecode.miners import URI -from minecode import map_router from minecode.miners import Mapper class NugetSeed(seed.Seeder): - def get_seeds(self): - yield 'https://api-v2v3search-0.nuget.org/query' - yield 'https://www.nuget.org/packages?page=1' + yield "https://api-v2v3search-0.nuget.org/query" + yield "https://www.nuget.org/packages?page=1" -@visit_router.route('https://api-v2v3search-0.nuget.org/query') +@visit_router.route("https://api-v2v3search-0.nuget.org/query") class NugetQueryVisitor(HttpJsonVisitor): """ - 'https://api-v2v3search-0.nuget.org/query' is a query URL which has metadata for - Nuget packages and we can query for all the packages by using the pagination - technique. For example 'https://api-v2v3search-0.nuget.org/query?skip=40' will - skip the first 40 packages in the order and returns JSON data for the packages - from 40-60. - 'https://api-v2v3search-0.nuget.org/query' could be the latest version, as the - url 'https://api-v3search-0.nuget.org/query' is not accessible now. + 'https://api-v2v3search-0.nuget.org/query' is a query URL which has metadata for + Nuget packages and we can query for all the packages by using the pagination + technique. For example 'https://api-v2v3search-0.nuget.org/query?skip=40' will + skip the first 40 packages in the order and returns JSON data for the packages + from 40-60. + 'https://api-v2v3search-0.nuget.org/query' could be the latest version, as the + url 'https://api-v3search-0.nuget.org/query' is not accessible now. """ + def get_uris(self, content): """ Return all the URLs for query results through pagination. Starts with number '0', increment count by '20'. The total count is found by 'totalHits'. """ - pkgs_count = content.get('totalHits', 0) + pkgs_count = content.get("totalHits", 0) count = 0 - url_template = 'https://api-v2v3search-0.nuget.org/query?skip={count}' + url_template = "https://api-v2v3search-0.nuget.org/query?skip={count}" while count < pkgs_count: url = url_template.format(count=str(count)) yield URI(uri=url, source_uri=self.uri) count = count + 20 -@visit_router.route('https://api-v2v3search-0.nuget.org/query\?skip=\d+') +@visit_router.route(r"https://api-v2v3search-0.nuget.org/query\?skip=\d+") class PackagesPageVisitor(HttpJsonVisitor): - """ - Visit the nuget API resources and return all the package URLs available at the passing`uri`. - """ + """Visit the nuget API resources and return all the package URLs available at the passing`uri`.""" + def get_uris(self, content): - metadata = content['data'] + metadata = content["data"] for packages in metadata: - for version in packages['versions']: - pkg_ver = version['version'] - pkg_url = version['@id'] - version_template = '{pkg_version}.0.json' + for version in packages["versions"]: + pkg_ver = version["version"] + pkg_url = version["@id"] + version_template = "{pkg_version}.0.json" version_name = version_template.format(pkg_version=pkg_ver) - name = pkg_url.replace('https://api.nuget.org/v3/registration1/', '').partition('/')[0] - package_url = PackageURL(type='nuget', name=name, version=pkg_ver).to_string() + name = pkg_url.replace( + "https://api.nuget.org/v3/registration1/", "" + ).partition("/")[0] + package_url = PackageURL( + type="nuget", name=name, version=pkg_ver + ).to_string() if version_name in pkg_url: # sometimes an extra '0' is appended to the version in the URL # FIXME: this is weird: there must be good reason why this is done??? - pkg_url = pkg_url.replace(version_name, pkg_ver + '.json') + pkg_url = pkg_url.replace(version_name, pkg_ver + ".json") yield URI(uri=pkg_url, package_url=package_url, source_uri=self.uri) # Add another case to have registration0 or registration1 in the url, yield the alternative url. - if pkg_url.find('/registration0/') > 0: - pkg_url = pkg_url.replace('/registration0/', '/registration1/') + if pkg_url.find("/registration0/") > 0: + pkg_url = pkg_url.replace("/registration0/", "/registration1/") yield URI(uri=pkg_url, source_uri=self.uri) - elif pkg_url.find('/registration1/') > 0: - pkg_url = pkg_url.replace('/registration1/', '/registration0/') + elif pkg_url.find("/registration1/") > 0: + pkg_url = pkg_url.replace("/registration1/", "/registration0/") yield URI(uri=pkg_url, source_uri=self.uri) -@visit_router.route('https://api.nuget.org/.+.json') +@visit_router.route("https://api.nuget.org/.+.json") class NugetAPIJsonVisitor(HttpJsonVisitor): """ Visit packageContent of nuget API json and return a @@ -107,65 +110,69 @@ class NugetAPIJsonVisitor(HttpJsonVisitor): The second loop will return the url https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json by visiting this url it won't create any new uris, the key is to store the json file itself through visitor and used in mapper. """ + def get_uris(self, content): - download_url = content.get('packageContent') + download_url = content.get("packageContent") if download_url: filename = fileutils.file_name(download_url) - withou_prefix = filename.replace('.nupkg', '') - filename_splits = withou_prefix.partition('.') + withou_prefix = filename.replace(".nupkg", "") + filename_splits = withou_prefix.partition(".") name = filename_splits[0] version = None if len(filename_splits) > 1: version = filename_splits[-1] - package_url = PackageURL( - type='nuget', - name=name, - version=version) + package_url = PackageURL(type="nuget", name=name, version=version) yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) - catalog_entry_url = content.get('catalogEntry') + catalog_entry_url = content.get("catalogEntry") if catalog_entry_url: yield URI(uri=catalog_entry_url, source_uri=self.uri) -@visit_router.route('https://www.nuget.org/packages\?page=\d+') +@visit_router.route(r"https://www.nuget.org/packages\?page=\d+") class NugetHTMLPageVisitor(HttpVisitor): - """ - Visitor to yield the URI of the each package page. - """ + """Visitor to yield the URI of the each package page.""" + def get_uris(self, content): - url_format = 'https://www.nuget.org/packages/{name}' - soup = BeautifulSoup(content, 'lxml') + url_format = "https://www.nuget.org/packages/{name}" + soup = BeautifulSoup(content, "lxml") has_package = False - for a in soup.find_all('a'): - if a.get('class') and 'package-title' in a.get('class'): + for a in soup.find_all("a"): + if a.get("class") and "package-title" in a.get("class"): has_package = True - href = a.get('href') + href = a.get("href") if not href: continue # href format is like: "/packages/NUnit/" - name = href.strip('/').partition('/')[-1] + name = href.strip("/").partition("/")[-1] if name: yield URI(uri=url_format.format(name=name), source_uri=self.uri) if has_package: - page_id = self.uri.replace('https://www.nuget.org/packages?page=', '').strip('/') + page_id = self.uri.replace( + "https://www.nuget.org/packages?page=", "" + ).strip("/") next_pageid = int(page_id) + 1 - nextpage_url_format = 'https://www.nuget.org/packages?page={id}' - yield URI(uri=nextpage_url_format.format(id=next_pageid), source_uri=self.uri) + nextpage_url_format = "https://www.nuget.org/packages?page={id}" + yield URI( + uri=nextpage_url_format.format(id=next_pageid), source_uri=self.uri + ) -@visit_router.route('https://www.nuget.org/packages/[\w\-\.]+', - 'https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+') +@visit_router.route( + r"https://www.nuget.org/packages/[\w\-\.]+", + r"https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+", +) class NugetHTMLPackageVisitor(HttpVisitor): """ Visitor to fetch the package HTML content Example: https://www.nuget.org/packages/log4net or https://www.nuget.org/packages/log4net/2.0.7 """ + pass -@map_router.route('https://api.nuget.org/v3/catalog.+\.json') +@map_router.route(r"https://api.nuget.org/v3/catalog.+\.json") class NugetPackageMapper(Mapper): """ Return NugetPackage object by parsing the ResourceURI stored in db referenced by the @@ -185,31 +192,31 @@ def build_packages_with_json(metadata, purl=None): metadata: json metadata content from API call purl: String value of the package url of the ResourceURI object """ - licenseUrl = metadata.get('licenseUrl') - copyr = metadata.get('copyright') + licenseUrl = metadata.get("licenseUrl") + copyr = metadata.get("copyright") authors = [] - names = metadata.get('authors') + names = metadata.get("authors") if names: - for name in names.split(','): - authors.append(scan_models.Party(name=name.strip(), role='author')) + for name in names.split(","): + authors.append(scan_models.Party(name=name.strip(), role="author")) - keywords = metadata.get('tags', []) + keywords = metadata.get("tags", []) # TODO: the content has the SHA512, our model may extend to SHA512 if name: - short_desc = metadata.get('summary') - long_desc = metadata.get('description') + short_desc = metadata.get("summary") + long_desc = metadata.get("description") if long_desc == short_desc: long_desc = None descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + description = "\n".join(descriptions) package_mapping = dict( - type='nuget', - name=metadata['id'], - version=metadata['version'], - homepage_url=metadata.get('projectUrl'), + type="nuget", + name=metadata["id"], + version=metadata["version"], + homepage_url=metadata.get("projectUrl"), description=description, extracted_license_statement=licenseUrl, license_detections=[], @@ -222,7 +229,7 @@ def build_packages_with_json(metadata, purl=None): yield package -@map_router.route('https://api.nuget.org/packages/.*\.nupkg') +@map_router.route(r"https://api.nuget.org/packages/.*\.nupkg") class NugetNUPKGDownloadMapper(Mapper): """ Return NugetPackage object by parsing the download URL. @@ -233,22 +240,24 @@ def get_packages(self, uri, resource_uri): if not resource_uri.data: return pkg_data = json.loads(resource_uri.data) - return build_packages_with_nupkg_download_url(pkg_data, resource_uri.package_url, resource_uri.uri) + return build_packages_with_nupkg_download_url( + pkg_data, resource_uri.package_url, resource_uri.uri + ) def build_packages_with_nupkg_download_url(metadata, purl, uri): if purl: package = scan_models.PackageData( - type='nuget', - name=purl.name, - download_url=uri + type="nuget", name=purl.name, download_url=uri ) package.set_purl(purl) yield package -@map_router.route('https://www.nuget.org/packages/[\w\-\.]+', - 'https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+') +@map_router.route( + r"https://www.nuget.org/packages/[\w\-\.]+", + r"https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+", +) class NugetHTMLPackageMapper(Mapper): """ Return NugetPackage object by parsing the package HTML content. @@ -256,12 +265,9 @@ class NugetHTMLPackageMapper(Mapper): """ def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri data. - """ + """Yield Package built from resource_uri data.""" metadata = resource_uri.data - build_packages_from_html( - metadata, resource_uri.uri, resource_uri.package_url) + build_packages_from_html(metadata, resource_uri.uri, resource_uri.package_url) def build_packages_from_html(metadata, uri, purl=None): @@ -271,54 +277,60 @@ def build_packages_from_html(metadata, uri, purl=None): uri: the uri of the ResourceURI object purl: String value of the package url of the ResourceURI object """ - download_url_format = 'https://www.nuget.org/api/v2/package/{name}/{version}' - soup = BeautifulSoup(metadata, 'lxml') - h1 = soup.find('h1') + download_url_format = "https://www.nuget.org/api/v2/package/{name}/{version}" + soup = BeautifulSoup(metadata, "lxml") + h1 = soup.find("h1") if h1 and h1.contents: license_value = None name = str(h1.contents[0]).strip() - for a in soup.find_all('a'): - if a.get('data-track') and a.get('data-track') == 'outbound-license-url': + for a in soup.find_all("a"): + if a.get("data-track") and a.get("data-track") == "outbound-license-url": license_value = a.string if license_value: license_value = str(license_value).strip() copyright_value = None - h2s = soup.find_all('h2') + h2s = soup.find_all("h2") for h2 in h2s: # Copyright will be after the copyright h2 node # The exmaple is like this: #

Copyright

#

Copyright 2004-2017 The Apache Software Foundation

- if h2.string and h2.string == 'Copyright': - next_element = h2.find_next_sibling('p') + if h2.string and h2.string == "Copyright": + next_element = h2.find_next_sibling("p") if next_element: copyright_value = next_element.string description = None - for m in soup.find_all('meta'): - if m.get('property') and m.get('property') == 'og:description' and m.get('content'): - description = m.get('content') - - for tbody in soup.find_all('tbody'): - if tbody.get('class') and tbody.get('class')[0] == 'no-border': - for a in tbody.find_all('a'): + for m in soup.find_all("meta"): + if ( + m.get("property") + and m.get("property") == "og:description" + and m.get("content") + ): + description = m.get("content") + + for tbody in soup.find_all("tbody"): + if tbody.get("class") and tbody.get("class")[0] == "no-border": + for a in tbody.find_all("a"): version = a.string if not version or not version.strip(): continue version = version.strip() - download_url = download_url_format.format(name=name, version=version) + download_url = download_url_format.format( + name=name, version=version + ) package_mapping = dict( datasource_id="nuget_metadata_json", name=name, - type='nuget', + type="nuget", version=version, homepage_url=uri, description=description, download_url=download_url, extracted_license_statement=license_value, license_detections=[], - copyright=copyright_value + copyright=copyright_value, ) package = scan_models.Package.from_package_data( package_data=package_mapping, diff --git a/minecode/miners/openssl.py b/minecode/miners/openssl.py index f4eff87d..6140dd58 100644 --- a/minecode/miners/openssl.py +++ b/minecode/miners/openssl.py @@ -7,25 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from bs4 import BeautifulSoup -from datetime import datetime import logging +from datetime import datetime +from bs4 import BeautifulSoup from commoncode import fileutils -from packageurl import PackageURL from packagedcode import models as scan_models +from packageurl import PackageURL from minecode import map_router -from minecode.miners import Mapper -from minecode.utils import parse_date - from minecode import seed from minecode import visit_router -from minecode.utils import is_int -from minecode.miners import HttpVisitor from minecode.miners import URI - - +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.utils import is_int +from minecode.utils import parse_date logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -34,80 +31,93 @@ class OpenSSLSeed(seed.Seeder): - def get_seeds(self): - yield 'https://ftp.openssl.org/' + yield "https://ftp.openssl.org/" -@visit_router.route('https://ftp.openssl.org/', - 'https://ftp.openssl.org/.*/') +@visit_router.route("https://ftp.openssl.org/", "https://ftp.openssl.org/.*/") class OpenSSLVisitor(HttpVisitor): - """ - Collect package metadata URIs from the open SSL HTML site. - """ + """Collect package metadata URIs from the open SSL HTML site.""" def get_uris(self, content): - """ - Return URIs objects and the corresponding size, file date info. - """ - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: + """Return URIs objects and the corresponding size, file date info.""" + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: continue - href = a['href'] + href = a["href"] if not href: continue - if href.startswith('?') or href.startswith('/'): + if href.startswith("?") or href.startswith("/"): # if href is not valid resource, ignore, for example, it's a # link to parent link etc. continue url = self.uri + href - next_sibling = a.parent.findNext('td') + next_sibling = a.parent.findNext("td") date = None if next_sibling and next_sibling.contents: date = next_sibling.contents[0].strip() # The passing date format is like: 2014-11-19 17:48 - date = datetime.strptime(date, '%Y-%m-%d %H:%M') + date = datetime.strptime(date, "%Y-%m-%d %H:%M") if next_sibling: - next_next = next_sibling.findNext('td') + next_next = next_sibling.findNext("td") if next_next and next_next.contents: size = next_next.contents[0].strip() if size and is_int(size): # By default, if the unit is not shown, it means k. size = str(int(size) * 1024) - if size.endswith(('M', 'm')): + if size.endswith(("M", "m")): # If the size is mega byte, and the format is a float # instead of int, since it's possible like 5.1M size = str( - int(float(size.replace('M', '').replace('m', '')) * 1024 * 1024)) - elif size.endswith('G') or size.endswith('G'): + int( + float(size.replace("M", "").replace("m", "")) + * 1024 + * 1024 + ) + ) + elif size.endswith("G") or size.endswith("G"): # if the size is gega byte size = str( - int(float(size.replace('G', '').replace('g', '')) * 1024 * 1024 * 1024)) - if size == '-': + int( + float(size.replace("G", "").replace("g", "")) + * 1024 + * 1024 + * 1024 + ) + ) + if size == "-": # if it's folder, ignore the size size = None file_name = None - if not url.endswith('/'): + if not url.endswith("/"): file_name = fileutils.file_name(url) if file_name: # If it's a file, pass the url to mapper by setting the visited # to True package_url = None version = None - if 'tar.gz' in file_name: - version = file_name.replace('openssl-', '').partition('.tar.gz')[0] - package_url = PackageURL(type='generic', name='openssl', version=version).to_string() - yield URI(uri=url, source_uri=self.uri, package_url=package_url, date=date, file_name=file_name, size=size) + if "tar.gz" in file_name: + version = file_name.replace("openssl-", "").partition(".tar.gz")[0] + package_url = PackageURL( + type="generic", name="openssl", version=version + ).to_string() + yield URI( + uri=url, + source_uri=self.uri, + package_url=package_url, + date=date, + file_name=file_name, + size=size, + ) else: yield URI(uri=url, source_uri=self.uri, date=date, size=size) -@map_router.route('https://ftp.openssl.org/.*') +@map_router.route("https://ftp.openssl.org/.*") class OpenSSLMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield ScannedPackage built from resource_uri record for a single package @@ -124,24 +134,31 @@ def build_packages(resource_uri, purl=None): """ uri = resource_uri.uri file_name = fileutils.file_name(uri) - version = file_name.replace('.tar.gz', '').replace('openssl-', '').replace('.tar.gz', '').replace( - '.asc', '').replace('.md5', '').replace('.sha1', '').replace('.sha256', '') + version = ( + file_name.replace(".tar.gz", "") + .replace("openssl-", "") + .replace(".tar.gz", "") + .replace(".asc", "") + .replace(".md5", "") + .replace(".sha1", "") + .replace(".sha256", "") + ) common_data = dict( datasource_id="openssl_metadeta", - type='generic', + type="generic", name=file_name, - description='The OpenSSL Project is a collaborative effort to develop a robust, commercial-grade, fully featured, and Open Source toolkit implementing the Transport Layer Security (TLS) protocols (including SSLv3) as well as a full-strength general purpose cryptographic library.', + description="The OpenSSL Project is a collaborative effort to develop a robust, commercial-grade, fully featured, and Open Source toolkit implementing the Transport Layer Security (TLS) protocols (including SSLv3) as well as a full-strength general purpose cryptographic library.", version=version, size=resource_uri.size, release_date=parse_date(resource_uri.last_modified_date), - extracted_license_statement='OpenSSL License', + extracted_license_statement="OpenSSL License", license_detections=[], - homepage_url='https://www.openssl.org/', + homepage_url="https://www.openssl.org/", download_url=uri, - copyright='Copyright (c) 1998-2018 The OpenSSL Project\nCopyright (c) 1995-1998 Eric A. Young, Tim J. Hudson\nAll rights reserved.', - vcs_url='git+https://github.com/openssl/openssl.git', - code_view_url='https://github.com/openssl/openssl', - bug_tracking_url='https://github.com/openssl/openssl/issues', + copyright="Copyright (c) 1998-2018 The OpenSSL Project\nCopyright (c) 1995-1998 Eric A. Young, Tim J. Hudson\nAll rights reserved.", + vcs_url="git+https://github.com/openssl/openssl.git", + code_view_url="https://github.com/openssl/openssl", + bug_tracking_url="https://github.com/openssl/openssl/issues", ) package = scan_models.Package.from_package_data( package_data=common_data, diff --git a/minecode/miners/openwrt.py b/minecode/miners/openwrt.py index f2388bf3..97d89214 100644 --- a/minecode/miners/openwrt.py +++ b/minecode/miners/openwrt.py @@ -9,8 +9,8 @@ import gzip import json -import os import logging +import os from bs4 import BeautifulSoup from debian_inspector import debcon @@ -18,16 +18,15 @@ from packageurl import PackageURL from minecode import debutils -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.utils import extract_file from minecode.collectors.debian import get_dependencies -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpVisitor +from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI - +from minecode.utils import extract_file logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -36,81 +35,89 @@ class OpenWrtSeed(seed.Seeder): - def get_seeds(self): - yield 'https://downloads.openwrt.org/chaos_calmer/15.05/' + yield "https://downloads.openwrt.org/chaos_calmer/15.05/" -@visit_router.route('https://downloads.openwrt.org/.*/') +@visit_router.route("https://downloads.openwrt.org/.*/") class OpenWrtDownloadPagesVisitor(HttpVisitor): - """ - Visit the OpwnWRT download HTML page and return URIs parsed from HTML page. - """ + """Visit the OpwnWRT download HTML page and return URIs parsed from HTML page.""" + def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='td'): - a = td.find(name='a') + page = BeautifulSoup(content, "lxml") + for td in page.find_all(name="td"): + a = td.find(name="a") if not a: continue - href = a['href'] - if href == '../': # Ignore the parent url + href = a["href"] + if href == "../": # Ignore the parent url continue # Add the uri for next loop if it ends with "/", which means it'a # folder resource uri - if href.endswith('/'): - package_url = PackageURL(type='openwrt', name=href.replace('/', '')).to_string() - yield URI(uri=self.uri + href, package_url=package_url, source_uri=self.uri) - elif href.endswith(('Packages', 'Packages.gz', '.ipk')): + if href.endswith("/"): + package_url = PackageURL( + type="openwrt", name=href.replace("/", "") + ).to_string() + yield URI( + uri=self.uri + href, package_url=package_url, source_uri=self.uri + ) + elif href.endswith(("Packages", "Packages.gz", ".ipk")): yield URI(uri=self.uri + href, source_uri=self.uri) -@visit_router.route('https://downloads.openwrt.org/.*/Packages\.gz') +@visit_router.route(r"https://downloads.openwrt.org/.*/Packages\.gz") class OpenWrtPackageIndexVisitor(NonPersistentHttpVisitor): - """ - Visit the OpwnWRT Packages.gz Index file and collect uris. - """ + """Visit the OpwnWRT Packages.gz Index file and collect uris.""" + def get_uris(self, content): - with gzip.open(content, 'rb') as f: + with gzip.open(content, "rb") as f: content = f.read() for package in debcon.get_paragraphs_data(content): - file_info = package.get('Filename') + file_info = package.get("Filename") if not file_info: continue - version = package.get('Version') - md5sum = package.get('MD5Sum') - sha256sum = package.get('SHA256sum') - package_name = package.get('Package') + version = package.get("Version") + md5sum = package.get("MD5Sum") + sha256sum = package.get("SHA256sum") + package_name = package.get("Package") package_url = None if package_name and version: - package_url = PackageURL(type='openwrt', name=package_name, version=version).to_string() - file_info = file_info.lstrip('/') - dir_url = self.uri.replace('Packages.gz', '') + file_info - yield URI(uri=dir_url, package_url=package_url, data=json.dumps(str(package)), source_uri=self.uri, md5=md5sum, sha256=sha256sum,) - - -@visit_router.route('https://downloads.openwrt.org/.*\.ipk') + package_url = PackageURL( + type="openwrt", name=package_name, version=version + ).to_string() + file_info = file_info.lstrip("/") + dir_url = self.uri.replace("Packages.gz", "") + file_info + yield URI( + uri=dir_url, + package_url=package_url, + data=json.dumps(str(package)), + source_uri=self.uri, + md5=md5sum, + sha256=sha256sum, + ) + + +@visit_router.route(r"https://downloads.openwrt.org/.*\.ipk") class OpenWrtIpkPackageArchiveVisitor(NonPersistentHttpVisitor): - """ - Visit the OpwnWRT Packages.gz and collect uris. - """ + """Visit the OpwnWRT Packages.gz and collect uris.""" + def dumps(self, content): """ Extract an ipk package archive and its control.targ.gz. Parse the control file and return a JSON string from these data. """ extracted_location = extract_file(content) - control_targz = os.path.join(extracted_location, 'control.tar.gz') + control_targz = os.path.join(extracted_location, "control.tar.gz") control_extracted_folder = extract_file(control_targz) - control_location = os.path.join(control_extracted_folder, 'control') + control_location = os.path.join(control_extracted_folder, "control") parsed = debcon.Debian822.from_file(control_location) return json.dumps(parsed) -@map_router.route('https://downloads.openwrt.org/.*\.ipk') +@map_router.route(r"https://downloads.openwrt.org/.*\.ipk") class OpenwrtIpkMetadataMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield ScannedPackage built from resource_uri record for a single package @@ -127,39 +134,39 @@ def build_packages(metadata, purl=None, uri=None): purl: String value of the package url of the ResourceURI object """ common_data = dict( - type='openwrt', - datasource_id='openwrt_metadata', - name=metadata.get('Package'), - version=metadata.get('Version'), - description=metadata.get('Description'), - size=metadata.get('Installed-Size'), + type="openwrt", + datasource_id="openwrt_metadata", + name=metadata.get("Package"), + version=metadata.get("Version"), + description=metadata.get("Description"), + size=metadata.get("Installed-Size"), ) - dependencies = get_dependencies(metadata, ['Depends']) + dependencies = get_dependencies(metadata, ["Depends"]) if dependencies: - common_data['dependencies'] = dependencies + common_data["dependencies"] = dependencies - maintainers = metadata.get('Maintainer') + maintainers = metadata.get("Maintainer") if maintainers: name, email = debutils.parse_email(maintainers) if name: - parties = common_data.get('parties') + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - party = scan_models.Party(name=name, role='maintainer', email=email) - common_data['parties'].append(party) + common_data["parties"] = [] + party = scan_models.Party(name=name, role="maintainer", email=email) + common_data["parties"].append(party) - lic = metadata.get('License') + lic = metadata.get("License") if lic: - common_data['declared_license'] = lic + common_data["declared_license"] = lic - common_data['keywords'] = [] - section = metadata.get('Section') + common_data["keywords"] = [] + section = metadata.get("Section") if section: - common_data['keywords'].append(section) - architecture = metadata.get('Architecture') + common_data["keywords"].append(section) + architecture = metadata.get("Architecture") if architecture: - common_data['keywords'].append(architecture) + common_data["keywords"].append(architecture) package = scan_models.Package.from_package_data( package_data=common_data, datafile_path=uri, diff --git a/minecode/miners/packagist.py b/minecode/miners/packagist.py index 964f2f11..b5e8e3b4 100644 --- a/minecode/miners/packagist.py +++ b/minecode/miners/packagist.py @@ -13,15 +13,14 @@ from packagedcode.models import DependentPackage from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper -from minecode.miners import HttpJsonVisitor from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper from minecode.utils import form_vcs_url - """ Collect packagist packages @@ -30,12 +29,11 @@ class PackagistSeed(seed.Seeder): - def get_seeds(self): - yield 'https://packagist.org/packages/list.json' + yield "https://packagist.org/packages/list.json" -@visit_router.route('https://packagist.org/packages/list.json') +@visit_router.route("https://packagist.org/packages/list.json") class PackagistListVisitor(HttpJsonVisitor): """ Collect list json resource and yield URIs for searching with package url. @@ -44,31 +42,33 @@ class PackagistListVisitor(HttpJsonVisitor): """ def get_uris(self, content): - search_url_template = 'https://packagist.org/p/{vendor}/{package}.json' - packages_entries = content.get('packageNames', {}) + search_url_template = "https://packagist.org/p/{vendor}/{package}.json" + packages_entries = content.get("packageNames", {}) for package in packages_entries: # FIXME: what does it mean to have no / in the URL? - if '/' not in package: + if "/" not in package: continue - vp = package.split('/') + vp = package.split("/") vendor = vp[0] package = vp[1] - package_url = PackageURL(type='composer', name=package).to_string() - yield URI(uri=search_url_template.format(vendor=vendor, package=package), package_url=package_url, source_uri=self.uri) + package_url = PackageURL(type="composer", name=package).to_string() + yield URI( + uri=search_url_template.format(vendor=vendor, package=package), + package_url=package_url, + source_uri=self.uri, + ) -@visit_router.route('https://packagist.org/p/.*json') +@visit_router.route("https://packagist.org/p/.*json") class PackageVisitor(HttpJsonVisitor): - """ - Collect JSON for a package. - """ + """Collect JSON for a package.""" + # FIXME: what about having a download URL to fetch the real package??? pass -@map_router.route('https://packagist.org/p/.*json') +@map_router.route("https://packagist.org/p/.*json") class PackagistPackageMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -85,59 +85,64 @@ def build_packages_with_json(metadata, purl=None, uri=None): metadata: json metadata content purl: String value of the package url of the ResourceURI object """ - - package = metadata.get('package') + package = metadata.get("package") if package: - primary_language = package.get('language') - for version_content in package.get('versions').values(): + primary_language = package.get("language") + for version_content in package.get("versions").values(): common = dict( - datasource_id='php_composer_json', - type='composer', - name=version_content.get('name'), - description=version_content.get('description'), + datasource_id="php_composer_json", + type="composer", + name=version_content.get("name"), + description=version_content.get("description"), primary_language=primary_language, ) - common['version'] = version_content.get('version') - common['keywords'] = version_content.get('keywords') - common['homepage_url'] = version_content.get('homepage') + common["version"] = version_content.get("version") + common["keywords"] = version_content.get("keywords") + common["homepage_url"] = version_content.get("homepage") - source = version_content.get('source') + source = version_content.get("source") if source: - if source.get('type') == 'git' and source.get('url'): - common['vcs_url'] = form_vcs_url('git', source.get('url')) + if source.get("type") == "git" and source.get("url"): + common["vcs_url"] = form_vcs_url("git", source.get("url")) else: pass # Packagist only has the github repo - dist = version_content.get('dist') + dist = version_content.get("dist") if dist: - common['download_url'] = dist.get('url') - common['sha1'] = dist.get('shasum') + common["download_url"] = dist.get("url") + common["sha1"] = dist.get("shasum") - for author in version_content.get('authors', []): - parties = common.get('parties') + for author in version_content.get("authors", []): + parties = common.get("parties") if not parties: - common['parties'] = [] - common['parties'].append( - scan_models.Party(name=author.get('name'), role='author', url=author.get( - 'homepage'), email=author.get('email')).to_dict() + common["parties"] = [] + common["parties"].append( + scan_models.Party( + name=author.get("name"), + role="author", + url=author.get("homepage"), + email=author.get("email"), + ).to_dict() ) extracted_license_statement = set([]) - for lic in version_content.get('license'): + for lic in version_content.get("license"): extracted_license_statement.add(lic) if extracted_license_statement: - common['extracted_license_statement'] = list( - extracted_license_statement) - common['license_detections'] = [] + common["extracted_license_statement"] = list( + extracted_license_statement + ) + common["license_detections"] = [] dependencies = [] - for name, version in version_content.get('require', {}).items(): + for name, version in version_content.get("require", {}).items(): dependencies.append( DependentPackage( - purl=name, extracted_requirement=version, scope='runtime').to_dict() + purl=name, extracted_requirement=version, scope="runtime" + ).to_dict() ) if dependencies: - common['dependencies'] = dependencies + common["dependencies"] = dependencies # FIXME: We should create a composer package package = scan_models.Package.from_package_data( package_data=common, diff --git a/minecode/miners/pypi.py b/minecode/miners/pypi.py index 56f6244c..98876a21 100644 --- a/minecode/miners/pypi.py +++ b/minecode/miners/pypi.py @@ -14,17 +14,16 @@ from packagedcode import models as scan_models from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.utils import get_temp_file -from minecode.miners import Mapper -from minecode.miners import HttpJsonVisitor from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper from minecode.miners import Visitor +from minecode.utils import get_temp_file from minecode.utils import parse_date - """ Visitors for Pypi and Pypi-like Python package repositories. @@ -43,94 +42,108 @@ class PypiSeed(seed.Seeder): - def get_seeds(self): - yield 'https://pypi.python.org/pypi/' + yield "https://pypi.python.org/pypi/" -@visit_router.route('https://pypi.python.org/pypi/') +@visit_router.route("https://pypi.python.org/pypi/") class PypiIndexVisitor(Visitor): - """ - Collect package metadata URIs from the top level pypi index for each package. - """ + """Collect package metadata URIs from the top level pypi index for each package.""" + def fetch(self, uri, timeout=None): - """ - Specialized fetching using XML RPCs. - """ + """Specialized fetching using XML RPCs.""" packages = xmlrpc.client.ServerProxy(uri).list_packages() content = list(packages) - temp_file = get_temp_file('PypiIndexVisitor') - with codecs.open(temp_file, mode='wb', encoding='utf-8') as expect: - json.dump(content, expect, indent=2, separators=(',', ':')) + temp_file = get_temp_file("PypiIndexVisitor") + with codecs.open(temp_file, mode="wb", encoding="utf-8") as expect: + json.dump(content, expect, indent=2, separators=(",", ":")) return temp_file def dumps(self, content): - """ - The content is huge json and should not be dumped. - """ + """The content is huge json and should not be dumped.""" return None def get_uris(self, content): - with codecs.open(content, mode='rb', encoding='utf-8') as contentfile: + with codecs.open(content, mode="rb", encoding="utf-8") as contentfile: packages_list = json.load(contentfile) - url_template = 'https://pypi.python.org/pypi/{name}/json' + url_template = "https://pypi.python.org/pypi/{name}/json" for name in packages_list: - package_url = PackageURL(type='pypi', name=name).to_string() - yield URI(uri=url_template.format(name=name), package_url=package_url, source_uri=self.uri) + package_url = PackageURL(type="pypi", name=name).to_string() + yield URI( + uri=url_template.format(name=name), + package_url=package_url, + source_uri=self.uri, + ) -@visit_router.route('https://pypi.python.org/pypi/[^/]+/json') +@visit_router.route("https://pypi.python.org/pypi/[^/]+/json") class PypiPackageVisitor(HttpJsonVisitor): """ Collect package metadata URIs for all release of a single Pypi package. The url will contain only the package name, for example: https://pypi.org/pypi/vmock/json By parsing the content, the goal is to form the json with version/release: https://pypi.org/pypi/vmock/0.1/json """ - def get_uris(self, content): - url_template = 'https://pypi.python.org/pypi/{name}/{release}/json' - info = content.get('info', {}) - name = info.get('name') + def get_uris(self, content): + url_template = "https://pypi.python.org/pypi/{name}/{release}/json" + info = content.get("info", {}) + name = info.get("name") if name: - for release in content['releases']: - package_url = PackageURL(type='pypi', name=name, version=release).to_string() - yield URI(uri=url_template.format(name=name, release=release), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://pypi.python.org/pypi/[^/]+/[^/]+/json') + for release in content["releases"]: + package_url = PackageURL( + type="pypi", name=name, version=release + ).to_string() + yield URI( + uri=url_template.format(name=name, release=release), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("https://pypi.python.org/pypi/[^/]+/[^/]+/json") class PypiPackageReleaseVisitor(HttpJsonVisitor): """ Collect package download URIs for all packages archives of one Pypi package release. The example is: https://pypi.org/pypi/vmock/0.1/json """ + def get_uris(self, content): # TODO: this is likely best ignored entirely??? # A download_url may be provided for an off-Pypi-download - info = content.get('info', {}) - name = info.get('name') + info = content.get("info", {}) + name = info.get("name") version = None - download_url = info.get('download_url') - if download_url and download_url != 'UNKNOWN': - version = info.get('version') - package_url = PackageURL(type='pypi', name=name, version=version).to_string() + download_url = info.get("download_url") + if download_url and download_url != "UNKNOWN": + version = info.get("version") + package_url = PackageURL( + type="pypi", name=name, version=version + ).to_string() yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) # Common on-Pypi-download URLs are in the urls block - for download in content.get('urls', {}): - url = download.get('url') + for download in content.get("urls", {}): + url = download.get("url") if not url: continue - package_url = PackageURL(type='pypi', name=name, version=version).to_string() - yield URI(url, package_url=package_url, file_name=download.get('filename'), - size=download.get('size'), date=download.get('upload_time'), - md5=download.get('md5_digest'), source_uri=self.uri) - - -@map_router.route('https://pypi.python.org/pypi/[^/]+/[^/]+/json') + package_url = PackageURL( + type="pypi", name=name, version=version + ).to_string() + yield URI( + url, + package_url=package_url, + file_name=download.get("filename"), + size=download.get("size"), + date=download.get("upload_time"), + md5=download.get("md5_digest"), + source_uri=self.uri, + ) + + +@map_router.route("https://pypi.python.org/pypi/[^/]+/[^/]+/json") class PypiPackageMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield ScannedPackages built from resource_uri record for a single @@ -155,56 +168,63 @@ def build_packages(metadata, purl=None): purl: String value of the package url of the ResourceURI object """ - info = metadata['info'] + info = metadata["info"] # mapping of information that are common to all the downloads of a version - short_desc = info.get('summary') - long_desc = info.get('description') + short_desc = info.get("summary") + long_desc = info.get("description") descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + description = "\n".join(descriptions) common_data = dict( - name=info['name'], - version=info['version'], + name=info["name"], + version=info["version"], description=description, - homepage_url=info.get('home_page'), - bug_tracking_url=info.get('bugtrack_url'), + homepage_url=info.get("home_page"), + bug_tracking_url=info.get("bugtrack_url"), ) - author = info.get('author') - email = info.get('author_email') + author = info.get("author") + email = info.get("author_email") if author or email: - parties = common_data.get('parties') + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - common_data['parties'].append(scan_models.Party( - type=scan_models.party_person, name=author, role='author', email=email)) + common_data["parties"] = [] + common_data["parties"].append( + scan_models.Party( + type=scan_models.party_person, name=author, role="author", email=email + ) + ) - maintainer = info.get('maintainer') - email = info.get('maintainer_email') + maintainer = info.get("maintainer") + email = info.get("maintainer_email") if maintainer or email: - parties = common_data.get('parties') + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - common_data['parties'].append(scan_models.Party( - type=scan_models.party_person, name=maintainer, role='maintainer', email=email)) + common_data["parties"] = [] + common_data["parties"].append( + scan_models.Party( + type=scan_models.party_person, + name=maintainer, + role="maintainer", + email=email, + ) + ) extracted_license_statement = [] - lic = info.get('license') - if lic and lic != 'UNKNOWN': + lic = info.get("license") + if lic and lic != "UNKNOWN": extracted_license_statement.append(lic) - classifiers = info.get('classifiers') + classifiers = info.get("classifiers") if classifiers and not extracted_license_statement: - licenses = [ - lic for lic in classifiers if lic.lower().startswith('license')] + licenses = [lic for lic in classifiers if lic.lower().startswith("license")] for lic in licenses: extracted_license_statement.append(lic) - common_data['extracted_license_statement'] = extracted_license_statement + common_data["extracted_license_statement"] = extracted_license_statement - kw = info.get('keywords') + kw = info.get("keywords") if kw: - common_data['keywords'] = [k.strip() - for k in kw.split(',') if k.strip()] + common_data["keywords"] = [k.strip() for k in kw.split(",") if k.strip()] # FIXME: we should either support "extra" data in a ScannedPackage or just ignore this kind of FIXME comments for now @@ -219,37 +239,37 @@ def build_packages(metadata, purl=None): # A download_url may be provided for off Pypi download: we yield a package if relevant # FIXME: do not prioritize the download_url outside Pypi over actual exact Pypi donwload URL - download_url = info.get('download_url') - if download_url and download_url != 'UNKNOWN': + download_url = info.get("download_url") + if download_url and download_url != "UNKNOWN": download_data = dict( - datasource_id='pypi_sdist_pkginfo', - type='pypi', + datasource_id="pypi_sdist_pkginfo", + type="pypi", download_url=download_url, ) download_data.update(common_data) package = scan_models.PackageData.from_data(download_data) # TODO: Consider creating a DatafileHandler for PyPI API metadata - package.datasource_id = 'pypi_api_metadata' + package.datasource_id = "pypi_api_metadata" package.set_purl(purl) yield package # yield a package for each download URL - for download in metadata['urls']: - url = download.get('url') + for download in metadata["urls"]: + url = download.get("url") if not url: continue download_data = dict( download_url=url, - size=download.get('size'), - release_date=parse_date(download.get('upload_time')), - datasource_id='pypi_sdist_pkginfo', - type='pypi', + size=download.get("size"), + release_date=parse_date(download.get("upload_time")), + datasource_id="pypi_sdist_pkginfo", + type="pypi", ) # TODO: Check for other checksums - download_data['md5'] = download.get('md5_digest') + download_data["md5"] = download.get("md5_digest") download_data.update(common_data) package = scan_models.PackageData.from_data(download_data) - package.datasource_id = 'pypi_api_metadata' + package.datasource_id = "pypi_api_metadata" package.set_purl(purl) yield package diff --git a/minecode/miners/repodata.py b/minecode/miners/repodata.py index 87053e1d..e40eb788 100644 --- a/minecode/miners/repodata.py +++ b/minecode/miners/repodata.py @@ -2,9 +2,6 @@ # Copyright (c) nexB Inc. and others. All rights reserved. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals import logging import posixpath @@ -19,9 +16,7 @@ def remove_list_repetitions(input_list): - """ - Removes the repeated items in a list and returns a list with unique values - """ + """Removes the repeated items in a list and returns a list with unique values""" output = [] for item in input_list: if item not in output: @@ -37,9 +32,12 @@ def combine_dicts_using_pkgid(all_dicts): """ all_package_info = [] for package_info in all_dicts: - if package_info['pkgid']: - all_package_info.append(combine_list_of_dicts( - [a for a in all_dicts if a['pkgid'] == package_info['pkgid']])) + if package_info["pkgid"]: + all_package_info.append( + combine_list_of_dicts( + [a for a in all_dicts if a["pkgid"] == package_info["pkgid"]] + ) + ) return remove_list_repetitions(all_package_info) @@ -63,19 +61,17 @@ def convert_tuples_to_dict(input, attr_name=None): infos = {} if input: if not attr_name: - attr_name = '' + attr_name = "" else: - attr_name = '_' + attr_name + attr_name = "_" + attr_name for attrib, value in input: infos[attrib + attr_name] = value return infos def is_absolute(url): - """ - Return 'True' if the URL is absolute. - """ - schemes = ('http://', 'ftp://', 'https://') + """Return 'True' if the URL is absolute.""" + schemes = ("http://", "ftp://", "https://") return url.startswith(schemes) @@ -86,8 +82,8 @@ def build_rpm_download_url(base_url, href): """ if is_absolute(href): return href - if href.startswith('/'): - href = href.lstrip('/') + if href.startswith("/"): + href = href.lstrip("/") return posixpath.join(base_url, href) @@ -115,15 +111,17 @@ def get_url_for_tag(location, data_type): """ repomd = etree.parse(location).getroot() - for data_tag in repomd.findall('{http://linux.duke.edu/metadata/repo}data'): + for data_tag in repomd.findall("{http://linux.duke.edu/metadata/repo}data"): for attrib, value in data_tag.items(): - if attrib == 'type' and value == data_type: + if attrib == "type" and value == data_type: download_location = data_tag.find( - '{http://linux.duke.edu/metadata/repo}location') + "{http://linux.duke.edu/metadata/repo}location" + ) relative_url_info = convert_tuples_to_dict( - download_location.items(), 'location') + download_location.items(), "location" + ) if relative_url_info: - return relative_url_info['href_location'] + return relative_url_info["href_location"] def get_value_from_tuple_pairs(tuples, key): @@ -153,24 +151,26 @@ def filelistsxml_parser(location): """ infos = [] filelistsxml = etree.parse(location).getroot() - for package in filelistsxml.findall('{http://linux.duke.edu/metadata/filelists}package'): - version = package.find( - '{http://linux.duke.edu/metadata/filelists}version') + for package in filelistsxml.findall( + "{http://linux.duke.edu/metadata/filelists}package" + ): + version = package.find("{http://linux.duke.edu/metadata/filelists}version") package_info = dict(package.items() + version.items()) directory_listing = package.findall( - '{http://linux.duke.edu/metadata/filelists}file') + "{http://linux.duke.edu/metadata/filelists}file" + ) directories = [] files = [] for name in directory_listing: items = name.items() if items: - file_type = get_value_from_tuple_pairs(items, 'type') - if file_type == 'dir': - directories.append({'name': name.text}) + file_type = get_value_from_tuple_pairs(items, "type") + if file_type == "dir": + directories.append({"name": name.text}) else: - files.append({'name': name.text}) - package_info['directories'] = directories - package_info['files'] = files + files.append({"name": name.text}) + package_info["directories"] = directories + package_info["files"] = files infos.append(package_info) return infos @@ -196,67 +196,54 @@ def primaryxml_parser(location): """ pkgs_infos = [] primaryxml = etree.parse(location).getroot() - for package in primaryxml.findall('{http://linux.duke.edu/metadata/common}package'): + for package in primaryxml.findall("{http://linux.duke.edu/metadata/common}package"): package_info = dict(package.items()) tags_infos = [] - description = package.find( - '{http://linux.duke.edu/metadata/common}description') - summary = package.find( - '{http://linux.duke.edu/metadata/common}summary') - packager = package.find( - '{http://linux.duke.edu/metadata/common}packager') - url = package.find('{http://linux.duke.edu/metadata/common}url') - size = package.find('{http://linux.duke.edu/metadata/common}size') - time = package.find('{http://linux.duke.edu/metadata/common}time') + description = package.find("{http://linux.duke.edu/metadata/common}description") + summary = package.find("{http://linux.duke.edu/metadata/common}summary") + packager = package.find("{http://linux.duke.edu/metadata/common}packager") + url = package.find("{http://linux.duke.edu/metadata/common}url") + size = package.find("{http://linux.duke.edu/metadata/common}size") + time = package.find("{http://linux.duke.edu/metadata/common}time") download_location = package.find( - '{http://linux.duke.edu/metadata/common}location') - checksum = package.find( - '{http://linux.duke.edu/metadata/common}checksum') - - rpm_format = package.find( - '{http://linux.duke.edu/metadata/common}format') - buildhost = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}buildhost') - rpm_group = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}group') + "{http://linux.duke.edu/metadata/common}location" + ) + checksum = package.find("{http://linux.duke.edu/metadata/common}checksum") + + rpm_format = package.find("{http://linux.duke.edu/metadata/common}format") + buildhost = rpm_format.find("{http://linux.duke.edu/metadata/rpm}buildhost") + rpm_group = rpm_format.find("{http://linux.duke.edu/metadata/rpm}group") header_range = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}header-range') - rpm_license = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}license') - rpm_vendor = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}vendor') - source_rpm = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}sourcerpm') - - package_info['description'] = get_tag_text(description) - package_info['summary'] = get_tag_text(summary) - package_info['url'] = get_tag_text(url) - package_info['checksum'] = get_tag_text(checksum) - package_info['pkgid'] = get_tag_text(checksum) - package_info['buildhost'] = get_tag_text(buildhost) - package_info['group'] = get_tag_text(rpm_group) - package_info['license'] = get_tag_text(rpm_license) - package_info['sourcerpm'] = get_tag_text(source_rpm) - tags_infos.append(convert_tuples_to_dict(packager.items(), 'packager')) - tags_infos.append(convert_tuples_to_dict(size.items(), 'size')) - tags_infos.append(convert_tuples_to_dict(time.items(), 'time')) + "{http://linux.duke.edu/metadata/rpm}header-range" + ) + rpm_license = rpm_format.find("{http://linux.duke.edu/metadata/rpm}license") + rpm_vendor = rpm_format.find("{http://linux.duke.edu/metadata/rpm}vendor") + source_rpm = rpm_format.find("{http://linux.duke.edu/metadata/rpm}sourcerpm") + + package_info["description"] = get_tag_text(description) + package_info["summary"] = get_tag_text(summary) + package_info["url"] = get_tag_text(url) + package_info["checksum"] = get_tag_text(checksum) + package_info["pkgid"] = get_tag_text(checksum) + package_info["buildhost"] = get_tag_text(buildhost) + package_info["group"] = get_tag_text(rpm_group) + package_info["license"] = get_tag_text(rpm_license) + package_info["sourcerpm"] = get_tag_text(source_rpm) + tags_infos.append(convert_tuples_to_dict(packager.items(), "packager")) + tags_infos.append(convert_tuples_to_dict(size.items(), "size")) + tags_infos.append(convert_tuples_to_dict(time.items(), "time")) tags_infos.append(convert_tuples_to_dict(download_location.items())) - tags_infos.append( - convert_tuples_to_dict(header_range.items(), 'header_range')) - tags_infos.append(convert_tuples_to_dict(rpm_vendor.items(), 'vendor')) - - requires = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}requires') - provides = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}provides') + tags_infos.append(convert_tuples_to_dict(header_range.items(), "header_range")) + tags_infos.append(convert_tuples_to_dict(rpm_vendor.items(), "vendor")) + + requires = rpm_format.find("{http://linux.duke.edu/metadata/rpm}requires") + provides = rpm_format.find("{http://linux.duke.edu/metadata/rpm}provides") if requires is not None: - required_rpms = [ - convert_tuples_to_dict(rpm.items()) for rpm in requires] - package_info['required_rpms'] = required_rpms + required_rpms = [convert_tuples_to_dict(rpm.items()) for rpm in requires] + package_info["required_rpms"] = required_rpms if provides is not None: - provided_rpms = [ - convert_tuples_to_dict(rpm.items()) for rpm in provides] - package_info['provided_rpms'] = provided_rpms + provided_rpms = [convert_tuples_to_dict(rpm.items()) for rpm in provides] + package_info["provided_rpms"] = provided_rpms package_info = combine_list_of_dicts([package_info] + tags_infos) pkgs_infos.append(package_info) @@ -280,28 +267,27 @@ def otherxml_parser(location): """ otherxml = etree.parse(location).getroot() infos = [] - for package in otherxml.findall('{http://linux.duke.edu/metadata/other}package'): - version = package.find('{http://linux.duke.edu/metadata/other}version') + for package in otherxml.findall("{http://linux.duke.edu/metadata/other}package"): + version = package.find("{http://linux.duke.edu/metadata/other}version") package_info = dict(package.items() + version.items()) - changelogs = package.findall( - '{http://linux.duke.edu/metadata/other}changelog') - package_info['changelogs'] = [] + changelogs = package.findall("{http://linux.duke.edu/metadata/other}changelog") + package_info["changelogs"] = [] for changelog in changelogs: if changelog.items(): change_info = convert_tuples_to_dict(changelog.items()) - change_info['changelog'] = changelog.text - package_info['changelogs'].append(change_info) + change_info["changelog"] = changelog.text + package_info["changelogs"].append(change_info) else: - package_info['changelogs'].append( - {'changelog': changelog.text}) + package_info["changelogs"].append({"changelog": changelog.text}) infos.append(package_info) return infos def get_pkg_infos(filelists_xml, primary_xml, other_xml): - primaryxml_dicts = primaryxml_parser(primary_xml) otherxml_dicts = otherxml_parser(other_xml) filelistsxml_dicts = filelistsxml_parser(filelists_xml) - return combine_dicts_using_pkgid(primaryxml_dicts + otherxml_dicts + filelistsxml_dicts) + return combine_dicts_using_pkgid( + primaryxml_dicts + otherxml_dicts + filelistsxml_dicts + ) diff --git a/minecode/miners/repodata_rpms.py b/minecode/miners/repodata_rpms.py index f4c4e883..f0ea9f32 100644 --- a/minecode/miners/repodata_rpms.py +++ b/minecode/miners/repodata_rpms.py @@ -2,12 +2,9 @@ # Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals -from minecode import seed from minecode import rsync +from minecode import seed from minecode import visit_router from minecode.miners import URI @@ -17,23 +14,22 @@ """ rsync_urls = ( - 'rsync://mirrors.kernel.org/centos/', - 'rsync://yum.postgresql.org', - 'rsync://www.fedora.is/fedora/', - 'rsync://rsync.opensuse.org/', + "rsync://mirrors.kernel.org/centos/", + "rsync://yum.postgresql.org", + "rsync://www.fedora.is/fedora/", + "rsync://rsync.opensuse.org/", ) class RPMRepoDataSeed(seed.Seeder): - def get_seeds(self): - yield 'rsync://mirrors.kernel.org/centos/' - yield 'rsync://yum.postgresql.org' - yield 'rsync://www.fedora.is/fedora/' - yield 'rsync://rsync.opensuse.org/' + yield "rsync://mirrors.kernel.org/centos/" + yield "rsync://yum.postgresql.org" + yield "rsync://www.fedora.is/fedora/" + yield "rsync://rsync.opensuse.org/" -def collect_rsync_urls(directory_listing, base_url, file_names=('repomd.xml',)): +def collect_rsync_urls(directory_listing, base_url, file_names=("repomd.xml",)): """ Given an rsync URI that may contain files with path ending with any of the 'path_ends' tuple yield URIs using the 'base_url' as the base. @@ -42,13 +38,21 @@ def collect_rsync_urls(directory_listing, base_url, file_names=('repomd.xml',)): for entry in rsync.directory_entries(directory_listing): # FIXME: why this assert? - assert not entry['path'].startswith('/') - if entry['path'].endswith(file_names): - entry = base_url + entry['path'] + assert not entry["path"].startswith("/") + if entry["path"].endswith(file_names): + entry = base_url + entry["path"] yield URI(uri=entry) @visit_router.route(*rsync_urls) -def collect_repomd_urls(uri, file_names=('repomd.xml',)): +def collect_repomd_urls(uri, file_names=("repomd.xml",)): directory_listing = rsync.fetch_directory(uri) - return collect_rsync_urls(directory_listing, base_url=uri.replace('rsync://', 'http://'), file_names=file_names), None, None + return ( + collect_rsync_urls( + directory_listing, + base_url=uri.replace("rsync://", "http://"), + file_names=file_names, + ), + None, + None, + ) diff --git a/minecode/miners/repomd.py b/minecode/miners/repomd.py index 1a092c9c..73c23fce 100644 --- a/minecode/miners/repomd.py +++ b/minecode/miners/repomd.py @@ -11,21 +11,17 @@ import logging import os -from minecode import rsync - from commoncode import fileutils from packagedcode.models import PackageData from packagedcode.rpm import EVR -from minecode import seed from minecode import map_router from minecode import visit_router +from minecode.miners import URI +from minecode.miners import repodata from minecode.utils import extract_file from minecode.utils import fetch_http from minecode.utils import get_temp_file -from minecode.miners import URI -from minecode.miners import repodata - logger = logging.getLogger(__name__) @@ -43,41 +39,44 @@ def download(uri): """ name = fileutils.file_name(uri) file_ext = fileutils.file_extension(name) - name = name.replace(file_ext, '') + name = name.replace(file_ext, "") content = fetch_http(uri) - temp_file = get_temp_file(file_name='minecode-fetched-file-' + name, extension=file_ext) - with open(temp_file, 'wb') as tmp: + temp_file = get_temp_file( + file_name="minecode-fetched-file-" + name, extension=file_ext + ) + with open(temp_file, "wb") as tmp: tmp.write(content) file_name = tmp.name return file_name def generate_rpm_objects(package_infos, base_url): - """ - Yield Packages from an iterable of RPM infos given a base_url. - """ + """Yield Packages from an iterable of RPM infos given a base_url.""" # FIXME: what does package_infos mean? wheer does it come from? for infos in package_infos: package_data = dict( # FIXME: need to add id back? this is id is some hash which is local to the repo. # id=infos.get('pkgid'), - type='rpm', - name=infos.get('name'), - version=EVR(epoch=infos.get('epoch'), version=infos.get( - 'ver'), release=infos.get('rel')).to_string(), - description=infos.get('description'), - homepage_url=infos.get('url'), - download_url=repodata.build_rpm_download_url( - base_url, infos.get('href')), - extracted_license_statement = infos.get('license', '') + type="rpm", + name=infos.get("name"), + version=EVR( + epoch=infos.get("epoch"), + version=infos.get("ver"), + release=infos.get("rel"), + ).to_string(), + description=infos.get("description"), + homepage_url=infos.get("url"), + download_url=repodata.build_rpm_download_url(base_url, infos.get("href")), + extracted_license_statement=infos.get("license", ""), ) package = PackageData.from_data(package_data) - if infos.get('source_rpm'): - src_rpm = PackageData(name=infos.get('source_rpm')) + if infos.get("source_rpm"): + src_rpm = PackageData(name=infos.get("source_rpm")) package.related_packages = [src_rpm] yield package + # TODO: refactor, this does not make sense, each are different URIs? # FIXME: the doc and semantics are cryptic too @@ -92,17 +91,15 @@ def fetch_repomd_subfile(base_url, repomd_xml, subfile): return os.path.join(target_location, os.listdir(target_location)[0]) -@visit_router.route('.+/repomd.xml') +@visit_router.route(".+/repomd.xml") def collect_rpm_packages_from_repomd(uri): - """ - Collect RPM data from yum repository repomd.xml. - """ + """Collect RPM data from yum repository repomd.xml.""" base_url = fileutils.parent_directory(fileutils.parent_directory(uri)) repomd_xml = download(uri) - filelists_xml = fetch_repomd_subfile(base_url, repomd_xml, 'filelists') - primary_xml = fetch_repomd_subfile(base_url, repomd_xml, 'primary') - other_xml = fetch_repomd_subfile(base_url, repomd_xml, 'other') + filelists_xml = fetch_repomd_subfile(base_url, repomd_xml, "filelists") + primary_xml = fetch_repomd_subfile(base_url, repomd_xml, "primary") + other_xml = fetch_repomd_subfile(base_url, repomd_xml, "other") pkg_infos = repodata.get_pkg_infos(filelists_xml, primary_xml, other_xml) @@ -114,11 +111,9 @@ def collect_rpm_packages_from_repomd(uri): return uris, json.dumps([r.to_dict() for r in rpms]), None -@map_router.route('.+/repomd.xml') +@map_router.route(".+/repomd.xml") def map_repomd_data(uris, resource_uri): - """ - Returns a list of RpmPackage objects collected from visitors. - """ + """Returns a list of RpmPackage objects collected from visitors.""" if not resource_uri.data: return packages = [] @@ -126,6 +121,6 @@ def map_repomd_data(uris, resource_uri): # 'name' is required for every package # FIXME: how could we obtain a package without a name??? # FIXME: This cannot work unless we use **pkg_data - if pkg_data.get('name'): + if pkg_data.get("name"): packages.append(PackageData(pkg_data)) return packages diff --git a/minecode/miners/rubygems.py b/minecode/miners/rubygems.py index 8662c5b5..3a9e94d2 100644 --- a/minecode/miners/rubygems.py +++ b/minecode/miners/rubygems.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -14,25 +13,24 @@ import logging import os -from rubymarshal import reader -from rubymarshal.classes import UsrMarshal +import saneyaml from packagedcode import models as scan_models from packagedcode.models import DependentPackage from packagedcode.models import PackageData from packageurl import PackageURL -import saneyaml +from rubymarshal import reader +from rubymarshal.classes import UsrMarshal -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.utils import extract_file -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI +from minecode.utils import extract_file from minecode.utils import parse_date - logger = logging.getLogger(__name__) handler = logging.StreamHandler() logger.addHandler(handler) @@ -42,57 +40,55 @@ # FIXME: we are missing several API calls: # http://guides.rubygems.org/rubygems-org-api/ -class RubyGemsSeed(seed.Seeder): +class RubyGemsSeed(seed.Seeder): def get_seeds(self): # We keep only specs.4.8.gz and exclude latest_spec.4.8.gz, # since specs.4.8.gz covers all uris in latest spec. - yield 'http://rubygems.org/specs.4.8.gz' + yield "http://rubygems.org/specs.4.8.gz" class GemVersion(UsrMarshal): - def version(self): - return self.values['version'] + return self.values["version"] -@visit_router.route('https?://rubygems\.org/specs\.4\.8\.gz') +@visit_router.route(r"https?://rubygems\.org/specs\.4\.8\.gz") class RubyGemsIndexVisitor(NonPersistentHttpVisitor): - """ - Collect REST APIs URIs from RubyGems index file. - """ + """Collect REST APIs URIs from RubyGems index file.""" def get_uris(self, content): - with gzip.open(content, 'rb') as idx: + with gzip.open(content, "rb") as idx: index = idx.read() # TODO: use a purl!!! for name, version, platform in reader.loads(index): - json_url = 'https://rubygems.org/api/v1/versions/{name}.json'.format( - **locals()) + json_url = "https://rubygems.org/api/v1/versions/{name}.json".format( + **locals() + ) - package_url = PackageURL(type='gem', name=name).to_string() + package_url = PackageURL(type="gem", name=name).to_string() yield URI(uri=json_url, package_url=package_url, source_uri=self.uri) # note: this list only has ever a single value version = version.values[0] if isinstance(version, bytes): - version = version.decode('utf-8') + version = version.decode("utf-8") - download_url = 'https://rubygems.org/downloads/{name}-{version}' + download_url = "https://rubygems.org/downloads/{name}-{version}" if isinstance(platform, bytes): - platform = platform.decode('utf-8') - if platform != 'ruby': - download_url += '-{platform}' + platform = platform.decode("utf-8") + if platform != "ruby": + download_url += "-{platform}" - download_url += '.gem' + download_url += ".gem" download_url = download_url.format(**locals()) - package_url = PackageURL(type='gem', name=name, version=version).to_string() + package_url = PackageURL(type="gem", name=name, version=version).to_string() yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) -@visit_router.route('https?://rubygems\.org/api/v1/versions/[\w\-\.]+.json') +@visit_router.route(r"https?://rubygems\.org/api/v1/versions/[\w\-\.]+.json") class RubyGemsApiManyVersionsVisitor(HttpJsonVisitor): """ Collect the json content of each version. @@ -101,20 +97,24 @@ class RubyGemsApiManyVersionsVisitor(HttpJsonVisitor): """ def get_uris(self, content): - """ - Yield URI of the gems url and data. - """ + """Yield URI of the gems url and data.""" # FIXME: return actual data too!!! for version_details in content: # get the gems name by parsing from the uri name = self.uri[ - self.uri.index('/versions/') + len('/versions/'):-len('.json')] - version = version_details.get('number') - gem_name = '%(name)s-%(version)s' % locals() - package_url = PackageURL(type='gem', name=name, version=version).to_string() - download_url = 'https://rubygems.org/downloads/%(gem_name)s.gem' % locals() - yield URI(uri=download_url, source_uri=self.uri, package_url=package_url, - data=json.dumps(version_details)) + self.uri.index("/versions/") + len("/versions/") : -len(".json") + ] + version = version_details.get("number") + gem_name = "%(name)s-%(version)s" % locals() + package_url = PackageURL(type="gem", name=name, version=version).to_string() + download_url = "https://rubygems.org/downloads/%(gem_name)s.gem" % locals() + yield URI( + uri=download_url, + source_uri=self.uri, + package_url=package_url, + data=json.dumps(version_details), + ) + # TODO: add API dependencies # https://rubygems.org/api/v1/dependencies.json?gems=file_validators @@ -122,11 +122,9 @@ def get_uris(self, content): # GET - /api/v2/rubygems/[GEM NAME]/versions/[VERSION NUMBER].(json|yaml) -@visit_router.route('https?://rubygems.org/downloads/[\w\-\.]+.gem') +@visit_router.route(r"https?://rubygems.org/downloads/[\w\-\.]+.gem") class RubyGemsPackageArchiveMetadataVisitor(NonPersistentHttpVisitor): - """ - Fetch a Rubygems gem archive, extract it and return its metadata file content. - """ + """Fetch a Rubygems gem archive, extract it and return its metadata file content.""" def dumps(self, content): return get_gem_metadata(content) @@ -139,27 +137,27 @@ def get_gem_metadata(location): """ # Extract the compressed file first. extracted_location = extract_file(location) - metadata_gz = os.path.join(extracted_location, 'metadata.gz') + metadata_gz = os.path.join(extracted_location, "metadata.gz") # Extract the embedded metadata gz file extract_parent_location = extract_file(metadata_gz) # Get the first file in the etracted folder which is the meta file location - meta_extracted_file = os.path.join(extract_parent_location, os.listdir(extract_parent_location)[0]) + meta_extracted_file = os.path.join( + extract_parent_location, os.listdir(extract_parent_location)[0] + ) with open(meta_extracted_file) as meta_file: return meta_file.read() -@map_router.route('https*://rubygems\.org/api/v1/versions/[\w\-\.]+.json') +@map_router.route(r"https*://rubygems\.org/api/v1/versions/[\w\-\.]+.json") class RubyGemsApiVersionsJsonMapper(Mapper): - """ - Mapper to build Rubygems Packages from JSON API data. - """ + """Mapper to build Rubygems Packages from JSON API data.""" def get_packages(self, uri, resource_uri): metadata = json.loads(resource_uri.data) - _, sep, namejson = uri.partition('versions/') + _, sep, namejson = uri.partition("versions/") if not sep: return - name, sep, _ = namejson.rpartition('.json') + name, sep, _ = namejson.rpartition(".json") if not sep: return return build_rubygem_packages_from_api_data(metadata, name) @@ -174,51 +172,50 @@ def build_rubygem_packages_from_api_data(metadata, name, purl=None): purl: String value of the package url of the ResourceURI object """ for version_details in metadata: - short_desc = version_details.get('summary') - long_desc = version_details.get('description') + short_desc = version_details.get("summary") + long_desc = version_details.get("description") if long_desc == short_desc: long_desc = None descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + description = "\n".join(descriptions) package = dict( - type='gem', + type="gem", name=name, description=description, - version=version_details.get('number'), + version=version_details.get("number"), ) # FIXME: we are missing deps and more things such as download URL and more - if version_details.get('sha'): - package['sha256'] = version_details.get('sha') + if version_details.get("sha"): + package["sha256"] = version_details.get("sha") - package['release_date'] = parse_date( - version_details.get('created_at') or '') or None + package["release_date"] = ( + parse_date(version_details.get("created_at") or "") or None + ) - author = version_details.get('authors') + author = version_details.get("authors") if author: - parties = package.get('parties') + parties = package.get("parties") if not parties: - package['parties'] = [] - party = scan_models.Party(name=author, role='author') - package['parties'].append(party) + package["parties"] = [] + party = scan_models.Party(name=author, role="author") + package["parties"].append(party) extracted_license_statement = [] - licenses = version_details.get('licenses') + licenses = version_details.get("licenses") if licenses: for lic in licenses: extracted_license_statement.append(lic) if extracted_license_statement: - package['extracted_license_statement'] = extracted_license_statement + package["extracted_license_statement"] = extracted_license_statement package = PackageData.from_data(package) package.set_purl(purl) yield package -@map_router.route('https?://rubygems.org/downloads/[\w\-\.]+.gem') +@map_router.route(r"https?://rubygems.org/downloads/[\w\-\.]+.gem") class RubyGemsPackageArchiveMetadataMapper(Mapper): - """ - Mapper to build on e Package from the metadata file found inside a gem. - """ + """Mapper to build on e Package from the metadata file found inside a gem.""" def get_packages(self, uri, resource_uri): metadata = resource_uri.data @@ -236,48 +233,48 @@ def build_rubygem_packages_from_metadata(metadata, download_url=None, purl=None) if not content: return - name = content.get('name') - short_desc = content.get('summary') - long_desc = content.get('description') + name = content.get("name") + short_desc = content.get("summary") + long_desc = content.get("description") if long_desc == short_desc: long_desc = None descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + description = "\n".join(descriptions) package = dict( - type='gem', + type="gem", name=name, description=description, - homepage_url=content.get('homepage'), + homepage_url=content.get("homepage"), ) if download_url: - package['download_url'] = download_url + package["download_url"] = download_url extracted_license_statement = [] - licenses = content.get('licenses') + licenses = content.get("licenses") if licenses: for lic in licenses: extracted_license_statement.append(lic) if extracted_license_statement: - package['extracted_license_statement'] = extracted_license_statement + package["extracted_license_statement"] = extracted_license_statement - authors = content.get('authors') + authors = content.get("authors") for author in authors: - parties = package.get('parties') + parties = package.get("parties") if not parties: - package['parties'] = [] - party = scan_models.Party(name=author, role='author') - package['parties'].append(party) + package["parties"] = [] + party = scan_models.Party(name=author, role="author") + package["parties"].append(party) # Release date in the form of `2010-02-01 00:00:00 -05:00` - release_date = content.get('date', '').split() - package['release_date'] = parse_date(release_date[0]) + release_date = content.get("date", "").split() + package["release_date"] = parse_date(release_date[0]) - package['dependencies'] = get_dependencies_from_meta(content) or [] + package["dependencies"] = get_dependencies_from_meta(content) or [] # This is a two level nenest item - version1 = content.get('version') or {} - version = version1.get('version') or None - package['version'] = version + version1 = content.get("version") or {} + version = version1.get("version") or None + package["version"] = version package = PackageData.from_data(package) package.set_purl(purl) yield package @@ -288,20 +285,20 @@ def get_dependencies_from_meta(content): Return a mapping of dependencies keyed by group based on the gem YAML metadata data structure. """ - dependencies = content.get('dependencies') or [] + dependencies = content.get("dependencies") or [] if not dependencies: return [] group = [] for dependency in dependencies: - name = dependency.get('name') or None + name = dependency.get("name") or None if not name: continue - requirement = dependency.get('requirement') or {} + requirement = dependency.get("requirement") or {} # FIXME when upating to the ScanCode package model - scope = dependency.get('type') - scope = scope and scope.lstrip(':') + scope = dependency.get("type") + scope = scope and scope.lstrip(":") # note that as weird artifact of our saneyaml YAML parsing, we are # getting both identical requirements and version_requirements mapping. @@ -310,23 +307,27 @@ def get_dependencies_from_meta(content): # [u'>=', {'version': '0'}] # ] # } - requirements = requirement.get('requirements') or [] + requirements = requirement.get("requirements") or [] version_constraint = [] # each requirement is [u'>=', {'version': '0'}] for constraint, req_version in requirements: - req_version = req_version.get('version') or None + req_version = req_version.get("version") or None # >= 0 allows for any version: we ignore these type of contrainsts # as this is the same as no constraints. We also ignore lack of # constraints and versions - if ((constraint == '>=' and req_version == '0') - or not (constraint and req_version)): + if (constraint == ">=" and req_version == "0") or not ( + constraint and req_version + ): continue - version_constraint.append(' '.join([constraint, req_version])) - version_constraint = ', '.join(version_constraint) or None + version_constraint.append(" ".join([constraint, req_version])) + version_constraint = ", ".join(version_constraint) or None - group.append(DependentPackage( - purl=name, extracted_requirement=version_constraint, scope=scope)) + group.append( + DependentPackage( + purl=name, extracted_requirement=version_constraint, scope=scope + ) + ) return group @@ -336,19 +337,19 @@ def get_dependencies_from_api(content): Return a mapping of dependencies keyed by group based on the RubyGems API data structure. """ - dependencies = content.get('dependencies') or [] + dependencies = content.get("dependencies") or [] if not dependencies: return {} group = [] for dependency in dependencies: - name = dependency.get('name') or None + name = dependency.get("name") or None if not name: continue - requirement = dependency.get('requirement') or {} - scope = dependency.get('type') - scope = scope and scope.lstrip(':') + requirement = dependency.get("requirement") or {} + scope = dependency.get("type") + scope = scope and scope.lstrip(":") # note that as weird artifact of our saneyaml YAML parsing, we are # getting both identical requirements and version_requirements mapping. @@ -357,57 +358,61 @@ def get_dependencies_from_api(content): # [u'>=', {'version': '0'}] # ] # } - requirements = requirement.get('requirements') or [] + requirements = requirement.get("requirements") or [] version_constraint = [] # each requirement is [u'>=', {'version': '0'}] for constraint, req_version in requirements: - req_version = req_version.get('version') or None + req_version = req_version.get("version") or None # >= 0 allows for any version: we ignore these type of contrainsts # as this is the same as no constraints. We also ignore lack of # constraints and versions - if ((constraint == '>=' and req_version == '0') - or not (constraint and req_version)): + if (constraint == ">=" and req_version == "0") or not ( + constraint and req_version + ): continue - version_constraint.append(' '.join([constraint, req_version])) - version_constraint = ', '.join(version_constraint) or None + version_constraint.append(" ".join([constraint, req_version])) + version_constraint = ", ".join(version_constraint) or None - group.append(DependentPackage( - purl=name, extracted_requirement=version_constraint, scope=scope)) + group.append( + DependentPackage( + purl=name, extracted_requirement=version_constraint, scope=scope + ) + ) return group # Structure: {gem_spec: license.key} LICENSES_MAPPING = { - 'None': None, - 'Apache 2.0': 'apache-2.0', - 'Apache License 2.0': 'apache-2.0', - 'Apache-2.0': 'apache-2.0', - 'Apache': 'apache-2.0', - 'GPL': 'gpl-2.0', - 'GPL-2': 'gpl-2.0', - 'GNU GPL v2': 'gpl-2.0', - 'GPLv2+': 'gpl-2.0-plus', - 'GPLv2': 'gpl-2.0', - 'GPLv3': 'gpl-3.0', - 'MIT': 'mit', - 'Ruby': 'ruby', - "same as ruby's": 'ruby', - 'Ruby 1.8': 'ruby', - 'Artistic 2.0': 'artistic-2.0', - 'Perl Artistic v2': 'artistic-2.0', - '2-clause BSDL': 'bsd-simplified', - 'BSD': 'bsd-new', - 'BSD-3': 'bsd-new', - 'ISC': 'isc', - 'SIL Open Font License': 'ofl-1.0', - 'New Relic': 'new-relic', - 'GPL2': 'gpl-2.0', - 'BSD-2-Clause': 'bsd-simplified', - 'BSD 2-Clause': 'bsd-simplified', - 'LGPL-3': 'lgpl-3.0', - 'LGPL-2.1+': 'lgpl-2.1-plus', - 'LGPLv2.1+': 'lgpl-2.1-plus', - 'LGPL': 'lgpl', - 'Unlicense': 'unlicense', + "None": None, + "Apache 2.0": "apache-2.0", + "Apache License 2.0": "apache-2.0", + "Apache-2.0": "apache-2.0", + "Apache": "apache-2.0", + "GPL": "gpl-2.0", + "GPL-2": "gpl-2.0", + "GNU GPL v2": "gpl-2.0", + "GPLv2+": "gpl-2.0-plus", + "GPLv2": "gpl-2.0", + "GPLv3": "gpl-3.0", + "MIT": "mit", + "Ruby": "ruby", + "same as ruby's": "ruby", + "Ruby 1.8": "ruby", + "Artistic 2.0": "artistic-2.0", + "Perl Artistic v2": "artistic-2.0", + "2-clause BSDL": "bsd-simplified", + "BSD": "bsd-new", + "BSD-3": "bsd-new", + "ISC": "isc", + "SIL Open Font License": "ofl-1.0", + "New Relic": "new-relic", + "GPL2": "gpl-2.0", + "BSD-2-Clause": "bsd-simplified", + "BSD 2-Clause": "bsd-simplified", + "LGPL-3": "lgpl-3.0", + "LGPL-2.1+": "lgpl-2.1-plus", + "LGPLv2.1+": "lgpl-2.1-plus", + "LGPL": "lgpl", + "Unlicense": "unlicense", } diff --git a/minecode/miners/sourceforge.py b/minecode/miners/sourceforge.py index 3f7deac5..9746c333 100644 --- a/minecode/miners/sourceforge.py +++ b/minecode/miners/sourceforge.py @@ -11,19 +11,17 @@ import re from bs4 import BeautifulSoup - from packagedcode import models as scan_models from packageurl import PackageURL -from minecode import seed from minecode import map_router +from minecode import seed from minecode import visit_router -from minecode.miners import Mapper +from minecode.miners import URI from minecode.miners import HttpJsonVisitor from minecode.miners import HttpVisitor +from minecode.miners import Mapper from minecode.miners import NonPersistentHttpVisitor -from minecode.miners import URI - logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -32,12 +30,11 @@ class SourceforgeSeed(seed.Seeder): - def get_seeds(self): - yield 'https://sourceforge.net/sitemap.xml' + yield "https://sourceforge.net/sitemap.xml" -@visit_router.route('https?://sourceforge.net/sitemap.xml') +@visit_router.route("https?://sourceforge.net/sitemap.xml") class SourceforgeSitemapIndexVisitor(NonPersistentHttpVisitor): """ Collect sub-sitemaps from the main sitemap. Return on URI for each sub- @@ -49,10 +46,8 @@ class SourceforgeSitemapIndexVisitor(NonPersistentHttpVisitor): """ def get_uris(self, content): - """ - Collect all the sitemaps URIs from master sitemap. - """ - locs = BeautifulSoup(open(content), 'lxml').find_all('loc') + """Collect all the sitemaps URIs from master sitemap.""" + locs = BeautifulSoup(open(content), "lxml").find_all("loc") # Content passing from NonPersistentHttpVisitor is a temp file path # instead of file content, so opening to get a file handler is # necessary. @@ -60,42 +55,48 @@ def get_uris(self, content): yield URI(uri=loc.text, source_uri=self.uri) -@visit_router.route('https?://sourceforge.net/sitemap-\d+.xml') +@visit_router.route(r"https?://sourceforge.net/sitemap-\d+.xml") class SourceforgeSitemapPageVisitor(HttpVisitor): - def get_uris(self, content): - """ - Collect all the projects URIs from a sub-sitemaps. - """ - sitemap_locs = BeautifulSoup(content, 'lxml').find_all('loc') - regex = re.compile( - r"^https?://sourceforge.net/projects/[a-z0-9.-]+/?$") + """Collect all the projects URIs from a sub-sitemaps.""" + sitemap_locs = BeautifulSoup(content, "lxml").find_all("loc") + regex = re.compile(r"^https?://sourceforge.net/projects/[a-z0-9.-]+/?$") for loc in sitemap_locs: if loc.text and re.match(regex, loc.text): - project_json_baseurl = 'https://sourceforge.net/api/project/name/{}/json' - project_name = loc.text.partition( - 'https://sourceforge.net/projects/')[-1].strip('/') + project_json_baseurl = ( + "https://sourceforge.net/api/project/name/{}/json" + ) + project_name = loc.text.partition("https://sourceforge.net/projects/")[ + -1 + ].strip("/") project_json_url = project_json_baseurl.format(project_name) - package_url = PackageURL(type='sourceforge', name=project_name).to_string() + package_url = PackageURL( + type="sourceforge", name=project_name + ).to_string() # The priority in the xml has different view with the priority in visitor, so skip it. - yield URI(uri=project_json_url, package_url=package_url, source_uri=self.uri) + yield URI( + uri=project_json_url, package_url=package_url, source_uri=self.uri + ) -@visit_router.route('https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json', - 'https?://sourceforge.net/rest/p/[a-z0-9.-]+' - ) +@visit_router.route( + "https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json", + "https?://sourceforge.net/rest/p/[a-z0-9.-]+", +) class SourceforgeProjectJsonVisitor(HttpJsonVisitor): """ Collect Sourceforge project data through the JSON API. The implementation is empty since it will inherit the implementation from HttpJsonVisitor and it returns json data for mapper. """ + pass -@map_router.route('https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json', - 'https?://sourceforge.net/rest/p/[a-z0-9.-]+') +@map_router.route( + "https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json", + "https?://sourceforge.net/rest/p/[a-z0-9.-]+", +) class SourceforgeProjectJsonAPIMapper(Mapper): - def get_packages(self, uri, resource_uri): """ Yield Package built from resource_uri record for a single @@ -112,64 +113,64 @@ def build_packages_from_metafile(metadata, purl=None, uri=None): metadata: json metadata content purl: String value of the package url of the ResourceURI object """ - short_desc = metadata.get('summary') - long_desc = metadata.get('short_description') + short_desc = metadata.get("summary") + long_desc = metadata.get("short_description") descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - name = metadata.get('shortname') + description = "\n".join(descriptions) + name = metadata.get("shortname") # short name is more reasonable here for name, since it's an abbreviation # for the project and unique if not name: - name = metadata.get('name') + name = metadata.get("name") if name: common_data = dict( - datasource_id='sourceforge_metadata', - type='sourceforge', - name=metadata.get('shortname', metadata.get('name')), + datasource_id="sourceforge_metadata", + type="sourceforge", + name=metadata.get("shortname", metadata.get("name")), description=description, - homepage_url=metadata.get( - 'external_homepage', metadata.get('url')), + homepage_url=metadata.get("external_homepage", metadata.get("url")), license_detections=[], ) - devs = metadata.get('developers') or [] + devs = metadata.get("developers") or [] for dev in devs: - parties = common_data.get('parties') + parties = common_data.get("parties") if not parties: - common_data['parties'] = [] - if dev.get('name'): - common_data['parties'].append( - scan_models.Party(name=dev.get( - 'name'), role='contributor', url=dev.get('url')).to_dict() + common_data["parties"] = [] + if dev.get("name"): + common_data["parties"].append( + scan_models.Party( + name=dev.get("name"), role="contributor", url=dev.get("url") + ).to_dict() ) - categories = metadata.get('categories', {}) - languages = categories.get('language', []) + categories = metadata.get("categories", {}) + languages = categories.get("language", []) langs = [] for lang in languages: - lshort = lang.get('shortname') + lshort = lang.get("shortname") if lshort: langs.append(lshort) - langs = ', '.join(langs) - common_data['primary_language'] = langs or None + langs = ", ".join(langs) + common_data["primary_language"] = langs or None extracted_license_statement = [] - licenses = categories.get('license') or [] + licenses = categories.get("license") or [] for l in licenses: - license_name = l.get('fullname') + license_name = l.get("fullname") # full name is first priority than shortname since shortname is like gpl, it doesn't show detailed gpl version etc. if license_name: - extracted_license_statement.append(l.get('shortname')) + extracted_license_statement.append(l.get("shortname")) if license_name: extracted_license_statement.append(license_name) if extracted_license_statement: - common_data['extracted_license_statement'] = extracted_license_statement + common_data["extracted_license_statement"] = extracted_license_statement keywords = [] - topics = categories.get('topic', []) + topics = categories.get("topic", []) for topic in topics: - keywords.append(topic.get('shortname')) - common_data['keywords'] = keywords or None + keywords.append(topic.get("shortname")) + common_data["keywords"] = keywords or None package = scan_models.Package.from_package_data( package_data=common_data, datafile_path=uri, diff --git a/minecode/miners/ubuntu.py b/minecode/miners/ubuntu.py index 3a8d9e74..9106db0b 100644 --- a/minecode/miners/ubuntu.py +++ b/minecode/miners/ubuntu.py @@ -2,8 +2,6 @@ # Copyright (c) 2014 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import unicode_literals # http://askubuntu.com/questions/139032/how-to-programmatically-fetch-a-list-of-applications-from-the-software-center diff --git a/minecode/model_utils.py b/minecode/model_utils.py index 74b2f975..218e5717 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -2,10 +2,14 @@ import logging import sys -from minecode.models import ScannableURI +from django.utils import timezone + from commoncode import fileutils +from packagedcode.models import PackageData from packageurl import normalize_qualifiers +from minecode.models import ScannableURI +from minecode.utils import stringify_null_purl_fields from packagedb.models import DependentPackage from packagedb.models import Package from packagedb.models import PackageContentType @@ -14,9 +18,6 @@ from packagedb.models import Resource from packagedb.serializers import DependentPackageSerializer from packagedb.serializers import PartySerializer -from packagedcode.models import PackageData -from minecode.utils import stringify_null_purl_fields -from django.utils import timezone TRACE = False @@ -28,23 +29,25 @@ # These are the list of default pipelines to run when we scan a Package for # indexing DEFAULT_PIPELINES = ( - 'scan_single_package', - 'fingerprint_codebase', + "scan_single_package", + "fingerprint_codebase", ) # These are the list of supported addon pipelines to run when we scan a Package for # indexing. SUPPORTED_ADDON_PIPELINES = ( - 'collect_strings_gettext', - 'collect_symbols_ctags', - 'collect_symbols_pygments', - 'collect_symbols_tree_sitter', - 'inspect_elf_binaries', - 'scan_for_virus', + "collect_strings_gettext", + "collect_symbols_ctags", + "collect_symbols_pygments", + "collect_symbols_tree_sitter", + "inspect_elf_binaries", + "scan_for_virus", ) -def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, priority=0, reindex_uri=False): +def add_package_to_scan_queue( + package, pipelines=DEFAULT_PIPELINES, priority=0, reindex_uri=False +): """ Add a Package `package` to the scan queue to run the list of provided `pipelines` with a given `priority`. A ScannableURI with a `priority` of 100 @@ -53,7 +56,7 @@ def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, priority=0, If `reindex_uri` is True, force rescanning of the package """ if not pipelines: - raise Exception('pipelines required to add package to scan queue') + raise Exception("pipelines required to add package to scan queue") uri = package.download_url _, scannable_uri_created = ScannableURI.objects.get_or_create( uri=uri, @@ -63,7 +66,7 @@ def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, priority=0, priority=priority, ) if scannable_uri_created: - logger.debug(' + Inserted ScannableURI\t: {}'.format(uri)) + logger.debug(f" + Inserted ScannableURI\t: {uri}") def merge_packages(existing_package, new_package_data, replace=False): @@ -84,22 +87,22 @@ def merge_packages(existing_package, new_package_data, replace=False): # We remove `purl` from `existing_mapping` because we use the other purl # fields (type, namespace, name, version, etc.) to generate the purl. - existing_mapping.pop('purl') + existing_mapping.pop("purl") # FIXME REMOVE this workaround when a ScanCode bug fixed with # https://github.com/aboutcode-org/scancode-toolkit/commit/9b687e6f9bbb695a10030a81be7b93c8b1d816c2 - qualifiers = new_package_data.get('qualifiers') + qualifiers = new_package_data.get("qualifiers") if isinstance(qualifiers, dict): # somehow we get an dict on the new value instead of a string # this not likely the best place to fix this - new_package_data['qualifiers'] = normalize_qualifiers(qualifiers, encode=True) + new_package_data["qualifiers"] = normalize_qualifiers(qualifiers, encode=True) new_mapping = new_package_data fields_to_skip = ( - 'package_uid', - 'declared_license_expression_spdx', - 'other_license_expression_spdx', + "package_uid", + "declared_license_expression_spdx", + "other_license_expression_spdx", ) updated_fields = [] @@ -107,10 +110,16 @@ def merge_packages(existing_package, new_package_data, replace=False): new_value = new_mapping.get(existing_field) if TRACE: logger.debug( - '\n'.join([ - 'existing_field:', repr(existing_field), - ' existing_value:', repr(existing_value), - ' new_value:', repr(new_value)]) + "\n".join( + [ + "existing_field:", + repr(existing_field), + " existing_value:", + repr(existing_value), + " new_value:", + repr(new_value), + ] + ) ) # FIXME: handle Booleans??? though there are none for now @@ -118,48 +127,54 @@ def merge_packages(existing_package, new_package_data, replace=False): # If the checksum from `new_package` is different than the one # existing checksum in `existing_package`, there is a big data # inconsistency issue and an Exception is raised - if (existing_field in ('md5', 'sha1', 'sha256', 'sha512') and - existing_value and - new_value and - existing_value != new_value): + if ( + existing_field in ("md5", "sha1", "sha256", "sha512") + and existing_value + and new_value + and existing_value != new_value + ): raise Exception( - '\n'.join([ - 'Mismatched {} for {}:'.format(existing_field, existing_package.uri), - ' existing_value: {}'.format(existing_value), - ' new_value: {}'.format(new_value) - ]) + "\n".join( + [ + f"Mismatched {existing_field} for {existing_package.uri}:", + f" existing_value: {existing_value}", + f" new_value: {new_value}", + ] + ) ) if not new_value: if TRACE: - logger.debug(' No new value: skipping') + logger.debug(" No new value: skipping") continue if not existing_value or replace: if TRACE and not existing_value: - logger.debug( - ' No existing value: set to new: {}'.format(new_value)) + logger.debug(f" No existing value: set to new: {new_value}") if TRACE and replace: logger.debug( - ' Existing value and replace: set to new: {}'.format(new_value)) + f" Existing value and replace: set to new: {new_value}" + ) - if existing_field == 'parties': + if existing_field == "parties": # If `existing_field` is `parties`, then we update the `Party` table parties = new_value existing_parties = Party.objects.filter(package=existing_package) - serialized_existing_parties = PartySerializer(existing_parties, many=True).data + serialized_existing_parties = PartySerializer( + existing_parties, many=True + ).data if replace: # Delete existing Party objects existing_parties.delete() for party in parties: _party, _created = Party.objects.get_or_create( package=existing_package, - type=party['type'], - role=party['role'], - name=party['name'], - email=party['email'], - url=party['url'], + type=party["type"], + role=party["role"], + name=party["name"], + email=party["email"], + url=party["url"], ) entry = dict( field=existing_field, @@ -168,23 +183,27 @@ def merge_packages(existing_package, new_package_data, replace=False): ) updated_fields.append(entry) continue - elif existing_field == 'dependencies': + elif existing_field == "dependencies": # If `existing_field` is `dependencies`, then we update the `DependentPackage` table dependencies = new_value - existing_dependencies = DependentPackage.objects.filter(package=existing_package) - serialized_existing_dependencies = DependentPackageSerializer(existing_dependencies, many=True).data + existing_dependencies = DependentPackage.objects.filter( + package=existing_package + ) + serialized_existing_dependencies = DependentPackageSerializer( + existing_dependencies, many=True + ).data if replace: # Delete existing DependentPackage objects existing_dependencies.delete() for dependency in dependencies: _dep, _created = DependentPackage.objects.get_or_create( package=existing_package, - purl=dependency['purl'], - extracted_requirement=dependency['extracted_requirement'], - scope=dependency['scope'], - is_runtime=dependency['is_runtime'], - is_optional=dependency['is_optional'], - is_resolved=dependency['is_resolved'], + purl=dependency["purl"], + extracted_requirement=dependency["extracted_requirement"], + scope=dependency["scope"], + is_runtime=dependency["is_runtime"], + is_optional=dependency["is_optional"], + is_resolved=dependency["is_resolved"], ) entry = dict( field=existing_field, @@ -193,9 +212,9 @@ def merge_packages(existing_package, new_package_data, replace=False): ) updated_fields.append(entry) continue - elif existing_field == 'package_content': + elif existing_field == "package_content": # get new_value from extra_data - new_value = new_mapping.extra_data.get('package_content') + new_value = new_mapping.extra_data.get("package_content") if not new_value: continue elif existing_field in fields_to_skip: @@ -206,9 +225,7 @@ def merge_packages(existing_package, new_package_data, replace=False): # `existing_field` is a regular field on the Package model and can # be updated normally. entry = dict( - field=existing_field, - old_value=existing_value, - new_value=new_value + field=existing_field, old_value=existing_value, new_value=new_value ) updated_fields.append(entry) setattr(existing_package, existing_field, new_value) @@ -216,7 +233,7 @@ def merge_packages(existing_package, new_package_data, replace=False): existing_package.save() if TRACE: - logger.debug(' Nothing done') + logger.debug(" Nothing done") return updated_fields @@ -235,30 +252,30 @@ def merge_or_create_package(scanned_package, visit_level, override=False): created = False merged = False package = None - map_error = '' + map_error = "" mining_level = visit_level if override: # this will force the data override - visit_level =+1 + visit_level = +1 if not isinstance(scanned_package, PackageData): - msg = 'Not a ScanCode PackageData type:' + repr(scanned_package) - map_error += msg + '\n' + msg = "Not a ScanCode PackageData type:" + repr(scanned_package) + map_error += msg + "\n" logger.error(msg) raise RuntimeError(msg) if not scanned_package.download_url: # TODO: there could be valid cases where we have no download URL # and still want to create a package??? - msg = 'No download_url for package:' + repr(scanned_package) - map_error += msg + '\n' + msg = "No download_url for package:" + repr(scanned_package) + map_error += msg + "\n" logger.error(msg) return package, created, merged, map_error package_uri = scanned_package.download_url - logger.debug('Package URI: {}'.format(package_uri)) - history = scanned_package.extra_data.get('history', []) + logger.debug(f"Package URI: {package_uri}") + history = scanned_package.extra_data.get("history", []) stored_package = None # Check if we already have an existing PackageDB record to update @@ -286,7 +303,8 @@ def merge_or_create_package(scanned_package, visit_level, override=False): updated_fields = merge_packages( existing_package=stored_package, new_package_data=scanned_package.to_dict(), - replace=False) + replace=False, + ) # for a foreign key, such as dependencies and parties, we will adopt the # same logic. In this case, parties or dependencies coming from a scanned # package are only added if there is no parties or dependencies in the @@ -300,7 +318,8 @@ def merge_or_create_package(scanned_package, visit_level, override=False): updated_fields = merge_packages( existing_package=stored_package, new_package_data=scanned_package.to_dict(), - replace=True) + replace=True, + ) # for a foreign key, such as dependencies and parties, we will adopt the # same logic. In this case, parties or dependencies coming from a scanned # package will override existing values. If there are parties in the scanned @@ -311,15 +330,17 @@ def merge_or_create_package(scanned_package, visit_level, override=False): if updated_fields: data = { - 'updated_fields': updated_fields, + "updated_fields": updated_fields, } - stored_package.append_to_history('Package field values have been updated.', data=data) + stored_package.append_to_history( + "Package field values have been updated.", data=data + ) # TODO: append updated_fields information to the package's history stored_package.last_modified_date = timezone.now() stored_package.save() - logger.debug(' + Updated package\t: {}'.format(package_uri)) + logger.debug(f" + Updated package\t: {package_uri}") package = stored_package merged = True @@ -337,7 +358,7 @@ def merge_or_create_package(scanned_package, visit_level, override=False): version=scanned_package.version, ) existing_related_package = existing_related_packages.first() - package_content = scanned_package.extra_data.get('package_content') + package_content = scanned_package.extra_data.get("package_content") package_data = dict( # FIXME: we should get the file_name in the @@ -380,7 +401,9 @@ def merge_or_create_package(scanned_package, visit_level, override=False): stringify_null_purl_fields(package_data) created_package = Package.objects.create(**package_data) - created_package.append_to_history('New Package created from URI: {}'.format(package_uri)) + created_package.append_to_history( + f"New Package created from URI: {package_uri}" + ) # This is used in the case of Maven packages created from the priority queue for h in history: @@ -388,12 +411,9 @@ def merge_or_create_package(scanned_package, visit_level, override=False): if existing_related_package: related_package_sets_count = existing_related_package.package_sets.count() - if ( - related_package_sets_count == 0 - or ( - related_package_sets_count > 0 - and created_package.package_content == PackageContentType.BINARY - ) + if related_package_sets_count == 0 or ( + related_package_sets_count > 0 + and created_package.package_content == PackageContentType.BINARY ): # Binary packages can only be part of one set package_set = PackageSet.objects.create() @@ -433,7 +453,7 @@ def merge_or_create_package(scanned_package, visit_level, override=False): created_package.save() package = created_package created = True - logger.debug(' + Inserted package\t: {}'.format(package_uri)) + logger.debug(f" + Inserted package\t: {package_uri}") return package, created, merged, map_error @@ -450,9 +470,9 @@ def update_or_create_resource(package, resource_data): updated = False created = False resource = None - path = resource_data.get('path') + path = resource_data.get("path") - extra_data = copy.deepcopy(resource_data.get('extra_data', {})) + extra_data = copy.deepcopy(resource_data.get("extra_data", {})) extra_data.pop("directory_content", None) extra_data.pop("directory_structure", None) @@ -463,21 +483,21 @@ def update_or_create_resource(package, resource_data): resource = Resource( package=package, path=path, - is_file=resource_data.get('type') == 'file', - name=resource_data.get('name'), - extension=resource_data.get('extension'), - size=resource_data.get('size'), - md5=resource_data.get('md5'), - sha1=resource_data.get('sha1'), - sha256=resource_data.get('sha256'), - mime_type=resource_data.get('mime_type'), - file_type=resource_data.get('file_type'), - programming_language=resource_data.get('programming_language'), - is_binary=resource_data.get('is_binary'), - is_text=resource_data.get('is_text'), - is_archive=resource_data.get('is_archive'), - is_media=resource_data.get('is_media'), - is_key_file=resource_data.get('is_key_file'), + is_file=resource_data.get("type") == "file", + name=resource_data.get("name"), + extension=resource_data.get("extension"), + size=resource_data.get("size"), + md5=resource_data.get("md5"), + sha1=resource_data.get("sha1"), + sha256=resource_data.get("sha256"), + mime_type=resource_data.get("mime_type"), + file_type=resource_data.get("file_type"), + programming_language=resource_data.get("programming_language"), + is_binary=resource_data.get("is_binary"), + is_text=resource_data.get("is_text"), + is_archive=resource_data.get("is_archive"), + is_media=resource_data.get("is_media"), + is_key_file=resource_data.get("is_key_file"), extra_data=extra_data, ) created = True diff --git a/minecode/models.py b/minecode/models.py index 3163cbfa..40263cf1 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -7,10 +7,10 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from datetime import timedelta import logging import sys import uuid +from datetime import timedelta from django.conf import settings from django.db import models @@ -19,12 +19,11 @@ import django_rq from minecode import map_router -from minecode import visit_router # UnusedImport here! # But importing the miners module triggers routes registration from minecode import miners # NOQA - +from minecode import visit_router from packagedb.models import Package logger = logging.getLogger(__name__) @@ -44,13 +43,11 @@ def get_canonical(uri): in the URI it is removed from the canonical output. """ import urlpy + normalized = urlpy.parse(uri).canonical().defrag().sanitize().punycode() # Taken from an old version of urlpy (latest does not have the PORTS dict # See: https://github.com/seomoz/url-py/blob/1d0efdda102cc48ce9dbcc41154296cea1d28c1f/url.py#L46 - PORTS = { - 'http': 80, - 'https': 443 - } + PORTS = {"http": 80, "https": 443} if normalized.port == PORTS.get(normalized.scheme, None): normalized.remove_default_port() return normalized.unicode @@ -61,46 +58,47 @@ class BaseURI(models.Model): A base abstract model to store URI for crawling, scanning and indexing. Also used as a processing "to do" queue for visiting and mapping these URIs. """ + uri = models.CharField( max_length=2048, db_index=True, - help_text='URI for this resource. This is the unmodified original URI.', + help_text="URI for this resource. This is the unmodified original URI.", ) canonical = models.CharField( max_length=3000, db_index=True, - help_text='Canonical form of the URI for this resource that must be ' - 'unique across all ResourceURI.', + help_text="Canonical form of the URI for this resource that must be " + "unique across all ResourceURI.", ) source_uri = models.CharField( max_length=2048, null=True, blank=True, - help_text='Optional: real source remote URI for this visit.' - 'For example for a package repository index is a typical source ' - 'via which a first level of package data is fetched. And it is ' - 'not the URI in the uri field. It is just the source of the fetch' - 'Or the source may be a mirror URI used for fetching.' + help_text="Optional: real source remote URI for this visit." + "For example for a package repository index is a typical source " + "via which a first level of package data is fetched. And it is " + "not the URI in the uri field. It is just the source of the fetch" + "Or the source may be a mirror URI used for fetching.", ) priority = models.PositiveIntegerField( # Using default because NULL is ordered first on Postgres. default=0, db_index=True, - help_text='Absolute procdssing priority of a URI (default to zero), ' - 'higher number means higher priority, zero means lowest ' - 'priority.', + help_text="Absolute procdssing priority of a URI (default to zero), " + "higher number means higher priority, zero means lowest " + "priority.", ) wip_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Work In Progress. This is a timestamp set at the start of a ' - 'visit or mapping or indexing or null when no processing is ' - 'in progress.', + help_text="Work In Progress. This is a timestamp set at the start of a " + "visit or mapping or indexing or null when no processing is " + "in progress.", ) file_name = models.CharField( @@ -108,8 +106,8 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='File name of a resource sometimes part of the URI proper ' - 'and sometimes only available through an HTTP header.', + help_text="File name of a resource sometimes part of the URI proper " + "and sometimes only available through an HTTP header.", ) # FIXME: 2147483647 is the max size which means we cannot store more than 2GB files @@ -117,7 +115,7 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='Size in bytes of the file represented by this ResourceURI.', + help_text="Size in bytes of the file represented by this ResourceURI.", ) sha1 = models.CharField( @@ -125,8 +123,8 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='SHA1 checksum hex-encoded (as in the sha1sum command) of the ' - 'content of the file represented by this ResourceURI.', + help_text="SHA1 checksum hex-encoded (as in the sha1sum command) of the " + "content of the file represented by this ResourceURI.", ) md5 = models.CharField( @@ -134,8 +132,8 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='MD5 checksum hex-encoded (as in the md5sum command) of the ' - 'content of the file represented by this ResourceURI.', + help_text="MD5 checksum hex-encoded (as in the md5sum command) of the " + "content of the file represented by this ResourceURI.", ) sha256 = models.CharField( @@ -143,18 +141,18 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='SHA256 checksum hex-encoded (as in the sha256sum command) of the ' - 'content of the file represented by this ResourceURI.', + help_text="SHA256 checksum hex-encoded (as in the sha256sum command) of the " + "content of the file represented by this ResourceURI.", ) last_modified_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the last modified date of the remote ' - 'resource represented by this URI such as the modified date ' - 'of a file, the lastmod value on a sitemap or the modified ' - 'date returned by an HTTP resource.', + help_text="Timestamp set to the last modified date of the remote " + "resource represented by this URI such as the modified date " + "of a file, the lastmod value on a sitemap or the modified " + "date returned by an HTTP resource.", ) class Meta: @@ -174,15 +172,14 @@ def normalize_fields(self, exclude=None): sha1 = self.sha1 if sha1 and len(sha1) != 40: logger.warning( - 'ResourceURI.normalize_fields() for URI: "{}" - ' - 'Invalid SHA1 length: "{}": SHA1 ignored!' - .format(self.uri, sha1)) + f'ResourceURI.normalize_fields() for URI: "{self.uri}" - ' + f'Invalid SHA1 length: "{sha1}": SHA1 ignored!' + ) self.sha1 = None # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class ResourceURIManager(models.Manager): - def insert(self, uri, **extra_fields): """ Create and return a new ResourceURI after computing its canonical URI @@ -213,8 +210,7 @@ def needs_revisit(self, uri, hours): if existing: return False - revisitable = self.get_revisitables( - hours=hours).filter(uri=uri).exists() + revisitable = self.get_revisitables(hours=hours).filter(uri=uri).exists() if revisitable: return True else: @@ -235,15 +231,11 @@ def visited(self): return self.filter(wip_date__isnull=True, last_visit_date__isnull=False) def successfully_visited(self): - """ - Limit the QuerySet to ResourceURIs that were visited successfully. - """ + """Limit the QuerySet to ResourceURIs that were visited successfully.""" return self.visited().filter(has_visit_error=False) def unsuccessfully_visited(self): - """ - Limit the QuerySet to ResourceURIs that were visited with errors. - """ + """Limit the QuerySet to ResourceURIs that were visited with errors.""" return self.visited().filter(has_visit_error=True) def get_revisitables(self, hours): @@ -251,12 +243,11 @@ def get_revisitables(self, hours): Limit the QuerySet to ResourceURIs that have not been visited since the number of `hours`, and therefore considered revisitable. """ - revisitables = self.visited().filter( - last_visit_date__lt=timezone.now() - timedelta(hours=hours) - ).exclude( - is_mappable=True, last_map_date__isnull=True - ).exclude( - is_visitable=False + revisitables = ( + self.visited() + .filter(last_visit_date__lt=timezone.now() - timedelta(hours=hours)) + .exclude(is_mappable=True, last_map_date__isnull=True) + .exclude(is_visitable=False) ) return revisitables @@ -278,7 +269,7 @@ def get_visitables(self): visitables = never_visited # NOTE: this matches an index for efficient ordering - visitables = visitables.order_by('-priority', '-uri') + visitables = visitables.order_by("-priority", "-uri") return visitables def get_next_visitable(self): @@ -295,7 +286,6 @@ def get_next_visitable(self): ResourceURI. ResourceURI that have not yet been visited are sorted by decreasing priority. """ - # We use select_for_update to ensure an atomic query. We ignore # locked rows by using skip_locked=True available since Django # 1.11. @@ -317,7 +307,7 @@ def get_next_visitable(self): # Mark the URI as wip: Callers mark this done by resetting # wip_date to null resource_uri.wip_date = timezone.now() - resource_uri.save(update_fields=['wip_date']) + resource_uri.save(update_fields=["wip_date"]) return resource_uri def never_mapped(self): @@ -325,7 +315,9 @@ def never_mapped(self): Limit the QuerySet to ResourceURIs that have never been mapped. This is usually the state of a ResourceURI after its succesful visit. """ - return self.successfully_visited().filter(last_map_date__isnull=True, wip_date__isnull=True) + return self.successfully_visited().filter( + last_map_date__isnull=True, wip_date__isnull=True + ) def mapped(self): """ @@ -335,15 +327,11 @@ def mapped(self): return self.filter(wip_date__isnull=True, last_map_date__isnull=False) def successfully_mapped(self): - """ - Limit the QuerySet to ResourceURIs that were mapped successfully. - """ + """Limit the QuerySet to ResourceURIs that were mapped successfully.""" return self.mapped().filter(has_map_error=False) def unsuccessfully_mapped(self): - """ - Limit the QuerySet to ResourceURIs that were mapped with errors. - """ + """Limit the QuerySet to ResourceURIs that were mapped with errors.""" return self.mapped().filter(has_map_error=True) def get_mappables(self): @@ -354,7 +342,7 @@ def get_mappables(self): """ qs = self.never_mapped().filter(is_mappable__exact=True, has_map_error=False) # NOTE: this matches an index for efficient ordering - qs = qs.order_by('-priority') + qs = qs.order_by("-priority") return qs @@ -377,9 +365,9 @@ class ResourceURI(BaseURI): mining_level = models.PositiveIntegerField( default=0, - help_text='A numeric indication of the depth and breadth of data ' - 'collected through this ResourceURI visit. Higher means ' - 'more and deeper data.', + help_text="A numeric indication of the depth and breadth of data " + "collected through this ResourceURI visit. Higher means " + "more and deeper data.", ) # This is a text blob that contains either HTML, JSON or anything @@ -388,9 +376,9 @@ class ResourceURI(BaseURI): data = models.TextField( null=True, blank=True, - help_text='Text content of the file represented by this ' - 'ResourceURI. This contains the data that was fetched or ' - 'extracted from a remote ResourceURI such as HTML or JSON.', + help_text="Text content of the file represented by this " + "ResourceURI. This contains the data that was fetched or " + "extracted from a remote ResourceURI such as HTML or JSON.", ) package_url = models.CharField( @@ -398,105 +386,99 @@ class ResourceURI(BaseURI): null=True, blank=True, db_index=True, - help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""", ) last_visit_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of the last visit. Used to track visit status.', + help_text="Timestamp set to the date of the last visit. Used to track visit status.", ) is_visitable = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'this URI is visitable in the sense that there is a visitor ' - 'route available to process it.' + help_text="When set to True (Yes), this field indicates that " + "this URI is visitable in the sense that there is a visitor " + "route available to process it.", ) has_visit_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when visiting this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when visiting this URI.", ) visit_error = models.TextField( null=True, blank=True, - help_text='Visit errors messages. When present this means the visit failed.', + help_text="Visit errors messages. When present this means the visit failed.", ) last_map_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of the last mapping. ' - 'Used to track mapping status.', + help_text="Timestamp set to the date of the last mapping. " + "Used to track mapping status.", ) is_mappable = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'this URI is mappable in the sense that there is a mapper ' - 'route available to process it.' + help_text="When set to True (Yes), this field indicates that " + "this URI is mappable in the sense that there is a mapper " + "route available to process it.", ) has_map_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when mapping this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when mapping this URI.", ) map_error = models.TextField( null=True, blank=True, - help_text='Mapping errors messages. When present this means the mapping failed.', + help_text="Mapping errors messages. When present this means the mapping failed.", ) objects = ResourceURIManager() class Meta: - verbose_name = 'Resource URI' - unique_together = ['canonical', 'last_visit_date'] + verbose_name = "Resource URI" + unique_together = ["canonical", "last_visit_date"] indexes = [ # to get the next visitable models.Index( fields=[ - 'is_visitable', - 'last_visit_date', - 'wip_date', - 'has_visit_error', + "is_visitable", + "last_visit_date", + "wip_date", + "has_visit_error", ] ), # to get the next mappable models.Index( fields=[ - 'is_mappable', - 'last_visit_date', - 'wip_date', - 'last_map_date', - 'has_visit_error', - 'has_map_error', + "is_mappable", + "last_visit_date", + "wip_date", + "last_map_date", + "has_visit_error", + "has_map_error", ] ), # ordered by for the main queue query e.g. '-priority' - models.Index( - fields=[ - '-priority' - ] - ) + models.Index(fields=["-priority"]), ] def _set_defauts(self): - """ - Set defaults for computed fields. - """ + """Set defaults for computed fields.""" uri = self.uri if not self.canonical: self.canonical = get_canonical(uri) @@ -504,9 +486,7 @@ def _set_defauts(self): self.is_mappable = map_router.is_routable(uri) def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" self._set_defauts() self.normalize_fields() self.has_map_error = True if self.map_error else False @@ -515,17 +495,15 @@ def save(self, *args, **kwargs): class ScannableURIManager(models.Manager): - def get_scannables(self): """ Return an ordered query set of all scannable ScannableURIs. Note: this does not evaluate the query set and does not lock the database for update. """ - qs = self.filter(scan_status__exact=ScannableURI.SCAN_NEW, - scan_error=None) + qs = self.filter(scan_status__exact=ScannableURI.SCAN_NEW, scan_error=None) # NOTE: this matches an index for efficient ordering - qs = qs.order_by('-priority') + qs = qs.order_by("-priority") return qs def get_next_scannable(self): @@ -584,7 +562,7 @@ def __get_next_candidate(self, qs): # Mark the URI as wip: Callers mark this done by resetting # wip_date to null canidate_uri.wip_date = timezone.now() - canidate_uri.save(update_fields=['wip_date']) + canidate_uri.save(update_fields=["wip_date"]) return canidate_uri def get_processables(self): @@ -595,15 +573,17 @@ def get_processables(self): Note: this does not evaluate the query set and does not lock the database for update. """ - qs = self.filter(scan_status__in=[ - ScannableURI.SCAN_SUBMITTED, - ScannableURI.SCAN_IN_PROGRESS, - ScannableURI.SCAN_COMPLETED - ], - wip_date=None, scan_error=None, + qs = self.filter( + scan_status__in=[ + ScannableURI.SCAN_SUBMITTED, + ScannableURI.SCAN_IN_PROGRESS, + ScannableURI.SCAN_COMPLETED, + ], + wip_date=None, + scan_error=None, ) # NOTE: this matches an index for efficient ordering - qs = qs.order_by('-scan_status', '-priority') + qs = qs.order_by("-scan_status", "-priority") return qs def get_next_processable(self): @@ -618,33 +598,38 @@ def get_next_processable(self): return self.__get_next_candidate(self.get_processables()) def statistics(self): - """ - Return a statistics mapping with summary counts of ScannableURI grouped by status. - """ - statuses = list(self.values('scan_status').annotate( - count=models.Count('scan_status')).order_by('scan_status'),) + """Return a statistics mapping with summary counts of ScannableURI grouped by status.""" + statuses = list( + self.values("scan_status") + .annotate(count=models.Count("scan_status")) + .order_by("scan_status"), + ) for stat in statuses: - stat['scan_status'] = ScannableURI.SCAN_STATUSES_BY_CODE[stat['scan_status']] + stat["scan_status"] = ScannableURI.SCAN_STATUSES_BY_CODE[ + stat["scan_status"] + ] stats = { - 'total': self.count(), - 'processables': self.get_processables().count(), - 'scannables': self.get_scannables().count(), - 'by_status': statuses, + "total": self.count(), + "processables": self.get_processables().count(), + "scannables": self.get_scannables().count(), + "by_status": statuses, } most_recent = dict( - most_recent_submitted=self._recent( - scan_status=ScannableURI.SCAN_SUBMITTED), - most_recent_indexed=self._recent( - scan_status=ScannableURI.SCAN_INDEXED), + most_recent_submitted=self._recent(scan_status=ScannableURI.SCAN_SUBMITTED), + most_recent_indexed=self._recent(scan_status=ScannableURI.SCAN_INDEXED), most_recent_failed=self._recent( - scan_status=ScannableURI.SCAN_FAILED, extra_value="scan_error",), + scan_status=ScannableURI.SCAN_FAILED, + extra_value="scan_error", + ), most_recent_in_progress=self._recent( - scan_status=ScannableURI.SCAN_IN_PROGRESS), - most_recent_completed=self._recent( - scan_status=ScannableURI.SCAN_COMPLETED), + scan_status=ScannableURI.SCAN_IN_PROGRESS + ), + most_recent_completed=self._recent(scan_status=ScannableURI.SCAN_COMPLETED), most_recent_index_errors=self._recent( - scan_status=ScannableURI.SCAN_INDEX_FAILED, extra_value="index_error",), + scan_status=ScannableURI.SCAN_INDEX_FAILED, + extra_value="index_error", + ), ) stats.update(most_recent) return stats @@ -655,8 +640,9 @@ def _recent(self, scan_status, extra_value=None, most_recent=10): ``scan_status``. Include an optional ``extra value`` field name. """ - recent_uris = self.filter(scan_status=scan_status).order_by( - '-scan_date')[:most_recent] + recent_uris = self.filter(scan_status=scan_status).order_by("-scan_date")[ + :most_recent + ] for scauri in recent_uris: recent = dict( # this is NOT a field requiring this loop @@ -701,6 +687,7 @@ class ScannableURI(BaseURI): - update the matching index for the PackageDB as needed with fingerprints from the scan - set status and timestamps as needed """ + uuid = models.UUIDField( default=uuid.uuid4, unique=True, @@ -711,14 +698,14 @@ class ScannableURI(BaseURI): null=True, blank=True, db_index=True, - help_text='Timestamp set to the date when a scan was taken by a worker', + help_text="Timestamp set to the date when a scan was taken by a worker", ) pipelines = models.JSONField( default=list, blank=True, editable=False, - help_text='A list of ScanCode.io pipeline names to be run for this scan', + help_text="A list of ScanCode.io pipeline names to be run for this scan", ) SCAN_NEW = 0 @@ -731,53 +718,51 @@ class ScannableURI(BaseURI): SCAN_INDEX_FAILED = 7 SCAN_STATUS_CHOICES = [ - (SCAN_NEW, 'new'), - (SCAN_SUBMITTED, 'submitted'), - (SCAN_IN_PROGRESS, 'in progress'), - (SCAN_COMPLETED, 'scanned'), - (SCAN_INDEXED, 'indexed'), - (SCAN_FAILED, 'failed'), - (SCAN_TIMEOUT, 'timeout'), - (SCAN_INDEX_FAILED, 'scan index failed'), + (SCAN_NEW, "new"), + (SCAN_SUBMITTED, "submitted"), + (SCAN_IN_PROGRESS, "in progress"), + (SCAN_COMPLETED, "scanned"), + (SCAN_INDEXED, "indexed"), + (SCAN_FAILED, "failed"), + (SCAN_TIMEOUT, "timeout"), + (SCAN_INDEX_FAILED, "scan index failed"), ] SCAN_STATUSES_BY_CODE = dict(SCAN_STATUS_CHOICES) SCAN_STATUS_CODES_BY_SCAN_STATUS = { - status: code - for code, status - in SCAN_STATUS_CHOICES + status: code for code, status in SCAN_STATUS_CHOICES } scan_status = models.IntegerField( default=SCAN_NEW, choices=SCAN_STATUS_CHOICES, db_index=True, - help_text='Status of the scan for this URI.', + help_text="Status of the scan for this URI.", ) reindex_uri = models.BooleanField( default=False, null=True, blank=True, - help_text='Flag indicating whether or not this URI should be rescanned and reindexed.', + help_text="Flag indicating whether or not this URI should be rescanned and reindexed.", ) scan_error = models.TextField( null=True, blank=True, - help_text='Scan errors messages. When present this means the scan failed.', + help_text="Scan errors messages. When present this means the scan failed.", ) index_error = models.TextField( null=True, blank=True, - help_text='Indexing errors messages. When present this means the indexing failed.', + help_text="Indexing errors messages. When present this means the indexing failed.", ) package = models.ForeignKey( Package, - help_text='The Package that this ScannableURI is for', + help_text="The Package that this ScannableURI is for", on_delete=models.CASCADE, null=False, ) @@ -785,38 +770,29 @@ class ScannableURI(BaseURI): objects = ScannableURIManager() class Meta: - verbose_name = 'Scannable URI' + verbose_name = "Scannable URI" indexes = [ # to get the scannables models.Index( fields=[ - 'scan_status', - 'scan_date', + "scan_status", + "scan_date", ] ), # ordered by for the main queue query e.g. '-priority' - models.Index( - fields=[ - '-priority' - ] - ) + models.Index(fields=["-priority"]), ] def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" if not self.canonical: self.canonical = get_canonical(self.uri) self.normalize_fields() super(ScannableURI, self).save(*args, **kwargs) def process_scan_results( - self, - scan_results_location, - scan_summary_location, - project_extra_data + self, scan_results_location, scan_summary_location, project_extra_data ): from minecode import tasks @@ -845,7 +821,6 @@ def process_scan_results( # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class PriorityResourceURIManager(models.Manager): - def insert(self, uri, **extra_fields): """ Create and return a new PriorityResourceURI after computing its canonical URI @@ -855,26 +830,17 @@ def insert(self, uri, **extra_fields): """ # TODO: be able to create a request for an existing purl if the previous request has been completed already - priority_resource_uris = self.filter( - uri=uri, - package_url=uri, - **extra_fields - ) - if ( - priority_resource_uris.count() == 0 - or all(p.processed_date for p in priority_resource_uris) + priority_resource_uris = self.filter(uri=uri, package_url=uri, **extra_fields) + if priority_resource_uris.count() == 0 or all( + p.processed_date for p in priority_resource_uris ): priority_resource_uri = self.create( - uri=uri, - package_url=uri, - **extra_fields + uri=uri, package_url=uri, **extra_fields ) return priority_resource_uri def in_progress(self): - """ - Limit the QuerySet to PriorityResourceURI being processed. - """ + """Limit the QuerySet to PriorityResourceURI being processed.""" return self.filter(wip_date__isnull=False) def never_processed(self): @@ -882,17 +848,12 @@ def never_processed(self): Limit the QuerySet to PriorityResourceURIs that have never been processed. This is usually the state of a PriorityResourceURI after upon creation. """ - return self.filter( - processed_date__isnull=True, - wip_date__isnull=True - ).order_by( - 'request_date' + return self.filter(processed_date__isnull=True, wip_date__isnull=True).order_by( + "request_date" ) def get_requests(self): - """ - Return an ordered query set of all processable PriorityResourceURIs. - """ + """Return an ordered query set of all processable PriorityResourceURIs.""" never_processed = self.never_processed() return never_processed @@ -906,12 +867,13 @@ def get_next_request(self): NOTE: this method can only be called from within a transaction.atomic block. """ - priority_resource_uri = self.get_requests( - ).select_for_update(skip_locked=True).first() + priority_resource_uri = ( + self.get_requests().select_for_update(skip_locked=True).first() + ) if not priority_resource_uri: return priority_resource_uri.wip_date = timezone.now() - priority_resource_uri.save(update_fields=['wip_date']) + priority_resource_uri.save(update_fields=["wip_date"]) return priority_resource_uri @@ -936,15 +898,15 @@ class PriorityResourceURI(BaseURI): max_length=2048, null=True, blank=True, - help_text='URI for this resource. This is the unmodified original URI.', + help_text="URI for this resource. This is the unmodified original URI.", ) canonical = models.CharField( max_length=3000, null=True, blank=True, - help_text='Canonical form of the URI for this resource that must be ' - 'unique across all ResourceURI.', + help_text="Canonical form of the URI for this resource that must be " + "unique across all ResourceURI.", ) # This is a text blob that contains either HTML, JSON or anything @@ -953,9 +915,9 @@ class PriorityResourceURI(BaseURI): data = models.TextField( null=True, blank=True, - help_text='Text content of the file represented by this ' - 'ResourceURI. This contains the data that was fetched or ' - 'extracted from a remote ResourceURI such as HTML or JSON.', + help_text="Text content of the file represented by this " + "ResourceURI. This contains the data that was fetched or " + "extracted from a remote ResourceURI such as HTML or JSON.", ) package_url = models.CharField( @@ -963,59 +925,56 @@ class PriorityResourceURI(BaseURI): null=True, blank=True, db_index=True, - help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""", ) request_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was requested.', + help_text="Timestamp set to the date of when this Package info was requested.", ) processed_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was requested.', + help_text="Timestamp set to the date of when this Package info was requested.", ) has_processing_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when processing this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when processing this URI.", ) processing_error = models.TextField( null=True, blank=True, - help_text='Processing errors messages. When present this means the processing failed.', + help_text="Processing errors messages. When present this means the processing failed.", ) addon_pipelines = models.JSONField( default=list, blank=True, editable=False, - help_text='A list of addon ScanCode.io pipeline to run.', + help_text="A list of addon ScanCode.io pipeline to run.", ) objects = PriorityResourceURIManager() class Meta: - verbose_name = 'Priority Resource URI' + verbose_name = "Priority Resource URI" def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" self.normalize_fields() super(PriorityResourceURI, self).save(*args, **kwargs) # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class ImportableURIManager(models.Manager): - def insert(self, uri, data, package_url, **extra_fields): """ Create and return a new ImportableURI @@ -1023,26 +982,17 @@ def insert(self, uri, data, package_url, **extra_fields): """ # TODO: be able to create a request for an existing purl if the previous request has been completed already - importable_uris = self.filter( - uri=uri, - **extra_fields - ) - if ( - importable_uris.count() == 0 - or all(p.processed_date for p in importable_uris) + importable_uris = self.filter(uri=uri, **extra_fields) + if importable_uris.count() == 0 or all( + p.processed_date for p in importable_uris ): importable_uri = self.create( - uri=uri, - data=data, - package_url=package_url, - **extra_fields + uri=uri, data=data, package_url=package_url, **extra_fields ) return importable_uri def in_progress(self): - """ - Limit the QuerySet to ImportableURI being processed. - """ + """Limit the QuerySet to ImportableURI being processed.""" return self.filter(wip_date__isnull=False) def never_processed(self): @@ -1050,17 +1000,12 @@ def never_processed(self): Limit the QuerySet to ImportableURIs that have never been processed. This is usually the state of a ImportableURI after upon creation. """ - return self.filter( - processed_date__isnull=True, - wip_date__isnull=True - ).order_by( - 'request_date' + return self.filter(processed_date__isnull=True, wip_date__isnull=True).order_by( + "request_date" ) def get_requests(self): - """ - Return an ordered query set of all processable ImportableURIs. - """ + """Return an ordered query set of all processable ImportableURIs.""" never_processed = self.never_processed() return never_processed @@ -1078,9 +1023,10 @@ def get_next_request(self): if not importable_uri: return importable_uri.wip_date = timezone.now() - importable_uri.save(update_fields=['wip_date']) + importable_uri.save(update_fields=["wip_date"]) return importable_uri + # TODO: have a second queue for crawling maven repo, that tracks which pages and namespaces we visited # when we hit the point of a package page, we add it to the queue that creates skinny packages for the package we visited. @@ -1091,7 +1037,7 @@ class ImportableURI(BaseURI): null=True, blank=True, db_index=True, - help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""", ) # This is a text blob that contains either HTML, JSON or anything @@ -1100,47 +1046,45 @@ class ImportableURI(BaseURI): data = models.TextField( null=True, blank=True, - help_text='Text content of the file represented by this ' - 'ResourceURI. This contains the data that was fetched or ' - 'extracted from a remote ResourceURI such as HTML or JSON.', + help_text="Text content of the file represented by this " + "ResourceURI. This contains the data that was fetched or " + "extracted from a remote ResourceURI such as HTML or JSON.", ) request_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was requested.', + help_text="Timestamp set to the date of when this Package info was requested.", ) processed_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was processed.', + help_text="Timestamp set to the date of when this Package info was processed.", ) has_processing_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when processing this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when processing this URI.", ) processing_error = models.TextField( null=True, blank=True, - help_text='Processing errors messages. When present this means the processing failed.', + help_text="Processing errors messages. When present this means the processing failed.", ) objects = ImportableURIManager() class Meta: - verbose_name = 'Importable URI' + verbose_name = "Importable URI" def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" self.normalize_fields() super(ImportableURI, self).save(*args, **kwargs) @@ -1150,21 +1094,19 @@ class ProcessingError(BaseURI): max_length=100, null=True, blank=True, - help_text='The name of the service running where the error occured.' + help_text="The name of the service running where the error occured.", ) date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this error occured.', + help_text="Timestamp set to the date of when this error occured.", ) error_message = models.TextField( - null=True, - blank=True, - help_text='The message associated with this error' + null=True, blank=True, help_text="The message associated with this error" ) class Meta: - verbose_name = 'Processing Error' + verbose_name = "Processing Error" diff --git a/minecode/permissions.py b/minecode/permissions.py index 83815cea..f4a3ce65 100644 --- a/minecode/permissions.py +++ b/minecode/permissions.py @@ -2,8 +2,7 @@ class IsScanQueueWorkerAPIUser(permissions.BasePermission): - """ - Allow access to a user who is a part of the `scan_queue_workers` group - """ + """Allow access to a user who is a part of the `scan_queue_workers` group""" + def has_permission(self, request, view): - return request.user.groups.filter(name='scan_queue_workers').exists() + return request.user.groups.filter(name="scan_queue_workers").exists() diff --git a/minecode/route.py b/minecode/route.py index 59a0cd97..f034bb81 100644 --- a/minecode/route.py +++ b/minecode/route.py @@ -8,10 +8,9 @@ # -from functools import wraps import inspect import re - +from functools import wraps """ Given a URI regex (or some string), this module can route execution to a @@ -36,7 +35,7 @@ """ -class Rule(object): +class Rule: """ A rule is a mapping between a pattern (typically a URI) and a callable (typically a function). @@ -48,8 +47,8 @@ class Rule(object): def __init__(self, pattern, endpoint): # To ensure the pattern will match entirely, we wrap the pattern # with start of line ^ and end of line $. - self.pattern = pattern.lstrip('^').rstrip('$') - self.pattern_match = re.compile('^' + self.pattern + '$').match + self.pattern = pattern.lstrip("^").rstrip("$") + self.pattern_match = re.compile("^" + self.pattern + "$").match # ensure the endpoint is callable assert callable(endpoint) @@ -61,35 +60,26 @@ def __init__(self, pattern, endpoint): self.endpoint = endpoint def __repr__(self): - return 'Rule(r"""{}""", {}.{})'.format( - self.pattern, self.endpoint.__module__, self.endpoint.__name__) + return f'Rule(r"""{self.pattern}""", {self.endpoint.__module__}.{self.endpoint.__name__})' def match(self, string): - """ - Match a string with the rule pattern, return True is matching. - """ + """Match a string with the rule pattern, return True is matching.""" return self.pattern_match(string) class RouteAlreadyDefined(TypeError): - """ - Raised when this route Rule already exists in the route map. - """ + """Raised when this route Rule already exists in the route map.""" class NoRouteAvailable(TypeError): - """ - Raised when there are no route available. - """ + """Raised when there are no route available.""" class MultipleRoutesDefined(TypeError): - """ - Raised when there are more than one route possible. - """ + """Raised when there are more than one route possible.""" -class Router(object): +class Router: """ A router is: - a container for a route map, consisting of several rules, stored in an @@ -104,9 +94,7 @@ class Router(object): """ def __init__(self, route_map=None): - """ - 'route_map' is an ordered mapping of pattern -> Rule. - """ + """'route_map' is an ordered mapping of pattern -> Rule.""" self.route_map = route_map or dict() # lazy cached pre-compiled regex match() for all route patterns self._is_routable = None @@ -134,11 +122,14 @@ def route(self, *patterns): Decorator to make a callable 'endpoint' routed to one or more patterns. Example: + ------- >>> my_router = Router() >>> @my_router.route('http://nexb.com', 'http://deja.com') ... def somefunc(uri): ... pass + """ + def decorator(endpoint): assert patterns for pat in patterns: @@ -147,6 +138,7 @@ def decorator(endpoint): @wraps(endpoint) def decorated(*args, **kwargs): return self.process(*args, **kwargs) + return decorated return decorator @@ -186,7 +178,7 @@ def resolve(self, string): # this can happen when multiple patterns match the same string # we raise an exception with enough debugging information pats = repr([r.pattern for r in candidates]) - msg = '%(string)r matches multiple patterns %(pats)r' % locals() + msg = "%(string)r matches multiple patterns %(pats)r" % locals() raise MultipleRoutesDefined(msg) return candidates[0].endpoint @@ -201,7 +193,7 @@ def is_routable(self, string): if not self._is_routable: # build an alternation regex - routables = '^(' + '|'.join(pat for pat in self.route_map) + ')$' + routables = "^(" + "|".join(pat for pat in self.route_map) + ")$" self._is_routable = re.compile(routables, re.UNICODE).match return bool(self._is_routable(string)) diff --git a/minecode/rsync.py b/minecode/rsync.py index 164b4e5d..bb765669 100644 --- a/minecode/rsync.py +++ b/minecode/rsync.py @@ -23,7 +23,7 @@ # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) # logger.setLevel(logging.DEBUG) -RSYNC_COMMAND = 'rsync' +RSYNC_COMMAND = "rsync" def modules(input_file): @@ -38,22 +38,22 @@ def modules(input_file): for line in inp: if not line: continue - if line.startswith(' '): + if line.startswith(" "): # this is the motd section continue line = line.strip() if line: - name, _desc = line.split('\t', 1) + name, _desc = line.split("\t", 1) yield name.strip() -octals = re.compile(r'#(\d{3})').findall +octals = re.compile(r"#(\d{3})").findall def decode_path(p): """Decode an rsync path with octal encodings""" for oc in set(octals(p)): - p = p.replace('#' + oc, octal2char(oc)) + p = p.replace("#" + oc, octal2char(oc)) return p @@ -63,30 +63,29 @@ def octal2char(s): def decode_ts(s): - """ - Convert an rsync timestamp (which is local tz) to an UTC ISO timestamp. - """ + """Convert an rsync timestamp (which is local tz) to an UTC ISO timestamp.""" tzinfo = tz.tzutc() - ar = arrow.get(s, 'YYYY/MM/DD HH:mm:ss').replace(tzinfo=tzinfo).to('utc') + ar = arrow.get(s, "YYYY/MM/DD HH:mm:ss").replace(tzinfo=tzinfo).to("utc") return ar.isoformat() + # note: there is a large number of possible file types, but we do not care for # them: only files and dirs matter; And links, block, pipes, fifo, etc do not. # i.e. we keep only - and d rsync_line = re.compile( - r'^(?P[\-d])' - r'(?P.{9})' - r' +' - r'(?P[\d,]+)' - r' ' - r'(?P\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})' # YYYY/MM/DD HH:mm:ss - r' +' - r'(?P.+$)' + r"^(?P[\-d])" + r"(?P.{9})" + r" +" + r"(?P[\d,]+)" + r" " + r"(?P\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})" # YYYY/MM/DD HH:mm:ss + r" +" + r"(?P.+$)" ).match -Entry = collections.namedtuple('Entry', 'type perm size date path') +Entry = collections.namedtuple("Entry", "type perm size date path") def entry(line): @@ -94,20 +93,20 @@ def entry(line): Return an Entry constructed from an rsync directory listing line. Assumes universal line endings. """ - line = line.rstrip('\n') + line = line.rstrip("\n") if not line: return - if 'skipping directory' in line: + if "skipping directory" in line: return rline = rsync_line(line) if not rline: return - typ = rline.group('type') - perm = rline.group('perm') - size = int(rline.group('size').replace(',', '')) - ts = rline.group('ts') + typ = rline.group("type") + perm = rline.group("perm") + size = int(rline.group("size").replace(",", "")) + ts = rline.group("ts") date = decode_ts(ts) - path = rline.group('path') + path = rline.group("path") path = decode_path(path) return dict(Entry(typ, perm, size, date, path)._asdict()) @@ -133,12 +132,11 @@ def fetch_directory(uri, recurse=True): Return the location of a tempfile containing an rsync dir listing for uri. Recursive if recurse is True. Raise an Exception with error details. """ - temp_file = get_temp_file( - file_name='minecode-rsync-dir-', extension='.rsync') - with open(temp_file, 'w') as tmp: + temp_file = get_temp_file(file_name="minecode-rsync-dir-", extension=".rsync") + with open(temp_file, "w") as tmp: file_name = tmp.name - ends = not uri.endswith('/') and '/' or '' - recursive = recurse and '--recursive' or '--no-recursive' + ends = not uri.endswith("/") and "/" or "" + recursive = recurse and "--recursive" or "--no-recursive" cmd = 'rsync --no-motd %(recursive)s -d "%(uri)s%(ends)s"' % locals() rsync = command.Command(cmd) out, err = rsync.execute() @@ -146,9 +144,9 @@ def fetch_directory(uri, recurse=True): for o in out: tmp.write(o) - err = '\n'.join([e for e in err]) + err = "\n".join([e for e in err]) rc = rsync.returncode if err or rc: - raise Exception('%(cmd) failed. rc:%(tc)d err: %(err)s' % locals()) + raise Exception("%(cmd) failed. rc:%(tc)d err: %(err)s" % locals()) else: return file_name diff --git a/minecode/saneyaml.py b/minecode/saneyaml.py index 499c1eee..e55fc638 100644 --- a/minecode/saneyaml.py +++ b/minecode/saneyaml.py @@ -13,11 +13,11 @@ import yaml try: - from yaml import CSafeLoader as SafeLoader from yaml import CSafeDumper as SafeDumper + from yaml import CSafeLoader as SafeLoader except ImportError: - from yaml import SafeLoader from yaml import SafeDumper + from yaml import SafeLoader """ @@ -48,9 +48,7 @@ def load(s): def dump(obj): - """ - Return a safe and sane YAML unicode string representation from `obj`. - """ + """Return a safe and sane YAML unicode string representation from `obj`.""" return yaml.dump( obj, Dumper=SaneDumper, @@ -62,29 +60,25 @@ def dump(obj): encoding=None, indent=4, width=90, - line_break='\n', + line_break="\n", explicit_start=False, explicit_end=False, ) class SaneLoader(SafeLoader): - """ - A safe loader configured with many sane defaults. - """ + """A safe loader configured with many sane defaults.""" def ignore_aliases(self, data): return True def string_loader(loader, node): - """ - Ensure that a scalar type (a value) is returned as a plain unicode string. - """ + """Ensure that a scalar type (a value) is returned as a plain unicode string.""" return loader.construct_scalar(node) -SaneLoader.add_constructor(u'tag:yaml.org,2002:str', string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:str", string_loader) # Load as strings most scalar types: nulls, ints, (such as in version # 01) floats (such version 2.20) and timestamps conversion (in @@ -94,20 +88,18 @@ def string_loader(loader, node): # must handle type conversion explicitly from unicode to other types # in the loaded objects. -SaneLoader.add_constructor(u'tag:yaml.org,2002:null', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:timestamp', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:float', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:int', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:null', string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:null", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:timestamp", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:float", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:int", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:null", string_loader) # keep boolean conversion # SaneLoader.add_constructor(u'tag:yaml.org,2002:boolean', string_loader) def ordered_loader(loader, node): - """ - Ensure that YAML maps ordered is preserved and loaded in an dict now always ordered - """ + """Ensure that YAML maps ordered is preserved and loaded in an dict now always ordered""" assert isinstance(node, yaml.MappingNode) omap = dict() yield omap @@ -118,8 +110,8 @@ def ordered_loader(loader, node): omap[key] = value -SaneLoader.add_constructor(u'tag:yaml.org,2002:map', ordered_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:omap', ordered_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:map", ordered_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:omap", ordered_loader) # Fall back to mapping for anything else, e.g. ignore tags such as # !!Python, ruby and other dangerous mappings: treat them as a mapping @@ -128,39 +120,31 @@ def ordered_loader(loader, node): class SaneDumper(SafeDumper): def increase_indent(self, flow=False, indentless=False): - """ - Ensure that lists items are always indented. - """ + """Ensure that lists items are always indented.""" return super(SaneDumper, self).increase_indent(flow, indentless=False) def ignore_aliases(self, data): - """ - Avoid having aliases created from re-used Python objects. - """ + """Avoid having aliases created from re-used Python objects.""" return True def ordered_dumper(dumper, data): - """ - Ensure that maps are always dumped in the items order. - """ - return dumper.represent_mapping(u'tag:yaml.org,2002:map', data.items()) + """Ensure that maps are always dumped in the items order.""" + return dumper.represent_mapping("tag:yaml.org,2002:map", data.items()) SaneDumper.add_representer(dict, ordered_dumper) def null_dumper(dumper, value): - """ - Always dump nulls as empty string. - """ - return dumper.represent_scalar(u'tag:yaml.org,2002:null', u'') + """Always dump nulls as empty string.""" + return dumper.represent_scalar("tag:yaml.org,2002:null", "") SafeDumper.add_representer(type(None), null_dumper) -def string_dumper(dumper, value, _tag=u'tag:yaml.org,2002:str'): +def string_dumper(dumper, value, _tag="tag:yaml.org,2002:str"): """ Ensure that all scalars are dumped as UTF-8 unicode, folded and quoted in the sanest and most readable way. @@ -169,12 +153,12 @@ def string_dumper(dumper, value, _tag=u'tag:yaml.org,2002:str'): value = repr(value) if isinstance(value, str): - value = value.decode('utf-8') + value = value.decode("utf-8") style = None - multilines = '\n' in value + multilines = "\n" in value if multilines: - literal_style = '|' + literal_style = "|" style = literal_style return dumper.represent_scalar(_tag, value, style=style) @@ -183,19 +167,17 @@ def string_dumper(dumper, value, _tag=u'tag:yaml.org,2002:str'): SaneDumper.add_representer(str, string_dumper) # treat number as strings, not as numbers -SaneDumper.add_representer(int, partial( - string_dumper, _tag=u'tag:yaml.org,2002:int')) -SaneDumper.add_representer(float, partial( - string_dumper, _tag=u'tag:yaml.org,2002:float')) +SaneDumper.add_representer(int, partial(string_dumper, _tag="tag:yaml.org,2002:int")) +SaneDumper.add_representer( + float, partial(string_dumper, _tag="tag:yaml.org,2002:float") +) def boolean_dumper(dumper, value): - """ - Dump booleans as yes or no strings. - """ - value = u'yes' if value else u'no' + """Dump booleans as yes or no strings.""" + value = "yes" if value else "no" style = None - return dumper.represent_scalar(u'tag:yaml.org,2002:bool', value, style=style) + return dumper.represent_scalar("tag:yaml.org,2002:bool", value, style=style) SaneDumper.add_representer(bool, boolean_dumper) diff --git a/minecode/seed.py b/minecode/seed.py index 61892ef3..9a805960 100644 --- a/minecode/seed.py +++ b/minecode/seed.py @@ -18,7 +18,7 @@ unicode = str # NOQA -class Seeder(object): +class Seeder: """ Abstract base class for seeding URIs to visit. Each visitor should create a subclass of Seeder and implement the get_seeds method to yield the top levle @@ -31,9 +31,7 @@ class Seeder(object): revisit_after = 240 # hours def get_seeds(self): - """ - Yield seed URIs strings. Subclass must override. - """ + """Yield seed URIs strings. Subclass must override.""" raise NotImplementedError() @@ -48,7 +46,7 @@ def get_active_seeders(seeders=()): seeders = get_configured_seeders() for seeder in seeders: if isinstance(seeder, (bytes, unicode)): - module_name, _, class_name = seeder.rpartition('.') + module_name, _, class_name = seeder.rpartition(".") module = importlib.import_module(module_name) yield getattr(module, class_name)() else: @@ -62,5 +60,6 @@ def get_configured_seeders(): environment. """ from minecode.management.commands import get_settings + # ACTIVE_VISITOR_SEEDS is a list of fully qualified Seeder subclass strings - return get_settings('ACTIVE_SEEDERS') or [] + return get_settings("ACTIVE_SEEDERS") or [] diff --git a/minecode/tasks.py b/minecode/tasks.py index 072ab06f..d8550625 100644 --- a/minecode/tasks.py +++ b/minecode/tasks.py @@ -29,7 +29,6 @@ def process_scan_results( `scan_results_location` and `scan_summary_location` are deleted after the indexing process has finished. """ - with open(scan_results_location) as f: scan_data = json.load(f) with open(scan_summary_location) as f: @@ -38,7 +37,7 @@ def process_scan_results( try: scannable_uri = ScannableURI.objects.get(uuid=scannable_uri_uuid) except ScannableURI.DoesNotExist: - raise Exception(f'ScannableURI {scannable_uri_uuid} does not exist!') + raise Exception(f"ScannableURI {scannable_uri_uuid} does not exist!") indexing_errors = index_package( scannable_uri, diff --git a/minecode/tests/__init__.py b/minecode/tests/__init__.py index 8598b637..c7703d25 100644 --- a/minecode/tests/__init__.py +++ b/minecode/tests/__init__.py @@ -9,5 +9,4 @@ import os - FIXTURES_REGEN = os.environ.get("MINECODE_TEST_FIXTURES_REGEN", False) diff --git a/minecode/tests/collectors/test_conan.py b/minecode/tests/collectors/test_conan.py index e44e5147..1c20e33f 100644 --- a/minecode/tests/collectors/test_conan.py +++ b/minecode/tests/collectors/test_conan.py @@ -9,27 +9,28 @@ import os +from unittest.mock import patch -import saneyaml from django.test import TestCase -from mock import patch + +import saneyaml from packageurl import PackageURL import packagedb -from minecode.utils_test import JsonBasedTesting from minecode.collectors import conan +from minecode.utils_test import JsonBasedTesting class ConanPriorityQueueTests(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "testfiles") + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def setUp(self): super(ConanPriorityQueueTests, self).setUp() self.package_url1 = PackageURL.from_string("pkg:conan/zlib@1.3.1") - zlib_conanfile_loc = self.get_test_loc( - "conan/zlib/manifest/conanfile.py") - zlib_conandata_loc = self.get_test_loc( - "conan/zlib/manifest/conandata.yml") + zlib_conanfile_loc = self.get_test_loc("conan/zlib/manifest/conanfile.py") + zlib_conandata_loc = self.get_test_loc("conan/zlib/manifest/conandata.yml") zlib_config_loc = self.get_test_loc("conan/zlib/manifest/config.yml") with open(zlib_conanfile_loc) as f: @@ -41,8 +42,7 @@ def setUp(self): with open(zlib_conandata_loc) as f: self.zlib_conandata_contents = f.read() - self.zlib_conandata_contents_dict = saneyaml.load( - self.zlib_conandata_contents) + self.zlib_conandata_contents_dict = saneyaml.load(self.zlib_conandata_contents) @patch("requests.get") def test_get_conan_recipe(self, mock_get): @@ -101,7 +101,7 @@ def test_map_conan_package(self, mock_get_conan_recipe): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(package_count, 0) - conan.map_conan_package(self.package_url1, ('test_pipelines')) + conan.map_conan_package(self.package_url1, ("test_pipelines")) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(package_count, 1) package = packagedb.models.Package.objects.all().first() diff --git a/minecode/tests/collectors/test_generic.py b/minecode/tests/collectors/test_generic.py index d3fb4492..ec58d2a6 100644 --- a/minecode/tests/collectors/test_generic.py +++ b/minecode/tests/collectors/test_generic.py @@ -8,12 +8,12 @@ # from django.test import TestCase as DjangoTestCase -from packagedcode.maven import _parse + from packageurl import PackageURL +from minecode.collectors import generic from minecode.route import NoRouteAvailable from minecode.utils_test import JsonBasedTesting -from minecode.collectors import generic from packagedb.models import Package @@ -22,7 +22,7 @@ def test_process_request(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) - purl = 'pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz' + purl = "pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz" error_msg = generic.process_request(purl) self.assertEqual(None, error_msg) @@ -30,10 +30,9 @@ def test_process_request(self): self.assertEqual(1, package_count) package = Package.objects.first() - self.assertEqual('test', package.name) - self.assertEqual('1.0.0', package.version) - self.assertEqual('http://example.com/test.tar.gz', - package.download_url) + self.assertEqual("test", package.name) + self.assertEqual("1.0.0", package.version) + self.assertEqual("http://example.com/test.tar.gz", package.download_url) def test_process_request_no_download_url(self): package_count = Package.objects.all().count() @@ -48,29 +47,27 @@ def test_map_generic_package(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) - purl = 'pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz' + purl = "pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz" package_url = PackageURL.from_string(purl) - error_msg = generic.map_generic_package(package_url, ('test_pipeline')) + error_msg = generic.map_generic_package(package_url, ("test_pipeline")) - self.assertEqual('', error_msg) + self.assertEqual("", error_msg) package_count = Package.objects.all().count() self.assertEqual(1, package_count) package = Package.objects.first() - self.assertEqual('test', package.name) - self.assertEqual('1.0.0', package.version) - self.assertEqual('http://example.com/test.tar.gz', - package.download_url) + self.assertEqual("test", package.name) + self.assertEqual("1.0.0", package.version) + self.assertEqual("http://example.com/test.tar.gz", package.download_url) def test_map_fetchcode_supported_package(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) purl = PackageURL.from_string("pkg:generic/udhcp@0.9.1") - error_msg = generic.map_fetchcode_supported_package( - purl, ('test_pipeline')) + error_msg = generic.map_fetchcode_supported_package(purl, ("test_pipeline")) - self.assertEqual('', error_msg) + self.assertEqual("", error_msg) package_count = Package.objects.all().count() self.assertEqual(1, package_count) diff --git a/minecode/tests/collectors/test_gnu.py b/minecode/tests/collectors/test_gnu.py index c0d89235..ef2fc331 100644 --- a/minecode/tests/collectors/test_gnu.py +++ b/minecode/tests/collectors/test_gnu.py @@ -9,17 +9,19 @@ import os +from unittest.mock import patch from django.test import TestCase -from mock import patch -from minecode.utils_test import JsonBasedTesting from minecode.collectors import gnu +from minecode.utils_test import JsonBasedTesting from packagedb.models import Package class GnuPriorityQueueTests(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "testfiles") + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def setUp(self): super(GnuPriorityQueueTests, self).setUp() diff --git a/minecode/tests/collectors/test_maven.py b/minecode/tests/collectors/test_maven.py index 23db11c7..322beed6 100644 --- a/minecode/tests/collectors/test_maven.py +++ b/minecode/tests/collectors/test_maven.py @@ -1,29 +1,34 @@ -from django.test import TestCase as DjangoTestCase -from minecode.utils_test import JsonBasedTesting +import os from unittest import mock +from unittest.mock import patch + +from django.test import TestCase as DjangoTestCase + from packagedcode.maven import _parse from packageurl import PackageURL -import os + +import packagedb from minecode.collectors import maven from minecode.tests import FIXTURES_REGEN -import packagedb -from mock import patch +from minecode.utils_test import JsonBasedTesting class MavenPriorityQueueTests(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def setUp(self): super(MavenPriorityQueueTests, self).setUp() - self.expected_pom_loc = self.get_test_loc('maven/pom/classworlds-1.1.pom') + self.expected_pom_loc = self.get_test_loc("maven/pom/classworlds-1.1.pom") with open(self.expected_pom_loc) as f: self.expected_pom_contents = f.read() self.scan_package = _parse( - 'maven_pom', - 'maven', - 'Java', + "maven_pom", + "maven", + "Java", text=self.expected_pom_contents, ) @@ -31,34 +36,36 @@ def test_get_pom_text(self, regen=FIXTURES_REGEN): pom_contents = maven.get_pom_text( namespace=self.scan_package.namespace, name=self.scan_package.name, - version=self.scan_package.version + version=self.scan_package.version, ) if regen: - with open(self.expected_pom_loc, 'w') as f: + with open(self.expected_pom_loc, "w") as f: f.write(pom_contents) self.assertEqual(self.expected_pom_contents, pom_contents) pom_contents = maven.get_pom_text( - namespace='', - name='does-not-exist', - version='1.0', + namespace="", + name="does-not-exist", + version="1.0", ) self.assertFalse(pom_contents) def test_get_package_sha1(self): sha1 = maven.get_package_sha1(self.scan_package) - expected_sha1 = '60c708f55deeb7c5dfce8a7886ef09cbc1388eca' + expected_sha1 = "60c708f55deeb7c5dfce8a7886ef09cbc1388eca" self.assertEqual(expected_sha1, sha1) def test_map_maven_package(self): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) package_url = PackageURL.from_string(self.scan_package.purl) - maven.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) + maven.map_maven_package( + package_url, packagedb.models.PackageContentType.BINARY, ("test_pipeline") + ) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() - expected_purl_str = 'pkg:maven/classworlds/classworlds@1.1' + expected_purl_str = "pkg:maven/classworlds/classworlds@1.1" self.assertEqual(expected_purl_str, package.purl) def test_map_maven_package_custom_repo_url(self): @@ -66,18 +73,20 @@ def test_map_maven_package_custom_repo_url(self): self.assertEqual(0, package_count) custom_repo_purl = "pkg:maven/org.eclipse.core/runtime@20070801?repository_url=https://packages.atlassian.com/mvn/maven-atlassian-external/" package_url = PackageURL.from_string(custom_repo_purl) - maven.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) + maven.map_maven_package( + package_url, packagedb.models.PackageContentType.BINARY, ("test_pipeline") + ) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() - expected_repo_url = 'https://packages.atlassian.com/mvn/maven-atlassian-external//org/eclipse/core/runtime/20070801/runtime-20070801.jar' + expected_repo_url = "https://packages.atlassian.com/mvn/maven-atlassian-external//org/eclipse/core/runtime/20070801/runtime-20070801.jar" self.assertEqual(expected_repo_url, package.download_url) def test_process_request(self): - purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' - download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' - purl_sources_str = f'{purl_str}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' + purl_str = "pkg:maven/org.apache.twill/twill-core@0.12.0" + download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar" + purl_sources_str = f"{purl_str}?classifier=sources" + sources_download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar" package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) maven.process_request(purl_str) @@ -87,22 +96,18 @@ def test_process_request(self): (package.purl, package.download_url) for package in packagedb.models.Package.objects.all() ] - self.assertIn( - (purl_str, download_url), purls - ) - self.assertIn( - (purl_sources_str, sources_download_url), purls - ) + self.assertIn((purl_str, download_url), purls) + self.assertIn((purl_sources_str, sources_download_url), purls) def test_fetch_parent(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') + pom_loc = self.get_test_loc("maven/pom/ant-antlr-1.10.1.pom") with open(pom_loc) as f: pom_text = f.read() parent_pom_text = maven.fetch_parent(pom_text) - expected_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') + expected_loc = self.get_test_loc("maven/pom/ant-parent-1.10.1.pom") if regen: - with open(expected_loc, 'w') as f: + with open(expected_loc, "w") as f: f.write(parent_pom_text) with open(expected_loc) as f: @@ -110,15 +115,15 @@ def test_fetch_parent(self, regen=FIXTURES_REGEN): self.assertEqual(expected_pom_text, parent_pom_text) def test_get_ancestry(self): - pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') + pom_loc = self.get_test_loc("maven/pom/pulsar-client-1x-2.5.1.pom") with open(pom_loc) as f: pom_text = f.read() ancestor_pom_texts = list(maven.get_ancestry(pom_text)) expected_ancestor_pom_texts = [] for expected_loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') + self.get_test_loc("maven/pom/apache-18.pom"), + self.get_test_loc("maven/pom/pulsar-2.5.1.pom"), + self.get_test_loc("maven/pom/pulsar-client-1x-base-2.5.1.pom"), ]: with open(expected_loc) as f: expected_pom_text = f.read() @@ -126,67 +131,62 @@ def test_get_ancestry(self): self.assertEqual(expected_ancestor_pom_texts, ancestor_pom_texts) def test_merge_parent(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') + pom_loc = self.get_test_loc("maven/pom/ant-antlr-1.10.1.pom") with open(pom_loc) as f: pom_text = f.read() - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text + package = _parse("maven_pom", "maven", "Java", text=pom_text) + expected_before_loc = self.get_test_loc( + "maven/pom/ant-antlr-1.10.1-package_before.json" ) - expected_before_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1-package_before.json') self.check_expected_results(package.to_dict(), expected_before_loc, regen=regen) - parent_pom_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') + parent_pom_loc = self.get_test_loc("maven/pom/ant-parent-1.10.1.pom") with open(parent_pom_loc) as f: parent_pom_text = f.read() - parent_package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=parent_pom_text - ) + parent_package = _parse("maven_pom", "maven", "Java", text=parent_pom_text) package = maven.merge_parent(package, parent_package) - expected_after_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1-package_after.json') + expected_after_loc = self.get_test_loc( + "maven/pom/ant-antlr-1.10.1-package_after.json" + ) self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) def test_merge_ancestors(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') + pom_loc = self.get_test_loc("maven/pom/pulsar-client-1x-2.5.1.pom") with open(pom_loc) as f: pom_text = f.read() - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text + package = _parse("maven_pom", "maven", "Java", text=pom_text) + expected_before_loc = self.get_test_loc( + "maven/pom/pulsar-client-1x-2.5.1-package_before.json" ) - expected_before_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1-package_before.json') self.check_expected_results(package.to_dict(), expected_before_loc, regen=regen) ancestor_pom_texts = [] for loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') + self.get_test_loc("maven/pom/apache-18.pom"), + self.get_test_loc("maven/pom/pulsar-2.5.1.pom"), + self.get_test_loc("maven/pom/pulsar-client-1x-base-2.5.1.pom"), ]: with open(loc) as f: pom_text = f.read() ancestor_pom_texts.append(pom_text) maven.merge_ancestors(ancestor_pom_texts, package) - expected_after_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1-package_after.json') + expected_after_loc = self.get_test_loc( + "maven/pom/pulsar-client-1x-2.5.1-package_after.json" + ) self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) @mock.patch("minecode.collectors.maven.get_pom_text") - def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, regen=FIXTURES_REGEN): + def test_get_merged_ancestor_package_from_maven_package( + self, get_pom_text_mock, regen=FIXTURES_REGEN + ): get_pom_text_mock.return_value = "" ancestor_pom_texts = [] with patch("minecode.collectors.maven.get_ancestry") as mock_get_ancestry: for loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') + self.get_test_loc("maven/pom/apache-18.pom"), + self.get_test_loc("maven/pom/pulsar-2.5.1.pom"), + self.get_test_loc("maven/pom/pulsar-client-1x-base-2.5.1.pom"), ]: with open(loc) as f: pom_text = f.read() @@ -199,293 +199,296 @@ def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, type="maven", download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar-client/2.5.1/pulsar-client-2.5.1.jar", ) - merged_package = maven.get_merged_ancestor_package_from_maven_package(package=db_package) - expected_loc = self.get_test_loc('maven/pom/pulsar-client-merged-ancestor-package.json') - self.check_expected_results(merged_package.to_dict(), expected_loc, regen=regen) + merged_package = maven.get_merged_ancestor_package_from_maven_package( + package=db_package + ) + expected_loc = self.get_test_loc( + "maven/pom/pulsar-client-merged-ancestor-package.json" + ) + self.check_expected_results( + merged_package.to_dict(), expected_loc, regen=regen + ) class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def test_check_if_file_name_is_linked_on_page(self): - links = ['foo/', 'bar/', 'baz/'] - self.assertTrue( - maven.check_if_file_name_is_linked_on_page('foo/', links) - ) - self.assertFalse( - maven.check_if_file_name_is_linked_on_page('qux/', links) - ) + links = ["foo/", "bar/", "baz/"] + self.assertTrue(maven.check_if_file_name_is_linked_on_page("foo/", links)) + self.assertFalse(maven.check_if_file_name_is_linked_on_page("qux/", links)) def test_check_if_page_has_pom_files(self): - links1 = ['foo/', 'bar.jar', 'bar.pom'] - links2 = ['foo/', 'bar.jar'] + links1 = ["foo/", "bar.jar", "bar.pom"] + links2 = ["foo/", "bar.jar"] self.assertTrue(maven.check_if_page_has_pom_files(links1)) self.assertFalse(maven.check_if_page_has_pom_files(links2)) def test_check_if_page_has_directories(self): - links1 = ['foo/', 'bar/', 'baz/'] - links2 = ['../', 'bar.pom', 'bar.jar'] + links1 = ["foo/", "bar/", "baz/"] + links2 = ["../", "bar.pom", "bar.jar"] self.assertTrue(maven.check_if_page_has_directories(links1)) self.assertFalse(maven.check_if_page_has_directories(links2)) def test_check_if_package_version_page(self): - links1 = ['../', 'bar.pom', 'bar.jar'] - links2 = ['../', 'foo/', 'bar/', 'baz/'] + links1 = ["../", "bar.pom", "bar.jar"] + links2 = ["../", "foo/", "bar/", "baz/"] self.assertTrue(maven.check_if_package_version_page(links1)) self.assertFalse(maven.check_if_package_version_page(links2)) def test_check_if_package_page(self): - links1 = ['../', 'maven-metadata.xml'] - links2 = ['../', 'bar.pom', 'bar.jar'] + links1 = ["../", "maven-metadata.xml"] + links2 = ["../", "bar.pom", "bar.jar"] self.assertTrue(maven.check_if_package_page(links1)) self.assertFalse(maven.check_if_package_page(links2)) def test_check_if_maven_root(self): - links1 = ['../', 'archetype-catalog.xml'] - links2 = ['../', 'bar.pom', 'bar.jar'] + links1 = ["../", "archetype-catalog.xml"] + links2 = ["../", "bar.pom", "bar.jar"] self.assertTrue(maven.check_if_maven_root(links1)) self.assertFalse(maven.check_if_maven_root(links2)) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_check_on_page(self, mock_request_get): checker = maven.check_if_page_has_pom_files mock_request_get.return_value.ok = True mock_request_get.return_value.text = '
parent-7.11.0.pom' - self.assertTrue(maven.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) + self.assertTrue( + maven.check_on_page( + "https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/", checker + ) + ) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_is_maven_root(self, mock_request_get): mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'archetype-catalog.xml' - self.assertTrue(maven.is_maven_root('https://repo1.maven.org/maven2/')) + self.assertTrue(maven.is_maven_root("https://repo1.maven.org/maven2/")) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_is_package_page(self, mock_request_get): mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'maven-metadata.xml' - self.assertTrue(maven.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) + self.assertTrue( + maven.is_package_page("https://repo1.maven.org/maven2/xml-apis/xml-apis/") + ) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_is_package_version_page(self, mock_request_get): mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' + mock_request_get.return_value.text = """ ../ parent-7.11.0.pom - ''' - self.assertTrue(maven.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) + """ + self.assertTrue( + maven.is_package_version_page( + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/" + ) + ) def test_url_parts(self): - url = 'https://example.com/foo/bar/baz.jar' + url = "https://example.com/foo/bar/baz.jar" scheme, netloc, path_segments = maven.url_parts(url) - self.assertEqual('https', scheme) - self.assertEqual('example.com', netloc) - self.assertEqual(['foo', 'bar', 'baz.jar'], path_segments) + self.assertEqual("https", scheme) + self.assertEqual("example.com", netloc) + self.assertEqual(["foo", "bar", "baz.jar"], path_segments) def test_create_url(self): - scheme = 'https' - netloc = 'example.com' - path_segments = ['foo', 'bar', 'baz.jar'] - url = 'https://example.com/foo/bar/baz.jar' - self.assertEqual( - url, - maven.create_url(scheme, netloc, path_segments) - ) + scheme = "https" + netloc = "example.com" + path_segments = ["foo", "bar", "baz.jar"] + url = "https://example.com/foo/bar/baz.jar" + self.assertEqual(url, maven.create_url(scheme, netloc, path_segments)) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_get_maven_root(self, mock_request_get): mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'archetype-catalog.xml' self.assertEqual( - 'https://repo1.maven.org/maven2', - maven.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + "https://repo1.maven.org/maven2", + maven.get_maven_root( + "https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/" + ), ) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_determine_namespace_name_version_from_url(self, mock_request_get): - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2' - root_url = 'https://repo1.maven.org/maven2' + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2" + root_url = "https://repo1.maven.org/maven2" - package_page_text = ''' + package_page_text = """ 1.0.b2/ 2005-09-20 05:53 - maven-metadata.xml 2012-06-26 17:01 567 - ''' + """ package_page = mock.Mock(ok=True, text=package_page_text) - package_version_page_text = ''' + package_version_page_text = """ ../ - xml-apis-1.0.b2.pom 2005-09-20 05:53 2249 - ''' + """ package_version_page = mock.Mock(ok=True, text=package_version_page_text) mock_request_get.side_effect = [ - mock.Mock(ok=True, text=''), - mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=""), + mock.Mock(ok=True, text=""), package_page, - mock.Mock(ok=True, text=''), - package_version_page + mock.Mock(ok=True, text=""), + package_version_page, ] - namespace, package_name, package_version = maven.determine_namespace_name_version_from_url(url, root_url) - self.assertEqual('xml-apis', namespace) - self.assertEqual('xml-apis', package_name) - self.assertEqual('1.0.b2', package_version) + namespace, package_name, package_version = ( + maven.determine_namespace_name_version_from_url(url, root_url) + ) + self.assertEqual("xml-apis", namespace) + self.assertEqual("xml-apis", package_name) + self.assertEqual("1.0.b2", package_version) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_add_to_import_queue(self, mock_request_get): from minecode.models import ImportableURI - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' - root_url = 'https://repo1.maven.org/maven2' + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/" + root_url = "https://repo1.maven.org/maven2" - package_page_text = ''' + package_page_text = """ 1.0.b2/ 2005-09-20 05:53 - maven-metadata.xml 2012-06-26 17:01 567 - ''' + """ package_page = mock.Mock(ok=True, text=package_page_text) - package_version_page_text = ''' + package_version_page_text = """ ../ - xml-apis-1.0.b2.pom 2005-09-20 05:53 2249 - ''' + """ package_version_page = mock.Mock(ok=True, text=package_version_page_text) mock_request_get.side_effect = [ package_page, - mock.Mock(ok=True, text=''), - mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=""), + mock.Mock(ok=True, text=""), package_page, - mock.Mock(ok=True, text=''), - package_version_page + mock.Mock(ok=True, text=""), + package_version_page, ] self.assertEqual(0, ImportableURI.objects.all().count()) - maven.add_to_import_queue(url, root_url ) + maven.add_to_import_queue(url, root_url) self.assertEqual(1, ImportableURI.objects.all().count()) importable_uri = ImportableURI.objects.get(uri=url) - self.assertEqual('pkg:maven/xml-apis/xml-apis', importable_uri.package_url) + self.assertEqual("pkg:maven/xml-apis/xml-apis", importable_uri.package_url) def test_filter_only_directories(self): timestamps_by_links = { - '../': '-', - 'foo/': '-', - 'foo.pom': '2023-09-28', + "../": "-", + "foo/": "-", + "foo.pom": "2023-09-28", } expected = { - 'foo/': '-', + "foo/": "-", } - self.assertEqual( - expected, - maven.filter_only_directories(timestamps_by_links) - ) + self.assertEqual(expected, maven.filter_only_directories(timestamps_by_links)) def test_filter_for_artifacts(self): timestamps_by_links = { - '../': '2023-09-28', - 'foo.pom': '2023-09-28', - 'foo.ejb3': '2023-09-28', - 'foo.ear': '2023-09-28', - 'foo.aar': '2023-09-28', - 'foo.apk': '2023-09-28', - 'foo.gem': '2023-09-28', - 'foo.jar': '2023-09-28', - 'foo.nar': '2023-09-28', - 'foo.so': '2023-09-28', - 'foo.swc': '2023-09-28', - 'foo.tar': '2023-09-28', - 'foo.tar.gz': '2023-09-28', - 'foo.war': '2023-09-28', - 'foo.xar': '2023-09-28', - 'foo.zip': '2023-09-28', + "../": "2023-09-28", + "foo.pom": "2023-09-28", + "foo.ejb3": "2023-09-28", + "foo.ear": "2023-09-28", + "foo.aar": "2023-09-28", + "foo.apk": "2023-09-28", + "foo.gem": "2023-09-28", + "foo.jar": "2023-09-28", + "foo.nar": "2023-09-28", + "foo.so": "2023-09-28", + "foo.swc": "2023-09-28", + "foo.tar": "2023-09-28", + "foo.tar.gz": "2023-09-28", + "foo.war": "2023-09-28", + "foo.xar": "2023-09-28", + "foo.zip": "2023-09-28", } expected = { - 'foo.ejb3': '2023-09-28', - 'foo.ear': '2023-09-28', - 'foo.aar': '2023-09-28', - 'foo.apk': '2023-09-28', - 'foo.gem': '2023-09-28', - 'foo.jar': '2023-09-28', - 'foo.nar': '2023-09-28', - 'foo.so': '2023-09-28', - 'foo.swc': '2023-09-28', - 'foo.tar': '2023-09-28', - 'foo.tar.gz': '2023-09-28', - 'foo.war': '2023-09-28', - 'foo.xar': '2023-09-28', - 'foo.zip': '2023-09-28', + "foo.ejb3": "2023-09-28", + "foo.ear": "2023-09-28", + "foo.aar": "2023-09-28", + "foo.apk": "2023-09-28", + "foo.gem": "2023-09-28", + "foo.jar": "2023-09-28", + "foo.nar": "2023-09-28", + "foo.so": "2023-09-28", + "foo.swc": "2023-09-28", + "foo.tar": "2023-09-28", + "foo.tar.gz": "2023-09-28", + "foo.war": "2023-09-28", + "foo.xar": "2023-09-28", + "foo.zip": "2023-09-28", } self.assertEqual(expected, maven.filter_for_artifacts(timestamps_by_links)) def test_collect_links_from_text(self): filter = maven.filter_only_directories - text = ''' + text = """ ../ 1.0.b2/ 2005-09-20 05:53 - 1.2.01/ 2010-02-03 21:05 - - ''' - expected = { - '1.0.b2/': '2005-09-20 05:53', - '1.2.01/': '2010-02-03 21:05' - } - self.assertEqual( - expected, - maven.collect_links_from_text(text, filter=filter) - ) + """ + expected = {"1.0.b2/": "2005-09-20 05:53", "1.2.01/": "2010-02-03 21:05"} + self.assertEqual(expected, maven.collect_links_from_text(text, filter=filter)) def test_create_absolute_urls_for_links(self): filter = maven.filter_only_directories - text = ''' + text = """ ../ 1.0.b2/ 2005-09-20 05:53 - 1.2.01/ 2010-02-03 21:05 - - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + """ + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/" expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/": "2005-09-20 05:53", + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/": "2010-02-03 21:05", } self.assertEqual( - expected, - maven.create_absolute_urls_for_links(text, url, filter=filter) + expected, maven.create_absolute_urls_for_links(text, url, filter=filter) ) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_get_directory_links(self, mock_request_get): mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' + mock_request_get.return_value.text = """ ../ 1.0.b2/ 2005-09-20 05:53 - 1.2.01/ 2010-02-03 21:05 - - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + """ + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/" expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/": "2005-09-20 05:53", + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/": "2010-02-03 21:05", } self.assertEqual(expected, maven.get_directory_links(url)) - @mock.patch('requests.get') + @mock.patch("requests.get") def test_get_artifact_links(self, mock_request_get): mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' + mock_request_get.return_value.text = """ ../ xml-apis-1.0.b2.jar 2005-09-20 05:53 109318 xml-apis-1.0.b2.pom 2005-09-20 05:53 2249 - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/' + """ + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/" expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar": "2005-09-20 05:53", } self.assertEqual(expected, maven.get_artifact_links(url)) @@ -495,22 +498,26 @@ def test_crawl_to_package(self): def test_crawl_maven_repo_from_root(self): pass - @mock.patch('requests.get') + @mock.patch("requests.get") def test_get_artifact_sha1(self, mock_request_get): - sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' + sha1 = "3136ca936f64c9d68529f048c2618bd356bf85c9" mock_request_get.return_value.ok = True mock_request_get.return_value.text = sha1 - self.assertEqual(sha1, maven.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) + self.assertEqual( + sha1, + maven.get_artifact_sha1( + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1" + ), + ) def test_get_classifier_from_artifact_url(self): - artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' - package_name = 'livereload-jvm' - package_version = '0.2.0' + artifact_url = "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar" + package_version_page_url = ( + "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/" + ) + package_name = "livereload-jvm" + package_version = "0.2.0" classifier = maven.get_classifier_from_artifact_url( - artifact_url, - package_version_page_url, - package_name, - package_version + artifact_url, package_version_page_url, package_name, package_version ) - self.assertEqual('onejar', classifier) + self.assertEqual("onejar", classifier) diff --git a/minecode/tests/collectors/test_npm.py b/minecode/tests/collectors/test_npm.py index cc975452..5a378be8 100644 --- a/minecode/tests/collectors/test_npm.py +++ b/minecode/tests/collectors/test_npm.py @@ -11,20 +11,24 @@ import os from django.test import TestCase as DjangoTestCase + from packagedcode.npm import NpmPackageJsonHandler from packageurl import PackageURL + import packagedb -from minecode.utils_test import JsonBasedTesting -from minecode.tests import FIXTURES_REGEN from minecode.collectors import npm +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting class NpmPriorityQueueTests(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def setUp(self): super(NpmPriorityQueueTests, self).setUp() - self.expected_json_loc = self.get_test_loc('npm/lodash_package-expected.json') + self.expected_json_loc = self.get_test_loc("npm/lodash_package-expected.json") with open(self.expected_json_loc) as f: self.expected_json_contents = json.load(f) @@ -36,22 +40,22 @@ def test_get_package_json(self, regen=FIXTURES_REGEN): json_contents = npm.get_package_json( namespace=self.scan_package.namespace, name=self.scan_package.name, - version=self.scan_package.version + version=self.scan_package.version, ) if regen: - with open(self.expected_json_loc, 'w') as f: - json.dump(json_contents, f, indent=3, separators=(',', ':')) + with open(self.expected_json_loc, "w") as f: + json.dump(json_contents, f, indent=3, separators=(",", ":")) self.assertEqual(self.expected_json_contents, json_contents) def test_map_npm_package(self): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) package_url = PackageURL.from_string(self.scan_package.purl) - npm.map_npm_package(package_url, ('test_pipeline')) + npm.map_npm_package(package_url, ("test_pipeline")) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() - expected_purl_str = 'pkg:npm/lodash@4.17.21' - expected_download_url = 'https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz' + expected_purl_str = "pkg:npm/lodash@4.17.21" + expected_download_url = "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz" self.assertEqual(expected_purl_str, package.purl) self.assertEqual(expected_download_url, package.download_url) diff --git a/minecode/tests/miners/test_apache.py b/minecode/tests/miners/test_apache.py index e5d6eaa4..5b403243 100644 --- a/minecode/tests/miners/test_apache.py +++ b/minecode/tests/miners/test_apache.py @@ -7,181 +7,189 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict import json import os import re +from collections import OrderedDict +from unittest.mock import patch from django.test import TestCase as DjangoTestCase -from mock import Mock -from mock import patch from minecode import miners -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting from minecode.miners import apache from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class ApacheVistorTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_ApacheDistIndexVisitor(self): - uri = 'http://apache.org/dist/zzz/find-ls.gz' - test_loc = self.get_test_loc('apache/find-ls.gz') - with patch('requests.get') as mock_http_get: + uri = "http://apache.org/dist/zzz/find-ls.gz" + test_loc = self.get_test_loc("apache/find-ls.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = apache.ApacheDistIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'apache/find-ls.gz_uris-expected.json') + expected_loc = self.get_test_loc("apache/find-ls.gz_uris-expected.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_ApacheChecksumVisitor(self): - uri = 'http://archive.apache.org/dist/abdera/1.1.3/apache-abdera-1.1.3-src.zip.md5' - test_loc = self.get_test_loc('apache/apache-abdera-1.1.3-src.zip.md5') - with patch('requests.get') as mock_http_get: + uri = "http://archive.apache.org/dist/abdera/1.1.3/apache-abdera-1.1.3-src.zip.md5" + test_loc = self.get_test_loc("apache/apache-abdera-1.1.3-src.zip.md5") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = apache.ApacheChecksumVisitor(uri) self.assertEqual(None, uris) - self.assertEqual(b'0b5f2c334916c289f06c03f8577a9879', data) + self.assertEqual(b"0b5f2c334916c289f06c03f8577a9879", data) def test_ApacheChecksumVisitor_2(self): - uri = 'http://archive.apache.org/dist/groovy/2.4.6/distribution/apache-groovy-docs-2.4.6.zip.md5' - test_loc = self.get_test_loc('apache/apache-groovy-docs-2.4.6.zip.md5') - with patch('requests.get') as mock_http_get: + uri = "http://archive.apache.org/dist/groovy/2.4.6/distribution/apache-groovy-docs-2.4.6.zip.md5" + test_loc = self.get_test_loc("apache/apache-groovy-docs-2.4.6.zip.md5") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = apache.ApacheChecksumVisitor(uri) self.assertEqual(None, uris) - self.assertEqual(b'c7a2d3becea1d28b518528f8204b8d2a', data) + self.assertEqual(b"c7a2d3becea1d28b518528f8204b8d2a", data) def test_ApacheProjectsJsonVisitor(self): - uri = 'https://projects.apache.org/json/foundation/projects.json' - test_loc = self.get_test_loc('apache/projects.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/foundation/projects.json" + test_loc = self.get_test_loc("apache/projects.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again uris, result, _ = apache.ApacheProjectsJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/projects_uris-expected.json') + expected_loc = self.get_test_loc("apache/projects_uris-expected.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) self.check_expected_results(result, test_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor(self): - uri = 'https://projects.apache.org/json/projects/ant-dotnet.json' - test_loc = self.get_test_loc('apache/ant-dotnet.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/ant-dotnet.json" + test_loc = self.get_test_loc("apache/ant-dotnet.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/ant-dotnet_expected.json') + expected_loc = self.get_test_loc("apache/ant-dotnet_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor_error1_json(self): - uri = 'https://projects.apache.org/json/projects/felix.json' - test_loc = self.get_test_loc('apache/felix.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/felix.json" + test_loc = self.get_test_loc("apache/felix.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/felix_expected.json') + expected_loc = self.get_test_loc("apache/felix_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor_error2_json(self): - uri = 'https://projects.apache.org/json/projects/attic-mrunit.json' - test_loc = self.get_test_loc('apache/attic-mrunit.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/attic-mrunit.json" + test_loc = self.get_test_loc("apache/attic-mrunit.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/attic-mrunit_expected.json') + expected_loc = self.get_test_loc("apache/attic-mrunit_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor_error3_json(self): - uri = 'https://projects.apache.org/json/projects/metamodel.json' - test_loc = self.get_test_loc('apache/metamodel.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/metamodel.json" + test_loc = self.get_test_loc("apache/metamodel.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/metamodel_expected.json') + expected_loc = self.get_test_loc("apache/metamodel_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApachePodlingsJsonVisitor(self): - uri = 'https://projects.apache.org/json/foundation/podlings.json' - test_loc = self.get_test_loc('apache/podlings.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/foundation/podlings.json" + test_loc = self.get_test_loc("apache/podlings.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again uris, result, _ = apache.ApachePodlingsJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/podlings_expected_uris.json') + expected_loc = self.get_test_loc("apache/podlings_expected_uris.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - expected_loc = self.get_test_loc('apache/podlings_expected.json') + expected_loc = self.get_test_loc("apache/podlings_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) class ApacheMapperTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_package_from_download(self): package = miners.apache.build_package_from_download( - 'http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip', - 'pkg:apache/groovy@2.4.6') - expected_loc = self.get_test_loc('apache/map-groovy_expected.json') + "http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip", + "pkg:apache/groovy@2.4.6", + ) + expected_loc = self.get_test_loc("apache/map-groovy_expected.json") self.check_expected_results( - package.to_dict(), expected_loc, regen=FIXTURES_REGEN) + package.to_dict(), expected_loc, regen=FIXTURES_REGEN + ) def test_build_package_from_download2(self): package = miners.apache.build_package_from_download( - 'http://archive.apache.org/dist/turbine/maven/turbine-webapp-2.3.3-1.0.0-source-release.zip', - 'pkg:apache/turbine-webapp@2.3.3-1.0.0-source-release') - expected_loc = self.get_test_loc( - 'apache/map-turbine-webapp_expected.json') + "http://archive.apache.org/dist/turbine/maven/turbine-webapp-2.3.3-1.0.0-source-release.zip", + "pkg:apache/turbine-webapp@2.3.3-1.0.0-source-release", + ) + expected_loc = self.get_test_loc("apache/map-turbine-webapp_expected.json") self.check_expected_results( - package.to_dict(), expected_loc, regen=FIXTURES_REGEN) + package.to_dict(), expected_loc, regen=FIXTURES_REGEN + ) # TODO: add tests for checksums def test_build_packages_from_projects_json(self): - with open(self.get_test_loc('apache/projects.json')) as projectsjson_meta: + with open(self.get_test_loc("apache/projects.json")) as projectsjson_meta: metadata = json.load(projectsjson_meta, object_pairs_hook=OrderedDict) packages = miners.apache.build_packages_from_projects(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('apache/projects_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("apache/projects_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages_from_one_podling_json(self): - with open(self.get_test_loc('apache/podling_amaterasu.json')) as podlings_meta: + with open(self.get_test_loc("apache/podling_amaterasu.json")) as podlings_meta: metadata = json.load(podlings_meta, object_pairs_hook=OrderedDict) - packages = miners.apache.build_packages_from_podlings(metadata, purl='pkg:apache-podlings/amaterasu') + packages = miners.apache.build_packages_from_podlings( + metadata, purl="pkg:apache-podlings/amaterasu" + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'apache/podling_amaterasu_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("apache/podling_amaterasu_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) # TODO: add real mapper class tests c def test_regex_1(self): - regex = re.compile(r'^https?://(archive\.)?apache\.org/dist/.*$') + regex = re.compile(r"^https?://(archive\.)?apache\.org/dist/.*$") result = re.match( - regex, 'http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip') + regex, + "http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip", + ) self.assertTrue(result) def test_regex_2(self): - regex = re.compile(r'^https?://(archive\.)?apache\.org/dist/.*$') + regex = re.compile(r"^https?://(archive\.)?apache\.org/dist/.*$") result = re.match( - regex, 'https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip') + regex, + "https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip", + ) self.assertTrue(result) diff --git a/minecode/tests/miners/test_bitbucket.py b/minecode/tests/miners/test_bitbucket.py index ad9a35f8..f123f343 100644 --- a/minecode/tests/miners/test_bitbucket.py +++ b/minecode/tests/miners/test_bitbucket.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -8,140 +7,139 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict import json import os import re - -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.miners.bitbucket import build_bitbucket_download_packages -from minecode.miners.bitbucket import build_bitbucket_repo_package +from collections import OrderedDict +from unittest.mock import patch from minecode.miners.bitbucket import BitbucketDetailsVisitorPaginated from minecode.miners.bitbucket import BitbucketIndexVisitor from minecode.miners.bitbucket import BitbucketSingleRepoVisitor - +from minecode.miners.bitbucket import build_bitbucket_download_packages +from minecode.miners.bitbucket import build_bitbucket_repo_package from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class BitbucketVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_BitbucketIndexVisitor(self): - uri = 'https://api.bitbucket.org/2.0/repositories?pagelen=10' - test_loc = self.get_test_loc('bitbucket/visit/index-repositories.json') + uri = "https://api.bitbucket.org/2.0/repositories?pagelen=10" + test_loc = self.get_test_loc("bitbucket/visit/index-repositories.json") - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = BitbucketIndexVisitor(uri) expected_uri_loc = self.get_test_loc( - 'bitbucket/visit/index-repositories_expected_uris.json') + "bitbucket/visit/index-repositories_expected_uris.json" + ) self.check_expected_uris(uris, expected_uri_loc, regen=FIXTURES_REGEN) expected_data_loc = self.get_test_loc( - 'bitbucket/visit/index-repositories_expected_data.json') - self.check_expected_results( - data, expected_data_loc, regen=FIXTURES_REGEN) + "bitbucket/visit/index-repositories_expected_data.json" + ) + self.check_expected_results(data, expected_data_loc, regen=FIXTURES_REGEN) def test_BitbucketSingleRepoVisitor(self): - uri = 'https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/' - test_loc = self.get_test_loc('bitbucket/visit/singlerepo.json') + uri = "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/" + test_loc = self.get_test_loc("bitbucket/visit/singlerepo.json") - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = BitbucketSingleRepoVisitor(uri) expected_data_loc = self.get_test_loc( - 'bitbucket/visit/singlerepo_expected_data.json') - self.check_expected_results( - data, expected_data_loc, regen=FIXTURES_REGEN) + "bitbucket/visit/singlerepo_expected_data.json" + ) + self.check_expected_results(data, expected_data_loc, regen=FIXTURES_REGEN) expected_uris_loc = self.get_test_loc( - 'bitbucket/visit/singlerepo_expected_uris.json') + "bitbucket/visit/singlerepo_expected_uris.json" + ) self.check_expected_uris(uris, expected_uris_loc, regen=FIXTURES_REGEN) def test_BitbucketDetailsVisitorPaginated(self): - uri = 'https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags?pagelen=2' - test_loc = self.get_test_loc('bitbucket/visit/paginated_tags.json') + uri = "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags?pagelen=2" + test_loc = self.get_test_loc("bitbucket/visit/paginated_tags.json") - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = BitbucketDetailsVisitorPaginated(uri) expected_data_loc = self.get_test_loc( - 'bitbucket/visit/paginated_tags_expected_data.json') - self.check_expected_results( - data, expected_data_loc, regen=FIXTURES_REGEN) + "bitbucket/visit/paginated_tags_expected_data.json" + ) + self.check_expected_results(data, expected_data_loc, regen=FIXTURES_REGEN) expected_uris_loc = self.get_test_loc( - 'bitbucket/visit/paginated_tags_expected_uris.json') + "bitbucket/visit/paginated_tags_expected_uris.json" + ) self.check_expected_uris(uris, expected_uris_loc, regen=FIXTURES_REGEN) class BitbucketMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_pattern_match_without_download(self): - url = 'https://api.bitbucket.org/2.0/repositories/phlogistonjohn/tweakmsg' - pattern = 'https://api.bitbucket.org/2.0/repositories/.*(?= 5.0.37.2)' in result['Build-Depends']) - self.assertTrue('cmake' in result['Build-Depends']) + self.assertEqual("lastfm-python-mirbuild", result["Source"]) + self.assertEqual("python", result["Section"]) + self.assertEqual("optional", result["Priority"]) + self.assertEqual("3.9.1", result["Standards-Version"]) + self.assertTrue("debhelper (>= 5.0.37.2)" in result["Build-Depends"]) + self.assertTrue("cmake" in result["Build-Depends"]) @expectedFailure def test_debcon_get_paragraph_data_from_file_control_invalid(self): - control_file = self.get_test_loc('debian/debutils/control_invalid') + control_file = self.get_test_loc("debian/debutils/control_invalid") result = debcon.get_paragraph_data_from_file(control_file) self.assertEqual({}, result) @expectedFailure def test_debcon_get_paragraph_data_from_file_with_non_existing_path(self): - control_file = 'path_invalid' + control_file = "path_invalid" with self.assertRaises(Exception) as context: debcon.get_paragraph_data_from_file(control_file) - self.assertTrue('No such file or directory' in context.exception) + self.assertTrue("No such file or directory" in context.exception) def test_parse_deb822_dsc(self): - dsc_file = self.get_test_loc('debian/debutils/3dldf_2.0.3+dfsg-2.dsc') + dsc_file = self.get_test_loc("debian/debutils/3dldf_2.0.3+dfsg-2.dsc") result = debcon.get_paragraph_data_from_file(dsc_file) expected_loc = self.get_test_loc( - 'debian/debutils/3dldf_2.0.3+dfsg-2.dsc-expected') + "debian/debutils/3dldf_2.0.3+dfsg-2.dsc-expected" + ) self.check_expected_deb822(result, expected_loc, regen=FIXTURES_REGEN) ################################################################# def test_parse_email(self): - content = 'Debian TeX Maintainers ' + content = "Debian TeX Maintainers " name, email = debutils.parse_email(content) - self.assertEqual('Debian TeX Maintainers', name) - self.assertEqual('debian-tex-maint@lists.debian.org', email) + self.assertEqual("Debian TeX Maintainers", name) + self.assertEqual("debian-tex-maint@lists.debian.org", email) def test_parse_email_2(self): # Space left Purposefully - content = ' Debian TeX Maintainers ' + content = " Debian TeX Maintainers " name, email = debutils.parse_email(content) - self.assertEqual('Debian TeX Maintainers', name) + self.assertEqual("Debian TeX Maintainers", name) self.assertEqual(None, email) def test_parse_email_3(self): # Space left Purposefully - content = '< debian-tex-maint@lists.debian.org >' + content = "< debian-tex-maint@lists.debian.org >" name, email = debutils.parse_email(content) self.assertEqual(None, name) self.assertEqual("debian-tex-maint@lists.debian.org", email) def test_comma_separated(self): - tags = 'implemented-in::perl, role::program, use::converting, works-with::pim' + tags = "implemented-in::perl, role::program, use::converting, works-with::pim" result = list(debutils.comma_separated(tags)) - self.assertEqual([u'implemented-in::perl', u'role::program', - u'use::converting', u'works-with::pim'], result) + self.assertEqual( + [ + "implemented-in::perl", + "role::program", + "use::converting", + "works-with::pim", + ], + result, + ) class DebianReleaseTest(BaseDebianTest): - def test_parse_release(self): - release_file = self.get_test_loc('debian/release/Release') + release_file = self.get_test_loc("debian/release/Release") result = list(debian.parse_release(release_file)) - expected_loc = self.get_test_loc('debian/release/Release_expected') + expected_loc = self.get_test_loc("debian/release/Release_expected") self.check_expected_deb822(result, expected_loc) def test_parse_release_with_md5(self): - release_file = self.get_test_loc('debian/release/Release_with_md5') + release_file = self.get_test_loc("debian/release/Release_with_md5") result = list(debian.parse_release(release_file)) - expected_loc = self.get_test_loc('debian/release/Release_with_md5_expected') + expected_loc = self.get_test_loc("debian/release/Release_with_md5_expected") self.check_expected_deb822(result, expected_loc) @expectedFailure def test_visit_debian_release(self): - uri = 'http://ftp.debian.org/debian/dists/Debian8.3/Release' - test_loc = self.get_test_loc('debian/release/visited_Release') - with patch('requests.get') as mock_http_get: + uri = "http://ftp.debian.org/debian/dists/Debian8.3/Release" + test_loc = self.get_test_loc("debian/release/visited_Release") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = debian.DebianReleaseVisitor(uri) result = json.loads(data) - release_file = self.get_test_loc( - 'debian/release/visited_Release-expected.json') + release_file = self.get_test_loc("debian/release/visited_Release-expected.json") self.check_expected_deb822(result, release_file) class DebianCopyrightTest(BaseDebianTest): - # TODO: There is an exception for the current debian copyright parser @expectedFailure def test_parse_copyright_only_basic(self): - copyright_file = self.get_test_loc('debian/copyright/basic_copyright') + copyright_file = self.get_test_loc("debian/copyright/basic_copyright") copyrights = [info for info in debian.parse_copyright_only(copyright_file)] - self.assertTrue('Copyright 1998 John Doe ' in copyrights) - self.assertTrue('Copyright 1998 Jane Doe ' in copyrights) + self.assertTrue("Copyright 1998 John Doe " in copyrights) + self.assertTrue("Copyright 1998 Jane Doe " in copyrights) @expectedFailure def test_parse_copyright_only_with_incorrect_file(self): - copyright_file = self.get_test_loc( - 'debian/copyright/invalid_copyright') + copyright_file = self.get_test_loc("debian/copyright/invalid_copyright") with self.assertRaises(Exception) as context: [info for info in debian.parse_copyright_only(copyright_file)] - self.assertTrue('no paragraphs in input' in context.exception) + self.assertTrue("no paragraphs in input" in context.exception) @expectedFailure def test_parse_copyright_only_with_incorrect_path(self): - copyright_file = 'path_invalid' + copyright_file = "path_invalid" with self.assertRaises(Exception) as context: [info for info in debian.parse_copyright_only(copyright_file)] - self.assertTrue('No such file or directory' in context.exception) + self.assertTrue("No such file or directory" in context.exception) @expectedFailure def test_parse_copyright_allinfo_basic(self): - copyright_file = self.get_test_loc('debian/copyright/basic_copyright') - copyright_data = [info for info in debian.parse_copyright_allinfo(copyright_file)] + copyright_file = self.get_test_loc("debian/copyright/basic_copyright") + copyright_data = [ + info for info in debian.parse_copyright_allinfo(copyright_file) + ] expected = [ - {'files': (u'*',), - 'license': u'GPL-2+', - 'copyright': 'Copyright 1998 John Doe ' - }, - {'files': (u'debian/*',), - 'license': u'GPL-2+', - 'copyright': 'Copyright 1998 Jane Doe ' - } + { + "files": ("*",), + "license": "GPL-2+", + "copyright": "Copyright 1998 John Doe ", + }, + { + "files": ("debian/*",), + "license": "GPL-2+", + "copyright": "Copyright 1998 Jane Doe ", + }, ] self.assertEqual(expected, copyright_data) @expectedFailure def test_parse_copyright_allinfo_with_invalid_file(self): - copyright_file = self.get_test_loc( - 'debian/copyright/invalid_copyright') + copyright_file = self.get_test_loc("debian/copyright/invalid_copyright") with self.assertRaises(Exception) as context: [info for info in debian.parse_copyright_allinfo(copyright_file)] - self.assertTrue('no paragraphs in input' in context.exception) + self.assertTrue("no paragraphs in input" in context.exception) @expectedFailure def test_parse_copyright_allinfo_with_incorrect_path(self): - copyright_file = 'path_invalid' + copyright_file = "path_invalid" with self.assertRaises(Exception) as context: [info for info in debian.parse_copyright_allinfo(copyright_file)] - self.assertTrue('No such file or directory' in context.exception) + self.assertTrue("No such file or directory" in context.exception) @expectedFailure def test_parse_license_basic(self): - copyright_file = self.get_test_loc('debian/copyright/basic_copyright') + copyright_file = self.get_test_loc("debian/copyright/basic_copyright") licenses, licensetexts = debian.parse_license(copyright_file) expected = { - 'GPL-2+': [ + "GPL-2+": [ "This program is free software; you can redistribute it\n" "and/or modify it under the terms of the GNU General Public\n" "License as published by the Free Software Foundation; either\n" @@ -251,174 +254,179 @@ def test_parse_license_basic(self): "On Debian systems, the full text of the GNU General Public\n" "License version 2 can be found in the file\n" "`/usr/share/common-licenses/GPL-2'." - ]} + ] + } self.assertEqual(expected, licenses) self.assertEqual([], licensetexts) @expectedFailure def test_parse_license_with_invalid_file(self): - copyright_file = self.get_test_loc( - 'debian/copyright/invalid_copyright') + copyright_file = self.get_test_loc("debian/copyright/invalid_copyright") with self.assertRaises(Exception) as context: debian.parse_license(copyright_file) - self.assertTrue('no paragraphs in input' in context.exception) + self.assertTrue("no paragraphs in input" in context.exception) @expectedFailure def test_parse_license_with_incorrect_path(self): - copyright_file = 'path_invalid' + copyright_file = "path_invalid" with self.assertRaises(Exception) as context: debian.parse_license(copyright_file) - self.assertTrue('No such file or directory' in context.exception) + self.assertTrue("No such file or directory" in context.exception) class DebianSourcesTest(BaseDebianTest): - def test_collect_source_packages(self): - index_file = self.get_test_loc('debian/sources/debian_Sources') + index_file = self.get_test_loc("debian/sources/debian_Sources") source_info = [info for info in debian.collect_source_packages(index_file)] - expected_loc = self.get_test_loc('debian/sources/debian_Sources_visit_expected') + expected_loc = self.get_test_loc("debian/sources/debian_Sources_visit_expected") self.check_objects_expected(source_info, expected_loc, regen=FIXTURES_REGEN) def test_collect_source_packages_ubuntu(self): - index_file = self.get_test_loc('debian/sources/ubuntu_Sources') + index_file = self.get_test_loc("debian/sources/ubuntu_Sources") source_info = [info for info in debian.collect_source_packages(index_file)] - expected_loc = self.get_test_loc('debian/sources/ubuntu_Sources_visit_expected') + expected_loc = self.get_test_loc("debian/sources/ubuntu_Sources_visit_expected") self.check_objects_expected(source_info, expected_loc, regen=FIXTURES_REGEN) @expectedFailure def test_DebianSourcesVisitor(self): - uri = 'http://ftp.debian.org/debian/dists/jessie-backports/main/source/Sources.gz' - test_loc = self.get_test_loc('debian/sources/Sources.gz') - with patch('requests.get') as mock_http_get: + uri = ( + "http://ftp.debian.org/debian/dists/jessie-backports/main/source/Sources.gz" + ) + test_loc = self.get_test_loc("debian/sources/Sources.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = debian.DebianSourcesVisitor(uri) - expected_loc = self.get_test_loc('debian/sources/Sources.gz-expected.json') + expected_loc = self.get_test_loc("debian/sources/Sources.gz-expected.json") self.check_expected_uris(list(uris), expected_loc) @expectedFailure def test_DebianSourcesVisitor_with_invalid_file(self): - uri = 'http://ftp.debian.org/debian/dists/jessie-backports/main/source/invalid_files/Sources.gz' - test_loc = self.get_test_loc('debian/invalid_files/ls-lR.gz') - with patch('requests.get') as mock_http_get: + uri = "http://ftp.debian.org/debian/dists/jessie-backports/main/source/invalid_files/Sources.gz" + test_loc = self.get_test_loc("debian/invalid_files/ls-lR.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _ = debian.DebianSourcesVisitor(uri) self.assertEqual(0, len(list(uris))) @expectedFailure def test_build_source_file_packages(self): - with open(self.get_test_loc('debian/sources/debian_Sources')) as packs: + with open(self.get_test_loc("debian/sources/debian_Sources")) as packs: packages = debian.build_source_file_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'debian/sources/debian_Sources_mapped-expected-packages.json') + "debian/sources/debian_Sources_mapped-expected-packages.json" + ) self.check_expected_results(packages, expected_loc) class DebianPackagesTest(BaseDebianTest): - def test_parse_packages_index(self): - index_file = self.get_test_loc('debian/packages/debian_Packages') + index_file = self.get_test_loc("debian/packages/debian_Packages") package_info = [info for info in debian.parse_packages_index(index_file)] - expected_loc = self.get_test_loc('debian/packages/debian_Packages-visit-expected.json') + expected_loc = self.get_test_loc( + "debian/packages/debian_Packages-visit-expected.json" + ) self.check_objects_expected(package_info, expected_loc, regen=FIXTURES_REGEN) @expectedFailure def test_parse_packages_from_debian_Packages(self): - with open(self.get_test_loc('debian/packages/debian_Packages')) as packs: + with open(self.get_test_loc("debian/packages/debian_Packages")) as packs: packages = debian.parse_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'debian/packages/debian_Packages-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "debian/packages/debian_Packages-expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) @expectedFailure def test_parse_packages_from_ubuntu_Packages(self): - with open(self.get_test_loc('debian/packages/ubuntu_Packages')) as packs: + with open(self.get_test_loc("debian/packages/ubuntu_Packages")) as packs: packages = debian.parse_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'debian/packages/ubuntu_Packages-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "debian/packages/ubuntu_Packages-expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) @expectedFailure def test_parse_packages_from_installed_status(self): - with open(self.get_test_loc('debian/status/simple_status')) as packs: + with open(self.get_test_loc("debian/status/simple_status")) as packs: packages = debian.parse_packages(packs.read()) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'debian/packages/ubuntu_Packages-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "debian/packages/ubuntu_Packages-expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) class DebianLSLRTest(BaseDebianTest): - def test_DebianDirectoryIndexVisitor_from_debian(self): - uri = 'http://ftp.debian.org/debian/ls-lR.gz' - test_loc = self.get_test_loc('debian/lslr/ls-lR_debian') + uri = "http://ftp.debian.org/debian/ls-lR.gz" + test_loc = self.get_test_loc("debian/lslr/ls-lR_debian") temp_gz_location = self.get_tmp_gz_file(test_loc) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, temp_gz_location) uris, _, _ = debian.DebianDirectoryIndexVisitor(uri) - expected_loc = self.get_test_loc('debian/lslr/ls-lR_debian.gz-expected.json') + expected_loc = self.get_test_loc("debian/lslr/ls-lR_debian.gz-expected.json") self.check_expected_uris(list(uris), expected_loc) def test_DebianDirectoryIndexVisitor_from_ubuntu(self): - uri = 'http://archive.ubuntu.com/ubuntu/ls-lR.gz' - test_loc = self.get_test_loc('debian/lslr/ls-lR_ubuntu') + uri = "http://archive.ubuntu.com/ubuntu/ls-lR.gz" + test_loc = self.get_test_loc("debian/lslr/ls-lR_ubuntu") temp_gz_location = self.get_tmp_gz_file(test_loc) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, temp_gz_location) uris, _, _ = debian.DebianDirectoryIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'debian/lslr/ls-lR_ubuntu.gz-expected.json') + expected_loc = self.get_test_loc("debian/lslr/ls-lR_ubuntu.gz-expected.json") self.check_expected_uris(list(uris), expected_loc) class DebianDescriptionTest(BaseDebianTest): - @expectedFailure def test_DebianDescriptionVisitor(self): - uri = 'http://ftp.debian.org/debian/pool/main/7/7kaa/7kaa_2.14.3-1.dsc' - test_loc = self.get_test_loc('debian/dsc/7kaa_2.14.3-1.dsc') - with patch('requests.get') as mock_http_get: + uri = "http://ftp.debian.org/debian/pool/main/7/7kaa/7kaa_2.14.3-1.dsc" + test_loc = self.get_test_loc("debian/dsc/7kaa_2.14.3-1.dsc") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = debian.DebianDescriptionVisitor(uri) result = json.loads(data) - dsc_file = self.get_test_loc('debian/dsc/description_expected.json') + dsc_file = self.get_test_loc("debian/dsc/description_expected.json") self.check_expected_deb822(result, dsc_file) @expectedFailure def test_parse_description(self): - with open(self.get_test_loc('debian/dsc/description.json')) as debian_description_meta: + with open( + self.get_test_loc("debian/dsc/description.json") + ) as debian_description_meta: metadata = json.load(debian_description_meta) packages = debian.parse_description(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'debian/dsc/description-expected.json') + expected_loc = self.get_test_loc("debian/dsc/description-expected.json") self.check_expected_results(packages, expected_loc) class DebianMapperTest(BaseDebianTest): - @expectedFailure def test_get_dependencies(self): test = { - 'build1': 'build', - 'build2': 'build2', - 'build3': 'buildnot', + "build1": "build", + "build2": "build2", + "build3": "buildnot", } - keys = ['build1', 'build2'] + keys = ["build1", "build2"] result = debian.get_dependencies(test, keys) self.assertEqual(2, len(result)) - self.assertEqual('build', result[0].purl) + self.assertEqual("build", result[0].purl) self.assertEqual(None, result[0].requirement) - self.assertEqual('build2', result[1].purl) + self.assertEqual("build2", result[1].purl) self.assertEqual(None, result[1].requirement) def test_get_programming_language(self): - tags = ['role::program', 'implemented-in::perl', 'use::converting', 'works-with::pim'] + tags = [ + "role::program", + "implemented-in::perl", + "use::converting", + "works-with::pim", + ] result = debian.get_programming_language(tags) - self.assertEqual('perl', result) + self.assertEqual("perl", result) diff --git a/minecode/tests/miners/test_dockerhub.py b/minecode/tests/miners/test_dockerhub.py index 8a7f802d..4785cc08 100644 --- a/minecode/tests/miners/test_dockerhub.py +++ b/minecode/tests/miners/test_dockerhub.py @@ -10,80 +10,70 @@ import json import os from collections import OrderedDict +from unittest.mock import patch - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - +from minecode import miners from minecode.miners import dockerhub from minecode.tests import FIXTURES_REGEN -from minecode import miners +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class DockerHubTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) class DockerHubVistorTest(DockerHubTest): - def test_searching_condition(self): combinations = dockerhub.get_search_conditions() - expected_file = self.get_test_loc('dockerhub/conditions_expected') - self.check_expected_results( - combinations, expected_file, regen=FIXTURES_REGEN) + expected_file = self.get_test_loc("dockerhub/conditions_expected") + self.check_expected_results(combinations, expected_file, regen=FIXTURES_REGEN) def test_seeds(self): seed = dockerhub.DockerHubSeed() seeds = list(seed.get_seeds()) - expected_file = self.get_test_loc('dockerhub/seeds_expected') + expected_file = self.get_test_loc("dockerhub/seeds_expected") self.check_expected_results(seeds, expected_file, regen=FIXTURES_REGEN) def test_visit_dockerhub_exlpore_page(self): - uri = 'https://hub.docker.com/explore/?page=1' - test_loc = self.get_test_loc('dockerhub/Explore_DockerHub_Page1.html') - with patch('requests.get') as mock_http_get: + uri = "https://hub.docker.com/explore/?page=1" + test_loc = self.get_test_loc("dockerhub/Explore_DockerHub_Page1.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = dockerhub.DockHubExplorePageVisitor(uri) - expected_loc = self.get_test_loc( - 'dockerhub/visitor_explore_page1_expected') + expected_loc = self.get_test_loc("dockerhub/visitor_explore_page1_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_dockerhub_project(self): - uri = 'https://hub.docker.com/_/elixir/' - test_loc = self.get_test_loc('dockerhub/library_elixir.html') - with patch('requests.get') as mock_http_get: + uri = "https://hub.docker.com/_/elixir/" + test_loc = self.get_test_loc("dockerhub/library_elixir.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = dockerhub.DockHubProjectHTMLVisitor(uri) result = json.loads(data, object_pairs_hook=OrderedDict) - expected_file = self.get_test_loc( - 'dockerhub/visitor_library_elixir_expected') - self.check_expected_results( - result, expected_file, regen=FIXTURES_REGEN) + expected_file = self.get_test_loc("dockerhub/visitor_library_elixir_expected") + self.check_expected_results(result, expected_file, regen=FIXTURES_REGEN) def test_visit_dockerhub_search_api(self): - uri = 'https://index.docker.io/v1/search?q=1a&n=100&page=2' - test_loc = self.get_test_loc('dockerhub/search.json') - with patch('requests.get') as mock_http_get: + uri = "https://index.docker.io/v1/search?q=1a&n=100&page=2" + test_loc = self.get_test_loc("dockerhub/search.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = dockerhub.DockHubLibraryRESTJsonVisitor(uri) - expected_loc = self.get_test_loc('dockerhub/visitor_search_expected') + expected_loc = self.get_test_loc("dockerhub/visitor_search_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class DockerHubMapperTest(DockerHubTest): - def test_build_packages_fromjson(self): - with open(self.get_test_loc('dockerhub/elixir.json')) as dockerhub_metadata: + with open(self.get_test_loc("dockerhub/elixir.json")) as dockerhub_metadata: metadata = dockerhub_metadata.read() packages = miners.dockerhub.build_packages_from_jsonfile( - metadata, 'https://registry.hub.docker.com/v2/repositories/library') + metadata, "https://registry.hub.docker.com/v2/repositories/library" + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'dockerhub/expected_dockerhubmapper.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("dockerhub/expected_dockerhubmapper.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_eclipse.py b/minecode/tests/miners/test_eclipse.py index 7cbc4245..5466bbb9 100644 --- a/minecode/tests/miners/test_eclipse.py +++ b/minecode/tests/miners/test_eclipse.py @@ -10,114 +10,120 @@ import json import os import unittest +from unittest.mock import patch -from mock import Mock -from mock import patch import requests -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - from minecode import miners from minecode.miners import URI from minecode.miners import eclipse from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class EclipseVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_eclipse_projects(self): - uri = 'https://projects.eclipse.org/list-of-projects' - test_loc = self.get_test_loc('eclipse/projects.eclipse.org.html') - with patch('requests.get') as mock_http_get: + uri = "https://projects.eclipse.org/list-of-projects" + test_loc = self.get_test_loc("eclipse/projects.eclipse.org.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = eclipse.EclipseProjectVisitors(uri) - expected_loc = self.get_test_loc('eclipse/eclipse_projects_expected') + expected_loc = self.get_test_loc("eclipse/eclipse_projects_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_eclipse_project(self): - uri = 'https://projects.eclipse.org/projects/modeling.m2t.acceleo' - test_loc = self.get_test_loc( - 'eclipse/Acceleo_projects.eclipse.org.html') - with patch('requests.get') as mock_http_get: + uri = "https://projects.eclipse.org/projects/modeling.m2t.acceleo" + test_loc = self.get_test_loc("eclipse/Acceleo_projects.eclipse.org.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = eclipse.EclipseSingleProjectVisitor(uri) - with open(self.get_test_loc('eclipse/acceleo_expected.html'), 'rb') as data_file: + with open( + self.get_test_loc("eclipse/acceleo_expected.html"), "rb" + ) as data_file: self.assertEqual(data_file.read(), data) def test_visit_eclipse_git_repo(self): - uri = 'http://git.eclipse.org/c' - test_loc = self.get_test_loc('eclipse/Eclipse_Git_repositories.html') - with patch('requests.get') as mock_http_get: + uri = "http://git.eclipse.org/c" + test_loc = self.get_test_loc("eclipse/Eclipse_Git_repositories.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = eclipse.EclipseGitVisitor(uri) - expected_loc = self.get_test_loc('eclipse/eclipse_git_repos_expected') + expected_loc = self.get_test_loc("eclipse/eclipse_git_repos_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_eclipse_packages(self): - uri = 'http://www.eclipse.org/downloads/packages/all' - test_loc = self.get_test_loc('eclipse/All_Releases_Packages.html') - with patch('requests.get') as mock_http_get: + uri = "http://www.eclipse.org/downloads/packages/all" + test_loc = self.get_test_loc("eclipse/All_Releases_Packages.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = eclipse.EclipsePackagesVisitor(uri) - expected_loc = self.get_test_loc('eclipse/eclipse_packages_expected') + expected_loc = self.get_test_loc("eclipse/eclipse_packages_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_eclipse_package_releases(self): - uri = 'http://www.eclipse.org/downloads/packages/release/Neon/R' - test_loc = self.get_test_loc('eclipse/Neon_R.html') - with patch('requests.get') as mock_http_get: + uri = "http://www.eclipse.org/downloads/packages/release/Neon/R" + test_loc = self.get_test_loc("eclipse/Neon_R.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = eclipse.EclipseReleaseVisitor(uri) - expected_loc = self.get_test_loc('eclipse/Neon_R-expected.json') + expected_loc = self.get_test_loc("eclipse/Neon_R-expected.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_eclipse_projects_json(self): - uri = 'http://projects.eclipse.org/json/projects/all' - test_loc = self.get_test_loc('eclipse/birt.json') - with patch('requests.get') as mock_http_get: + uri = "http://projects.eclipse.org/json/projects/all" + test_loc = self.get_test_loc("eclipse/birt.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _error = eclipse.EclipseProjectsJsonVisitor(uri) expected_uris = [ - URI(uri=u'http://projects.eclipse.org/json/project/birt', - source_uri=u'http://projects.eclipse.org/json/projects/all', - package_url=u'pkg:eclipse/birt')] + URI( + uri="http://projects.eclipse.org/json/project/birt", + source_uri="http://projects.eclipse.org/json/projects/all", + package_url="pkg:eclipse/birt", + ) + ] self.assertEqual(expected_uris, list(uris)) - expected_loc = self.get_test_loc('eclipse/birt-expected.json') + expected_loc = self.get_test_loc("eclipse/birt-expected.json") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) - @unittest.skip('This requires a live internet connection to test requests timeouts') + @unittest.skip("This requires a live internet connection to test requests timeouts") def test_visitor_eclipse_projects_json_download_timeout_error(self): - uri = 'http://projects.eclipse.org/json/projects/all' + uri = "http://projects.eclipse.org/json/projects/all" try: eclipse.EclipseProjectsJsonVisitor(uri) except requests.Timeout: self.fail( "Time out error happens when download the url, " - "this should be fixed by increaseing the timeout.") + "this should be fixed by increaseing the timeout." + ) class TestEclipseMap(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('eclipse/birt.json')) as eclipse_metadata: + with open(self.get_test_loc("eclipse/birt.json")) as eclipse_metadata: metadata = json.load(eclipse_metadata) packages = miners.eclipse.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('eclipse/eclipse_birt_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("eclipse/eclipse_birt_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_eclipse_html_packages(self): - with open(self.get_test_loc('eclipse/Acceleo_projects.eclipse.org.html')) as eclipse_metadata: + with open( + self.get_test_loc("eclipse/Acceleo_projects.eclipse.org.html") + ) as eclipse_metadata: metadata = eclipse_metadata.read() packages = miners.eclipse.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'eclipse/Acceleo_projects_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("eclipse/Acceleo_projects_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_fdroid.py b/minecode/tests/miners/test_fdroid.py index ffd7f1c5..f81543e2 100644 --- a/minecode/tests/miners/test_fdroid.py +++ b/minecode/tests/miners/test_fdroid.py @@ -9,39 +9,42 @@ import json import os +from unittest.mock import patch -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.miners import fdroid from minecode.miners import URI +from minecode.miners import fdroid from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class TestFdroidVisitor(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_FdroidPackageRepoVisitor(self): - uri = 'https://f-droid.org/repo/index-v2.json' - test_loc = self.get_test_loc('fdroid/index-v2.json') - with patch('requests.get') as mock_http_get: + uri = "https://f-droid.org/repo/index-v2.json" + test_loc = self.get_test_loc("fdroid/index-v2.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _errors = fdroid.FdroidIndexVisitor(uri) # this is a non-persistent visitor, lets make sure we dont return any data assert not data expected_loc = self.get_test_loc( - 'fdroid/index-v2-expected-visit.json',) + "fdroid/index-v2-expected-visit.json", + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class TestFdroidMapper(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('fdroid/index-v2-visited.json')) as fdroid_data: + with open(self.get_test_loc("fdroid/index-v2-visited.json")) as fdroid_data: visited_uris = json.load(fdroid_data) visited_uris = [URI(**uri) for uri in visited_uris] purl_data = [(u.package_url, json.loads(u.data)) for u in visited_uris] @@ -52,7 +55,5 @@ def test_build_packages(self): packages.extend(pkgs) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'fdroid/index-v2-visited-expected-mapped.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("fdroid/index-v2-visited-expected-mapped.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_freebsd.py b/minecode/tests/miners/test_freebsd.py index a16c9a7e..5f25171a 100644 --- a/minecode/tests/miners/test_freebsd.py +++ b/minecode/tests/miners/test_freebsd.py @@ -9,60 +9,59 @@ import os -import yaml - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import freebsd from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class FreeBSDVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_freebsd_seed(self): - uri = 'https://pkg.freebsd.org' - test_loc = self.get_test_loc('freebsd/FreeBSD.org.html') - with patch('requests.get') as mock_http_get: + uri = "https://pkg.freebsd.org" + test_loc = self.get_test_loc("freebsd/FreeBSD.org.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = freebsd.FreeBSDBaseHTMLVisitors(uri) - expected_loc = self.get_test_loc('freebsd/FreeBSD.org.html_expected') + expected_loc = self.get_test_loc("freebsd/FreeBSD.org.html_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_freebsd_subHTML(self): - uri = 'https://pkg.freebsd.org/FreeBSD:10:i386/release_0/' - test_loc = self.get_test_loc('freebsd/FreeBSD-10-i386_release_0_.html') - with patch('requests.get') as mock_http_get: + uri = "https://pkg.freebsd.org/FreeBSD:10:i386/release_0/" + test_loc = self.get_test_loc("freebsd/FreeBSD-10-i386_release_0_.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = freebsd.FreeBSDSubHTMLVisitors(uri) expected_loc = self.get_test_loc( - 'freebsd/FreeBSD-10-i386_release_0_.html_expected') + "freebsd/FreeBSD-10-i386_release_0_.html_expected" + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_freebsd_indexvisitor(self): - uri = 'https://pkg.freebsd.org/FreeBSD:10:i386/release_0/packagesite.txz' - test_loc = self.get_test_loc('freebsd/packagesite.txz') - with patch('requests.get') as mock_http_get: + uri = "https://pkg.freebsd.org/FreeBSD:10:i386/release_0/packagesite.txz" + test_loc = self.get_test_loc("freebsd/packagesite.txz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = freebsd.FreeBSDIndexVisitors(uri) - expected_loc = self.get_test_loc('freebsd/indexfile_expected') + expected_loc = self.get_test_loc("freebsd/indexfile_expected") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) class FreedesktopMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_map_index_file(self): - with open(self.get_test_loc('freebsd/mapper_input1')) as freebsd_metadata: + with open(self.get_test_loc("freebsd/mapper_input1")) as freebsd_metadata: metadata = freebsd_metadata.read() packages = miners.freebsd.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'freebsd/indexfile_expected_mapper.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("freebsd/indexfile_expected_mapper.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_freedesktop.py b/minecode/tests/miners/test_freedesktop.py index 00515f08..4932d02a 100644 --- a/minecode/tests/miners/test_freedesktop.py +++ b/minecode/tests/miners/test_freedesktop.py @@ -8,60 +8,55 @@ # import os - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import freedesktop from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class FreedesktopTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) class FreedesktopVistorTest(FreedesktopTest): - def test_visit_software_html_page(self): - uri = 'https://www.freedesktop.org/wiki/Software' - test_loc = self.get_test_loc('freedesktop/Software.html') - with patch('requests.get') as mock_http_get: + uri = "https://www.freedesktop.org/wiki/Software" + test_loc = self.get_test_loc("freedesktop/Software.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = freedesktop.FreedesktopHTMLVisitor(uri) - expected_loc = self.get_test_loc( - 'freedesktop/freedesktop_software_expected') + expected_loc = self.get_test_loc("freedesktop/freedesktop_software_expected") self.check_expected_uris(uris, expected_loc) class FreedesktopMapperTest(FreedesktopTest): - def test_map_software_html_page_hal(self): - with open(self.get_test_loc('freedesktop/hal.html')) as freedesktop_metadata: + with open(self.get_test_loc("freedesktop/hal.html")) as freedesktop_metadata: metadata = freedesktop_metadata.read() packages = miners.freedesktop.build_packages( metadata, - 'https://www.freedesktop.org/wiki/Software/hal', - purl='pkg:freedesktop/hal') + "https://www.freedesktop.org/wiki/Software/hal", + purl="pkg:freedesktop/hal", + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'freedesktop/hal_project_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("freedesktop/hal_project_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_map_software_html_page_libinput(self): - with open(self.get_test_loc('freedesktop/libinput.html')) as freedesktop_metadata: + with open( + self.get_test_loc("freedesktop/libinput.html") + ) as freedesktop_metadata: metadata = freedesktop_metadata.read() packages = miners.freedesktop.build_packages( metadata, - 'https://www.freedesktop.org/wiki/Software/libinput/', - purl='pkg:freedesktop/libinput') + "https://www.freedesktop.org/wiki/Software/libinput/", + purl="pkg:freedesktop/libinput", + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'freedesktop/libinput_project_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("freedesktop/libinput_project_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_github.py b/minecode/tests/miners/test_github.py index 4bb6c42f..0ac02059 100644 --- a/minecode/tests/miners/test_github.py +++ b/minecode/tests/miners/test_github.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -9,44 +8,42 @@ # import os - -from mock import MagicMock -from mock import Mock -from mock import patch +from unittest.mock import MagicMock +from unittest.mock import patch from github.Download import Download -from github.MainClass import Github from github.Repository import Repository -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - from minecode import miners from minecode.miners import github from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class GithubVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) - @patch('github.MainClass.Github.get_repo') + @patch("github.MainClass.Github.get_repo") def test_GithubRepoVisitor(self, mock_get_repo): repository = MagicMock(spec=Repository) - repository.name = 'grit' + repository.name = "grit" repository.size = 7954 repository.id = 1 - repository.description = '**Grit is no longer maintained. Check out libgit2/rugged.** Grit gives you object oriented read/write access to Git repositories via Ruby.' - repository.language = 'Ruby' - repository.homepage = 'http://grit.rubyforge.org/' + repository.description = "**Grit is no longer maintained. Check out libgit2/rugged.** Grit gives you object oriented read/write access to Git repositories via Ruby." + repository.language = "Ruby" + repository.homepage = "http://grit.rubyforge.org/" repository._issues_url = None repository._git_url = None - repository.html_url = 'https://github.com/mojombo/grit' + repository.html_url = "https://github.com/mojombo/grit" repository.svn_url = None repository.etag = None - repository.clone_url = 'https://github.com/mojombo/grit.git' + repository.clone_url = "https://github.com/mojombo/grit.git" repository.watchers = None - repository.full_name = 'mojombo/grit' - repository.ssh_url = 'git@github.com:mojombo/grit.git' + repository.full_name = "mojombo/grit" + repository.ssh_url = "git@github.com:mojombo/grit.git" repository.owner = None repository.blobs_url = None repository.master_branch = None @@ -54,17 +51,17 @@ def test_GithubRepoVisitor(self, mock_get_repo): repository.pushed_at = None download = MagicMock(spec=Download) - download.name = 'grit-1.0.1.gem' + download.name = "grit-1.0.1.gem" download.redirect = None download.description = None - download.url = 'https://api.github.com/repos/mojombo/grit/downloads/5' + download.url = "https://api.github.com/repos/mojombo/grit/downloads/5" download.size = 1861632 download.s3_url = None download.created_at = None download.download_count = 187 download.redirect = None download.signature = None - download.html_url = 'https://github.com/downloads/mojombo/grit/grit-1.0.1.gem' + download.html_url = "https://github.com/downloads/mojombo/grit/grit-1.0.1.gem" download.bucket = None download.acl = None download.accesskeyid = None @@ -72,44 +69,44 @@ def test_GithubRepoVisitor(self, mock_get_repo): repository.get_downloads.return_value = iter([download]) tag = MagicMock() - tag.name = 'tags' - tag.zipball_url = 'https://api.github.com/repos/mojombo/grit/zipball/v2.5.0' - tag.tarball_url = 'https://api.github.com/repos/mojombo/grit/tarball/v2.5.0' - tag.name = 'v2.5.0' + tag.name = "tags" + tag.zipball_url = "https://api.github.com/repos/mojombo/grit/zipball/v2.5.0" + tag.tarball_url = "https://api.github.com/repos/mojombo/grit/tarball/v2.5.0" + tag.name = "v2.5.0" tag.commit = None repository.get_tags.return_value = iter([tag]) label = MagicMock() - label.name = 'label 1' + label.name = "label 1" repository.get_labels.return_value = iter([label]) mock_get_repo.return_value = repository - uri = 'https://api.github.com/repos/mojombo/grit' + uri = "https://api.github.com/repos/mojombo/grit" _, data, _ = github.GithubSingleRepoVisitor(uri) - expected_loc = self.get_test_loc('github/mojombo_grit_expected.json') + expected_loc = self.get_test_loc("github/mojombo_grit_expected.json") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) - @patch('github.MainClass.Github.get_repo') + @patch("github.MainClass.Github.get_repo") def test_GithubRepoVisitor_without_tag_without_download(self, mock_get_repo): repository = MagicMock(spec=Repository) - repository.name = 'calendar_builder' + repository.name = "calendar_builder" repository.size = 188 repository.id = 367 repository.description = None - repository.language = 'Ruby' + repository.language = "Ruby" repository.homepage = None repository._issues_url = None repository._git_url = None - repository.html_url = 'https://github.com/collectiveidea/calendar_builder' + repository.html_url = "https://github.com/collectiveidea/calendar_builder" repository.svn_url = None repository.etag = '"e10b78ff74a199fcf802be4afc333275"' - repository.clone_url = 'git@github.com:collectiveidea/calendar_builder.git' + repository.clone_url = "git@github.com:collectiveidea/calendar_builder.git" repository.watchers = None - repository.full_name = 'collectiveidea/calendar_builder' - repository.ssh_url = 'git@github.com:collectiveidea/calendar_builder.git' + repository.full_name = "collectiveidea/calendar_builder" + repository.ssh_url = "git@github.com:collectiveidea/calendar_builder.git" repository.owner = None - repository.blobs_url = 'https://api.github.com/repos/collectiveidea/calendar_builder/git/blobs{/sha}' + repository.blobs_url = "https://api.github.com/repos/collectiveidea/calendar_builder/git/blobs{/sha}" repository.master_branch = None repository.updated_at = None repository.pushed_at = None @@ -119,47 +116,51 @@ def test_GithubRepoVisitor_without_tag_without_download(self, mock_get_repo): repository.get_labels.return_value = None master_branch = MagicMock() - master_branch.name = 'master' + master_branch.name = "master" refactoring_branch = MagicMock() - refactoring_branch.name = 'refactoring' - repository.get_branches.return_value = iter( - [master_branch, refactoring_branch]) + refactoring_branch.name = "refactoring" + repository.get_branches.return_value = iter([master_branch, refactoring_branch]) mock_get_repo.return_value = repository - uri = 'https://api.github.com/repos/collectiveidea/calendar_builder' + uri = "https://api.github.com/repos/collectiveidea/calendar_builder" _, data, _ = github.GithubSingleRepoVisitor(uri) - expected_loc = self.get_test_loc( - 'github/calendar_builder-expected.json') + expected_loc = self.get_test_loc("github/calendar_builder-expected.json") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) def test_GithubReposVisitor(self): - uri = 'https://api.github.com/repositories?since=0' - test_loc = self.get_test_loc('github/repo_since0.json') - with patch('requests.get') as mock_http_get: + uri = "https://api.github.com/repositories?since=0" + test_loc = self.get_test_loc("github/repo_since0.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = github.GithubReposVisitor(uri) - expected_loc = self.get_test_loc('github/repo_since0_expected.json') + expected_loc = self.get_test_loc("github/repo_since0_expected.json") self.check_expected_results(data, expected_loc) class GithubMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_github_repo_mapper1(self): - with open(self.get_test_loc('github/calendar_builder.json')) as json_metadata: + with open(self.get_test_loc("github/calendar_builder.json")) as json_metadata: metadata = json_metadata.read() - packages = miners.github.build_github_packages(metadata, 'https://api.github.com/repos/collectiveidea/calendar_builder') + packages = miners.github.build_github_packages( + metadata, "https://api.github.com/repos/collectiveidea/calendar_builder" + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'github/mapper_calendar_builder_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("github/mapper_calendar_builder_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_github_repo_mapper2(self): - with open(self.get_test_loc('github/mojombo_grit_from_visitor_4mapper_input.json')) as json_metadata: + with open( + self.get_test_loc("github/mojombo_grit_from_visitor_4mapper_input.json") + ) as json_metadata: metadata = json_metadata.read() - packages = miners.github.build_github_packages(metadata, 'https://api.github.com/repos/mojombo/grit') + packages = miners.github.build_github_packages( + metadata, "https://api.github.com/repos/mojombo/grit" + ) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'github/mojombo_grit_result_mapper_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "github/mojombo_grit_result_mapper_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_gitlab.py b/minecode/tests/miners/test_gitlab.py index 604b5dc4..70cc6153 100644 --- a/minecode/tests/miners/test_gitlab.py +++ b/minecode/tests/miners/test_gitlab.py @@ -9,52 +9,56 @@ import os import unittest +from unittest.mock import patch -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - +from minecode import miners from minecode.miners import gitlab from minecode.tests import FIXTURES_REGEN -from minecode import miners +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class GitlabTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) class GitlabVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) - @unittest.skip('The test is to test fetching remotely through http connection') + @unittest.skip("The test is to test fetching remotely through http connection") def test_visit_api_header_getheaders(self): - uri = 'https://gitlab.com/api/v4/projects' + uri = "https://gitlab.com/api/v4/projects" uris, _, _ = gitlab.GitlabAPIHeaderVisitor(uri) - expected_loc = self.get_test_loc('gitlab/expected_projects.json') + expected_loc = self.get_test_loc("gitlab/expected_projects.json") self.check_expected_uris(uris, expected_loc) def test_visit_metacpan_api_projects(self): - uri = 'https://gitlab.com/api/v4/projects?page=1&per_page=70&statistics=true' - test_loc = self.get_test_loc('gitlab/projects_visitor.json') - with patch('requests.get') as mock_http_get: + uri = "https://gitlab.com/api/v4/projects?page=1&per_page=70&statistics=true" + test_loc = self.get_test_loc("gitlab/projects_visitor.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = gitlab.GitlabAPIVisitor(uri) - expected_loc = self.get_test_loc( - 'gitlab/expected_projects_visitor.json') + expected_loc = self.get_test_loc("gitlab/expected_projects_visitor.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class GitlabMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_map_software_html_page_hal(self): - with open(self.get_test_loc('gitlab/microservice-express-mongo.json')) as gitlab_json: + with open( + self.get_test_loc("gitlab/microservice-express-mongo.json") + ) as gitlab_json: metadata = gitlab_json.read() packages = miners.gitlab.build_packages_from_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'gitlab/microservice-express-mongo_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "gitlab/microservice-express-mongo_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_golang.py b/minecode/tests/miners/test_golang.py index 6695a271..285fe8e0 100644 --- a/minecode/tests/miners/test_golang.py +++ b/minecode/tests/miners/test_golang.py @@ -9,92 +9,91 @@ import json import os - -from mock import Mock -from mock import patch +from unittest.mock import patch from packageurl import PackageURL -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - from minecode.miners.golang import GodocIndexVisitor from minecode.miners.golang import GodocSearchVisitor -from minecode.miners.golang import parse_package_path from minecode.miners.golang import build_golang_package +from minecode.miners.golang import parse_package_path from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class GoLangVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_GoLangGoDocAPIVisitor(self): - uri = 'https://api.godoc.org/packages' - test_loc = self.get_test_loc('golang/packages.json') - with patch('requests.get') as mock_http_get: + uri = "https://api.godoc.org/packages" + test_loc = self.get_test_loc("golang/packages.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = GodocIndexVisitor(uri) - expected_loc = self.get_test_loc('golang/packages_expected_uris.json') + expected_loc = self.get_test_loc("golang/packages_expected_uris.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_GodocSearchVisitor(self): - uri = 'https://api.godoc.org/search?q=github.com/golang' - test_loc = self.get_test_loc('golang/godoc_search.json') - with patch('requests.get') as mock_http_get: + uri = "https://api.godoc.org/search?q=github.com/golang" + test_loc = self.get_test_loc("golang/godoc_search.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = GodocSearchVisitor(uri) - expected_loc = self.get_test_loc( - 'golang/godoc_search_expected_uris.json') + expected_loc = self.get_test_loc("golang/godoc_search_expected_uris.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_GodocSearchVisitor_with_non_github_urls(self): - uri = 'https://api.godoc.org/search?q=github.com/golang*' - test_loc = self.get_test_loc('golang/godoc_search_off_github.json') - with patch('requests.get') as mock_http_get: + uri = "https://api.godoc.org/search?q=github.com/golang*" + test_loc = self.get_test_loc("golang/godoc_search_off_github.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = GodocSearchVisitor(uri) expected_loc = self.get_test_loc( - 'golang/godoc_search_off_github_expected_uris.json') + "golang/godoc_search_off_github_expected_uris.json" + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_parse_package_path(self): - test_path = 'github.com/lambdasoup/go-netlink/log' + test_path = "github.com/lambdasoup/go-netlink/log" purl = PackageURL.from_string( - 'pkg:golang/github.com/lambdasoup/go-netlink' - '?vcs_repository=https://github.com/lambdasoup/go-netlink') - expected = purl, 'github.com/lambdasoup/go-netlink' + "pkg:golang/github.com/lambdasoup/go-netlink" + "?vcs_repository=https://github.com/lambdasoup/go-netlink" + ) + expected = purl, "github.com/lambdasoup/go-netlink" assert expected == parse_package_path(test_path) class GoLangMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_golang_package(self): - purl = 'pkg:golang/github.com/golang/glog?vcs_repository=https://github.com/golang/glog' - with open(self.get_test_loc('golang/glog.json')) as f: + purl = "pkg:golang/github.com/golang/glog?vcs_repository=https://github.com/golang/glog" + with open(self.get_test_loc("golang/glog.json")) as f: package_data = json.load(f) package = build_golang_package(package_data, purl) package = package.to_dict() - expected_loc = self.get_test_loc('golang/glog_expected.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("golang/glog_expected.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_build_golang_package_bitbucket(self): - purl = 'pkg:bitbucket/bitbucket.org/zombiezen/yaml?vcs_repository=https://bitbucket.org/zombiezen/yaml' - with open(self.get_test_loc('golang/math3.json')) as f: + purl = "pkg:bitbucket/bitbucket.org/zombiezen/yaml?vcs_repository=https://bitbucket.org/zombiezen/yaml" + with open(self.get_test_loc("golang/math3.json")) as f: package_data = json.load(f) package = build_golang_package(package_data, purl) package = package.to_dict() - expected_loc = self.get_test_loc('golang/math3_expected.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("golang/math3_expected.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_build_golang_package_non_well_known(self): - purl = 'pkg:golang/winterdrache.de/bindings/sdl' - with open(self.get_test_loc('golang/winter.json')) as f: + purl = "pkg:golang/winterdrache.de/bindings/sdl" + with open(self.get_test_loc("golang/winter.json")) as f: package_data = json.load(f) package = build_golang_package(package_data, purl) package = package.to_dict() - expected_loc = self.get_test_loc('golang/winter_expected.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("golang/winter_expected.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_googlecode.py b/minecode/tests/miners/test_googlecode.py index f4b344fa..9938fbe0 100644 --- a/minecode/tests/miners/test_googlecode.py +++ b/minecode/tests/miners/test_googlecode.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -10,104 +9,123 @@ import json import os - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import URI from minecode.miners import googlecode from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class GoogleNewAPIVisitorsTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_google_download_zip_visitor(self): - uri = 'https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip' - test_loc = self.get_test_loc('googlecode/google-code-archive.txt.zip') - with patch('requests.get') as mock_http_get: + uri = "https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip" + test_loc = self.get_test_loc("googlecode/google-code-archive.txt.zip") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = googlecode.GooglecodeArchiveVisitor(uri) expected_loc = self.get_test_loc( - 'googlecode/expected_google-code-archive.txt.zip.json') + "googlecode/expected_google-code-archive.txt.zip.json" + ) self.check_expected_uris(uris, expected_loc) def test_visit_google_projectpages(self): - uri = 'https://code.google.com/archive/search?q=domain:code.google.com' - test_loc = self.get_test_loc( - 'googlecode/v2_api/GoogleCodeProjectHosting.htm') - with patch('requests.get') as mock_http_get: + uri = "https://code.google.com/archive/search?q=domain:code.google.com" + test_loc = self.get_test_loc("googlecode/v2_api/GoogleCodeProjectHosting.htm") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) expected_loc = self.get_test_loc( - 'googlecode/v2_api/expected_googleprojects.json') + "googlecode/v2_api/expected_googleprojects.json" + ) self.check_expected_uris(uris, expected_loc) def test_visit_google_projectpage2(self): - uri = 'https://code.google.com/archive/search?q=domain:code.google.com&page=2' + uri = "https://code.google.com/archive/search?q=domain:code.google.com&page=2" test_loc = self.get_test_loc( - 'googlecode/v2_api/GoogleCodeProjectHosting_page2.htm') - with patch('requests.get') as mock_http_get: + "googlecode/v2_api/GoogleCodeProjectHosting_page2.htm" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) expected_loc = self.get_test_loc( - 'googlecode/v2_api/expected_googleproject_page2.json') + "googlecode/v2_api/expected_googleproject_page2.json" + ) self.check_expected_uris(uris, expected_loc) def test_visit_google_download_json(self): - uri = 'https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/project.json' - test_loc = self.get_test_loc('googlecode/v2_api/project.json') - with patch('requests.get') as mock_http_get: + uri = "https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/project.json" + test_loc = self.get_test_loc("googlecode/v2_api/project.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = googlecode.GoogleProjectJsonVisitor(uri) self.assertEqual( - [URI(uri=u'https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json')], list(uris)) + [ + URI( + uri="https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json" + ) + ], + list(uris), + ) def test_visit_google_json(self): - uri = 'https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json' - test_loc = self.get_test_loc('googlecode/v2_api/downloads-page-1.json') - with patch('requests.get') as mock_http_get: + uri = "https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json" + test_loc = self.get_test_loc("googlecode/v2_api/downloads-page-1.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) expected_loc = self.get_test_loc( - 'googlecode/v2_api/hg4j_download_expected.json') + "googlecode/v2_api/hg4j_download_expected.json" + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_googleapi_project_json(self): - uri = 'https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2%2Fapache-extras.org%2F124799961-qian%2Fproject.json?alt=media' + uri = "https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2%2Fapache-extras.org%2F124799961-qian%2Fproject.json?alt=media" test_loc = self.get_test_loc( - 'googlecode/v2_apache-extras.org_124799961-qian_project.json') - with patch('requests.get') as mock_http_get: + "googlecode/v2_apache-extras.org_124799961-qian_project.json" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) expected_loc = self.get_test_loc( - 'googlecode/expected_v2_apache-extras.org_124799961-qian_project2.json') + "googlecode/expected_v2_apache-extras.org_124799961-qian_project2.json" + ) self.check_expected_results(data, expected_loc) class GoogleNewAPIMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages_from_v2_projects_json(self): - with open(self.get_test_loc('googlecode/v2_api/project.json')) as projectsjson_meta: + with open( + self.get_test_loc("googlecode/v2_api/project.json") + ) as projectsjson_meta: metadata = json.load(projectsjson_meta) packages = miners.googlecode.build_packages_from_projectsjson_v2(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'googlecode/v2_api/package_expected_project.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "googlecode/v2_api/package_expected_project.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages_from_v1_projects_json(self): - with open(self.get_test_loc('googlecode/v2_apache-extras.org_124799961-qian_project.json')) as projectsjson_meta: + with open( + self.get_test_loc( + "googlecode/v2_apache-extras.org_124799961-qian_project.json" + ) + ) as projectsjson_meta: metadata = json.load(projectsjson_meta) packages = miners.googlecode.build_packages_from_projectsjson_v1(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'googlecode/mapper_expected_v2_apache-extras.org_124799961-qian_project.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "googlecode/mapper_expected_v2_apache-extras.org_124799961-qian_project.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_gstreamer.py b/minecode/tests/miners/test_gstreamer.py index 9b41cbb9..14b528b3 100644 --- a/minecode/tests/miners/test_gstreamer.py +++ b/minecode/tests/miners/test_gstreamer.py @@ -9,55 +9,58 @@ import os import re +from unittest.mock import patch -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - +from minecode import miners from minecode.miners import gstreamer from minecode.tests import FIXTURES_REGEN -from minecode import miners +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class GstreamerVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_gstreamer_source_root(self): - uri = 'https://gstreamer.freedesktop.org/src/' - test_loc = self.get_test_loc('gstreamer/src_root.html') - with patch('requests.get') as mock_http_get: + uri = "https://gstreamer.freedesktop.org/src/" + test_loc = self.get_test_loc("gstreamer/src_root.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = gstreamer.GstreamerHTMLVisitor(uri) - expected_loc = self.get_test_loc('gstreamer/src_root.html-expected') + expected_loc = self.get_test_loc("gstreamer/src_root.html-expected") self.check_expected_uris(uris, expected_loc) def test_visit_Gstreamer_subpath_contains_file_resources(self): - uri = 'https://gstreamer.freedesktop.org/src/gst-openmax/pre/' - test_loc = self.get_test_loc('gstreamer/src_gst-openmax_pre.html') - with patch('requests.get') as mock_http_get: + uri = "https://gstreamer.freedesktop.org/src/gst-openmax/pre/" + test_loc = self.get_test_loc("gstreamer/src_gst-openmax_pre.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = gstreamer.GstreamerHTMLVisitor(uri) - expected_loc = self.get_test_loc( - 'gstreamer/src_gst-openmax_pre.html-expected') + expected_loc = self.get_test_loc("gstreamer/src_gst-openmax_pre.html-expected") self.check_expected_uris(uris, expected_loc) class GstreamerMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_package_mapper_regex(self): regex = re.compile( - r'^https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2|\.sha1sum|\.md5|\.gz|\.tar\.xz|\.asc]$') + r"^https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2|\.sha1sum|\.md5|\.gz|\.tar\.xz|\.asc]$" + ) result = re.match( - regex, 'https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2') + regex, + "https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2", + ) self.assertTrue(result) def test_build_package_from_url(self): - packages = miners.gstreamer.build_package_from_url('https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2') + packages = miners.gstreamer.build_package_from_url( + "https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2" + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'gstreamer/mapper_build_from_url-expected') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("gstreamer/mapper_build_from_url-expected") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_haxe.py b/minecode/tests/miners/test_haxe.py index f729a280..7156f19e 100644 --- a/minecode/tests/miners/test_haxe.py +++ b/minecode/tests/miners/test_haxe.py @@ -9,59 +9,57 @@ import json import os - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import haxe from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class HaxeVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_haxe_projects(self): - uri = 'https://lib.haxe.org/all' - test_loc = self.get_test_loc('haxe/all_haxelibs.html') - with patch('requests.get') as mock_http_get: + uri = "https://lib.haxe.org/all" + test_loc = self.get_test_loc("haxe/all_haxelibs.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = haxe.HaxeProjectsVisitor(uri) - expected_loc = self.get_test_loc('haxe/all_haxelibs.html-expected') + expected_loc = self.get_test_loc("haxe/all_haxelibs.html-expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_haxe_versions(self): - uri = 'https://lib.haxe.org/p/openfl/versions' - test_loc = self.get_test_loc('haxe/all_versions_openfl.html') - with patch('requests.get') as mock_http_get: + uri = "https://lib.haxe.org/p/openfl/versions" + test_loc = self.get_test_loc("haxe/all_versions_openfl.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = haxe.HaxeVersionsVisitor(uri) - expected_loc = self.get_test_loc( - 'haxe/all_versions_openfl.html-expected') + expected_loc = self.get_test_loc("haxe/all_versions_openfl.html-expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_haxe_package_json(self): - uri = 'https://lib.haxe.org/p/openfl/8.5.1/raw-files/openfl/package.json' - test_loc = self.get_test_loc('haxe/openfl-8.5.1-package.json') - with patch('requests.get') as mock_http_get: + uri = "https://lib.haxe.org/p/openfl/8.5.1/raw-files/openfl/package.json" + test_loc = self.get_test_loc("haxe/openfl-8.5.1-package.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = haxe.HaxePackageJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'haxe/openfl-8.5.1-package.json-expected') + expected_loc = self.get_test_loc("haxe/openfl-8.5.1-package.json-expected") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) class HaxeMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_project_package_json(self): - with open(self.get_test_loc('haxe/project_package.json')) as projectsjson_meta: + with open(self.get_test_loc("haxe/project_package.json")) as projectsjson_meta: metadata = json.load(projectsjson_meta) packages = miners.haxe.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('haxe/project_package.json-expected') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("haxe/project_package.json-expected") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_maven.py b/minecode/tests/miners/test_maven.py index 2ebc6e2d..e3aaa597 100644 --- a/minecode/tests/miners/test_maven.py +++ b/minecode/tests/miners/test_maven.py @@ -7,25 +7,23 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from operator import itemgetter import json import os import re - -from mock import patch +from operator import itemgetter +from unittest.mock import patch from django.test import TestCase as DjangoTestCase +import packagedb from minecode.management.commands.run_map import map_uri from minecode.management.commands.run_visit import visit_uri from minecode.miners import maven from minecode.models import ResourceURI -from minecode.utils_test import mocked_requests_get +from minecode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get from minecode.utils_test import model_to_dict -from minecode.tests import FIXTURES_REGEN -import packagedb - # TODO: add tests from /maven-indexer/indexer-core/src/test/java/org/acche/maven/index/artifact @@ -35,221 +33,318 @@ def sort_deps(results): FIXME: UGLY TEMP WORKAROUND: we sort the results because of a PyMaven bug See https://github.com/sassoftware/pymaven/issues/11 """ - if 'dependencies' in results: - results['dependencies'].sort() - elif results and 'metadata' in results[0]: + if "dependencies" in results: + results["dependencies"].sort() + elif results and "metadata" in results[0]: for result in results: - result['metadata']['dependencies'].sort() + result["metadata"]["dependencies"].sort() class MavenMiscTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_get_entries(self): - index = self.get_test_loc('maven/index/nexus-maven-repository-index.gz') - fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) fields = set(fields) result = list(maven.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc('maven/index/expected_entries.json') + expected_loc = self.get_test_loc("maven/index/expected_entries.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_entries_increment(self): - index = self.get_test_loc('maven/index/increment/nexus-maven-repository-index.445.gz') - fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) fields = set(fields) result = list(maven.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc('maven/index/increment/expected_entries.json') + expected_loc = self.get_test_loc("maven/index/increment/expected_entries.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_entries_buggy(self): - index = self.get_test_loc('maven/index/buggy/nexus-maven-repository-index.gz') - fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc("maven/index/buggy/nexus-maven-repository-index.gz") + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) fields = set(fields) result = list(maven.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc('maven/index/buggy/expected_entries.json') + expected_loc = self.get_test_loc("maven/index/buggy/expected_entries.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_full(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") fields = ( - list(maven.ENTRY_FIELDS) + - list(maven.ENTRY_FIELDS_OTHER) + - list(maven.ENTRY_FIELDS_IGNORED) + list(maven.ENTRY_FIELDS) + + list(maven.ENTRY_FIELDS_OTHER) + + list(maven.ENTRY_FIELDS_IGNORED) ) fields = set(fields) - result = [a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True)] - expected_loc = self.get_test_loc('maven/index/expected_artifacts.json') + result = [ + a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True) + ] + expected_loc = self.get_test_loc("maven/index/expected_artifacts.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_increment(self): - index = self.get_test_loc('maven/index/increment/nexus-maven-repository-index.445.gz') - fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) fields = set(fields) - result = [a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True)] - expected_loc = self.get_test_loc('maven/index/increment/expected_artifacts.json') + result = [ + a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True) + ] + expected_loc = self.get_test_loc( + "maven/index/increment/expected_artifacts.json" + ) self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_buggy(self): - index = self.get_test_loc('maven/index/buggy/nexus-maven-repository-index.gz') - fields = list(maven.ENTRY_FIELDS.keys()) + list(maven.ENTRY_FIELDS_OTHER.keys()) + list(maven.ENTRY_FIELDS_IGNORED.keys()) + index = self.get_test_loc("maven/index/buggy/nexus-maven-repository-index.gz") + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) fields = set(fields) - result = [a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True)] - expected_loc = self.get_test_loc('maven/index/buggy/expected_artifacts.json') + result = [ + a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True) + ] + expected_loc = self.get_test_loc("maven/index/buggy/expected_artifacts.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_get_artifacts_defaults(self): - index = self.get_test_loc('maven/index/nexus-maven-repository-index.gz') + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") result = [a.to_dict() for a in maven.get_artifacts(index)] - expected_loc = self.get_test_loc('maven/index/expected_artifacts-defaults.json') + expected_loc = self.get_test_loc("maven/index/expected_artifacts-defaults.json") self.check_expected_results(result, expected_loc) def test_get_artifacts_no_worthyness(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") def worth(a): return True result = [a.to_dict() for a in maven.get_artifacts(index, worthyness=worth)] - expected_loc = self.get_test_loc('maven/index/expected_artifacts-all-worthy.json') + expected_loc = self.get_test_loc( + "maven/index/expected_artifacts-all-worthy.json" + ) self.check_expected_results(result, expected_loc) def test_get_artifacts_defaults_increment(self): - index = self.get_test_loc('maven/index/increment/nexus-maven-repository-index.445.gz') + index = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) result = [a.to_dict() for a in maven.get_artifacts(index)] - expected_loc = self.get_test_loc('maven/index/increment/expected_artifacts-defaults.json') + expected_loc = self.get_test_loc( + "maven/index/increment/expected_artifacts-defaults.json" + ) self.check_expected_results(result, expected_loc) def test_get_artifacts_defaults_buggy(self): - index = self.get_test_loc('maven/index/buggy/nexus-maven-repository-index.gz') + index = self.get_test_loc("maven/index/buggy/nexus-maven-repository-index.gz") result = [a.to_dict() for a in maven.get_artifacts(index)] - expected_loc = self.get_test_loc('maven/index/buggy/expected_artifacts-defaults.json') + expected_loc = self.get_test_loc( + "maven/index/buggy/expected_artifacts-defaults.json" + ) self.check_expected_results(result, expected_loc) def test_build_artifact(self): entry = { - u'i': u'0-alpha-1-20050407.154541-1.pom|1131488721000|-1|2|2|0|pom', - u'm': u'1318447185654', - u'u': u'org.apache|maven|archetypes|1|0-alpha-1-20050407.154541-1.pom'} + "i": "0-alpha-1-20050407.154541-1.pom|1131488721000|-1|2|2|0|pom", + "m": "1318447185654", + "u": "org.apache|maven|archetypes|1|0-alpha-1-20050407.154541-1.pom", + } result = maven.build_artifact(entry, include_all=True) result = result.to_dict() - expected = dict([ - (u'group_id', u'org.apache'), - (u'artifact_id', u'maven'), - (u'version', u'archetypes'), - (u'packaging', u'0-alpha-1-20050407.154541-1.pom'), - (u'classifier', u'1'), - (u'extension', u'pom'), - (u'last_modified', '2005-11-08T22:25:21+00:00'), - (u'size', None), - (u'sha1', None), - (u'name', None), - (u'description', None), - (u'src_exist', False), - (u'jdoc_exist', False), - (u'sig_exist', False), - (u'sha256', None), - (u'osgi', dict()), - (u'classes', []) - ]) + expected = dict( + [ + ("group_id", "org.apache"), + ("artifact_id", "maven"), + ("version", "archetypes"), + ("packaging", "0-alpha-1-20050407.154541-1.pom"), + ("classifier", "1"), + ("extension", "pom"), + ("last_modified", "2005-11-08T22:25:21+00:00"), + ("size", None), + ("sha1", None), + ("name", None), + ("description", None), + ("src_exist", False), + ("jdoc_exist", False), + ("sig_exist", False), + ("sha256", None), + ("osgi", dict()), + ("classes", []), + ] + ) self.assertEqual(expected.items(), result.items()) def test_build_url_and_filename_1(self): - test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-domain', - 'version': '3.12.0', 'classifier': None, 'extension': 'jar'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/3.12.0/address-book-domain-3.12.0.jar', 'address-book-domain-3.12.0.jar' + test = { + "group_id": "de.alpharogroup", + "artifact_id": "address-book-domain", + "version": "3.12.0", + "classifier": None, + "extension": "jar", + } + expected = ( + "https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/3.12.0/address-book-domain-3.12.0.jar", + "address-book-domain-3.12.0.jar", + ) self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_2(self): - test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-data', - 'version': '3.12.0', 'classifier': None, 'extension': 'pom'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-data/3.12.0/address-book-data-3.12.0.pom', 'address-book-data-3.12.0.pom' + test = { + "group_id": "de.alpharogroup", + "artifact_id": "address-book-data", + "version": "3.12.0", + "classifier": None, + "extension": "pom", + } + expected = ( + "https://repo1.maven.org/maven2/de/alpharogroup/address-book-data/3.12.0/address-book-data-3.12.0.pom", + "address-book-data-3.12.0.pom", + ) self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_3(self): - test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-rest-web', - 'version': '3.12.0', 'classifier': None, 'extension': 'war'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-rest-web/3.12.0/address-book-rest-web-3.12.0.war', 'address-book-rest-web-3.12.0.war' + test = { + "group_id": "de.alpharogroup", + "artifact_id": "address-book-rest-web", + "version": "3.12.0", + "classifier": None, + "extension": "war", + } + expected = ( + "https://repo1.maven.org/maven2/de/alpharogroup/address-book-rest-web/3.12.0/address-book-rest-web-3.12.0.war", + "address-book-rest-web-3.12.0.war", + ) self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_4(self): - test = {'group_id': 'uk.com.robust-it', 'artifact_id': 'cloning', - 'version': '1.9.5', 'classifier': 'sources', 'extension': 'jar'} - expected = 'https://repo1.maven.org/maven2/uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar', 'cloning-1.9.5-sources.jar' + test = { + "group_id": "uk.com.robust-it", + "artifact_id": "cloning", + "version": "1.9.5", + "classifier": "sources", + "extension": "jar", + } + expected = ( + "https://repo1.maven.org/maven2/uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar", + "cloning-1.9.5-sources.jar", + ) self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_url_and_filename_with_alternate_base(self): test = { - 'group_id': 'uk.com.robust-it', 'artifact_id': 'cloning', - 'version': '1.9.5', 'classifier': 'sources', 'extension': 'jar', - 'base_repo_url': 'maven-index://'} - expected = 'maven-index:///uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar', 'cloning-1.9.5-sources.jar' + "group_id": "uk.com.robust-it", + "artifact_id": "cloning", + "version": "1.9.5", + "classifier": "sources", + "extension": "jar", + "base_repo_url": "maven-index://", + } + expected = ( + "maven-index:///uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar", + "cloning-1.9.5-sources.jar", + ) self.assertEqual(expected, maven.build_url_and_filename(**test)) def test_build_maven_xml_url(self): - test = {'group_id': 'de.alpharogroup', - 'artifact_id': 'address-book-domain'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/maven-metadata.xml' + test = {"group_id": "de.alpharogroup", "artifact_id": "address-book-domain"} + expected = "https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/maven-metadata.xml" self.assertEqual(expected, maven.build_maven_xml_url(**test)) class MavenVisitorTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_MavenNexusIndexVisitor_uris(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' - test_loc = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" + test_loc = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc('maven/index/expected_uris.json') + expected_loc = self.get_test_loc("maven/index/expected_uris.json") self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) def test_MavenNexusIndexPropertiesVisitor(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties' + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties" test_loc = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.properties') - with patch('requests.get') as mock_http_get: + "maven/index/increment/nexus-maven-repository-index.properties" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = maven.MavenNexusPropertiesVisitor(uri) - expected_loc = self.get_test_loc('maven/index/increment/expected_properties_uris.json') - self.check_expected_uris(uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc( + "maven/index/increment/expected_properties_uris.json" + ) + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) def test_MavenNexusIndexVisitor_uris_increment(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz' + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz" + ) test_loc = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - with patch('requests.get') as mock_http_get: + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc('maven/index/increment/expected_uris.json') - self.check_expected_uris(uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("maven/index/increment/expected_uris.json") + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) def test_MavenNexusIndexVisitor_uris_buggy(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" test_loc = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: + "maven/index/buggy/nexus-maven-repository-index.gz" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc('maven/index/buggy/expected_uris.json') - self.check_expected_uris(uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("maven/index/buggy/expected_uris.json") + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) def test_visit_uri_does_not_fail_on_incorrect_sha1(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" resource_uri = ResourceURI.objects.insert(uri=uri) before = [p.id for p in ResourceURI.objects.all()] test_loc = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: + "maven/index/buggy/nexus-maven-repository-index.gz" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) visit_uri(resource_uri) @@ -258,39 +353,38 @@ def test_visit_uri_does_not_fail_on_incorrect_sha1(self): else: visited = ResourceURI.objects.all() - results = [model_to_dict(rec, fields=['uri', 'sha1']) - for rec in visited] - results = sorted(results, key=itemgetter('uri')) - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_visited_uris.json') - self.check_expected_results( - results, expected_loc, regen=FIXTURES_REGEN) + results = [model_to_dict(rec, fields=["uri", "sha1"]) for rec in visited] + results = sorted(results, key=itemgetter("uri")) + expected_loc = self.get_test_loc("maven/index/buggy/expected_visited_uris.json") + self.check_expected_results(results, expected_loc, regen=FIXTURES_REGEN) visited.delete() def test_MavenPOMVisitor_data(self): - uri = 'https://repo1.maven.org/maven2/classworlds/classworlds/1.1-alpha-2/classworlds-1.1-alpha-2.pom' - test_loc = self.get_test_loc('maven/pom/classworlds-1.1-alpha-2.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/classworlds/classworlds/1.1-alpha-2/classworlds-1.1-alpha-2.pom" + test_loc = self.get_test_loc("maven/pom/classworlds-1.1-alpha-2.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = maven.MavenPOMVisitor(uri) self.assertEqual(None, uris) - expected = open(test_loc, 'rb').read() + expected = open(test_loc, "rb").read() self.assertEqual(expected, data) class MavenEnd2EndTest(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_MavenNexusIndexVisitor_with_run_visit_then_map_end2end(self): # setup before = sorted(p.id for p in ResourceURI.objects.all()) - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz' + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz" + ) resource_uri = ResourceURI.objects.insert(uri=uri) - test_index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: + test_index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_index) visit_uri(resource_uri) @@ -299,22 +393,22 @@ def test_MavenNexusIndexVisitor_with_run_visit_then_map_end2end(self): else: visited = ResourceURI.objects.all() - results = list(model_to_dict(rec, exclude=['id']) for rec in visited) - results = sorted(results, key=itemgetter('uri')) - expected_loc = self.get_test_loc( - 'maven/end2end/expected_visited_uris.json') - self.check_expected_results( - results, expected_loc, regen=FIXTURES_REGEN) + results = list(model_to_dict(rec, exclude=["id"]) for rec in visited) + results = sorted(results, key=itemgetter("uri")) + expected_loc = self.get_test_loc("maven/end2end/expected_visited_uris.json") + self.check_expected_results(results, expected_loc, regen=FIXTURES_REGEN) pre_visited_uris = ResourceURI.objects.filter( - uri__contains='maven-index://').exclude(id__in=before) + uri__contains="maven-index://" + ).exclude(id__in=before) self.assertTrue( - all(ru.last_visit_date and not ru.last_map_date - for ru in pre_visited_uris)) + all(ru.last_visit_date and not ru.last_map_date for ru in pre_visited_uris) + ) package_ids_before = sorted( - p.id for p in packagedb.models.Package.objects.all()) + p.id for p in packagedb.models.Package.objects.all() + ) # now onto mapping the previsited URIs # setup @@ -323,32 +417,34 @@ def test_MavenNexusIndexVisitor_with_run_visit_then_map_end2end(self): map_uri(res_uri) newly_mapped = packagedb.models.Package.objects.filter( - download_url__startswith='https://repo1.maven.org/maven2').exclude(id__in=package_ids_before) + download_url__startswith="https://repo1.maven.org/maven2" + ).exclude(id__in=package_ids_before) # check that the saved packages are there as planned self.assertEqual(19, newly_mapped.count()) package_results = list(pac.to_dict() for pac in newly_mapped) - expected_loc = self.get_test_loc( - 'maven/end2end/expected_mapped_packages.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("maven/end2end/expected_mapped_packages.json") + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) # check that the map status has been updated correctly - visited_then_mapped = ResourceURI.objects.filter( - uri__contains='maven-index://') + visited_then_mapped = ResourceURI.objects.filter(uri__contains="maven-index://") self.assertTrue(all(ru.last_map_date for ru in visited_then_mapped)) def test_visit_and_map_using_pom_with_unicode(self): - uri = 'https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.22/commons-jaxrs-1.22.pom' - test_loc = self.get_test_loc( - 'maven/end2end_unicode/commons-jaxrs-1.22.pom') + uri = "https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.22/commons-jaxrs-1.22.pom" + test_loc = self.get_test_loc("maven/end2end_unicode/commons-jaxrs-1.22.pom") before_uri = [p.id for p in ResourceURI.objects.all()] before_pkg = [p.id for p in packagedb.models.Package.objects.all()] resource_uri = ResourceURI.objects.insert(uri=uri) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # visit test proper: this should insert all the test_uris visit_uri(resource_uri) @@ -359,24 +455,22 @@ def test_visit_and_map_using_pom_with_unicode(self): else: visited = ResourceURI.objects.all() - uri_results = sorted(model_to_dict( - rec, exclude=['id']) for rec in visited) + uri_results = sorted(model_to_dict(rec, exclude=["id"]) for rec in visited) expected_loc = self.get_test_loc( - 'maven/end2end_unicode/expected_visited_commons-jaxrs-1.22.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) + "maven/end2end_unicode/expected_visited_commons-jaxrs-1.22.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) else: mapped = packagedb.models.Package.objects.all() package_results = sorted(pac.to_dict() for pac in mapped) expected_loc = self.get_test_loc( - 'maven/end2end_unicode/expected_mapped_commons-jaxrs-1.22.json') - self.check_expected_results( - package_results, expected_loc, regen=FIXTURES_REGEN) + "maven/end2end_unicode/expected_mapped_commons-jaxrs-1.22.json" + ) + self.check_expected_results(package_results, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_map_using_pom_with_unicode_multisteps(self): # this test deals with a single POM and the results from @@ -387,346 +481,411 @@ def test_visit_and_map_using_pom_with_unicode_multisteps(self): # this is a pre-visited as from the Maven index URI index_uri_test_loc = self.get_test_loc( - 'maven/end2end_multisteps/commons-jaxrs-1.21-index-data.json') - index_uri = json.load(open(index_uri_test_loc, 'rb')) + "maven/end2end_multisteps/commons-jaxrs-1.21-index-data.json" + ) + index_uri = json.load(open(index_uri_test_loc, "rb")) idx_resource_uri = ResourceURI.objects.insert(**index_uri) map_uri(idx_resource_uri) if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) else: mapped = packagedb.models.Package.objects.all() - package_results = sorted((pac.to_dict() - for pac in mapped), key=lambda d: list(d.keys())) + package_results = sorted( + (pac.to_dict() for pac in mapped), key=lambda d: list(d.keys()) + ) expected_loc = self.get_test_loc( - 'maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) + "maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json" + ) + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) # Step 2: map a POM # this is a pre-visited URI as from a POM pom_uri_test_loc = self.get_test_loc( - 'maven/end2end_multisteps/commons-jaxrs-1.21-pom-data.json') - pom_uri = json.load(open(pom_uri_test_loc, 'rb')) + "maven/end2end_multisteps/commons-jaxrs-1.21-pom-data.json" + ) + pom_uri = json.load(open(pom_uri_test_loc, "rb")) pom_resource_uri = ResourceURI.objects.insert(**pom_uri) map_uri(pom_resource_uri) if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) else: mapped = packagedb.models.Package.objects.all() - package_results = sorted((pac.to_dict() - for pac in mapped), key=lambda d: list(d.keys())) + package_results = sorted( + (pac.to_dict() for pac in mapped), key=lambda d: list(d.keys()) + ) expected_loc = self.get_test_loc( - 'maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) + "maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json" + ) + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) def test_visit_and_map_with_index(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties' + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties" test_loc = self.get_test_loc( - 'maven/end2end_index/nexus-maven-repository-index.properties') + "maven/end2end_index/nexus-maven-repository-index.properties" + ) before_uri = [p.id for p in ResourceURI.objects.all()] before_pkg = [p.id for p in packagedb.models.Package.objects.all()] resource_uri = ResourceURI.objects.insert(uri=uri) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # visit test proper: this should insert all the test_uris visit_uri(resource_uri) if before_uri: - visited = ResourceURI.objects.exclude( - id__in=before_uri).order_by('uri') + visited = ResourceURI.objects.exclude(id__in=before_uri).order_by("uri") else: - visited = ResourceURI.objects.all().order_by('uri') + visited = ResourceURI.objects.all().order_by("uri") - uri_results = list(model_to_dict( - rec, exclude=['id']) for rec in visited) + uri_results = list(model_to_dict(rec, exclude=["id"]) for rec in visited) expected_loc = self.get_test_loc( - 'maven/end2end_index/expected_visited_index.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) + "maven/end2end_index/expected_visited_index.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.543.gz' + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.543.gz" + ) # Use a small index file for test cases test_loc = self.get_test_loc( - 'maven/end2end_index/nexus-maven-repository-index.163.gz') + "maven/end2end_index/nexus-maven-repository-index.163.gz" + ) resource_uri = ResourceURI.objects.get(uri=uri) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # visit test proper: this should insert all the test_uris visit_uri(resource_uri) if before_uri: - visited = ResourceURI.objects.exclude( - id__in=before_uri).order_by('uri') + visited = ResourceURI.objects.exclude(id__in=before_uri).order_by("uri") else: - visited = ResourceURI.objects.all().order_by('uri') + visited = ResourceURI.objects.all().order_by("uri") - uri_results = list(model_to_dict( - rec, exclude=['id']) for rec in visited) + uri_results = list(model_to_dict(rec, exclude=["id"]) for rec in visited) expected_loc = self.get_test_loc( - 'maven/end2end_index/expected_visited_increment_index.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) + "maven/end2end_index/expected_visited_increment_index.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) class MavenXmlMetadataVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_maven_medatata_xml_file(self): - uri = 'https://repo1.maven.org/maven2/st/digitru/identity-core/maven-metadata.xml' - test_loc = self.get_test_loc('maven/maven-metadata/maven-metadata.xml') - with patch('requests.get') as mock_http_get: + uri = ( + "https://repo1.maven.org/maven2/st/digitru/identity-core/maven-metadata.xml" + ) + test_loc = self.get_test_loc("maven/maven-metadata/maven-metadata.xml") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = maven.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc('maven/maven-metadata/expected_maven_xml.json') + expected_loc = self.get_test_loc("maven/maven-metadata/expected_maven_xml.json") self.check_expected_uris(uris, expected_loc) class MavenHtmlIndexVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_maven_medatata_html_index_jcenter_1(self): - uri = 'http://jcenter.bintray.com/' - test_loc = self.get_test_loc('maven/html/jcenter.bintray.com.html') - with patch('requests.get') as mock_http_get: + uri = "http://jcenter.bintray.com/" + test_loc = self.get_test_loc("maven/html/jcenter.bintray.com.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = maven.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc('maven/html/visitor_expected_jcenter.bintray.com2.html.json') + expected_loc = self.get_test_loc( + "maven/html/visitor_expected_jcenter.bintray.com2.html.json" + ) self.check_expected_uris(uris, expected_loc) def test_visit_maven_medatata_html_index_jcenter_2(self): - uri = 'http://jcenter.bintray.com/Action/app/' - test_loc = self.get_test_loc('maven/html/app.html') - with patch('requests.get') as mock_http_get: + uri = "http://jcenter.bintray.com/Action/app/" + test_loc = self.get_test_loc("maven/html/app.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = maven.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc('maven/html/visitor_expected_app.html.json') + expected_loc = self.get_test_loc("maven/html/visitor_expected_app.html.json") self.check_expected_uris(uris, expected_loc) def test_visit_maven_medatata_html_index_jcenter_3(self): - uri = "http://jcenter.bintray.com/'com/virtualightning'/stateframework-compiler/" - test_loc = self.get_test_loc('maven/html/stateframework-compiler.html') - with patch('requests.get') as mock_http_get: + uri = ( + "http://jcenter.bintray.com/'com/virtualightning'/stateframework-compiler/" + ) + test_loc = self.get_test_loc("maven/html/stateframework-compiler.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = maven.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc('maven/html/visitor_expected_stateframework-compiler.html.json') + expected_loc = self.get_test_loc( + "maven/html/visitor_expected_stateframework-compiler.html.json" + ) self.check_expected_uris(uris, expected_loc) # FIXME: we should not need to call a visitor for testing a mapper class MavenMapperVisitAndMapTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_and_build_package_from_pom_axis(self): - uri = 'https://repo1.maven.org/maven2/axis/axis/1.4/axis-1.4.pom' - test_loc = self.get_test_loc('maven/mapper/axis-1.4.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/axis/axis/1.4/axis-1.4.pom" + test_loc = self.get_test_loc("maven/mapper/axis-1.4.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/mapper/axis-1.4.pom.package.json') + expected_loc = self.get_test_loc("maven/mapper/axis-1.4.pom.package.json") self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_commons_pool(self): - uri = 'https://repo1.maven.org/maven2/commons-pool/commons-pool/1.5.7/commons-pool-1.5.7.pom' - test_loc = self.get_test_loc('maven/mapper/commons-pool-1.5.7.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/commons-pool/commons-pool/1.5.7/commons-pool-1.5.7.pom" + test_loc = self.get_test_loc("maven/mapper/commons-pool-1.5.7.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/mapper/commons-pool-1.5.7.pom.package.json') + expected_loc = self.get_test_loc( + "maven/mapper/commons-pool-1.5.7.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_struts(self): - uri = 'https://repo1.maven.org/maven2/struts-menu/struts-menu/2.4.2/struts-menu-2.4.2.pom' - test_loc = self.get_test_loc('maven/mapper/struts-menu-2.4.2.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/struts-menu/struts-menu/2.4.2/struts-menu-2.4.2.pom" + test_loc = self.get_test_loc("maven/mapper/struts-menu-2.4.2.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/mapper/struts-menu-2.4.2.pom.package.json') + expected_loc = self.get_test_loc( + "maven/mapper/struts-menu-2.4.2.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_mysql(self): - uri = 'https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.27/mysql-connector-java-5.1.27.pom' - test_loc = self.get_test_loc( - 'maven/mapper/mysql-connector-java-5.1.27.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.27/mysql-connector-java-5.1.27.pom" + test_loc = self.get_test_loc("maven/mapper/mysql-connector-java-5.1.27.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/mapper/mysql-connector-java-5.1.27.pom.package.json') + expected_loc = self.get_test_loc( + "maven/mapper/mysql-connector-java-5.1.27.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_xbean(self): - uri = 'https://repo1.maven.org/maven2/xbean/xbean-jmx/2.0/xbean-jmx-2.0.pom' - test_loc = self.get_test_loc('maven/mapper/xbean-jmx-2.0.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/xbean/xbean-jmx/2.0/xbean-jmx-2.0.pom" + test_loc = self.get_test_loc("maven/mapper/xbean-jmx-2.0.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/mapper/xbean-jmx-2.0.pom.package.json') + expected_loc = self.get_test_loc("maven/mapper/xbean-jmx-2.0.pom.package.json") self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_maven_all(self): - uri = 'https://repo1.maven.org/maven2/date/yetao/maven/maven-all/1.0-RELEASE/maven-all-1.0-RELEASE.pom' - test_loc = self.get_test_loc('maven/mapper/maven-all-1.0-RELEASE.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/date/yetao/maven/maven-all/1.0-RELEASE/maven-all-1.0-RELEASE.pom" + test_loc = self.get_test_loc("maven/mapper/maven-all-1.0-RELEASE.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/mapper/maven-all-1.0-RELEASE.pom.package.json') + expected_loc = self.get_test_loc( + "maven/mapper/maven-all-1.0-RELEASE.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_visit_and_build_package_from_pom_with_unicode(self): - uri = 'https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.21/commons-jaxrs-1.21.pom' - test_loc = self.get_test_loc('maven/mapper/commons-jaxrs-1.21.pom') - with patch('requests.get') as mock_http_get: + uri = "https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.21/commons-jaxrs-1.21.pom" + test_loc = self.get_test_loc("maven/mapper/commons-jaxrs-1.21.pom") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = maven.MavenPOMVisitor(uri) package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/mapper/commons-jaxrs-1.21.pom.package.json') + expected_loc = self.get_test_loc( + "maven/mapper/commons-jaxrs-1.21.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) class MavenMapperGetPackageTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_get_package_from_pom_1(self): - test_loc = self.get_test_loc('maven/parsing/parse/jds-3.0.1.pom') + test_loc = self.get_test_loc("maven/parsing/parse/jds-3.0.1.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/parse/jds-3.0.1.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/parse/jds-3.0.1.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_2(self): test_loc = self.get_test_loc( - 'maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom') + "maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom" + ) data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_3(self): - test_loc = self.get_test_loc('maven/parsing/parse/jds-2.17.0718b.pom') + test_loc = self.get_test_loc("maven/parsing/parse/jds-2.17.0718b.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/parse/jds-2.17.0718b.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/parse/jds-2.17.0718b.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_4(self): - test_loc = self.get_test_loc( - 'maven/parsing/parse/maven-javanet-plugin-1.7.pom') + test_loc = self.get_test_loc("maven/parsing/parse/maven-javanet-plugin-1.7.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/parse/maven-javanet-plugin-1.7.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/parse/maven-javanet-plugin-1.7.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_5(self): - test_loc = self.get_test_loc('maven/parsing/loop/coreplugin-1.0.0.pom') + test_loc = self.get_test_loc("maven/parsing/loop/coreplugin-1.0.0.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/loop/coreplugin-1.0.0.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/loop/coreplugin-1.0.0.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_6(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.7.0.pom') + test_loc = self.get_test_loc("maven/parsing/loop/argus-webservices-2.7.0.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/loop/argus-webservices-2.7.0.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/loop/argus-webservices-2.7.0.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_7(self): - test_loc = self.get_test_loc('maven/parsing/loop/pkg-2.0.13.1005.pom') + test_loc = self.get_test_loc("maven/parsing/loop/pkg-2.0.13.1005.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/loop/pkg-2.0.13.1005.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/loop/pkg-2.0.13.1005.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_8(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/ojcms-beans-0.1-beta.pom') + test_loc = self.get_test_loc("maven/parsing/loop/ojcms-beans-0.1-beta.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/loop/ojcms-beans-0.1-beta.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/loop/ojcms-beans-0.1-beta.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_9(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-annotations-0.2.1.pom') + test_loc = self.get_test_loc("maven/parsing/loop/jacuzzi-annotations-0.2.1.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/loop/jacuzzi-annotations-0.2.1.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/loop/jacuzzi-annotations-0.2.1.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_10(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.8.0.pom') + test_loc = self.get_test_loc("maven/parsing/loop/argus-webservices-2.8.0.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/loop/argus-webservices-2.8.0.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/loop/argus-webservices-2.8.0.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_11(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-database-0.2.1.pom') + test_loc = self.get_test_loc("maven/parsing/loop/jacuzzi-database-0.2.1.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/loop/jacuzzi-database-0.2.1.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/loop/jacuzzi-database-0.2.1.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_12(self): - test_loc = self.get_test_loc( - 'maven/parsing/empty/common-object-1.0.2.pom') + test_loc = self.get_test_loc("maven/parsing/empty/common-object-1.0.2.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/empty/common-object-1.0.2.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/empty/common-object-1.0.2.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_get_package_from_pom_13(self): - test_loc = self.get_test_loc('maven/parsing/empty/osgl-http-1.1.2.pom') + test_loc = self.get_test_loc("maven/parsing/empty/osgl-http-1.1.2.pom") data = open(test_loc).read() package = maven.get_package(data).to_dict() - expected_loc = self.get_test_loc('maven/parsing/empty/osgl-http-1.1.2.pom.package.json') + expected_loc = self.get_test_loc( + "maven/parsing/empty/osgl-http-1.1.2.pom.package.json" + ) self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) def test_regex_maven_pom_mapper_1(self): - regex = re.compile(r'^https?://repo1.maven.org/maven2/.*\.pom$') + regex = re.compile(r"^https?://repo1.maven.org/maven2/.*\.pom$") result = re.match( - regex, 'https://repo1.maven.org/maven2/com/google/appengine/appengine-api-1.0-sdk/1.2.0/appengine-api-1.0-sdk-1.2.0.pom') + regex, + "https://repo1.maven.org/maven2/com/google/appengine/appengine-api-1.0-sdk/1.2.0/appengine-api-1.0-sdk-1.2.0.pom", + ) self.assertTrue(result) def test_MavenNexusIndexVisitor_uris_increment_contain_correct_purl(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz' + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz" + ) test_loc = self.get_test_loc( - 'maven/index/increment2/nexus-maven-repository-index.457.gz') - with patch('requests.get') as mock_http_get: + "maven/index/increment2/nexus-maven-repository-index.457.gz" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) uris = [u for i, u in enumerate(uris) if i % 500 == 0] - expected_loc = self.get_test_loc( - 'maven/index/increment2/expected_uris.json') + expected_loc = self.get_test_loc("maven/index/increment2/expected_uris.json") self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) def test_MavenNexusIndexVisitor_then_get_mini_package_from_index_data(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz' + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz" + ) test_loc = self.get_test_loc( - 'maven/index/increment2/nexus-maven-repository-index.457.gz') - with patch('requests.get') as mock_http_get: + "maven/index/increment2/nexus-maven-repository-index.457.gz" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) results = [] @@ -735,5 +894,7 @@ def test_MavenNexusIndexVisitor_then_get_mini_package_from_index_data(self): if i % 500 == 0: minip = maven.get_mini_package(u.data, u.uri, u.package_url) results.append(minip and minip.to_dict() or minip) - expected_loc = self.get_test_loc('maven/index/increment2/expected_mini_package.json') + expected_loc = self.get_test_loc( + "maven/index/increment2/expected_mini_package.json" + ) self.check_expected_results(results, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_npm.py b/minecode/tests/miners/test_npm.py index 3b8003df..bb409f3b 100644 --- a/minecode/tests/miners/test_npm.py +++ b/minecode/tests/miners/test_npm.py @@ -10,86 +10,86 @@ import json import os import re - -from mock import patch +from unittest.mock import patch from minecode import miners from minecode import route +from minecode.miners import npm from minecode.models import ResourceURI +from minecode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTesting from minecode.utils_test import mocked_requests_get -from minecode.miners import npm -from minecode.tests import FIXTURES_REGEN class TestNPMVisit(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) # FIXME: use smaller test files def test_NpmRegistryVisitor(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000' - test_loc = self.get_test_loc('npm/replicate_doc1.json') - with patch('requests.get') as mock_http_get: + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000" + test_loc = self.get_test_loc("npm/replicate_doc1.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _errors = npm.NpmRegistryVisitor(uri) # this is a non-persistent visitor, lets make sure we dont return any data assert not data - expected_loc = self.get_test_loc('npm/expected_doclimit_visitor.json') + expected_loc = self.get_test_loc("npm/expected_doclimit_visitor.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_NpmRegistryVisitor_OverLimit(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000' - test_loc = self.get_test_loc('npm/over_limit.json') - with patch('requests.get') as mock_http_get: + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000" + test_loc = self.get_test_loc("npm/over_limit.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = npm.NpmRegistryVisitor(uri) - expected_loc = self.get_test_loc('npm/expected_over_limit.json') + expected_loc = self.get_test_loc("npm/expected_over_limit.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_NpmRegistryVisitor_1000records(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777' - test_loc = self.get_test_loc('npm/1000_records.json') - with patch('requests.get') as mock_http_get: + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777" + test_loc = self.get_test_loc("npm/1000_records.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = npm.NpmRegistryVisitor(uri) - expected_loc = self.get_test_loc('npm/expected_1000_records.json') + expected_loc = self.get_test_loc("npm/expected_1000_records.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class TestNPMMapper(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('npm/0flux.json')) as npm_metadata: + with open(self.get_test_loc("npm/0flux.json")) as npm_metadata: metadata = json.load(npm_metadata) packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/0flux_npm_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/0flux_npm_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_package2(self): - with open(self.get_test_loc('npm/2112.json')) as npm_metadata: + with open(self.get_test_loc("npm/2112.json")) as npm_metadata: metadata = json.load(npm_metadata) packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/npm_2112_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/npm_2112_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_package3(self): - with open(self.get_test_loc('npm/microdata.json')) as npm_metadata: + with open(self.get_test_loc("npm/microdata.json")) as npm_metadata: metadata = json.load(npm_metadata) packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/microdata-node_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/microdata-node_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_package_with_visitor_data(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777' - test_loc = self.get_test_loc('npm/1000_records.json') - with patch('requests.get') as mock_http_get: + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777" + test_loc = self.get_test_loc("npm/1000_records.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = npm.NpmRegistryVisitor(uri) uris_list = list(uris) @@ -98,22 +98,20 @@ def test_build_package_with_visitor_data(self): metadata = uris_list[29].data packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/29_record_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/29_record_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) # Randomly pick a record from 0-1000 metadata = uris_list[554].data packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/554_record_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/554_record_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_package_with_ticket_439(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7333426' - test_loc = self.get_test_loc('npm/ticket_439.json') - with patch('requests.get') as mock_http_get: + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7333426" + test_loc = self.get_test_loc("npm/ticket_439.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = npm.NpmRegistryVisitor(uri) uris_list = list(uris) @@ -123,14 +121,13 @@ def test_build_package_with_ticket_439(self): metadata = uris_list[1].data packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/expected_ticket_439.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/expected_ticket_439.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_package_verify_ticket_440(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7632607' - test_loc = self.get_test_loc('npm/ticket_440_records.json') - with patch('requests.get') as mock_http_get: + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7632607" + test_loc = self.get_test_loc("npm/ticket_440_records.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = npm.NpmRegistryVisitor(uri) uris_list = list(uris) @@ -140,36 +137,34 @@ def test_build_package_verify_ticket_440(self): metadata = uris_list[1].data packages = miners.npm.build_packages(json.loads(metadata)) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/expected_ticket_440.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/expected_ticket_440.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_npm_mapper(self): - test_uri = 'https://registry.npmjs.org/angular-compare-validator' + test_uri = "https://registry.npmjs.org/angular-compare-validator" router = route.Router() router.append(test_uri, miners.npm.NpmPackageMapper) - test_loc = self.get_test_loc('npm/mapper/index.json') - with open(test_loc, 'rb') as test_file: - test_data = test_file.read().decode('utf-8') + test_loc = self.get_test_loc("npm/mapper/index.json") + with open(test_loc, "rb") as test_file: + test_data = test_file.read().decode("utf-8") test_res_uri = ResourceURI(uri=test_uri, data=test_data) packages = miners.npm.NpmPackageMapper(test_uri, test_res_uri) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/mapper/index.expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/mapper/index.expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_package_for_jsonp_filter(self): - with open(self.get_test_loc('npm/jsonp-filter.json')) as npm_metadata: + with open(self.get_test_loc("npm/jsonp-filter.json")) as npm_metadata: metadata = json.load(npm_metadata) packages = miners.npm.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/jsonp-filter-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("npm/jsonp-filter-expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_regex_npm_mapper(self): - regex = re.compile(r'^https://registry.npmjs.org/[^\/]+$') + regex = re.compile(r"^https://registry.npmjs.org/[^\/]+$") result = re.match( - regex, 'https://registry.npmjs.org/react-mobile-navigation-modal') + regex, "https://registry.npmjs.org/react-mobile-navigation-modal" + ) self.assertTrue(result) diff --git a/minecode/tests/miners/test_nuget.py b/minecode/tests/miners/test_nuget.py index 8a0b7a7b..5f7e659a 100644 --- a/minecode/tests/miners/test_nuget.py +++ b/minecode/tests/miners/test_nuget.py @@ -10,103 +10,104 @@ import json import os import re - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import nuget from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class NugetVisitorsTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_NugetQueryVisitor(self): - uri = 'https://api-v2v3search-0.nuget.org/query' - test_loc = self.get_test_loc('nuget/query.json') - with patch('requests.get') as mock_http_get: + uri = "https://api-v2v3search-0.nuget.org/query" + test_loc = self.get_test_loc("nuget/query.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = nuget.NugetQueryVisitor(uri) - expected_loc = self.get_test_loc('nuget/nuget_query_expected') + expected_loc = self.get_test_loc("nuget/nuget_query_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_PackagesPageVisitor(self): - uri = 'https://api-v2v3search-0.nuget.org/query?skip=0' - test_loc = self.get_test_loc('nuget/query_search.json') - with patch('requests.get') as mock_http_get: + uri = "https://api-v2v3search-0.nuget.org/query?skip=0" + test_loc = self.get_test_loc("nuget/query_search.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = nuget.PackagesPageVisitor(uri) - expected_loc = self.get_test_loc('nuget/nuget_page_json_expected') + expected_loc = self.get_test_loc("nuget/nuget_page_json_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_NugetAPIJsonVisitor(self): - uri = 'https://api.nuget.org/v3/registration1/entityframework/6.1.3.json' - test_loc = self.get_test_loc('nuget/entityframework.json') - with patch('requests.get') as mock_http_get: + uri = "https://api.nuget.org/v3/registration1/entityframework/6.1.3.json" + test_loc = self.get_test_loc("nuget/entityframework.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = nuget.NugetAPIJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'nuget/nuget_downlloadvisitor_json_expected') + expected_loc = self.get_test_loc("nuget/nuget_downlloadvisitor_json_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_NugetHTMLPageVisitor(self): - uri = 'https://www.nuget.org/packages?page=1' - test_loc = self.get_test_loc('nuget/packages.html') - with patch('requests.get') as mock_http_get: + uri = "https://www.nuget.org/packages?page=1" + test_loc = self.get_test_loc("nuget/packages.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = nuget.NugetHTMLPageVisitor(uri) - expected_loc = self.get_test_loc('nuget/packages.html.expected.json') + expected_loc = self.get_test_loc("nuget/packages.html.expected.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_NugetHTMLPackageVisitor(self): - uri = 'https://www.nuget.org/packages/log4net' - test_loc = self.get_test_loc('nuget/log4net.html') - with patch('requests.get') as mock_http_get: + uri = "https://www.nuget.org/packages/log4net" + test_loc = self.get_test_loc("nuget/log4net.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _errors = nuget.NugetHTMLPackageVisitor(uri) - self.assertTrue(b'Apache-2.0 License ' in data) - self.assertTrue(b'log4net is a tool to help the programmer' in data) + self.assertTrue(b"Apache-2.0 License " in data) + self.assertTrue(b"log4net is a tool to help the programmer" in data) class TestNugetMap(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('nuget/entityframework2.json')) as nuget_metadata: + with open(self.get_test_loc("nuget/entityframework2.json")) as nuget_metadata: metadata = json.load(nuget_metadata) packages = miners.nuget.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('nuget/nuget_mapper_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("nuget/nuget_mapper_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_regex_1(self): - regex = re.compile(r'^https://api.nuget.org/packages/.*\.nupkg$') + regex = re.compile(r"^https://api.nuget.org/packages/.*\.nupkg$") result = re.match( - regex, 'https://api.nuget.org/packages/entityframework.4.3.1.nupkg') + regex, "https://api.nuget.org/packages/entityframework.4.3.1.nupkg" + ) self.assertTrue(result) def test_regex_2(self): - regex = re.compile(r'^https://api.nuget.org/v3/catalog.+\.json$') + regex = re.compile(r"^https://api.nuget.org/v3/catalog.+\.json$") result = re.match( - regex, 'https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json') + regex, + "https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json", + ) self.assertTrue(result) def test_build_packages_from_html(self): - uri = 'https://www.nuget.org/packages/log4net' - test_loc = self.get_test_loc('nuget/log4net.html') - with patch('requests.get') as mock_http_get: + uri = "https://www.nuget.org/packages/log4net" + test_loc = self.get_test_loc("nuget/log4net.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _errors = nuget.NugetHTMLPackageVisitor(uri) - packages = miners.nuget.build_packages_from_html(data, uri,) + packages = miners.nuget.build_packages_from_html( + data, + uri, + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'nuget/nuget_mapper_log4net_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("nuget/nuget_mapper_log4net_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_openssl.py b/minecode/tests/miners/test_openssl.py index dd414e27..a42c4def 100644 --- a/minecode/tests/miners/test_openssl.py +++ b/minecode/tests/miners/test_openssl.py @@ -7,62 +7,61 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from datetime import datetime import os - -from mock import Mock -from mock import patch +from datetime import datetime +from unittest.mock import patch from django.test import TestCase as DjangoTestCase -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - +from minecode.miners import openssl from minecode.miners.openssl import build_packages from minecode.models import ResourceURI -from minecode.miners import openssl from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class OpenSSLVisitorsTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_OpenSSLVisitor(self): - uri = 'https://ftp.openssl.org/' - test_loc = self.get_test_loc('openssl/Index.html') - with patch('requests.get') as mock_http_get: + uri = "https://ftp.openssl.org/" + test_loc = self.get_test_loc("openssl/Index.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = openssl.OpenSSLVisitor(uri) - expected_loc = self.get_test_loc( - 'openssl/expected_uri_openssl_index.json') + expected_loc = self.get_test_loc("openssl/expected_uri_openssl_index.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_OpenSSLVisitor_sub_folder(self): - uri = 'https://ftp.openssl.org/source/' - test_loc = self.get_test_loc('openssl/Indexof_source.html') - with patch('requests.get') as mock_http_get: + uri = "https://ftp.openssl.org/source/" + test_loc = self.get_test_loc("openssl/Indexof_source.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = openssl.OpenSSLVisitor(uri) expected_loc = self.get_test_loc( - 'openssl/expected_uri_openssl_sourceindex.json') + "openssl/expected_uri_openssl_sourceindex.json" + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class OpenSSLTest(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_OpenSSL_mapper(self): - uri = 'https://ftp.openssl.org/snapshot/openssl-1.0.2-stable-SNAP-20180518.tar.gz' - last_modified_date = '2014-11-19 17:49' - last_modified_date = datetime.strptime( - last_modified_date, '%Y-%m-%d %H:%M') + uri = ( + "https://ftp.openssl.org/snapshot/openssl-1.0.2-stable-SNAP-20180518.tar.gz" + ) + last_modified_date = "2014-11-19 17:49" + last_modified_date = datetime.strptime(last_modified_date, "%Y-%m-%d %H:%M") resource_uri = ResourceURI.objects.insert( - uri=uri, size='527', last_modified_date=last_modified_date) + uri=uri, size="527", last_modified_date=last_modified_date + ) packages = build_packages(resource_uri) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'openssl/openssl_mapper_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("openssl/openssl_mapper_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_openwrt.py b/minecode/tests/miners/test_openwrt.py index f9122260..8bf124a1 100644 --- a/minecode/tests/miners/test_openwrt.py +++ b/minecode/tests/miners/test_openwrt.py @@ -10,97 +10,103 @@ import json import os from unittest.case import expectedFailure - -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import openwrt from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class OpenWRTVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_openwrt_download_pages(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/' - test_loc = self.get_test_loc( - 'openwrt/Index_of_chaos_calmer_15.05_.html') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/" + test_loc = self.get_test_loc("openwrt/Index_of_chaos_calmer_15.05_.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = openwrt.OpenWrtDownloadPagesVisitor(uri) - expected_loc = self.get_test_loc('openwrt/chaos_calmer_15.05_expected') + expected_loc = self.get_test_loc("openwrt/chaos_calmer_15.05_expected") self.check_expected_uris(uris, expected_loc) def test_visitor_openwrt_download_pages2(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/' + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/" test_loc = self.get_test_loc( - 'openwrt/Index_of_chaos_calmer_15.05_adm5120_rb1xx_packages_base_.html') - with patch('requests.get') as mock_http_get: + "openwrt/Index_of_chaos_calmer_15.05_adm5120_rb1xx_packages_base_.html" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = openwrt.OpenWrtDownloadPagesVisitor(uri) - expected_loc = self.get_test_loc( - 'openwrt/chaos_calmer_15.05_expected_2') + expected_loc = self.get_test_loc("openwrt/chaos_calmer_15.05_expected_2") self.check_expected_uris(uris, expected_loc) @expectedFailure def test_visitor_openwrt_packages_gz(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/Packages.gz' - test_loc = self.get_test_loc('openwrt/Packages.gz') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/Packages.gz" + test_loc = self.get_test_loc("openwrt/Packages.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = openwrt.OpenWrtPackageIndexVisitor(uri) - expected_loc = self.get_test_loc('openwrt/Packages_gz_expected') + expected_loc = self.get_test_loc("openwrt/Packages_gz_expected") self.check_expected_uris(uris, expected_loc) @expectedFailure def test_visitor_openwrt_ipk(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/6to4_12-2_all.ipk' - test_loc = self.get_test_loc('openwrt/6to4_12-2_all.ipk') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/6to4_12-2_all.ipk" + test_loc = self.get_test_loc("openwrt/6to4_12-2_all.ipk") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = openwrt.OpenWrtPackageIndexVisitor(uri) result = json.loads(data) - json_file = self.get_test_loc('openwrt/6to4_12-2_all_ipk_expected') + json_file = self.get_test_loc("openwrt/6to4_12-2_all_ipk_expected") self.check_expected_results(result, json_file, regen=FIXTURES_REGEN) @expectedFailure def test_visitor_openwrt_ipk2(self): - uri = 'https://downloads.openwrt.org/kamikaze/7.09/brcm-2.4/packages/wpa-cli_0.5.7-1_mipsel.ipk' - test_loc = self.get_test_loc('openwrt/wpa-cli_0.5.7-1_mipsel.ipk') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/kamikaze/7.09/brcm-2.4/packages/wpa-cli_0.5.7-1_mipsel.ipk" + test_loc = self.get_test_loc("openwrt/wpa-cli_0.5.7-1_mipsel.ipk") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = openwrt.OpenWrtPackageIndexVisitor(uri) result = json.loads(data) - json_file = self.get_test_loc( - 'openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected') + json_file = self.get_test_loc("openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected") self.check_expected_results(result, json_file) class OpenWRTMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) @expectedFailure def test_build_packages_1(self): - with open(self.get_test_loc('openwrt/6to4_12-2_all_ipk_expected')) as openwrt_ipk_meta: + with open( + self.get_test_loc("openwrt/6to4_12-2_all_ipk_expected") + ) as openwrt_ipk_meta: metadata = json.load(openwrt_ipk_meta) packages = miners.openwrt.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'openwrt/6to4_12-2_all_ipk_expected_mapper.json') + "openwrt/6to4_12-2_all_ipk_expected_mapper.json" + ) self.check_expected_results(packages, expected_loc) @expectedFailure def test_build_packages_2(self): - with open(self.get_test_loc('openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected')) as openwrt_ipk_meta: + with open( + self.get_test_loc("openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected") + ) as openwrt_ipk_meta: metadata = json.load(openwrt_ipk_meta) packages = miners.openwrt.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected_mapper.json') + "openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected_mapper.json" + ) self.check_expected_results(packages, expected_loc) diff --git a/minecode/tests/miners/test_packagist.py b/minecode/tests/miners/test_packagist.py index da5b1efb..4103e810 100644 --- a/minecode/tests/miners/test_packagist.py +++ b/minecode/tests/miners/test_packagist.py @@ -9,40 +9,43 @@ import json import os - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import packagist from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class PackagistVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_packagistlist(self): - uri = 'https://packagist.org/packages/list.json' - test_loc = self.get_test_loc('packagist/list.json') - with patch('requests.get') as mock_http_get: + uri = "https://packagist.org/packages/list.json" + test_loc = self.get_test_loc("packagist/list.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = packagist.PackagistListVisitor(uri) - expected_loc = self.get_test_loc('packagist/packagist_list_expected') + expected_loc = self.get_test_loc("packagist/packagist_list_expected") self.check_expected_uris(uris, expected_loc) class TestPackagistMap(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('packagist/00f100_cakephp-opauth.json')) as packagist_package: + with open( + self.get_test_loc("packagist/00f100_cakephp-opauth.json") + ) as packagist_package: metadata = json.load(packagist_package) packages = miners.packagist.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'packagist/packaglist_00f100_cakephp-opauth_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "packagist/packaglist_00f100_cakephp-opauth_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_pypi.py b/minecode/tests/miners/test_pypi.py index 8d39ca7c..d6143c17 100644 --- a/minecode/tests/miners/test_pypi.py +++ b/minecode/tests/miners/test_pypi.py @@ -10,29 +10,24 @@ import json import os +from unittest.mock import patch from django.test import TestCase as DjangoTestCase -from mock import MagicMock -from mock import Mock -from mock import patch - -from packagedb.models import Package - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import miners from minecode import miners -from minecode.miners import URI +from minecode.management.commands.run_map import map_uri from minecode.models import ResourceURI from minecode.route import Router from minecode.tests import FIXTURES_REGEN -from minecode.management.commands.run_map import map_uri +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get +from packagedb.models import Package class TestPypiVisit(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) ''' import unittest @@ -51,90 +46,90 @@ def test_first(self, mock_xmlrpc): res = server.multiply(2, 3) self.assertEqual(res, 6) ''' - @patch('xmlrpc.client.ServerProxy') + + @patch("xmlrpc.client.ServerProxy") def test_PypiIndexVisitor(self, mock_serverproxyclass): - package_list = ["0", - "0-._.-._.-._.-._.-._.-._.-0", - "0.0.1", - "00print_lol", - "vmnet", - "vmo", - "vmock", - "vmonere", - "VMPC", ] + package_list = [ + "0", + "0-._.-._.-._.-._.-._.-._.-0", + "0.0.1", + "00print_lol", + "vmnet", + "vmo", + "vmock", + "vmonere", + "VMPC", + ] instance = mock_serverproxyclass.return_value instance.list_packages.return_value = iter(package_list) - uri = 'https://pypi.python.org/pypi/' + uri = "https://pypi.python.org/pypi/" uris, _data, _error = miners.pypi.PypiIndexVisitor(uri) self.assertIsNone(_data) - expected_loc = self.get_test_loc('pypi/pypiindexvisitor-expected.json') + expected_loc = self.get_test_loc("pypi/pypiindexvisitor-expected.json") self.check_expected_uris(uris, expected_loc) def test_PypiPackageVisitor(self): - uri = 'https://pypi.python.org/pypi/CAGE/json' - test_loc = self.get_test_loc('pypi/cage.json') - with patch('requests.get') as mock_http_get: + uri = "https://pypi.python.org/pypi/CAGE/json" + test_loc = self.get_test_loc("pypi/cage.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _error = miners.pypi.PypiPackageVisitor(uri) - expected_loc = self.get_test_loc('pypi/expected_uris-cage.json') + expected_loc = self.get_test_loc("pypi/expected_uris-cage.json") self.check_expected_uris(uris, expected_loc) def test_PypiPackageVisitor_2(self): - uri = 'https://pypi.python.org/pypi/boolean.py/json' - test_loc = self.get_test_loc('pypi/boolean.py.json') - with patch('requests.get') as mock_http_get: + uri = "https://pypi.python.org/pypi/boolean.py/json" + test_loc = self.get_test_loc("pypi/boolean.py.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = miners.pypi.PypiPackageVisitor(uri) - expected_loc = self.get_test_loc('pypi/expected_uris-boolean.py.json') + expected_loc = self.get_test_loc("pypi/expected_uris-boolean.py.json") self.check_expected_uris(uris, expected_loc) def test_PypiPackageReleaseVisitor_cage12(self): - uri = 'https://pypi.python.org/pypi/CAGE/1.1.2/json' - test_loc = self.get_test_loc('pypi/cage_1.1.2.json') - with patch('requests.get') as mock_http_get: + uri = "https://pypi.python.org/pypi/CAGE/1.1.2/json" + test_loc = self.get_test_loc("pypi/cage_1.1.2.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) - expected_loc = self.get_test_loc('pypi/expected_uris-cage_1.1.2.json') + expected_loc = self.get_test_loc("pypi/expected_uris-cage_1.1.2.json") self.check_expected_uris(uris, expected_loc) - expected_loc = self.get_test_loc('pypi/expected_data-cage_1.1.2.json') + expected_loc = self.get_test_loc("pypi/expected_data-cage_1.1.2.json") self.check_expected_results(data, expected_loc) def test_PypiPackageReleaseVisitor_cage13(self): - uri = 'https://pypi.python.org/pypi/CAGE/1.1.3/json' - test_loc = self.get_test_loc('pypi/cage_1.1.3.json') - with patch('requests.get') as mock_http_get: + uri = "https://pypi.python.org/pypi/CAGE/1.1.3/json" + test_loc = self.get_test_loc("pypi/cage_1.1.3.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) - expected_loc = self.get_test_loc('pypi/expected_uris-cage_1.1.3.json') + expected_loc = self.get_test_loc("pypi/expected_uris-cage_1.1.3.json") self.check_expected_uris(uris, expected_loc) - expected_loc = self.get_test_loc('pypi/expected_data-cage_1.1.3.json') + expected_loc = self.get_test_loc("pypi/expected_data-cage_1.1.3.json") self.check_expected_results(data, expected_loc) def test_PypiPackageReleaseVisitor_boolean(self): - uri = 'https://pypi.python.org/pypi/boolean.py/2.0.dev3/json' - test_loc = self.get_test_loc('pypi/boolean.py-2.0.dev3.json') - with patch('requests.get') as mock_http_get: + uri = "https://pypi.python.org/pypi/boolean.py/2.0.dev3/json" + test_loc = self.get_test_loc("pypi/boolean.py-2.0.dev3.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) - expected_loc = self.get_test_loc( - 'pypi/expected_uris-boolean.py-2.0.dev3.json') + expected_loc = self.get_test_loc("pypi/expected_uris-boolean.py-2.0.dev3.json") self.check_expected_uris(uris, expected_loc) - expected_loc = self.get_test_loc( - 'pypi/expected_data-boolean.py-2.0.dev3.json') + expected_loc = self.get_test_loc("pypi/expected_data-boolean.py-2.0.dev3.json") self.check_expected_results(data, expected_loc) -class MockResourceURI(object): - +class MockResourceURI: def __init__(self, uri, data): self.uri = uri self.data = data @@ -142,90 +137,86 @@ def __init__(self, uri, data): class TestPypiMap(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages_lxml(self): - with open(self.get_test_loc('pypi/lxml-3.2.0.json')) as pypi_meta: + with open(self.get_test_loc("pypi/lxml-3.2.0.json")) as pypi_meta: metadata = json.load(pypi_meta) packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-lxml-3.2.0.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("pypi/expected-lxml-3.2.0.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages_boolean(self): - with open(self.get_test_loc('pypi/boolean.py-2.0.dev3.json')) as pypi_meta: + with open(self.get_test_loc("pypi/boolean.py-2.0.dev3.json")) as pypi_meta: metadata = json.load(pypi_meta) packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'pypi/expected-boolean.py-2.0.dev3.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("pypi/expected-boolean.py-2.0.dev3.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages_cage13(self): - with open(self.get_test_loc('pypi/cage_1.1.3.json')) as pypi_meta: + with open(self.get_test_loc("pypi/cage_1.1.3.json")) as pypi_meta: metadata = json.load(pypi_meta) packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.3.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("pypi/expected-CAGE-1.1.3.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages_cage12(self): - with open(self.get_test_loc('pypi/cage_1.1.2.json')) as pypi_meta: + with open(self.get_test_loc("pypi/cage_1.1.2.json")) as pypi_meta: metadata = json.load(pypi_meta) packages = miners.pypi.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.2.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("pypi/expected-CAGE-1.1.2.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_PypiPackageMapper_cage(self): - data = open(self.get_test_loc('pypi/cage_1.1.2.json')).read() - uri = 'https://pypi.python.org/pypi/CAGE/1.1.2/json' + data = open(self.get_test_loc("pypi/cage_1.1.2.json")).read() + uri = "https://pypi.python.org/pypi/CAGE/1.1.2/json" resuri = MockResourceURI(uri, data) packages = miners.pypi.PypiPackageMapper(uri, resuri) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.2.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("pypi/expected-CAGE-1.1.2.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_PypiPackageMapper_lxml(self): - data = open(self.get_test_loc('pypi/lxml-3.2.0.json')).read() - uri = 'https://pypi.python.org/pypi/lxml/3.2.0/json' + data = open(self.get_test_loc("pypi/lxml-3.2.0.json")).read() + uri = "https://pypi.python.org/pypi/lxml/3.2.0/json" resuri = MockResourceURI(uri, data) packages = miners.pypi.PypiPackageMapper(uri, resuri) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-lxml-3.2.0.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("pypi/expected-lxml-3.2.0.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_pypi_map(self): # setup: add a mappable URI - with open(self.get_test_loc('pypi/map/3to2-1.1.1.json')) as mappable: + with open(self.get_test_loc("pypi/map/3to2-1.1.1.json")) as mappable: resuri = ResourceURI(**json.load(mappable)) resuri.save() # sanity check packages = miners.pypi.PypiPackageMapper(resuri.uri, resuri) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/map/expected-3to2-1.1.1.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("pypi/map/expected-3to2-1.1.1.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) # build a mock router router = Router() - router.append('https://pypi.python.org/pypi/3to2/1.1.1/json', miners.pypi.PypiPackageMapper) + router.append( + "https://pypi.python.org/pypi/3to2/1.1.1/json", + miners.pypi.PypiPackageMapper, + ) # sanity check - expected_mapped_package_uri = 'https://pypi.python.org/packages/8f/ab/58a363eca982c40e9ee5a7ca439e8ffc5243dde2ae660ba1ffdd4868026b/3to2-1.1.1.zip' - self.assertEqual(0, Package.objects.filter( - download_url=expected_mapped_package_uri).count()) + expected_mapped_package_uri = "https://pypi.python.org/packages/8f/ab/58a363eca982c40e9ee5a7ca439e8ffc5243dde2ae660ba1ffdd4868026b/3to2-1.1.1.zip" + self.assertEqual( + 0, Package.objects.filter(download_url=expected_mapped_package_uri).count() + ) # test proper map_uri(resuri, _map_router=router) - mapped = Package.objects.filter( - download_url=expected_mapped_package_uri) + mapped = Package.objects.filter(download_url=expected_mapped_package_uri) self.assertEqual(1, mapped.count()) diff --git a/minecode/tests/miners/test_repodata.py b/minecode/tests/miners/test_repodata.py index 55b9feb6..959f1321 100644 --- a/minecode/tests/miners/test_repodata.py +++ b/minecode/tests/miners/test_repodata.py @@ -15,84 +15,71 @@ class TestRepoData(FileBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_get_pkg_infos(self): - filelists_xml = self.get_test_loc( - 'repodata_rpms/repodata/filelists.xml') - primary_xml = self.get_test_loc('repodata_rpms/repodata/primary.xml') - other_xml = self.get_test_loc('repodata_rpms/repodata/other.xml') + filelists_xml = self.get_test_loc("repodata_rpms/repodata/filelists.xml") + primary_xml = self.get_test_loc("repodata_rpms/repodata/primary.xml") + other_xml = self.get_test_loc("repodata_rpms/repodata/other.xml") expected = [ { - u'build_time': '1442515098', - u'buildhost': 'c1bk.rdu2.centos.org', - u'href': 'python-ceilometerclient-1.5.0-1.el7.src.rpm', - u'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - u'group': 'Development/Languages', - u'end_header_range': '4876', - u'archive_size': '99648', - u'package_size': '101516', - 'epoch': '0', - u'changelogs': [ + "build_time": "1442515098", + "buildhost": "c1bk.rdu2.centos.org", + "href": "python-ceilometerclient-1.5.0-1.el7.src.rpm", + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "group": "Development/Languages", + "end_header_range": "4876", + "archive_size": "99648", + "package_size": "101516", + "epoch": "0", + "changelogs": [ { - u'date': '1387195200', - u'changelog': '- Update to upstream 1.0.8\n- New dependency: python-six', - u'author': 'Jakub Ruzicka 1.0.8-1' + "date": "1387195200", + "changelog": "- Update to upstream 1.0.8\n- New dependency: python-six", + "author": "Jakub Ruzicka 1.0.8-1", } ], - 'rel': '1.el7', - 'type': 'rpm', - u'files': [ - { - u'name': 'python-ceilometerclient-1.5.0.tar.gz' - }, - { - u'name': 'python-ceilometerclient.spec' - } + "rel": "1.el7", + "type": "rpm", + "files": [ + {"name": "python-ceilometerclient-1.5.0.tar.gz"}, + {"name": "python-ceilometerclient.spec"}, ], - u'description': None, - u'installed_size': '99230', - u'file_time': '1446590411', - 'arch': 'src', - 'name': 'python-ceilometerclient', - u'license': 'ASL 2.0', - u'url': 'https://github.com/openstack/python-ceilometerclient', - u'checksum': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - u'directories': [], - u'summary': 'Python API and CLI for OpenStack Ceilometer', - u'start_header_range': '880', - u'required_rpms': [ - { - u'name': 'python-d2to1' - }, - { - u'ver': '2.5.0', - u'epoch': '0', - u'flags': 'GE', - u'name': 'python-oslo-sphinx' - }, - { - u'name': 'python-pbr' - }, + "description": None, + "installed_size": "99230", + "file_time": "1446590411", + "arch": "src", + "name": "python-ceilometerclient", + "license": "ASL 2.0", + "url": "https://github.com/openstack/python-ceilometerclient", + "checksum": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "directories": [], + "summary": "Python API and CLI for OpenStack Ceilometer", + "start_header_range": "880", + "required_rpms": [ + {"name": "python-d2to1"}, { - u'name': 'python-setuptools' + "ver": "2.5.0", + "epoch": "0", + "flags": "GE", + "name": "python-oslo-sphinx", }, - { - u'name': 'python-sphinx' - }, - { - u'name': 'python2-devel' - } + {"name": "python-pbr"}, + {"name": "python-setuptools"}, + {"name": "python-sphinx"}, + {"name": "python2-devel"}, ], - u'sourcerpm': None, - 'ver': '1.5.0' + "sourcerpm": None, + "ver": "1.5.0", } ] result = repodata.get_pkg_infos(filelists_xml, primary_xml, other_xml) self.assertEqual(expected, result) def test_get_url_for_tag(self): - expected = 'repodata/4c31e7e12c7aa42cf4d7d0b6ab7166fad76b5e40ea18f911e4a820cfa68d1541-filelists.xml.gz' - repomdxml_file = self.get_test_loc('repodata_rpms/repodata/repomd.xml') - output = repodata.get_url_for_tag(repomdxml_file, 'filelists') + expected = "repodata/4c31e7e12c7aa42cf4d7d0b6ab7166fad76b5e40ea18f911e4a820cfa68d1541-filelists.xml.gz" + repomdxml_file = self.get_test_loc("repodata_rpms/repodata/repomd.xml") + output = repodata.get_url_for_tag(repomdxml_file, "filelists") self.assertEqual(expected, output) diff --git a/minecode/tests/miners/test_repodata_rpms.py b/minecode/tests/miners/test_repodata_rpms.py index 6fca7b6c..d61aedc8 100644 --- a/minecode/tests/miners/test_repodata_rpms.py +++ b/minecode/tests/miners/test_repodata_rpms.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -9,24 +8,21 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals import os -from minecode.utils_test import MiningTestCase from minecode.miners import repodata_rpms +from minecode.utils_test import MiningTestCase class RepodataRPMVisitorsTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "testfiles") def test_collect_rsync_urls(self): - directory_listing_loc = self.get_test_loc( - 'repodata_rpms/centos_dir_listing') - base_url = 'http://mirrors.kernel.org/centos/' + directory_listing_loc = self.get_test_loc("repodata_rpms/centos_dir_listing") + base_url = "http://mirrors.kernel.org/centos/" uris = repodata_rpms.collect_rsync_urls( - directory_listing_loc, base_url, file_names=('repomd.xml',)) + directory_listing_loc, base_url, file_names=("repomd.xml",) + ) uris = list(uris) self.assertEqual(1, len(uris)) diff --git a/minecode/tests/miners/test_repomd_parser.py b/minecode/tests/miners/test_repomd_parser.py index f74d3178..6db6bf55 100644 --- a/minecode/tests/miners/test_repomd_parser.py +++ b/minecode/tests/miners/test_repomd_parser.py @@ -8,232 +8,279 @@ # import os +from unittest.mock import patch from packagedcode.rpm import EVR -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get_for_uris -from minecode.utils_test import JsonBasedTesting - from minecode.miners import URI -from minecode.miners.repodata import combine_list_of_dicts from minecode.miners.repodata import combine_dicts_using_pkgid -from minecode.miners.repomd import generate_rpm_objects +from minecode.miners.repodata import combine_list_of_dicts from minecode.miners.repomd import collect_rpm_packages_from_repomd +from minecode.miners.repomd import generate_rpm_objects from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get_for_uris # TODO: add redhat repo test! class TestRepomdParser(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_combine_list_of_dicts(self): - expected = {'a': '1', 'b': '2', 'c': '3'} - output = combine_list_of_dicts([{'a': '1'}, {'b': '2'}, {'c': '3'}]) + expected = {"a": "1", "b": "2", "c": "3"} + output = combine_list_of_dicts([{"a": "1"}, {"b": "2"}, {"c": "3"}]) self.assertEqual(expected, output) def test_generate_rpm_objects(self): - packages = [{'name': 'python-ceilometerclient', 'arch': 'src', 'ver': '1.5.0', - 'rel': '1.el7', 'href': '/python-ceilometerclient-1.5.0-1.el7.src.rpm'}] - repomdxml_url = 'http://vault.centos.org/7.1.1503/cloud/Source/openstack-liberty' + packages = [ + { + "name": "python-ceilometerclient", + "arch": "src", + "ver": "1.5.0", + "rel": "1.el7", + "href": "/python-ceilometerclient-1.5.0-1.el7.src.rpm", + } + ] + repomdxml_url = ( + "http://vault.centos.org/7.1.1503/cloud/Source/openstack-liberty" + ) rpms = list(generate_rpm_objects(packages, repomdxml_url)) self.assertEqual(1, len(rpms)) rpm = rpms[0] - self.assertEqual('python-ceilometerclient', rpm.name) - self.assertEqual( - EVR(version='1.5.0', release='1.el7').to_string(), rpm.version) + self.assertEqual("python-ceilometerclient", rpm.name) + self.assertEqual(EVR(version="1.5.0", release="1.el7").to_string(), rpm.version) def test_collect_rpm_packages_from_repomd_cloudera(self): uri2loc = { - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/repomd.xml'), - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/filelists.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/other.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/primary.xml.gz'), + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/repomd.xml" + ), + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/filelists.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/other.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/primary.xml.gz" + ), } - uri = 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) + uri = "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) _uris, packages, _error = collect_rpm_packages_from_repomd(uri) expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "repodata_rpms/repomd_parser/cloudera/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_collect_rpm_packages_from_repomd_centos(self): uri2loc = { - 'http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/repomd.xml'), - 'http://vault.centos.org/3.8/updates/x86_64/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/filelists.xml.gz'), - 'http://vault.centos.org/3.8/updates/x86_64/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/other.xml.gz'), - 'http://vault.centos.org/3.8/updates/x86_64/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/primary.xml.gz'), + "http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/repomd.xml" + ), + "http://vault.centos.org/3.8/updates/x86_64/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/filelists.xml.gz" + ), + "http://vault.centos.org/3.8/updates/x86_64/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/other.xml.gz" + ), + "http://vault.centos.org/3.8/updates/x86_64/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/primary.xml.gz" + ), } - uri = 'http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) + uri = "http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) uris, packages, _error = collect_rpm_packages_from_repomd(uri) expected_uris = [ - URI(uri='http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-0.99.2-EL3.1.x86_64.rpm'), - URI(uri='http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-gnome-0.99.2-EL3.1.x86_64.rpm'), - URI(uri='http://vault.centos.org/3.8/updates/x86_64/RPMS/XFree86-100dpi-fonts-4.3.0-111.EL.x86_64.rpm') + URI( + uri="http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-0.99.2-EL3.1.x86_64.rpm" + ), + URI( + uri="http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-gnome-0.99.2-EL3.1.x86_64.rpm" + ), + URI( + uri="http://vault.centos.org/3.8/updates/x86_64/RPMS/XFree86-100dpi-fonts-4.3.0-111.EL.x86_64.rpm" + ), ] self.assertEqual(expected_uris, uris) expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "repodata_rpms/repomd_parser/centos/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_collect_rpm_packages_from_repomd_cloudera_2(self): uri2loc = { - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/repomd.xml'), - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/filelists.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/primary.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/other.xml.gz'), + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/repomd.xml" + ), + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/filelists.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/primary.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/other.xml.gz" + ), } - uri = 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) + uri = "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) _uris, packages, _error = collect_rpm_packages_from_repomd(uri) expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "repodata_rpms/repomd_parser/cloudera2/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_collect_rpm_packages_from_repomd_postgresql(self): uri2loc = { - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/repomd.xml'), - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz'), - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz'), - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz'), + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/repomd.xml" + ), + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz" + ), + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz" + ), + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz" + ), } - uri = 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) + uri = "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) uris, packages, error = collect_rpm_packages_from_repomd(uri) self.assertEqual(None, error) expected_uris = [ - URI(uri='http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/skytools-92-debuginfo-3.1.5-1.rhel6.x86_64.rpm'), - URI(uri='http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repmgr92-2.0.2-4.rhel6.x86_64.rpm'), - URI(uri='http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/pgagent_92-3.2.1-1.rhel6.x86_64.rpm') + URI( + uri="http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/skytools-92-debuginfo-3.1.5-1.rhel6.x86_64.rpm" + ), + URI( + uri="http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repmgr92-2.0.2-4.rhel6.x86_64.rpm" + ), + URI( + uri="http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/pgagent_92-3.2.1-1.rhel6.x86_64.rpm" + ), ] self.assertEqual(expected_uris, uris) expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "repodata_rpms/repomd_parser/postgresql/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_collect_rpm_packages_from_repomd_opensuse(self): uri2loc = { - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/repomd.xml'), - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz'), - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz'), - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz'), + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/repomd.xml" + ), + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz" + ), + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz" + ), + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz" + ), } - uri = 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) + uri = "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) _uris, packages, _error = collect_rpm_packages_from_repomd(uri) expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "repodata_rpms/repomd_parser/opensuse/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_collect_rpm_packages_from_repomd_pgpool(self): uri2loc = { - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/repomd.xml'), - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/filelists.xml.gz'), - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/other.xml.gz'), - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/primary.xml.gz'), + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/repomd.xml" + ), + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/filelists.xml.gz" + ), + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/other.xml.gz" + ), + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/primary.xml.gz" + ), } - uri = 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) + uri = "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) _uris, packages, _error = collect_rpm_packages_from_repomd(uri) expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "repodata_rpms/repomd_parser/pgpool/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_combine_dicts_using_pkgid(self): all_dicts = [ - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - 'name': 'python-ceilometerclient'}, - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', 'ver': '1.5.0'}, - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', 'rel': '1.el7'} + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "name": "python-ceilometerclient", + }, + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "ver": "1.5.0", + }, + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "rel": "1.el7", + }, ] expected = [ - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - 'name': 'python-ceilometerclient', - 'rel': '1.el7', - 'ver': '1.5.0'} + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "name": "python-ceilometerclient", + "rel": "1.el7", + "ver": "1.5.0", + } ] output = combine_dicts_using_pkgid(all_dicts) self.assertEqual(expected, output) diff --git a/minecode/tests/miners/test_rubygems.py b/minecode/tests/miners/test_rubygems.py index edb3e91c..5d5b3b99 100644 --- a/minecode/tests/miners/test_rubygems.py +++ b/minecode/tests/miners/test_rubygems.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -12,32 +11,28 @@ import codecs import json import os +from unittest.mock import patch -from mock import Mock -from mock import patch - -from commoncode.fileutils import file_name from django.test import TestCase as DjangoTestCase -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting -from minecode.utils_test import model_to_dict +from commoncode.fileutils import file_name from minecode import miners from minecode import route -from minecode.models import ResourceURI from minecode import visit_router -from minecode.miners.rubygems import build_rubygem_packages_from_api_data -from minecode.miners.rubygems import build_rubygem_packages_from_metadata -from minecode.miners.rubygems import RubyGemsApiVersionsJsonMapper -from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataMapper - -from minecode.miners.rubygems import get_gem_metadata from minecode.miners.rubygems import RubyGemsApiManyVersionsVisitor +from minecode.miners.rubygems import RubyGemsApiVersionsJsonMapper from minecode.miners.rubygems import RubyGemsIndexVisitor +from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataMapper from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataVisitor +from minecode.miners.rubygems import build_rubygem_packages_from_api_data +from minecode.miners.rubygems import build_rubygem_packages_from_metadata +from minecode.miners.rubygems import get_gem_metadata +from minecode.models import ResourceURI from minecode.tests import FIXTURES_REGEN - +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get +from minecode.utils_test import model_to_dict # # TODO: also parse Gemspec @@ -46,154 +41,161 @@ class RubyGemsVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_check_gem_file_visitor_routes(self): routes = [ - 'https://rubygems.org/downloads/m2r-2.1.0.gem', # https - 'http://rubygems.org/downloads/m2r-2.1.0.gem', # http - 'https://rubygems.org/downloads/O365RubyEasy-0.0.1.gem', # upper + "https://rubygems.org/downloads/m2r-2.1.0.gem", # https + "http://rubygems.org/downloads/m2r-2.1.0.gem", # http + "https://rubygems.org/downloads/O365RubyEasy-0.0.1.gem", # upper ] for route in routes: self.assertTrue(visit_router.resolve(route)) def test_RubyGemsIndexVisitor_latest(self): - uri = 'http://rubygems.org/specs.4.8.gz' - test_loc = self.get_test_loc('rubygems/index/latest_specs.4.8.gz') - with patch('requests.get') as mock_http_get: + uri = "http://rubygems.org/specs.4.8.gz" + test_loc = self.get_test_loc("rubygems/index/latest_specs.4.8.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = RubyGemsIndexVisitor(uri) expected_loc = self.get_test_loc( - 'rubygems/index/latest_specs.4.8.gz.expected.json') + "rubygems/index/latest_specs.4.8.gz.expected.json" + ) uris_list = list(uris) self.assertTrue(len(uris_list) > 1000) - self.check_expected_uris( - uris_list[0:1000], expected_loc, regen=FIXTURES_REGEN) + self.check_expected_uris(uris_list[0:1000], expected_loc, regen=FIXTURES_REGEN) def test_RubyGemsApiVersionVisitor(self): - uri = 'https://rubygems.org/api/v1/versions/0xffffff.json' - test_loc = self.get_test_loc('rubygems/apiv1/0xffffff.api.json') - with patch('requests.get') as mock_http_get: + uri = "https://rubygems.org/api/v1/versions/0xffffff.json" + test_loc = self.get_test_loc("rubygems/apiv1/0xffffff.api.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = RubyGemsApiManyVersionsVisitor(uri) - expected_loc = self.get_test_loc( - 'rubygems/apiv1/expected_0xffffff.api.json') + expected_loc = self.get_test_loc("rubygems/apiv1/expected_0xffffff.api.json") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) def test_RubyGemsApiVersionVisitor2(self): - uri = 'https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json' - test_loc = self.get_test_loc('rubygems/apiv1/a1630ty_a1630ty.api.json') - with patch('requests.get') as mock_http_get: + uri = "https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json" + test_loc = self.get_test_loc("rubygems/apiv1/a1630ty_a1630ty.api.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = RubyGemsApiManyVersionsVisitor(uri) expected_loc = self.get_test_loc( - 'rubygems/apiv1/expected_a1630ty_a1630ty.api.json') + "rubygems/apiv1/expected_a1630ty_a1630ty.api.json" + ) self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) def test_RubyGemsApiVersionVisitor3(self): - uri = 'https://rubygems.org/api/v1/versions/zuck.json' - test_loc = self.get_test_loc('rubygems/apiv1/zuck.api.json') - with patch('requests.get') as mock_http_get: + uri = "https://rubygems.org/api/v1/versions/zuck.json" + test_loc = self.get_test_loc("rubygems/apiv1/zuck.api.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = RubyGemsApiManyVersionsVisitor(uri) - expected_loc = self.get_test_loc( - 'rubygems/apiv1/expected_zuck.api.json') + expected_loc = self.get_test_loc("rubygems/apiv1/expected_zuck.api.json") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) def test_RubyGemsPackageArchiveMetadataVisitor(self): - uri = 'https://rubygems.org/downloads/a_okay-0.1.0.gem' - test_loc = self.get_test_loc('rubygems/a_okay-0.1.0.gem', copy=True) - with patch('requests.get') as mock_http_get: + uri = "https://rubygems.org/downloads/a_okay-0.1.0.gem" + test_loc = self.get_test_loc("rubygems/a_okay-0.1.0.gem", copy=True) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = RubyGemsPackageArchiveMetadataVisitor(uri) - expected_loc = self.get_test_loc('rubygems/a_okay-0.1.0.gem.metadata') + expected_loc = self.get_test_loc("rubygems/a_okay-0.1.0.gem.metadata") with open(expected_loc) as expect_file: self.assertEqual(expect_file.read(), data) class RubyGemsApiMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_rubygem_packages_from_api_data_1(self): - with open(self.get_test_loc('rubygems/apiv1/0xffffff.api.json')) as api: + with open(self.get_test_loc("rubygems/apiv1/0xffffff.api.json")) as api: apidata = json.load(api) - packages = build_rubygem_packages_from_api_data(apidata, '0xffffff') + packages = build_rubygem_packages_from_api_data(apidata, "0xffffff") packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/apiv1/0xffffff.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("rubygems/apiv1/0xffffff.api.package.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_rubygem_packages_from_api_data_2(self): - with open(self.get_test_loc('rubygems/apiv1/zuck.api.json')) as api: + with open(self.get_test_loc("rubygems/apiv1/zuck.api.json")) as api: apidata = json.load(api) - packages = build_rubygem_packages_from_api_data(apidata, 'zuck') + packages = build_rubygem_packages_from_api_data(apidata, "zuck") packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/apiv1/zuck.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("rubygems/apiv1/zuck.api.package.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_rubygem_packages_from_api_data_3(self): - with open(self.get_test_loc('rubygems/apiv1/a1630ty_a1630ty.api.json')) as api: + with open(self.get_test_loc("rubygems/apiv1/a1630ty_a1630ty.api.json")) as api: apidata = json.load(api) - packages = miners.rubygems.build_rubygem_packages_from_api_data(apidata, 'a1630ty_a1630ty') + packages = miners.rubygems.build_rubygem_packages_from_api_data( + apidata, "a1630ty_a1630ty" + ) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'rubygems/apiv1/a1630ty_a1630ty.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "rubygems/apiv1/a1630ty_a1630ty.api.package.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_rubygem_packages_from_api_data_with_deps(self): - with open(self.get_test_loc('rubygems/apiv1/action_tracker.api.json')) as api: + with open(self.get_test_loc("rubygems/apiv1/action_tracker.api.json")) as api: apidata = json.load(api) - packages = miners.rubygems.build_rubygem_packages_from_api_data(apidata, 'action_tracker') + packages = miners.rubygems.build_rubygem_packages_from_api_data( + apidata, "action_tracker" + ) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'rubygems/apiv1/action_tracker.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "rubygems/apiv1/action_tracker.api.package.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_RubyGemsApiVersionsJsonMapper(self): - test_uri = 'https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json' + test_uri = "https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json" router = route.Router() router.append(test_uri, RubyGemsApiVersionsJsonMapper) - test_loc = self.get_test_loc('rubygems/apiv1/a1630ty_a1630ty.api.json') - with codecs.open(test_loc, encoding='utf-8') as ltest_file: + test_loc = self.get_test_loc("rubygems/apiv1/a1630ty_a1630ty.api.json") + with codecs.open(test_loc, encoding="utf-8") as ltest_file: test_data = ltest_file.read() test_res_uri = ResourceURI(uri=test_uri, data=test_data) packages = RubyGemsApiVersionsJsonMapper(test_uri, test_res_uri) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'rubygems/apiv1/a1630ty_a1630ty.api.mapped.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "rubygems/apiv1/a1630ty_a1630ty.api.mapped.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) class RubyGemsArchiveMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_test_RubyGemsPackageArchiveMetadataMapper(self): - test_uri = 'https://rubygems.org/downloads/mysmallidea-address_standardization-0.4.1.gem' + test_uri = "https://rubygems.org/downloads/mysmallidea-address_standardization-0.4.1.gem" router = route.Router() router.append(test_uri, RubyGemsPackageArchiveMetadataMapper) test_loc = self.get_test_loc( - 'rubygems/mysmallidea-address_standardization-0.4.1.gem.metadata') - with codecs.open(test_loc, encoding='utf-8') as test_file: + "rubygems/mysmallidea-address_standardization-0.4.1.gem.metadata" + ) + with codecs.open(test_loc, encoding="utf-8") as test_file: test_data = test_file.read() test_res_uri = ResourceURI(uri=test_uri, data=test_data) packages = RubyGemsPackageArchiveMetadataMapper(test_uri, test_res_uri) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'rubygems/mysmallidea-address_standardization-0.4.1.gem.mapped.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def check_mapped_packages(self, test_loc, expected_loc, extract=True, regen=FIXTURES_REGEN): + "rubygems/mysmallidea-address_standardization-0.4.1.gem.mapped.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + def check_mapped_packages( + self, test_loc, expected_loc, extract=True, regen=FIXTURES_REGEN + ): test_loc = self.get_test_loc(test_loc, copy=True) if extract: @@ -202,14 +204,15 @@ def check_mapped_packages(self, test_loc, expected_loc, extract=True, regen=FIXT with open(test_loc) as tl: metadata = tl.read() - download_url = 'https://rubygems.org/downloads/{}'.format( - file_name(test_loc).replace('.metadata', '')) + download_url = "https://rubygems.org/downloads/{}".format( + file_name(test_loc).replace(".metadata", "") + ) results = build_rubygem_packages_from_metadata(metadata, download_url) results = [p.to_dict() for p in results] expected_loc = self.get_test_loc(expected_loc) if regen: - with codecs.open(expected_loc, 'wb', encoding='UTF-8') as ex: + with codecs.open(expected_loc, "wb", encoding="UTF-8") as ex: json.dump(results, ex, indent=2) with open(expected_loc) as ex: @@ -219,85 +222,96 @@ def check_mapped_packages(self, test_loc, expected_loc, extract=True, regen=FIXT def test_build_rubygem_packages_from_metadata_plain(self): self.check_mapped_packages( - 'rubygems/0mq-0.4.1.gem.metadata', - 'rubygems/0mq-0.4.1.gem.package.json', - extract=False) + "rubygems/0mq-0.4.1.gem.metadata", + "rubygems/0mq-0.4.1.gem.package.json", + extract=False, + ) def test_build_rubygem_packages_from_metadata_0(self): self.check_mapped_packages( - 'rubygems/a_okay-0.1.0.gem', - 'rubygems/a_okay-0.1.0.gem.package.json') + "rubygems/a_okay-0.1.0.gem", "rubygems/a_okay-0.1.0.gem.package.json" + ) def test_build_rubygem_packages_from_metadata_1(self): self.check_mapped_packages( - 'rubygems/archive-tar-minitar-0.5.2.gem', - 'rubygems/archive-tar-minitar-0.5.2.gem.package.json') + "rubygems/archive-tar-minitar-0.5.2.gem", + "rubygems/archive-tar-minitar-0.5.2.gem.package.json", + ) def test_build_rubygem_packages_from_metadata_2(self): self.check_mapped_packages( - 'rubygems/blankslate-3.1.3.gem', - 'rubygems/blankslate-3.1.3.gem.package.json') + "rubygems/blankslate-3.1.3.gem", + "rubygems/blankslate-3.1.3.gem.package.json", + ) def test_build_rubygem_packages_from_metadata_3(self): self.check_mapped_packages( - 'rubygems/m2r-2.1.0.gem', - 'rubygems/m2r-2.1.0.gem.package.json') + "rubygems/m2r-2.1.0.gem", "rubygems/m2r-2.1.0.gem.package.json" + ) def test_build_rubygem_packages_from_metadata_4(self): self.check_mapped_packages( - 'rubygems/mysmallidea-address_standardization-0.4.1.gem', - 'rubygems/mysmallidea-address_standardization-0.4.1.gem.package.json') + "rubygems/mysmallidea-address_standardization-0.4.1.gem", + "rubygems/mysmallidea-address_standardization-0.4.1.gem.package.json", + ) def test_build_rubygem_packages_from_metadata_5(self): self.check_mapped_packages( - 'rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem', - 'rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem.package.json') + "rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem", + "rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem.package.json", + ) def test_build_rubygem_packages_from_metadata_6(self): self.check_mapped_packages( - 'rubygems/ng-rails-csrf-0.1.0.gem', - 'rubygems/ng-rails-csrf-0.1.0.gem.package.json') + "rubygems/ng-rails-csrf-0.1.0.gem", + "rubygems/ng-rails-csrf-0.1.0.gem.package.json", + ) def test_build_rubygem_packages_from_metadata_7(self): self.check_mapped_packages( - 'rubygems/small_wonder-0.1.10.gem', - 'rubygems/small_wonder-0.1.10.gem.package.json') + "rubygems/small_wonder-0.1.10.gem", + "rubygems/small_wonder-0.1.10.gem.package.json", + ) def test_build_rubygem_packages_from_metadata_8(self): self.check_mapped_packages( - 'rubygems/small-0.2.gem', - 'rubygems/small-0.2.gem.package.json') + "rubygems/small-0.2.gem", "rubygems/small-0.2.gem.package.json" + ) def test_build_rubygem_packages_from_metadata_9(self): self.check_mapped_packages( - 'rubygems/sprockets-vendor_gems-0.1.3.gem', - 'rubygems/sprockets-vendor_gems-0.1.3.gem.package.json') + "rubygems/sprockets-vendor_gems-0.1.3.gem", + "rubygems/sprockets-vendor_gems-0.1.3.gem.package.json", + ) def test_build_rubygem_packages_from_metadata_with_deps(self): self.check_mapped_packages( - 'rubygems/action_tracker-1.0.2.gem', - 'rubygems/action_tracker-1.0.2.gem.package.json') + "rubygems/action_tracker-1.0.2.gem", + "rubygems/action_tracker-1.0.2.gem.package.json", + ) class RubyEnd2EndTest(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_and_map_end2end(self): - from minecode.management.commands.run_visit import visit_uri - from minecode.management.commands.run_map import map_uri import packagedb + from minecode.management.commands.run_map import map_uri + from minecode.management.commands.run_visit import visit_uri - uri = 'https://rubygems.org/downloads/sprockets-vendor_gems-0.1.3.gem' + uri = "https://rubygems.org/downloads/sprockets-vendor_gems-0.1.3.gem" test_loc = self.get_test_loc( - 'rubygems/sprockets-vendor_gems-0.1.3.gem', copy=True) + "rubygems/sprockets-vendor_gems-0.1.3.gem", copy=True + ) before_uri = [p.id for p in ResourceURI.objects.all()] before_pkg = [p.id for p in packagedb.models.Package.objects.all()] resource_uri = ResourceURI.objects.insert(uri=uri) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # visit test proper: this should process all the test uris visit_uri(resource_uri) @@ -308,20 +322,24 @@ def test_visit_and_map_end2end(self): else: visited = ResourceURI.objects.all() - uri_results = [model_to_dict(rec, exclude=['id']) for rec in visited] + uri_results = [model_to_dict(rec, exclude=["id"]) for rec in visited] expected_loc = self.get_test_loc( - 'rubygems/sprockets-vendor_gems-0.1.3.gem.visited.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) + "rubygems/sprockets-vendor_gems-0.1.3.gem.visited.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) else: mapped = packagedb.models.Package.objects.all() package_results = [pac.to_dict() for pac in mapped] expected_loc = self.get_test_loc( - 'rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) + "rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json" + ) + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) diff --git a/minecode/tests/miners/test_sourceforge.py b/minecode/tests/miners/test_sourceforge.py index 25a53c83..8bf0ad24 100644 --- a/minecode/tests/miners/test_sourceforge.py +++ b/minecode/tests/miners/test_sourceforge.py @@ -9,108 +9,104 @@ import json import os - -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting +from unittest.mock import patch from minecode import miners from minecode.miners import sourceforge from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class SourceforgeVisitorsTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_sf_sitemap_index_new(self): - uri = 'http://sourceforge.net/sitemap.xml' - test_loc = self.get_test_loc('sourceforge/sitemap.xml') - with patch('requests.get') as mock_http_get: + uri = "http://sourceforge.net/sitemap.xml" + test_loc = self.get_test_loc("sourceforge/sitemap.xml") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, error = sourceforge.SourceforgeSitemapIndexVisitor( - uri) + uris, _data, error = sourceforge.SourceforgeSitemapIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'sourceforge/expected_sf_sitemap_new.json') + expected_loc = self.get_test_loc("sourceforge/expected_sf_sitemap_new.json") self.check_expected_uris(uris, expected_loc) self.assertIsNone(error) def test_visit_sf_sitemap_page_new(self): - uri = 'http://sourceforge.net/sitemap-1.xml' - test_loc = self.get_test_loc('sourceforge/sitemap-1.xml') - with patch('requests.get') as mock_http_get: + uri = "http://sourceforge.net/sitemap-1.xml" + test_loc = self.get_test_loc("sourceforge/sitemap-1.xml") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, error = sourceforge.SourceforgeSitemapPageVisitor(uri) expected_loc = self.get_test_loc( - 'sourceforge/expected_sf_sitemap_page_new.json') + "sourceforge/expected_sf_sitemap_page_new.json" + ) self.check_expected_uris(uris, expected_loc) self.assertIsNone(error) def test_visit_sf_sitemap_page6(self): - uri = 'https://sourceforge.net/sitemap-6.xml' - test_loc = self.get_test_loc('sourceforge/sitemap-6.xml') - with patch('requests.get') as mock_http_get: + uri = "https://sourceforge.net/sitemap-6.xml" + test_loc = self.get_test_loc("sourceforge/sitemap-6.xml") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, error = sourceforge.SourceforgeSitemapPageVisitor(uri) - expected_loc = self.get_test_loc('sourceforge/expected_sitemap-6.json') + expected_loc = self.get_test_loc("sourceforge/expected_sitemap-6.json") self.check_expected_uris(uris, expected_loc) self.assertIsNone(error) def test_visit_sf_project_json_api_new(self): - uri = 'https://sourceforge.net/api/project/name/netwiki/json' - test_loc = self.get_test_loc('sourceforge/netwiki.json') - with patch('requests.get') as mock_http_get: + uri = "https://sourceforge.net/api/project/name/netwiki/json" + test_loc = self.get_test_loc("sourceforge/netwiki.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, error = sourceforge.SourceforgeProjectJsonVisitor(uri) - expected_loc = self.get_test_loc('sourceforge/expected_netwiki.json') + expected_loc = self.get_test_loc("sourceforge/expected_netwiki.json") self.check_expected_results(data, expected_loc) self.assertIsNone(error) class SourceforgeMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('sourceforge/odanur.json')) as sourceforge_metadata: + with open(self.get_test_loc("sourceforge/odanur.json")) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_odanur_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("sourceforge/mapper_odanur_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages2(self): - with open(self.get_test_loc('sourceforge/openstunts.json')) as sourceforge_metadata: + with open( + self.get_test_loc("sourceforge/openstunts.json") + ) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_openstunts_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("sourceforge/mapper_openstunts_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages3(self): - with open(self.get_test_loc('sourceforge/monoql.json')) as sourceforge_metadata: + with open(self.get_test_loc("sourceforge/monoql.json")) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_omonoql_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("sourceforge/mapper_omonoql_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages4(self): - with open(self.get_test_loc('sourceforge/niftyphp.json')) as sourceforge_metadata: + with open( + self.get_test_loc("sourceforge/niftyphp.json") + ) as sourceforge_metadata: metadata = json.load(sourceforge_metadata) packages = miners.sourceforge.build_packages_from_metafile(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_niftyphp_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("sourceforge/mapper_niftyphp_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index f3507a12..9eab2af7 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -10,43 +10,47 @@ import json import os -from django.contrib.auth.models import Group, User +from django.contrib.auth.models import Group +from django.contrib.auth.models import User from django.core import signing from django.test import TestCase + from rest_framework import status from rest_framework.test import APIClient from minecode.models import ScannableURI from minecode.utils import get_webhook_url from minecode.utils_test import JsonBasedTesting -from packagedb.models import Package, Resource +from packagedb.models import Package +from packagedb.models import Resource class ScannableURIAPITestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.scan_queue_worker_user = User.objects.create_user( - username="username", - email="e@mail.com", - password="secret" + username="username", email="e@mail.com", password="secret" ) scan_queue_workers_group, _ = Group.objects.get_or_create( - name='scan_queue_workers') + name="scan_queue_workers" + ) scan_queue_workers_group.user_set.add(self.scan_queue_worker_user) - self.scan_queue_worker_auth = f"Token {self.scan_queue_worker_user.auth_token.key}" + self.scan_queue_worker_auth = ( + f"Token {self.scan_queue_worker_user.auth_token.key}" + ) self.scan_queue_worker_client = APIClient(enforce_csrf_checks=True) self.scan_queue_worker_client.credentials( - HTTP_AUTHORIZATION=self.scan_queue_worker_auth) - self.scan_queue_worker_user_id_str = str( - self.scan_queue_worker_user.id) + HTTP_AUTHORIZATION=self.scan_queue_worker_auth + ) + self.scan_queue_worker_user_id_str = str(self.scan_queue_worker_user.id) # create a staff user self.staff_user = User.objects.create_user( username="staff_username", email="staff_e@mail.com", password="secret", - is_staff=True + is_staff=True, ) self.staff_auth = f"Token {self.staff_user.auth_token.key}" self.staff_client = APIClient(enforce_csrf_checks=True) @@ -65,161 +69,159 @@ def setUp(self): self.anonymous_client = APIClient() self.package1 = Package.objects.create( - download_url='https://test-url.com/package1.tar.gz', - type='type1', - name='name1', - version='1.0', + download_url="https://test-url.com/package1.tar.gz", + type="type1", + name="name1", + version="1.0", ) self.scannable_uri1 = ScannableURI.objects.create( - uri='https://test-url.com/package1.tar.gz', - package=self.package1 + uri="https://test-url.com/package1.tar.gz", package=self.package1 ) self.package2 = Package.objects.create( - download_url='https://test-url.com/package2.tar.gz', - type='type2', - name='name2', - version='2.0', + download_url="https://test-url.com/package2.tar.gz", + type="type2", + name="name2", + version="2.0", ) self.scannable_uri2 = ScannableURI.objects.create( - uri='https://test-url.com/package2.tar.gz', - package=self.package2 + uri="https://test-url.com/package2.tar.gz", package=self.package2 ) self.package3 = Package.objects.create( - download_url='https://test-url.com/package3.tar.gz', - type='type3', - name='name3', - version='3.0', + download_url="https://test-url.com/package3.tar.gz", + type="type3", + name="name3", + version="3.0", ) self.scannable_uri3 = ScannableURI.objects.create( - uri='https://test-url.com/package3.tar.gz', - package=self.package3 + uri="https://test-url.com/package3.tar.gz", package=self.package3 ) def test_api_scannable_uri_permissions(self): - response = self.anonymous_client.get('/api/scan_queue/') + response = self.anonymous_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - response = self.anonymous_client.get( - '/api/scan_queue/get_next_download_url/') + response = self.anonymous_client.get("/api/scan_queue/get_next_download_url/") self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - response = self.anonymous_client.post('/api/scan_queue/update_status/') + response = self.anonymous_client.post("/api/scan_queue/update_status/") self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - response = self.regular_client.get('/api/scan_queue/') + response = self.regular_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - response = self.regular_client.get( - '/api/scan_queue/get_next_download_url/') + response = self.regular_client.get("/api/scan_queue/get_next_download_url/") self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - response = self.regular_client.post('/api/scan_queue/update_status/') + response = self.regular_client.post("/api/scan_queue/update_status/") self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) def test_api_scannable_uri_list_endpoint(self): - response = self.scan_queue_worker_client.get('/api/scan_queue/') + response = self.scan_queue_worker_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(3, response.data.get('count')) + self.assertEqual(3, response.data.get("count")) - response = self.staff_client.get('/api/scan_queue/') + response = self.staff_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(3, response.data.get('count')) + self.assertEqual(3, response.data.get("count")) def test_api_scannable_uri_get_next_download_url(self): def check_webhook_url(self, webhook_url): - webhook_url = response.data.get('webhook_url') - key = webhook_url.rstrip('/').split('/')[-1] - self.assertIn('/api/scan_queue/index_package_scan/', webhook_url) - self.assertEqual(signing.loads(key), str( - self.scan_queue_worker_user.id)) + webhook_url = response.data.get("webhook_url") + key = webhook_url.rstrip("/").split("/")[-1] + self.assertIn("/api/scan_queue/index_package_scan/", webhook_url) + self.assertEqual(signing.loads(key), str(self.scan_queue_worker_user.id)) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get( - 'scannable_uri_uuid'), self.scannable_uri1.uuid) - self.assertEqual(response.data.get( - 'download_url'), self.scannable_uri1.uri) - check_webhook_url(self, response.data.get('webhook_url')) + self.assertEqual( + response.data.get("scannable_uri_uuid"), self.scannable_uri1.uuid + ) + self.assertEqual(response.data.get("download_url"), self.scannable_uri1.uri) + check_webhook_url(self, response.data.get("webhook_url")) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get( - 'scannable_uri_uuid'), self.scannable_uri2.uuid) - self.assertEqual(response.data.get( - 'download_url'), self.scannable_uri2.uri) - check_webhook_url(self, response.data.get('webhook_url')) + self.assertEqual( + response.data.get("scannable_uri_uuid"), self.scannable_uri2.uuid + ) + self.assertEqual(response.data.get("download_url"), self.scannable_uri2.uri) + check_webhook_url(self, response.data.get("webhook_url")) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get( - 'scannable_uri_uuid'), self.scannable_uri3.uuid) - self.assertEqual(response.data.get( - 'download_url'), self.scannable_uri3.uri) - check_webhook_url(self, response.data.get('webhook_url')) + self.assertEqual( + response.data.get("scannable_uri_uuid"), self.scannable_uri3.uuid + ) + self.assertEqual(response.data.get("download_url"), self.scannable_uri3.uri) + check_webhook_url(self, response.data.get("webhook_url")) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get('scannable_uri_uuid'), '') - self.assertEqual(response.data.get('download_url'), '') - self.assertEqual(response.data.get('webhook_url'), '') + self.assertEqual(response.data.get("scannable_uri_uuid"), "") + self.assertEqual(response.data.get("download_url"), "") + self.assertEqual(response.data.get("webhook_url"), "") - response = self.staff_client.get( - '/api/scan_queue/get_next_download_url/') + response = self.staff_client.get("/api/scan_queue/get_next_download_url/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get('scannable_uri_uuid'), '') - self.assertEqual(response.data.get('download_url'), '') - self.assertEqual(response.data.get('webhook_url'), '') + self.assertEqual(response.data.get("scannable_uri_uuid"), "") + self.assertEqual(response.data.get("download_url"), "") + self.assertEqual(response.data.get("webhook_url"), "") def test_api_scannable_uri_update_status(self): scannable_uri1_uuid = self.scannable_uri1.uuid scannable_uri2_uuid = self.scannable_uri2.uuid - scannable_uri1_update_status_url = f'/api/scan_queue/{scannable_uri1_uuid}/update_status/' - scannable_uri2_update_status_url = f'/api/scan_queue/{scannable_uri2_uuid}/update_status/' + scannable_uri1_update_status_url = ( + f"/api/scan_queue/{scannable_uri1_uuid}/update_status/" + ) + scannable_uri2_update_status_url = ( + f"/api/scan_queue/{scannable_uri2_uuid}/update_status/" + ) - self.assertEqual(ScannableURI.SCAN_NEW, - self.scannable_uri1.scan_status) + self.assertEqual(ScannableURI.SCAN_NEW, self.scannable_uri1.scan_status) data = { "scannable_uri_uuid": scannable_uri1_uuid, - "scan_status": 'failed', - 'scan_log': 'scan_log', + "scan_status": "failed", + "scan_log": "scan_log", } response = self.scan_queue_worker_client.post( - scannable_uri1_update_status_url, data=data) + scannable_uri1_update_status_url, data=data + ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri1.refresh_from_db() - self.assertEqual(ScannableURI.SCAN_FAILED, - self.scannable_uri1.scan_status) - self.assertEqual('scan_log', self.scannable_uri1.scan_error) + self.assertEqual(ScannableURI.SCAN_FAILED, self.scannable_uri1.scan_status) + self.assertEqual("scan_log", self.scannable_uri1.scan_error) - data = { - 'scan_status': '' - } + data = {"scan_status": ""} response = self.scan_queue_worker_client.post( - scannable_uri2_update_status_url, data=data) - expected_response = {'error': 'missing scan_status'} + scannable_uri2_update_status_url, data=data + ) + expected_response = {"error": "missing scan_status"} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) - data = { - 'scan_status': 'invalid' - } + data = {"scan_status": "invalid"} response = self.scan_queue_worker_client.post( - scannable_uri2_update_status_url, data=data) - expected_response = {'error': 'invalid scan_status: invalid'} + scannable_uri2_update_status_url, data=data + ) + expected_response = {"error": "invalid scan_status: invalid"} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) data = {} response = self.scan_queue_worker_client.post( - '/api/scan_queue/asdf/', data=data) - self.assertEqual(response.status_code, - status.HTTP_405_METHOD_NOT_ALLOWED) + "/api/scan_queue/asdf/", data=data + ) + self.assertEqual(response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED) def test_api_scannable_uri_update_status_update_finished_scannable_uri(self): scannable_uri_uuid = self.scannable_uri3.uuid @@ -231,16 +233,13 @@ def test_api_scannable_uri_update_status_update_finished_scannable_uri(self): ]: self.scannable_uri3.scan_status = scan_status self.scannable_uri3.save() - data = { - 'scannable_uri_uuid': scannable_uri_uuid, - 'scan_status': 'scanned' - } + data = {"scannable_uri_uuid": scannable_uri_uuid, "scan_status": "scanned"} response = self.scan_queue_worker_client.post( - f'/api/scan_queue/{scannable_uri_uuid}/update_status/', data=data + f"/api/scan_queue/{scannable_uri_uuid}/update_status/", data=data ) expected_response = { - 'error': 'cannot update status for scannable_uri ' - f'{self.scannable_uri3.uuid}: scannable_uri has finished ' + "error": "cannot update status for scannable_uri " + f"{self.scannable_uri3.uuid}: scannable_uri has finished " f'with status "{ScannableURI.SCAN_STATUSES_BY_CODE[scan_status]}"' } self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @@ -255,49 +254,51 @@ def test_api_scannable_uri_index_package_scan(self): self.assertFalse(self.package2.declared_license_expression) self.assertFalse(self.package2.copyright) self.assertEqual(0, Resource.objects.all().count()) - scan_file_location = self.get_test_loc('scancodeio/get_scan_data.json') + scan_file_location = self.get_test_loc("scancodeio/get_scan_data.json") summary_file_location = self.get_test_loc( - 'scancodeio/scan_summary_response.json') + "scancodeio/scan_summary_response.json" + ) project_extra_data = { - 'scannable_uri_uuid': self.scannable_uri2.uuid, - 'md5': 'md5', - 'sha1': 'sha1', - 'sha256': 'sha256', - 'sha512': 'sha512', - 'size': 100, + "scannable_uri_uuid": self.scannable_uri2.uuid, + "md5": "md5", + "sha1": "sha1", + "sha256": "sha256", + "sha512": "sha512", + "size": 100, } with ( open(scan_file_location) as scan_file, - open(summary_file_location) as summary_file + open(summary_file_location) as summary_file, ): results = json.load(scan_file) summary = json.load(summary_file) data = { - 'project': { - 'extra_data': project_extra_data, + "project": { + "extra_data": project_extra_data, }, - 'results': results, - 'summary': summary, + "results": results, + "summary": summary, } webhook_url = get_webhook_url( - 'index_package_scan', self.scan_queue_worker_user.id) + "index_package_scan", self.scan_queue_worker_user.id + ) response = self.scan_queue_worker_client.post( - webhook_url, data=data, format='json') + webhook_url, data=data, format="json" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri2.refresh_from_db() - self.assertEqual(ScannableURI.SCAN_INDEXED, - self.scannable_uri2.scan_status) + self.assertEqual(ScannableURI.SCAN_INDEXED, self.scannable_uri2.scan_status) self.package2.refresh_from_db() - self.assertEqual('md5', self.package2.md5) - self.assertEqual('sha1', self.package2.sha1) - self.assertEqual('sha256', self.package2.sha256) - self.assertEqual('sha512', self.package2.sha512) + self.assertEqual("md5", self.package2.md5) + self.assertEqual("sha1", self.package2.sha1) + self.assertEqual("sha256", self.package2.sha256) + self.assertEqual("sha512", self.package2.sha512) self.assertEqual(100, self.package2.size) + self.assertEqual("apache-2.0", self.package2.declared_license_expression) self.assertEqual( - 'apache-2.0', self.package2.declared_license_expression) - self.assertEqual( - 'Copyright (c) Apache Software Foundation', self.package2.copyright) + "Copyright (c) Apache Software Foundation", self.package2.copyright + ) self.assertFalse(self.scannable_uri2.scan_error) self.assertEqual(64, Resource.objects.all().count()) diff --git a/minecode/tests/test_command.py b/minecode/tests/test_command.py index d620ac83..053e8133 100644 --- a/minecode/tests/test_command.py +++ b/minecode/tests/test_command.py @@ -10,17 +10,17 @@ import os -from minecode import command from minecode import ON_WINDOWS +from minecode import command from minecode.utils_test import MiningTestCase class CommandTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def test_listing_command(self): - td = self.get_test_loc('command') - osc = 'ls' if not ON_WINDOWS else 'dir' + td = self.get_test_loc("command") + osc = "ls" if not ON_WINDOWS else "dir" c = '%(osc)s "%(td)s"' % locals() cmd = command.Command(c) out, err = cmd.execute() @@ -28,6 +28,6 @@ def test_listing_command(self): self.assertEqual([], err) out = [o for o in out] - self.assertTrue(any('foo' in i for i in out)) - self.assertTrue(any('bar' in i for i in out)) - self.assertTrue(all(i.endswith('\n') for i in out)) + self.assertTrue(any("foo" in i for i in out)) + self.assertTrue(any("bar" in i for i in out)) + self.assertTrue(all(i.endswith("\n") for i in out)) diff --git a/minecode/tests/test_filter.py b/minecode/tests/test_filter.py index 03cb7697..87b67903 100644 --- a/minecode/tests/test_filter.py +++ b/minecode/tests/test_filter.py @@ -9,19 +9,19 @@ import os -from minecode.utils_test import MiningTestCase from minecode.filter import sf_net +from minecode.utils_test import MiningTestCase class FilterTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def test_filter(self): - inputf = self.get_test_loc('filter_sf/tst_sfnet.csv') - exf = self.get_test_loc('filter_sf/tst_sfnet2.csv') - expected = open(exf, 'rb').read() + inputf = self.get_test_loc("filter_sf/tst_sfnet.csv") + exf = self.get_test_loc("filter_sf/tst_sfnet2.csv") + expected = open(exf, "rb").read() tdir = self.get_temp_dir() - output = os.path.join(tdir, 'out.csv') + output = os.path.join(tdir, "out.csv") sf_net(inputf, output) - test = open(output, 'rb').read() + test = open(output, "rb").read() self.assertEqual(expected, test) diff --git a/minecode/tests/test_housekeeping.py b/minecode/tests/test_housekeeping.py index 6d9ec6d2..f0053a60 100644 --- a/minecode/tests/test_housekeeping.py +++ b/minecode/tests/test_housekeeping.py @@ -7,89 +7,85 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import codecs import json import os from io import StringIO - -from mock import patch +from unittest.mock import patch from django.core import management from django.test import TestCase as DjangoTestCase import packagedb - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - from minecode.management.commands.check_licenses import find_ambiguous_packages from minecode.management.commands.run_map import map_uri from minecode.management.commands.run_visit import visit_uri - from minecode.models import ResourceURI from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class PackageLicenseCheckTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def test_find_ambiguous_packages_declared_license(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='apache-2.0 and unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="apache-2.0 and unknown", + type="maven", ) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/declared_license_search_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "housekeeping/declared_license_search_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_find_ambiguous_packages_license_expression(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='apache-2.0 and unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="apache-2.0 and unknown", + type="maven", ) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/license_expression_search_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "housekeeping/license_expression_search_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_find_ambiguous_packages_license_expression_ignore_uppercase(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='Unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="Unknown", + type="maven", ) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/ignore_upper_case_search_expected.json') + "housekeeping/ignore_upper_case_search_expected.json" + ) - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_run_check_licenses_command(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='apache-2.0 and unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="apache-2.0 and unknown", + type="maven", ) results_loc = self.get_temp_file() - expected_loc = self.get_test_loc('housekeeping/example_expected.json') + expected_loc = self.get_test_loc("housekeeping/example_expected.json") output = StringIO() - management.call_command('check_licenses', '-o', - results_loc, stdout=output) + management.call_command("check_licenses", "-o", results_loc, stdout=output) self.assertTrue( - 'Visited 1 packages\nFound 1 possible packages\nFound packages dumped to:' in output.getvalue()) + "Visited 1 packages\nFound 1 possible packages\nFound packages dumped to:" + in output.getvalue() + ) with open(results_loc) as results: res = json.load(results) @@ -98,25 +94,24 @@ def test_run_check_licenses_command(self): def test_run_check_licenses_command_with_empty_package(self): output = StringIO() results_loc = self.get_temp_file() - management.call_command('check_licenses', '-o', - results_loc, stdout=output) + management.call_command("check_licenses", "-o", results_loc, stdout=output) self.assertTrue( - 'Visited 0 packages\nFound 0 possible packages' in output.getvalue()) + "Visited 0 packages\nFound 0 possible packages" in output.getvalue() + ) def test_visit_and_map_using_pom(self): - uri = 'http://repo1.maven.org/maven2/org/bytesoft/bytejta-supports/0.5.0-ALPHA4/bytejta-supports-0.5.0-ALPHA4.pom' - test_loc = self.get_test_loc( - 'housekeeping/bytejta-supports-0.5.0-ALPHA4.pom') + uri = "http://repo1.maven.org/maven2/org/bytesoft/bytejta-supports/0.5.0-ALPHA4/bytejta-supports-0.5.0-ALPHA4.pom" + test_loc = self.get_test_loc("housekeeping/bytejta-supports-0.5.0-ALPHA4.pom") resource_uri = ResourceURI.objects.insert(uri=uri) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # visit test proper: this should insert all the test_uris visit_uri(resource_uri) map_uri(resource_uri) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/bytejta-supports-0.5.0-ALPHA4.pom_search_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "housekeeping/bytejta-supports-0.5.0-ALPHA4.pom_search_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_indexing.py b/minecode/tests/test_indexing.py index d55c420d..ca6e905d 100644 --- a/minecode/tests/test_indexing.py +++ b/minecode/tests/test_indexing.py @@ -24,23 +24,23 @@ class IndexingTest(MiningTestCase, JsonBasedTesting): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package1 = Package.objects.create( - download_url='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - type='maven', - namespace='', - name='wagon-api', - version='20040705.181715' + download_url="https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar", + type="maven", + namespace="", + name="wagon-api", + version="20040705.181715", ) self.package2 = Package.objects.create( - download_url='https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug', - type='generic', - namespace='', - name='debug', - version='1.23' + download_url="https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug", + type="generic", + namespace="", + name="debug", + version="1.23", ) def test_indexing_index_package_files(self): @@ -53,17 +53,16 @@ def test_indexing_index_package_files(self): self.assertEqual(0, Resource.objects.count()) scan_data_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715.json') - with open(scan_data_loc, 'rb') as f: + "indexing/scancodeio_wagon-api-20040705.181715.json" + ) + with open(scan_data_loc, "rb") as f: scan_data = json.loads(f.read()) - indexing_errors = indexing.index_package_files( - self.package1, scan_data) + indexing_errors = indexing.index_package_files(self.package1, scan_data) self.assertEqual(0, len(indexing_errors)) self.assertEqual(11, ApproximateDirectoryContentIndex.objects.count()) - self.assertEqual( - 11, ApproximateDirectoryStructureIndex.objects.count()) + self.assertEqual(11, ApproximateDirectoryStructureIndex.objects.count()) self.assertEqual(2, ApproximateResourceContentIndex.objects.count()) self.assertEqual(45, ExactFileIndex.objects.count()) @@ -71,34 +70,38 @@ def test_indexing_index_package_files(self): self.assertEqual(64, len(resources)) resource_data = [r.to_dict() for r in resources] expected_resources_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715-expected.json') + "indexing/scancodeio_wagon-api-20040705.181715-expected.json" + ) self.check_expected_results( - resource_data, expected_resources_loc, regen=FIXTURES_REGEN) + resource_data, expected_resources_loc, regen=FIXTURES_REGEN + ) def test_indexing_index_package(self): scan_data_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715.json') - with open(scan_data_loc, 'rb') as f: + "indexing/scancodeio_wagon-api-20040705.181715.json" + ) + with open(scan_data_loc, "rb") as f: scan_data = json.load(f) scan_summary_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715-summary.json') - with open(scan_summary_loc, 'rb') as f: + "indexing/scancodeio_wagon-api-20040705.181715-summary.json" + ) + with open(scan_summary_loc, "rb") as f: scan_summary = json.load(f) project_extra_data = { - 'md5': 'md5', - 'sha1': 'sha1', - 'sha256': 'sha256', - 'sha512': 'sha512', - 'size': 100, + "md5": "md5", + "sha1": "sha1", + "sha256": "sha256", + "sha512": "sha512", + "size": 100, } # Set up ScannableURI scannable_uri = ScannableURI.objects.create( - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', + uri="https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar", scan_status=ScannableURI.SCAN_COMPLETED, - package=self.package1 + package=self.package1, ) # Ensure that we do not have any Package data updated, Resources, and fingerprints @@ -125,14 +128,14 @@ def test_indexing_index_package(self): ) # Make sure that Package data is updated + self.assertEqual("apache-2.0", self.package1.declared_license_expression) self.assertEqual( - 'apache-2.0', self.package1.declared_license_expression) - self.assertEqual( - 'Copyright (c) Apache Software Foundation', self.package1.copyright) - self.assertEqual('md5', self.package1.md5) - self.assertEqual('sha1', self.package1.sha1) - self.assertEqual('sha256', self.package1.sha256) - self.assertEqual('sha512', self.package1.sha512) + "Copyright (c) Apache Software Foundation", self.package1.copyright + ) + self.assertEqual("md5", self.package1.md5) + self.assertEqual("sha1", self.package1.sha1) + self.assertEqual("sha256", self.package1.sha256) + self.assertEqual("sha512", self.package1.sha512) self.assertEqual(100, self.package1.size) for expected_count, model in [ @@ -142,27 +145,25 @@ def test_indexing_index_package(self): (45, ExactFileIndex), ]: self.assertEqual( - expected_count, - model.objects.filter(package=self.package1).count() + expected_count, model.objects.filter(package=self.package1).count() ) def test_indexing_index_package_dwarf(self): - scan_data_loc = self.get_test_loc('indexing/get_scan_data_dwarf.json') - with open(scan_data_loc, 'rb') as f: + scan_data_loc = self.get_test_loc("indexing/get_scan_data_dwarf.json") + with open(scan_data_loc, "rb") as f: scan_data = json.load(f) - scan_summary_loc = self.get_test_loc( - 'indexing/scan_summary_dwarf.json') - with open(scan_summary_loc, 'rb') as f: + scan_summary_loc = self.get_test_loc("indexing/scan_summary_dwarf.json") + with open(scan_summary_loc, "rb") as f: scan_summary = json.load(f) project_extra_data = {} # Set up ScannableURI scannable_uri = ScannableURI.objects.create( - uri='https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug', + uri="https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug", scan_status=ScannableURI.SCAN_COMPLETED, - package=self.package2 + package=self.package2, ) # Run test diff --git a/minecode/tests/test_ls.py b/minecode/tests/test_ls.py index f8c2c8cc..a98af247 100644 --- a/minecode/tests/test_ls.py +++ b/minecode/tests/test_ls.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -12,30 +11,31 @@ import os -from minecode.utils_test import JsonBasedTesting -from minecode.tests import FIXTURES_REGEN from minecode import ls +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting class ParseDirectoryListingTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') -# maxDiff = None + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + # maxDiff = None def test_remove_inode_works_with_no_space_at_line_start(self): - test = '12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' - expected = u'drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' + test = "12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" + expected = "drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" self.assertEqual(expected, ls.remove_inode(test)) def test_remove_inode_works_even_with_space_at_line_start(self): - test = ' 12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' - expected = u'drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' + test = " 12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" + expected = "drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" self.assertEqual(expected, ls.remove_inode(test)) - def check_listing(self, test_file, expected_file, from_find=True, regen=FIXTURES_REGEN): + def check_listing( + self, test_file, expected_file, from_find=True, regen=FIXTURES_REGEN + ): test_file = self.get_test_loc(test_file) test_text = open(test_file).read() - results = list(ls.parse_directory_listing( - test_text, from_find=from_find)) + results = list(ls.parse_directory_listing(test_text, from_find=from_find)) for r in results: if r.date: # we remove the year in YYYY-MM-DD to avoid date-sensitive test @@ -47,26 +47,30 @@ def check_listing(self, test_file, expected_file, from_find=True, regen=FIXTURES self.check_expected_results(results, expected_file, regen=regen) def test_parse_listing_from_findls(self): - test_file = 'directories/find-ls' - expected_file = 'directories/find-ls-expected.json' - self.check_listing(test_file, expected_file, - from_find=True, regen=FIXTURES_REGEN) + test_file = "directories/find-ls" + expected_file = "directories/find-ls-expected.json" + self.check_listing( + test_file, expected_file, from_find=True, regen=FIXTURES_REGEN + ) def test_parse_listing_from_findls_from_apache_does_not_fail_on_first_line(self): - test_file = 'directories/find-ls-apache-start' - expected_file = 'directories/find-ls-apache-start-expected.json' - self.check_listing(test_file, expected_file, - from_find=True, regen=FIXTURES_REGEN) + test_file = "directories/find-ls-apache-start" + expected_file = "directories/find-ls-apache-start-expected.json" + self.check_listing( + test_file, expected_file, from_find=True, regen=FIXTURES_REGEN + ) def test_parse_listing_from_lslr(self): - test_file = 'directories/ls-lr' - expected_file = 'directories/ls-lr-expected.json' - self.check_listing(test_file, expected_file, - from_find=False, regen=FIXTURES_REGEN) + test_file = "directories/ls-lr" + expected_file = "directories/ls-lr-expected.json" + self.check_listing( + test_file, expected_file, from_find=False, regen=FIXTURES_REGEN + ) def test_parse_listing_from_lslr_at_ubuntu(self): - test_file = 'directories/ls-lr-ubuntu' - expected_file = 'directories/ls-lr-ubuntu-expected.json' + test_file = "directories/ls-lr-ubuntu" + expected_file = "directories/ls-lr-ubuntu-expected.json" self.maxDiff = None - self.check_listing(test_file, expected_file, - from_find=False, regen=FIXTURES_REGEN) + self.check_listing( + test_file, expected_file, from_find=False, regen=FIXTURES_REGEN + ) diff --git a/minecode/tests/test_migrations.py b/minecode/tests/test_migrations.py index 31f04c86..83ec1311 100644 --- a/minecode/tests/test_migrations.py +++ b/minecode/tests/test_migrations.py @@ -53,36 +53,36 @@ def test_populate_has_error_fields(self): "map_error", "has_visit_error", "visit_error", - ).order_by('uri') + ).order_by("uri") ) expected = [ { - 'has_map_error': True, - 'has_visit_error': True, - 'map_error': 'error', - 'uri': 'http://example.com/1', - 'visit_error': 'error' + "has_map_error": True, + "has_visit_error": True, + "map_error": "error", + "uri": "http://example.com/1", + "visit_error": "error", }, { - 'has_map_error': False, - 'has_visit_error': True, - 'map_error': None, - 'uri': 'http://example.com/2', - 'visit_error': 'error' + "has_map_error": False, + "has_visit_error": True, + "map_error": None, + "uri": "http://example.com/2", + "visit_error": "error", }, { - 'has_map_error': True, - 'has_visit_error': False, - 'map_error': 'error', - 'uri': 'http://example.com/3', - 'visit_error': None + "has_map_error": True, + "has_visit_error": False, + "map_error": "error", + "uri": "http://example.com/3", + "visit_error": None, }, { - 'has_map_error': False, - 'has_visit_error': False, - 'map_error': None, - 'uri': 'http://example.com/4', - 'visit_error': None + "has_map_error": False, + "has_visit_error": False, + "map_error": None, + "uri": "http://example.com/4", + "visit_error": None, }, ] self.assertEqual(results, expected) @@ -122,16 +122,17 @@ def test_set_is_visitable_for_maven_index_uris(self): ) expected = [ { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar' + "is_visitable": False, + "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar", }, { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar' - } + "is_visitable": False, + "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar", + }, ] self.assertEqual(results, expected) + class TestSetIsVisitableForMavenIndexURIs(TestMigrations): app_name = "minecode" migrate_from = "0025_populate_has_error_fields" @@ -166,13 +167,13 @@ def test_set_is_visitable_for_maven_index_uris(self): ) expected = [ { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar' + "is_visitable": False, + "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar", }, { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar' - } + "is_visitable": False, + "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar", + }, ] self.assertEqual(results, expected) @@ -208,10 +209,10 @@ def test_replace_http_with_https_in_maven_uris(self): ) expected = [ { - 'uri': 'https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom' + "uri": "https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom" }, { - 'uri': 'https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom' - } + "uri": "https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom" + }, ] self.assertEqual(results, expected) diff --git a/minecode/tests/test_model_utils.py b/minecode/tests/test_model_utils.py index bf64067b..34e72fa4 100644 --- a/minecode/tests/test_model_utils.py +++ b/minecode/tests/test_model_utils.py @@ -10,44 +10,43 @@ import os from django.test import TransactionTestCase + from packagedcode.maven import _parse from minecode.model_utils import merge_or_create_package from minecode.model_utils import update_or_create_resource +from minecode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTesting from minecode.utils_test import MiningTestCase -from minecode.tests import FIXTURES_REGEN from packagedb.models import Package from packagedb.models import Resource class ModelUtilsTestCase(MiningTestCase, JsonBasedTesting): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - pom_loc = self.get_test_loc('maven/pom/pulsar-2.5.1.pom') - self.scanned_package = _parse( - 'maven_pom', 'maven', 'Java', location=pom_loc) - self.scanned_package.download_url = 'https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar' + pom_loc = self.get_test_loc("maven/pom/pulsar-2.5.1.pom") + self.scanned_package = _parse("maven_pom", "maven", "Java", location=pom_loc) + self.scanned_package.download_url = "https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar" def test_merge_or_create_package_create_package(self): self.assertEqual(0, Package.objects.all().count()) package, created, merged, map_error = merge_or_create_package( - self.scanned_package, - visit_level=50 + self.scanned_package, visit_level=50 ) self.assertEqual(1, Package.objects.all().count()) self.assertEqual(package, Package.objects.all().first()) self.assertTrue(created) self.assertFalse(merged) - self.assertEqual('', map_error) + self.assertEqual("", map_error) self.assertTrue(package.created_date) self.assertTrue(package.last_modified_date) - expected_loc = self.get_test_loc('model_utils/created_package.json') + expected_loc = self.get_test_loc("model_utils/created_package.json") self.check_expected_results( package.to_dict(), expected_loc, - fields_to_remove=['package_sets'], + fields_to_remove=["package_sets"], regen=FIXTURES_REGEN, ) @@ -55,53 +54,55 @@ def test_merge_or_create_package_merge_package(self): # ensure fields get updated # ensure history is properly updated package = Package.objects.create( - type='maven', - namespace='org.apache.pulsar', - name='pulsar', - version='2.5.1', - download_url='https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar', + type="maven", + namespace="org.apache.pulsar", + name="pulsar", + version="2.5.1", + download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar", ) - before_merge_loc = self.get_test_loc('model_utils/before_merge.json') + before_merge_loc = self.get_test_loc("model_utils/before_merge.json") self.check_expected_results( package.to_dict(), before_merge_loc, - fields_to_remove=['package_sets'], + fields_to_remove=["package_sets"], regen=FIXTURES_REGEN, ) package, created, merged, map_error = merge_or_create_package( - self.scanned_package, - visit_level=50 + self.scanned_package, visit_level=50 ) self.assertEqual(1, Package.objects.all().count()) self.assertEqual(package, Package.objects.all().first()) self.assertFalse(created) self.assertTrue(merged) - self.assertEqual('', map_error) - expected_loc = self.get_test_loc('model_utils/after_merge.json') + self.assertEqual("", map_error) + expected_loc = self.get_test_loc("model_utils/after_merge.json") self.check_expected_results( package.to_dict(), expected_loc, - fields_to_remove=['package_sets'], + fields_to_remove=["package_sets"], regen=FIXTURES_REGEN, ) history = package.get_history() self.assertEqual(1, len(history)) entry = history[0] - timestamp = entry['timestamp'] - message = entry['message'] + timestamp = entry["timestamp"] + message = entry["message"] self.assertEqual( - 'Package field values have been updated.', + "Package field values have been updated.", message, ) last_modified_date_formatted = package.last_modified_date.strftime( - "%Y-%m-%d-%H:%M:%S") + "%Y-%m-%d-%H:%M:%S" + ) self.assertEqual(timestamp, last_modified_date_formatted) - data = entry['data'] - updated_fields = data['updated_fields'] + data = entry["data"] + updated_fields = data["updated_fields"] expected_updated_fields_loc = self.get_test_loc( - 'model_utils/expected_updated_fields.json') + "model_utils/expected_updated_fields.json" + ) self.check_expected_results( - updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN) + updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN + ) class UpdateORCreateResourceTest(TransactionTestCase): diff --git a/minecode/tests/test_models.py b/minecode/tests/test_models.py index 7ef88672..4e528e5a 100644 --- a/minecode/tests/test_models.py +++ b/minecode/tests/test_models.py @@ -14,28 +14,30 @@ from django.utils import timezone from minecode.models import ResourceURI -from packagedb.models import Package -from minecode.models import get_canonical from minecode.models import ScannableURI +from minecode.models import get_canonical +from packagedb.models import Package class ResourceURIModelTestCase(TestCase): - def setUp(self): self.res = ResourceURI.objects.insert( - uri='http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom') + uri="http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) def test_get_canonical(self): data = ( - ('http://www.nexb.com', 'http://www.nexb.com/'), - ('http://www.nexb.com/', 'http://www.nexb.com/'), - ('http://www.nexb.com/a/b/../../c/', 'http://www.nexb.com/c/'), - ('http://www.nexb.com:80', 'http://www.nexb.com/'), - ('https://www.nexb.com:443', 'https://www.nexb.com/'), - ('http://www.nexb.com:443', 'http://www.nexb.com:443/'), - ('https://www.nexb.com:80', 'https://www.nexb.com:80/'), - ('http://www.nexb.com/A 0.0.1 Alpha/a_0_0_1.zip', - 'http://www.nexb.com/A%200.0.1%20Alpha/a_0_0_1.zip'), + ("http://www.nexb.com", "http://www.nexb.com/"), + ("http://www.nexb.com/", "http://www.nexb.com/"), + ("http://www.nexb.com/a/b/../../c/", "http://www.nexb.com/c/"), + ("http://www.nexb.com:80", "http://www.nexb.com/"), + ("https://www.nexb.com:443", "https://www.nexb.com/"), + ("http://www.nexb.com:443", "http://www.nexb.com:443/"), + ("https://www.nexb.com:80", "https://www.nexb.com:80/"), + ( + "http://www.nexb.com/A 0.0.1 Alpha/a_0_0_1.zip", + "http://www.nexb.com/A%200.0.1%20Alpha/a_0_0_1.zip", + ), ) for test, expected in data: self.assertEqual(expected, get_canonical(test)) @@ -43,23 +45,24 @@ def test_get_canonical(self): def test_is_routable_flags_are_not_overwritten_on_save(self): self.assertTrue(self.res.is_visitable) self.assertTrue(self.res.is_mappable) - self.res.sha1 = 'a' * 40 + self.res.sha1 = "a" * 40 self.res.save() res1 = ResourceURI.objects.get( - uri='http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom') + uri="http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) self.assertTrue(res1.is_visitable) self.assertTrue(res1.is_mappable) res1.save() res2 = ResourceURI.objects.get( - uri='http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom') + uri="http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) self.assertTrue(res2.is_visitable) self.assertTrue(res2.is_mappable) class ResourceURIManagerTestCase(TestCase): - def setUp(self): - self.uri = 'https://sourceforge.net/sitemap.xml' + self.uri = "https://sourceforge.net/sitemap.xml" self.resource = ResourceURI.objects.insert(uri=self.uri, priority=100) def test_insert(self): @@ -94,7 +97,7 @@ def test_successful(self): self.resource.last_visit_date = timezone.now() self.resource.save() self.assertTrue(ResourceURI.objects.successfully_visited()) - self.resource.visit_error = 'error' + self.resource.visit_error = "error" self.resource.save() self.assertFalse(ResourceURI.objects.successfully_visited()) @@ -103,7 +106,7 @@ def test_unsuccessful(self): self.resource.last_visit_date = timezone.now() self.resource.save() self.assertFalse(ResourceURI.objects.unsuccessfully_visited()) - self.resource.visit_error = 'error' + self.resource.visit_error = "error" self.resource.save() self.assertTrue(ResourceURI.objects.unsuccessfully_visited()) @@ -111,152 +114,121 @@ def test_needs_revisit_force_revisit_at_0_hours(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertTrue(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=0)) + self.assertTrue(ResourceURI.objects.needs_revisit(uri=self.uri, hours=0)) def test_needs_revisit_very_old_visit(self): self.resource.last_visit_date = timezone.now() - timedelta(days=20) self.resource.save() - self.assertTrue(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=240)) + self.assertTrue(ResourceURI.objects.needs_revisit(uri=self.uri, hours=240)) def test_needs_revisit_near_visit(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=3) self.resource.save() - self.assertTrue(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=2)) + self.assertTrue(ResourceURI.objects.needs_revisit(uri=self.uri, hours=2)) def test_needs_revisit_recent_visit(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertFalse( - ResourceURI.objects.needs_revisit(uri=self.uri, hours=2)) + self.assertFalse(ResourceURI.objects.needs_revisit(uri=self.uri, hours=2)) def test_needs_revisit_never_been_visited(self): - self.assertFalse(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=200)) + self.assertFalse(ResourceURI.objects.needs_revisit(uri=self.uri, hours=200)) class ResourceURIManagerGetRevisitablesUnmappableURITestCase(TestCase): - def setUp(self): - self.uri = 'https://sourceforge.net/sitemap.xml' + self.uri = "https://sourceforge.net/sitemap.xml" self.resource = ResourceURI.objects.insert(uri=self.uri, priority=100) def test_get_revisitables_last_visit_date_now(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) def test_get_revisitables_last_visit_date_10_days_ago(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=240) self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=240).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=241).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=241).count()) class ResourceURIManagerGetRevisitablesMappableURITestCase(TestCase): - def setUp(self): # this is a mappable ResourceURI - self.uri = 'http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom' + self.uri = ( + "http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) self.resource = ResourceURI.objects.insert(uri=self.uri, priority=100) def test_get_revisitables_unmapped_last_visit_date_now(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) def test_get_revisitables_unmapped_last_visit_date_less_than_threshold(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) def test_get_revisitables_unmapped_last_visit_date_10_days_ago(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=240) self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=241).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=241).count()) def test_get_revisitables_mapped_last_visit_date_now(self): self.resource.last_visit_date = timezone.now() self.resource.last_map_date = timezone.now() self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) def test_get_revisitables_mapped_last_visit_date_less_than_threshold(self): self.resource.last_visit_date = timezone.now() self.resource.last_map_date = timezone.now() self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) def test_get_revisitables_mapped_last_visit_date_10_days_ago(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=240) self.resource.last_map_date = timezone.now() self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=240).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=241).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=241).count()) class ResourceURIManagerGetNextVisitableUnmappableURITestCase(TestCase): - def setUp(self): - self.uri0 = 'https://sourceforge.net/sitemap.xml' - self.uri1 = 'https://sourceforge.net/sitemap-0.xml' + self.uri0 = "https://sourceforge.net/sitemap.xml" + self.uri1 = "https://sourceforge.net/sitemap-0.xml" self.resource0 = ResourceURI.objects.insert(uri=self.uri0, priority=1) self.resource1 = ResourceURI.objects.insert(uri=self.uri1, priority=2) def test_get_next_visitable_unvisited(self): - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_none_when_both_visited_less_than_10_days_ago(self): @@ -273,20 +245,19 @@ def test_get_next_visitable_when_both_visited_10_days_ago(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) - def test_get_next_visitable_when_one_unvisited_and_one_visited_less_than_10_days_ago(self): + def test_get_next_visitable_when_one_unvisited_and_one_visited_less_than_10_days_ago( + self, + ): self.resource0.last_visit_date = None self.resource1.last_visit_date = timezone.now() - timedelta(hours=24) self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) self.resource0.last_visit_date = timezone.now() - timedelta(hours=24) @@ -294,18 +265,18 @@ def test_get_next_visitable_when_one_unvisited_and_one_visited_less_than_10_days self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) - def test_get_next_visitable_when_one_visited_more_and_one_visited_less_than_10_days_ago(self): + def test_get_next_visitable_when_one_visited_more_and_one_visited_less_than_10_days_ago( + self, + ): self.resource0.last_visit_date = timezone.now() - timedelta(hours=250) self.resource1.last_visit_date = timezone.now() - timedelta(hours=24) self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) self.resource0.last_visit_date = timezone.now() - timedelta(hours=24) @@ -313,27 +284,25 @@ def test_get_next_visitable_when_one_visited_more_and_one_visited_less_than_10_d self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) class ResourceURIManagerGetNextVisitableMappableURITestCase(TestCase): - def setUp(self): # this is a mappable ResourceURI - self.uri0 = 'http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom' - self.uri1 = 'http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.1/mav-all-1.1.pom' - self.resource0 = ResourceURI.objects.insert( - uri=self.uri0, priority=100) - self.resource1 = ResourceURI.objects.insert( - uri=self.uri1, priority=100) + self.uri0 = ( + "http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) + self.uri1 = ( + "http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.1/mav-all-1.1.pom" + ) + self.resource0 = ResourceURI.objects.insert(uri=self.uri0, priority=100) + self.resource1 = ResourceURI.objects.insert(uri=self.uri1, priority=100) def test_get_next_visitable_unvisited(self): - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_visited_unmapped(self): @@ -352,10 +321,8 @@ def test_get_next_visitable_visited_10_days_ago_mapped(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_visited_10_days_ago_one_unmapped(self): @@ -365,8 +332,7 @@ def test_get_next_visitable_visited_10_days_ago_one_unmapped(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) self.resource0.last_map_date = None @@ -374,8 +340,7 @@ def test_get_next_visitable_visited_10_days_ago_one_unmapped(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_recently_visited_mapped(self): @@ -390,14 +355,15 @@ def test_get_next_visitable_recently_visited_mapped(self): class ResourceURIManagerGetMappablesTestCase(TestCase): - def setUp(self): - self.uri1 = 'maven-index://repo1.maven.org/o/a/this.jar' - self.uri2 = 'maven-index://repo1.maven.org/o/a/thisother.jar' + self.uri1 = "maven-index://repo1.maven.org/o/a/this.jar" + self.uri2 = "maven-index://repo1.maven.org/o/a/thisother.jar" self.resource1 = ResourceURI.objects.create( - uri=self.uri1, priority=1, last_visit_date=timezone.now()) + uri=self.uri1, priority=1, last_visit_date=timezone.now() + ) self.resource2 = ResourceURI.objects.create( - uri=self.uri2, priority=2, last_visit_date=timezone.now()) + uri=self.uri2, priority=2, last_visit_date=timezone.now() + ) def test_get_mappables(self): assert self.resource1.is_mappable @@ -406,14 +372,13 @@ def test_get_mappables(self): self.resource1.last_map_date = timezone.now() self.resource1.save() resource1 = ResourceURI.objects.get(id=self.resource1.id) - self.assertEqual([self.resource2], list( - ResourceURI.objects.get_mappables())) + self.assertEqual([self.resource2], list(ResourceURI.objects.get_mappables())) def test_get_mappables__map_error_must_make_a_resourceuri_non_mappable(self): assert self.resource1.is_mappable self.assertEqual(2, ResourceURI.objects.get_mappables().count()) - self.resource1.map_error = 'Some error happened' - self.resource2.map_error = 'Some error happened' + self.resource1.map_error = "Some error happened" + self.resource2.map_error = "Some error happened" self.resource1.save() self.resource2.save() resource1 = ResourceURI.objects.get(id=self.resource1.id) @@ -422,26 +387,42 @@ def test_get_mappables__map_error_must_make_a_resourceuri_non_mappable(self): class ScannableURIManagerTestCase(TestCase): def setUp(self): - self.test_uri1 = 'http://example.com' + self.test_uri1 = "http://example.com" self.test_package1 = Package.objects.create( - download_url=self.test_uri1, name='Foo', version='12.34') - self.scannable_uri1 = ScannableURI.objects.create(uri=self.test_uri1, package=self.test_package1, - scan_status=ScannableURI.SCAN_NEW) - self.test_uri2 = 'http://elpmaxe.com' + download_url=self.test_uri1, name="Foo", version="12.34" + ) + self.scannable_uri1 = ScannableURI.objects.create( + uri=self.test_uri1, + package=self.test_package1, + scan_status=ScannableURI.SCAN_NEW, + ) + self.test_uri2 = "http://elpmaxe.com" self.test_package2 = Package.objects.create( - download_url=self.test_uri2, name='Bar', version='11.75') - self.scannable_uri2 = ScannableURI.objects.create(uri=self.test_uri2, package=self.test_package2, - scan_status=ScannableURI.SCAN_SUBMITTED) - self.test_uri3 = 'http://nexb.com' + download_url=self.test_uri2, name="Bar", version="11.75" + ) + self.scannable_uri2 = ScannableURI.objects.create( + uri=self.test_uri2, + package=self.test_package2, + scan_status=ScannableURI.SCAN_SUBMITTED, + ) + self.test_uri3 = "http://nexb.com" self.test_package3 = Package.objects.create( - download_url=self.test_uri3, name='Baz', version='5') - self.scannable_uri3 = ScannableURI.objects.create(uri=self.test_uri3, package=self.test_package3, - scan_status=ScannableURI.SCAN_IN_PROGRESS) - self.test_uri4 = 'http://realsite.com' + download_url=self.test_uri3, name="Baz", version="5" + ) + self.scannable_uri3 = ScannableURI.objects.create( + uri=self.test_uri3, + package=self.test_package3, + scan_status=ScannableURI.SCAN_IN_PROGRESS, + ) + self.test_uri4 = "http://realsite.com" self.test_package4 = Package.objects.create( - download_url=self.test_uri4, name='Qux', version='87') - self.scannable_uri4 = ScannableURI.objects.create(uri=self.test_uri4, package=self.test_package4, - scan_status=ScannableURI.SCAN_COMPLETED) + download_url=self.test_uri4, name="Qux", version="87" + ) + self.scannable_uri4 = ScannableURI.objects.create( + uri=self.test_uri4, + package=self.test_package4, + scan_status=ScannableURI.SCAN_COMPLETED, + ) def test_ScannableURIManager_get_scannables(self): result = ScannableURI.objects.get_scannables() @@ -470,21 +451,24 @@ def test_ScannableURI_get_next_processable(self): class ScannableURIModelTestCase(TestCase): def setUp(self): - self.test_uri = 'http://example.com' + self.test_uri = "http://example.com" self.test_package = Package.objects.create( - download_url=self.test_uri, name='Foo', version='12.34') + download_url=self.test_uri, name="Foo", version="12.34" + ) def test_ScannableURI_create_basic_record(self): scannable_uri = ScannableURI.objects.create( - uri=self.test_uri, package=self.test_package) + uri=self.test_uri, package=self.test_package + ) result = ScannableURI.objects.get(uri=self.test_uri) self.assertEqual(self.test_uri, result.uri) self.assertEqual(self.test_package, result.package) def test_ScannableURI_save(self): - test_error_message = 'error' + test_error_message = "error" scannable_uri = ScannableURI.objects.create( - uri=self.test_uri, package=self.test_package) + uri=self.test_uri, package=self.test_package + ) self.assertFalse(scannable_uri.scan_error) scannable_uri.scan_error = test_error_message scannable_uri.save() @@ -492,9 +476,8 @@ def test_ScannableURI_save(self): self.assertEqual(test_error_message, result.scan_error) def test_ScannableURI_save_set_canonical_uri(self): - scannable_uri = ScannableURI( - uri=self.test_uri, package=self.test_package) + scannable_uri = ScannableURI(uri=self.test_uri, package=self.test_package) self.assertFalse(scannable_uri.canonical) scannable_uri.save() result = ScannableURI.objects.get(uri=self.test_uri) - self.assertEqual('http://example.com/', result.canonical) + self.assertEqual("http://example.com/", result.canonical) diff --git a/minecode/tests/test_priority_queue.py b/minecode/tests/test_priority_queue.py index 2c94f02d..45cf7b72 100644 --- a/minecode/tests/test_priority_queue.py +++ b/minecode/tests/test_priority_queue.py @@ -9,9 +9,10 @@ from django.test import TestCase as DjangoTestCase -from minecode.utils_test import JsonBasedTesting -from minecode.models import PriorityResourceURI + from minecode.management.commands import priority_queue +from minecode.models import PriorityResourceURI +from minecode.utils_test import JsonBasedTesting from packagedb.models import Package @@ -20,10 +21,10 @@ def test_process_request(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) - purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' - download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' - purl_sources_str = f'{purl_str}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' + purl_str = "pkg:maven/org.apache.twill/twill-core@0.12.0" + download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar" + purl_sources_str = f"{purl_str}?classifier=sources" + sources_download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar" p = PriorityResourceURI.objects.create(uri=purl_str) priority_queue.process_request(p) @@ -32,12 +33,7 @@ def test_process_request(self): self.assertEqual(2, package_count) purls = [ - (package.purl, package.download_url) - for package in Package.objects.all() + (package.purl, package.download_url) for package in Package.objects.all() ] - self.assertIn( - (purl_str, download_url), purls - ) - self.assertIn( - (purl_sources_str, sources_download_url), purls - ) + self.assertIn((purl_str, download_url), purls) + self.assertIn((purl_sources_str, sources_download_url), purls) diff --git a/minecode/tests/test_route.py b/minecode/tests/test_route.py index 2a20ce7c..e719932d 100644 --- a/minecode/tests/test_route.py +++ b/minecode/tests/test_route.py @@ -15,53 +15,56 @@ class RouteTest(TestCase): - def test_rule(self): - self.assertRaises(AssertionError, Rule, '', '') + self.assertRaises(AssertionError, Rule, "", "") - class non_callable(object): + class non_callable: pass - self.assertRaises(AssertionError, Rule, 'abc', non_callable) + self.assertRaises(AssertionError, Rule, "abc", non_callable) - class RoutableClass(object): - """ A callable class can be routed.""" + class RoutableClass: + """A callable class can be routed.""" def __call__(self): pass ca = RoutableClass() - Rule('asas', ca) - Rule('asas', RoutableClass) + Rule("asas", ca) + Rule("asas", RoutableClass) def func(): pass - Rule('asas', func) + Rule("asas", func) import re - invalid_regex = '(({wewew' + + invalid_regex = "(({wewew" self.assertRaises(re.error, Rule, invalid_regex, func) def test_class_routing(self): uris = route.Router() - @uris.route('this') - class CallableClass(object): - """ A callable class can be routed.""" + @uris.route("this") + class CallableClass: + """A callable class can be routed.""" def __call__(self, uri, *args, **kwargs): return uri - self.assertEqual('this', uris.process('this')) + self.assertEqual("this", uris.process("this")) - def test_that_each_processing_of_routed_class_is_done_with_a_new_instance_that_does_not_share_state(self): + def test_that_each_processing_of_routed_class_is_done_with_a_new_instance_that_does_not_share_state( + self, + ): import time + uris = route.Router() - @uris.route('this', 'that') - class CallableClass(object): - """ A callable class can be routed.""" + @uris.route("this", "that") + class CallableClass: + """A callable class can be routed.""" def __init__(self): # some more or less unique thing for a given instance @@ -72,25 +75,23 @@ def __call__(self, uri): return self.ts # ensure that two routes with the same object are same class - thi = uris.resolve('this') - thi2 = uris.resolve('this') + thi = uris.resolve("this") + thi2 = uris.resolve("this") self.assertTrue(thi is thi2) - tha = uris.resolve('that') + tha = uris.resolve("that") self.assertTrue(thi is tha) # ensure that processing of routes for the same registered class # is done by different objects with different state - p1 = uris.process('this') - p2 = uris.process('this') + p1 = uris.process("this") + p2 = uris.process("this") self.assertNotEqual(p1, p2) - p3 = uris.process('this') - p4 = uris.process('that') + p3 = uris.process("this") + p4 = uris.process("that") self.assertNotEqual(p3, p4) def test_that_subclasses_are_routed_correctly_with_append_to_route(self): - - class CallableParentClass(object): - + class CallableParentClass: def __call__(self, uri): return self.myfunc() @@ -98,31 +99,27 @@ def myfunc(self): pass class CallableSubClass1(CallableParentClass): - def myfunc(self): - return 'done1' + return "done1" class CallableSubClass2(CallableParentClass): - def myfunc(self): - return 'done2' + return "done2" uris = route.Router() - uris.append('base', CallableParentClass) - uris.append('this', CallableSubClass1) - uris.append('that', CallableSubClass2) + uris.append("base", CallableParentClass) + uris.append("this", CallableSubClass1) + uris.append("that", CallableSubClass2) - self.assertEqual(None, uris.process('base')) - self.assertEqual('done1', uris.process('this')) - self.assertEqual('done2', uris.process('that')) + self.assertEqual(None, uris.process("base")) + self.assertEqual("done1", uris.process("this")) + self.assertEqual("done2", uris.process("that")) def test_that_subclasses_are_routed_correctly_with_class_decorator(self): uris = route.Router() - class CallableParentClass(object): - """ - Note: The parent class CANNOT be decorated. Only subclasses can - """ + class CallableParentClass: + """Note: The parent class CANNOT be decorated. Only subclasses can""" def __call__(self, uri): return self.myfunc() @@ -130,139 +127,134 @@ def __call__(self, uri): def myfunc(self): raise NotImplementedError - @uris.route('this') + @uris.route("this") class CallableSubClass1(CallableParentClass): - def myfunc(self): - return 'done1' + return "done1" - @uris.route('that') + @uris.route("that") class CallableSubClass2(CallableParentClass): - def __call__(self, uri): - return 'done3' + return "done3" - self.assertEqual('done1', uris.process('this')) - self.assertEqual('done3', uris.process('that')) + self.assertEqual("done1", uris.process("this")) + self.assertEqual("done3", uris.process("that")) def test_rule_match(self): - def func(uri): pass - r = Rule('asas', func) - self.assertTrue(r.match('asas')) - self.assertFalse(r.match('bbb')) + r = Rule("asas", func) + self.assertTrue(r.match("asas")) + self.assertFalse(r.match("bbb")) - r = Rule('.*abc', func) - self.assertTrue(r.match('abc')) - self.assertTrue(r.match('123abc')) - self.assertFalse(r.match('bbb')) - self.assertFalse(r.match('abcXYZ')) + r = Rule(".*abc", func) + self.assertTrue(r.match("abc")) + self.assertTrue(r.match("123abc")) + self.assertFalse(r.match("bbb")) + self.assertFalse(r.match("abcXYZ")) - r = Rule('https*://', func) - self.assertTrue(r.match('http://')) - self.assertTrue(r.match('https://')) + r = Rule("https*://", func) + self.assertTrue(r.match("http://")) + self.assertTrue(r.match("https://")) def test_routing_resolving_and_exceptions(self): uris = route.Router() - @uris.route(r'http://nexb\.com') + @uris.route(r"http://nexb\.com") def myroute(uri): pass - @uris.route(r'http://nexb\.com.*') + @uris.route(r"http://nexb\.com.*") def myroute2(uri): pass - self.assertRaises(route.RouteAlreadyDefined, uris.append, - r'http://nexb\.com', myroute) - self.assertRaises(route.RouteAlreadyDefined, uris.append, - r'http://nexb\.com', myroute) + self.assertRaises( + route.RouteAlreadyDefined, uris.append, r"http://nexb\.com", myroute + ) + self.assertRaises( + route.RouteAlreadyDefined, uris.append, r"http://nexb\.com", myroute + ) - self.assertRaises(route.MultipleRoutesDefined, uris.resolve, - r'http://nexb.com') - self.assertRaises(route.NoRouteAvailable, uris.resolve, 'impossible') + self.assertRaises(route.MultipleRoutesDefined, uris.resolve, r"http://nexb.com") + self.assertRaises(route.NoRouteAvailable, uris.resolve, "impossible") def test_route_resolution_and_execution(self): uris = route.Router() - @uris.route(r'http://nexb\.com') + @uris.route(r"http://nexb\.com") def myroute(uri): - return 'r1' + return "r1" - u1 = 'http://nexb.com' - self.assertEqual('r1', myroute(u1)) + u1 = "http://nexb.com" + self.assertEqual("r1", myroute(u1)) - @uris.route(r'http://dejacode\.com') + @uris.route(r"http://dejacode\.com") def myroute2(uri): - return 'r2' + return "r2" - u1 = 'http://nexb.com' + u1 = "http://nexb.com" self.assertEqual(myroute.__name__, uris.resolve(u1).__name__) # these three calls are equivalent: the uri determines what is executed - self.assertEqual('r1', myroute(u1)) - self.assertEqual('r1', myroute2(u1)) - self.assertEqual('r1', uris.process(u1)) + self.assertEqual("r1", myroute(u1)) + self.assertEqual("r1", myroute2(u1)) + self.assertEqual("r1", uris.process(u1)) - u2 = 'http://dejacode.com' + u2 = "http://dejacode.com" self.assertEqual(myroute2.__name__, uris.resolve(u2).__name__) # these three calls are equivalent: the uri determines what is executed - self.assertEqual('r2', myroute2(u2)) - self.assertEqual('r2', myroute(u2)) - self.assertEqual('r2', uris.process(u2)) + self.assertEqual("r2", myroute2(u2)) + self.assertEqual("r2", myroute(u2)) + self.assertEqual("r2", uris.process(u2)) def test_that_multiple_patterns_can_be_used_in_a_route_decorator(self): uris = route.Router() - @uris.route(r'http://nexb\.com', - r'http://deja\.com') + @uris.route(r"http://nexb\.com", r"http://deja\.com") def myroute(uri): - return 'r1' + return "r1" - u1 = 'http://nexb.com' - self.assertEqual('r1', myroute(u1)) - u1 = 'http://deja.com' - self.assertEqual('r1', myroute(u1)) + u1 = "http://nexb.com" + self.assertEqual("r1", myroute(u1)) + u1 = "http://deja.com" + self.assertEqual("r1", myroute(u1)) def test_translate_globs_can_be_used_instead_of_regex_patterns(self): uris = route.Router() from fnmatch import translate - @uris.route(translate('http://nexb.com/')) + @uris.route(translate("http://nexb.com/")) def myroute(uri): - return 'r1' + return "r1" - u1 = 'http://nexb.com/' - self.assertEqual('r1', myroute(u1)) + u1 = "http://nexb.com/" + self.assertEqual("r1", myroute(u1)) - @uris.route(translate('http://nexb.com/*/*/')) + @uris.route(translate("http://nexb.com/*/*/")) def myroute2(uri): - return 'r2' + return "r2" - u1 = 'http://nexb.com/somepath/otherpath/' - self.assertEqual('r2', myroute(u1)) - u1 = 'http://nexb.com/somepath/yetanotherotherpath/' - self.assertEqual('r2', myroute(u1)) + u1 = "http://nexb.com/somepath/otherpath/" + self.assertEqual("r2", myroute(u1)) + u1 = "http://nexb.com/somepath/yetanotherotherpath/" + self.assertEqual("r2", myroute(u1)) def test_is_routable(self): uris = route.Router() - @uris.route(r'http://nexb\.com', - r'http://deja\.com') + @uris.route(r"http://nexb\.com", r"http://deja\.com") def myroute(uri): pass - @uris.route(r'http://nexc\.com', - r'http://dejb\.com') + @uris.route(r"http://nexc\.com", r"http://dejb\.com") def myroute2(uri): pass - self.assertTrue(uris.is_routable('http://nexb.com')) - self.assertTrue(uris.is_routable('http://deja.com')) - self.assertTrue(uris.is_routable('http://nexc.com')) - self.assertTrue(uris.is_routable('http://dejb.com')) - self.assertFalse(uris.is_routable('https://deja.com')) + self.assertTrue(uris.is_routable("http://nexb.com")) + self.assertTrue(uris.is_routable("http://deja.com")) + self.assertTrue(uris.is_routable("http://nexc.com")) + self.assertTrue(uris.is_routable("http://dejb.com")) + self.assertFalse(uris.is_routable("https://deja.com")) diff --git a/minecode/tests/test_rsync.py b/minecode/tests/test_rsync.py index 7a3901e4..9e768b98 100644 --- a/minecode/tests/test_rsync.py +++ b/minecode/tests/test_rsync.py @@ -8,47 +8,53 @@ # # -from unittest import skipIf import os +from unittest import skipIf -from minecode import rsync from minecode import ON_WINDOWS +from minecode import rsync from minecode.utils_test import MiningTestCase class RsyncTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def test_modules(self): - inp = self.get_test_loc('rsync/rsync_modules') + inp = self.get_test_loc("rsync/rsync_modules") output = list(rsync.modules(inp)) - expected = '''apache CPAN CTAN eclipse flightgear gnualpha gnuftp - mozdev mozilla opencsw simgear sugar xemacs'''.split() + expected = """apache CPAN CTAN eclipse flightgear gnualpha gnuftp + mozdev mozilla opencsw simgear sugar xemacs""".split() self.assertEqual(expected, output) def test_entry_rsync_31(self): # $ rsync --no-motd --recursive rsync/rsync_dir/ lines = [ - 'drwxrwxr-x 4,096 2015/07/23 17:36:47 .', - '-rw-rw-r-- 0 2015/07/23 17:36:47 foo', - 'drwxrwxr-x 4,096 2015/07/23 17:36:47 bar', - '-rw-rw-r-- 0 2015/07/23 17:36:47 bar/this', - 'drwxrwxr-x 4,096 2015/07/23 17:36:47 bar/that', - '-rw-rw-r-- 0 2015/07/23 17:36:47 bar/that/baz', + "drwxrwxr-x 4,096 2015/07/23 17:36:47 .", + "-rw-rw-r-- 0 2015/07/23 17:36:47 foo", + "drwxrwxr-x 4,096 2015/07/23 17:36:47 bar", + "-rw-rw-r-- 0 2015/07/23 17:36:47 bar/this", + "drwxrwxr-x 4,096 2015/07/23 17:36:47 bar/that", + "-rw-rw-r-- 0 2015/07/23 17:36:47 bar/that/baz", ] expected = [ - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2015-07-23T17:36:47+00:00', '.')._asdict(), - rsync.Entry('-', 'rw-rw-r--', 0, - '2015-07-23T17:36:47+00:00', 'foo')._asdict(), - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2015-07-23T17:36:47+00:00', 'bar')._asdict(), - rsync.Entry('-', 'rw-rw-r--', 0, - '2015-07-23T17:36:47+00:00', 'bar/this')._asdict(), - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2015-07-23T17:36:47+00:00', 'bar/that')._asdict(), - rsync.Entry('-', 'rw-rw-r--', 0, - '2015-07-23T17:36:47+00:00', 'bar/that/baz')._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2015-07-23T17:36:47+00:00", "." + )._asdict(), + rsync.Entry( + "-", "rw-rw-r--", 0, "2015-07-23T17:36:47+00:00", "foo" + )._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2015-07-23T17:36:47+00:00", "bar" + )._asdict(), + rsync.Entry( + "-", "rw-rw-r--", 0, "2015-07-23T17:36:47+00:00", "bar/this" + )._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2015-07-23T17:36:47+00:00", "bar/that" + )._asdict(), + rsync.Entry( + "-", "rw-rw-r--", 0, "2015-07-23T17:36:47+00:00", "bar/that/baz" + )._asdict(), ] for test, exp in zip(lines, expected): @@ -57,20 +63,27 @@ def test_entry_rsync_31(self): def test_entry(self): lines = [ - '-rw-r--r-- 4399746 2008/11/23 16:03:57 zz/ZZUL P/ZUL.gz', - 'drwxrwxr-x 4096 2004/08/09 00:47:02 pub/sou/a/a7', - '-rwxrwxr-x 4096 2004/08/09 00:47:02 pub/#345sou/a/a7', - 'lrwxrwxrwx 19 2007/11/22 11:37:54 s/c/a/index.html', - 'crwxrwxrwx 19 2007/11/22 11:37:54 dev/pts1', + "-rw-r--r-- 4399746 2008/11/23 16:03:57 zz/ZZUL P/ZUL.gz", + "drwxrwxr-x 4096 2004/08/09 00:47:02 pub/sou/a/a7", + "-rwxrwxr-x 4096 2004/08/09 00:47:02 pub/#345sou/a/a7", + "lrwxrwxrwx 19 2007/11/22 11:37:54 s/c/a/index.html", + "crwxrwxrwx 19 2007/11/22 11:37:54 dev/pts1", ] expected = [ - rsync.Entry('-', 'rw-r--r--', 4399746, - '2008-11-23T16:03:57+00:00', 'zz/ZZUL P/ZUL.gz')._asdict(), - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2004-08-09T00:47:02+00:00', 'pub/sou/a/a7')._asdict(), - rsync.Entry('-', 'rwxrwxr-x', 4096, - '2004-08-09T00:47:02+00:00', 'pub/\xe5sou/a/a7')._asdict(), + rsync.Entry( + "-", + "rw-r--r--", + 4399746, + "2008-11-23T16:03:57+00:00", + "zz/ZZUL P/ZUL.gz", + )._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2004-08-09T00:47:02+00:00", "pub/sou/a/a7" + )._asdict(), + rsync.Entry( + "-", "rwxrwxr-x", 4096, "2004-08-09T00:47:02+00:00", "pub/\xe5sou/a/a7" + )._asdict(), None, None, ] @@ -80,105 +93,210 @@ def test_entry(self): self.assertEqual(exp, result) def test_directory(self): - test_dir = self.get_test_loc('rsync/rsync_wicket.dir') + test_dir = self.get_test_loc("rsync/rsync_wicket.dir") output = list(rsync.directory_entries(test_dir)) expected = [ - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-03-18T19:02:46+00:00', path='.'), - rsync.Entry(type='-', perm='rw-rw-r--', size=5, - date='2014-03-18T19:02:46+00:00', path='.revision'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-05T09:34:20+00:00', path='1.4.23'), - rsync.Entry(type='-', perm='rw-rw-r--', size=95314, - date='2014-02-05T09:23:44+00:00', path='1.4.23/CHANGELOG-1.4'), - rsync.Entry(type='-', perm='rw-rw-r--', size=3712820, - date='2014-02-05T09:23:44+00:00', path='1.4.23/apache-wicket-1.4.23-source.tgz'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-05T09:34:20+00:00', path='1.4.23/binaries'), - rsync.Entry(type='-', perm='rw-rw-r--', size=23622515, date='2014-02-05T09:23:44+00:00', - path='1.4.23/binaries/apache-wicket-1.4.23.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=32524295, - date='2014-02-05T09:23:44+00:00', path='1.4.23/binaries/apache-wicket-1.4.23.zip'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-01-27T09:09:40+00:00', path='1.5.11'), - rsync.Entry(type='-', perm='rw-rw-r--', size=115587, - date='2014-01-20T16:53:10+00:00', path='1.5.11/CHANGELOG-1.5'), - rsync.Entry(type='-', perm='rw-rw-r--', size=4116809, - date='2014-01-20T16:53:10+00:00', path='1.5.11/apache-wicket-1.5.11-source.tgz'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-01-27T09:09:39+00:00', path='1.5.11/binaries'), - rsync.Entry(type='-', perm='rw-rw-r--', size=26048500, date='2014-01-20T16:53:10+00:00', - path='1.5.11/binaries/apache-wicket-1.5.11.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=36156260, - date='2014-01-20T16:53:10+00:00', path='1.5.11/binaries/apache-wicket-1.5.11.zip'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-19T08:36:07+00:00', path='6.14.0'), - rsync.Entry(type='-', perm='rw-rw-r--', size=78058, - date='2014-02-14T15:51:23+00:00', path='6.14.0/CHANGELOG-6.x'), - rsync.Entry(type='-', perm='rw-rw-r--', size=4792619, - date='2014-02-14T15:51:23+00:00', path='6.14.0/apache-wicket-6.14.0.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=9038442, - date='2014-02-14T15:51:23+00:00', path='6.14.0/apache-wicket-6.14.0.zip'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-19T08:36:05+00:00', path='6.14.0/binaries'), - rsync.Entry(type='-', perm='rw-rw-r--', size=29851252, date='2014-02-14T15:51:23+00:00', - path='6.14.0/binaries/apache-wicket-6.14.0-bin.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=29890658, date='2014-02-14T15:51:23+00:00', - path='6.14.0/binaries/apache-wicket-6.14.0-bin.zip') + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-03-18T19:02:46+00:00", + path=".", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=5, + date="2014-03-18T19:02:46+00:00", + path=".revision", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-05T09:34:20+00:00", + path="1.4.23", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=95314, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/CHANGELOG-1.4", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=3712820, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/apache-wicket-1.4.23-source.tgz", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-05T09:34:20+00:00", + path="1.4.23/binaries", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=23622515, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/binaries/apache-wicket-1.4.23.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=32524295, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/binaries/apache-wicket-1.4.23.zip", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-01-27T09:09:40+00:00", + path="1.5.11", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=115587, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/CHANGELOG-1.5", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=4116809, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/apache-wicket-1.5.11-source.tgz", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-01-27T09:09:39+00:00", + path="1.5.11/binaries", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=26048500, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/binaries/apache-wicket-1.5.11.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=36156260, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/binaries/apache-wicket-1.5.11.zip", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-19T08:36:07+00:00", + path="6.14.0", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=78058, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/CHANGELOG-6.x", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=4792619, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/apache-wicket-6.14.0.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=9038442, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/apache-wicket-6.14.0.zip", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-19T08:36:05+00:00", + path="6.14.0/binaries", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=29851252, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/binaries/apache-wicket-6.14.0-bin.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=29890658, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/binaries/apache-wicket-6.14.0-bin.zip", + ), ] expected = [dict(x._asdict()) for x in expected] self.assertEqual(expected, output) def test_directory_weird_file_types_are_ignored(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dev.dir') + inp = self.get_test_loc("rsync/rsync_dev.dir") output = rsync.directory_entries(inp) - results = [e['path'] for e in output if e['type'] == '-'] - expected = ['dev/.udev/rules.d/root.rules'] + results = [e["path"] for e in output if e["type"] == "-"] + expected = ["dev/.udev/rules.d/root.rules"] self.assertEqual(expected, results) - @skipIf(ON_WINDOWS, 'rsync is not available on windows') + @skipIf(ON_WINDOWS, "rsync is not available on windows") def test_fetch_directory(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dir') + inp = self.get_test_loc("rsync/rsync_dir") output = rsync.fetch_directory(inp) - expected = 'foo bar bar/this bar/that bar/that/baz'.split() + expected = "foo bar bar/this bar/that bar/that/baz".split() with open(output) as f: results = f.read() self.assertTrue(all(e in results for e in expected)) - @skipIf(ON_WINDOWS, 'rsync is not available on windows') + @skipIf(ON_WINDOWS, "rsync is not available on windows") def test_fetch_directory_no_recurse(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dir') + inp = self.get_test_loc("rsync/rsync_dir") output = rsync.fetch_directory(inp, recurse=False) - expected = ['foo', 'bar'] + expected = ["foo", "bar"] with open(output) as f: results = f.read() self.assertTrue(all(e in results for e in expected)) - self.assertTrue('bar/this' not in results) + self.assertTrue("bar/this" not in results) def get_dirs(self, input_path): - """ - Returns only the type and path from rsync entries. - """ - return [(e['type'], e['path']) - for e in rsync.directory_entries(input_path) - if '.svn' not in e['path']] - - @skipIf(ON_WINDOWS, 'rsync is not available on windows') + """Returns only the type and path from rsync entries.""" + return [ + (e["type"], e["path"]) + for e in rsync.directory_entries(input_path) + if ".svn" not in e["path"] + ] + + @skipIf(ON_WINDOWS, "rsync is not available on windows") def test_fetch_and_parse_directory_no_recurse(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dir') + inp = self.get_test_loc("rsync/rsync_dir") output = rsync.fetch_directory(inp, recurse=False) results = self.get_dirs(output) - expected = [('d', '.'), ('-', 'foo'), ('d', 'bar')] + expected = [("d", "."), ("-", "foo"), ("d", "bar")] self.assertEqual(sorted(expected), sorted(results)) def test_directory_output_can_be_parsed_on_protocol_30_and_31(self): self.maxDiff = None - input_30 = self.get_test_loc('rsync/rsync_v3.0.9_protocol30.dir') - input_31 = self.get_test_loc('rsync/rsync_v3.1.0_protocol31.dir') + input_30 = self.get_test_loc("rsync/rsync_v3.0.9_protocol30.dir") + input_31 = self.get_test_loc("rsync/rsync_v3.1.0_protocol31.dir") self.assertEqual(self.get_dirs(input_30), self.get_dirs(input_31)) diff --git a/minecode/tests/test_run_map.py b/minecode/tests/test_run_map.py index 7bca14f1..f3ad262a 100644 --- a/minecode/tests/test_run_map.py +++ b/minecode/tests/test_run_map.py @@ -15,36 +15,38 @@ from packagedcode.models import Package as ScannedPackage +import packagedb from minecode.management.commands.run_map import map_uri from minecode.model_utils import merge_packages from minecode.models import ResourceURI from minecode.models import ScannableURI from minecode.route import Router +from minecode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTesting from minecode.utils_test import MiningTestCase -from minecode.tests import FIXTURES_REGEN -import packagedb class RunMapTest(JsonBasedTesting, MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def test_map_uri(self): # setup # build a mock mapper and register it in a router - uri = 'http://testdomap.com' + uri = "http://testdomap.com" def mock_mapper(uri, resource_uri): - return [ScannedPackage( - type='maven', - namespace='org.apache.spark', - name='spark-streaming_2.10', - version='1.2.0', - qualifiers=dict(extension='pom'), - download_url='http://testdomap.com', - sha1='beef' - )] + return [ + ScannedPackage( + type="maven", + namespace="org.apache.spark", + name="spark-streaming_2.10", + version="1.2.0", + qualifiers=dict(extension="pom"), + download_url="http://testdomap.com", + sha1="beef", + ) + ] router = Router() router.append(uri, mock_mapper) @@ -56,33 +58,37 @@ def mock_mapper(uri, resource_uri): resource_uri = ResourceURI.objects.insert( uri=uri, last_visit_date=timezone.now(), - package_url='pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom') + package_url="pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom", + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() self.assertEqual( - 'pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom', mapped_package.package_url) + "pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom", + mapped_package.package_url, + ) # test history history = mapped_package.get_history() self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual( - 'New Package created from URI: {}'.format(uri), message) + message = entry.get("message") + self.assertEqual(f"New Package created from URI: {uri}", message) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) @@ -90,13 +96,13 @@ def mock_mapper(uri, resource_uri): self.assertFalse(resource_uri.last_map_date is None) # check that a ScannableURI has been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(1, scannable.count()) def test_map_uri_continues_after_raised_exception(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): raise Exception() @@ -109,20 +115,23 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly @@ -132,16 +141,16 @@ def mock_mapper(uri, resource_uri): self.assertTrue(resource_uri.map_error is not None) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_continues_if_unknown_type_in_package_iterator(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): - return ['some string'] + return ["some string"] router = Router() router.append(uri, mock_mapper) @@ -151,45 +160,45 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) self.assertEqual(None, resource_uri.wip_date) self.assertFalse(resource_uri.last_map_date is None) - self.assertTrue( - 'Not a ScanCode PackageData type' in resource_uri.map_error) + self.assertTrue("Not a ScanCode PackageData type" in resource_uri.map_error) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_continues_if_no_download_url_in_package_iterator(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" class MP(ScannedPackage): pass def mock_mapper(uri, resource_uri): - return [ - MP(type='generic', name='foo', sha1='beef') - ] + return [MP(type="generic", name="foo", sha1="beef")] router = Router() router.append(uri, mock_mapper) @@ -199,51 +208,50 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) self.assertEqual(None, resource_uri.wip_date) self.assertFalse(resource_uri.last_map_date is None) - self.assertTrue( - 'No download_url for package' in resource_uri.map_error) + self.assertTrue("No download_url for package" in resource_uri.map_error) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_continues_after_raised_exception_in_package_iterator(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" class MP(ScannedPackage): - def to_dict(self, **kwargs): - raise Exception('ScannedPackage issue') + raise Exception("ScannedPackage issue") def __getattribute__(self, item): - raise Exception('ScannedPackage issue') + raise Exception("ScannedPackage issue") return ScannedPackage.__getattribute__(self, item) def mock_mapper(uri, resource_uri): - return [ - MP(type='generic', name='foo', download_url=uri, sha1='beef') - ] + return [MP(type="generic", name="foo", download_url=uri, sha1="beef")] router = Router() router.append(uri, mock_mapper) @@ -253,77 +261,80 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) self.assertEqual(None, resource_uri.wip_date) self.assertFalse(resource_uri.last_map_date is None) - self.assertTrue('ScannedPackage issue' in resource_uri.map_error) - self.assertTrue('Failed to map while' in resource_uri.map_error) + self.assertTrue("ScannedPackage issue" in resource_uri.map_error) + self.assertTrue("Failed to map while" in resource_uri.map_error) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_with_no_route_defined_does_not_map(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): return [ ScannedPackage( - uri='http://test.com', - type='generic', - name='testpack', + uri="http://test.com", + type="generic", + name="testpack", ) ] router = Router() - router.append('http://nexb.com', mock_mapper) + router.append("http://nexb.com", mock_mapper) resource_uri = ResourceURI.objects.create(uri=uri) # test proper map_uri(resource_uri, _map_router=router) try: - ResourceURI.objects.get(uri='http://test.com') - self.fail('URI should not have been created') + ResourceURI.objects.get(uri="http://test.com") + self.fail("URI should not have been created") except ResourceURI.DoesNotExist: pass def test_run_map_command(self): output = StringIO() - management.call_command('run_map', exit_on_empty=True, stdout=output) - self.assertEqual('', output.getvalue()) + management.call_command("run_map", exit_on_empty=True, stdout=output) + self.assertEqual("", output.getvalue()) def test_map_uri_does_update_with_same_mining_level(self): # setup # build a mock mapper and register it in a router - download_url = 'http://testdomap2.com' + download_url = "http://testdomap2.com" new_p = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', - download_url=download_url + type="generic", + name="pack", + version="0.2", + description="Description Updated", + download_url=download_url, ) - uri = 'http://testdomap2.com' + uri = "http://testdomap2.com" def mock_mapper(uri, resource_uri): return [new_p] @@ -336,9 +347,7 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, - last_visit_date=timezone.now(), - mining_level=0 + uri=uri, last_visit_date=timezone.now(), mining_level=0 ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True @@ -347,19 +356,18 @@ def mock_mapper(uri, resource_uri): # ensure that we have an existing Package before packagedb.models.Package.objects.insert( mining_level=0, - type='generic', - name='pack', - version='0.1', - description='Description Existing', + type="generic", + name="pack", + version="0.1", + description="Description Existing", download_url=download_url, - sha1='beef', + sha1="beef", ) # test proper map_uri(resource_uri, _map_router=router) - mapped = packagedb.models.Package.objects.filter( - download_url=download_url) + mapped = packagedb.models.Package.objects.filter(download_url=download_url) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() @@ -368,15 +376,16 @@ def mock_mapper(uri, resource_uri): self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual('Package field values have been updated.', message) - data = entry.get('data') - updated_fields = data.get('updated_fields') + message = entry.get("message") + self.assertEqual("Package field values have been updated.", message) + data = entry.get("data") + updated_fields = data.get("updated_fields") expected_updated_fields_loc = self.get_test_loc( - 'run_map/test_map_uri_does_update_with_same_mining_level_expected_updated_fields.json' + "run_map/test_map_uri_does_update_with_same_mining_level_expected_updated_fields.json" ) self.check_expected_results( - updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN) + updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN + ) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) @@ -385,7 +394,8 @@ def mock_mapper(uri, resource_uri): # check that the Package has been updated correctly expected_loc = self.get_test_loc( - 'run_map/test_map_uri_does_update_with_same_mining_level-expected.json') + "run_map/test_map_uri_does_update_with_same_mining_level-expected.json" + ) result = mapped_package.to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) @@ -398,17 +408,17 @@ def mock_mapper(uri, resource_uri): def test_map_uri_update_only_empties_with_lesser_new_mining_level(self): # setup # build a mock mapper and register it in a router - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" new_p = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', + type="generic", + name="pack", + version="0.2", + description="Description Updated", download_url=download_url, - sha1='feed' + sha1="feed", ) - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): return [new_p] @@ -421,9 +431,7 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, - last_visit_date=timezone.now(), - mining_level=0 + uri=uri, last_visit_date=timezone.now(), mining_level=0 ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True @@ -433,18 +441,17 @@ def mock_mapper(uri, resource_uri): packagedb.models.Package.objects.insert( # NOTE: existing is 10, new is 0 mining_level=10, - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='', + sha1="", ) # test proper map_uri(resource_uri, _map_router=router) - mapped = packagedb.models.Package.objects.filter( - download_url=download_url) + mapped = packagedb.models.Package.objects.filter(download_url=download_url) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() @@ -453,21 +460,17 @@ def mock_mapper(uri, resource_uri): self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual('Package field values have been updated.', message) - data = entry.get('data') - updated_fields = data.get('updated_fields') + message = entry.get("message") + self.assertEqual("Package field values have been updated.", message) + data = entry.get("data") + updated_fields = data.get("updated_fields") expected_updated_fields = [ { - 'field': 'description', - 'new_value': 'Description Updated', - 'old_value': '' + "field": "description", + "new_value": "Description Updated", + "old_value": "", }, - { - 'field': 'sha1', - 'new_value': 'feed', - 'old_value': '' - } + {"field": "sha1", "new_value": "feed", "old_value": ""}, ] self.assertEqual(expected_updated_fields, updated_fields) @@ -478,7 +481,8 @@ def mock_mapper(uri, resource_uri): # check that the Package has been updated correctly expected_loc = self.get_test_loc( - 'run_map/test_map_uri_update_only_empties_with_lesser_new_mining_level-expected.json') + "run_map/test_map_uri_update_only_empties_with_lesser_new_mining_level-expected.json" + ) result = mapped[0].to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) @@ -491,16 +495,16 @@ def mock_mapper(uri, resource_uri): def test_map_uri_replace_with_new_with_higher_new_mining_level(self): # setup # build a mock mapper and register it in a router - download_url = 'http://testdomap4.com' + download_url = "http://testdomap4.com" new_p = ScannedPackage( - type='generic', - name='pack2', - version='0.2', - description='Description Updated', - download_url=download_url + type="generic", + name="pack2", + version="0.2", + description="Description Updated", + download_url=download_url, ) - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): return [new_p] @@ -513,9 +517,7 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, - last_visit_date=timezone.now(), - mining_level=10 + uri=uri, last_visit_date=timezone.now(), mining_level=10 ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True @@ -525,18 +527,17 @@ def mock_mapper(uri, resource_uri): packagedb.models.Package.objects.insert( # NOTE: existing is 5, new is 10 mining_level=5, - name='pack', - version='0.1', - description='', + name="pack", + version="0.1", + description="", download_url=download_url, - type='generic', - sha1='beef', + type="generic", + sha1="beef", ) # test proper map_uri(resource_uri, _map_router=router) - mapped = packagedb.models.Package.objects.filter( - download_url=download_url) + mapped = packagedb.models.Package.objects.filter(download_url=download_url) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() @@ -545,15 +546,16 @@ def mock_mapper(uri, resource_uri): self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual('Package field values have been updated.', message) - data = entry.get('data') - updated_fields = data.get('updated_fields') + message = entry.get("message") + self.assertEqual("Package field values have been updated.", message) + data = entry.get("data") + updated_fields = data.get("updated_fields") expected_updated_fields_loc = self.get_test_loc( - 'run_map/test_map_uri_replace_with_new_with_higher_new_mining_level_expected_updated_fields.json' + "run_map/test_map_uri_replace_with_new_with_higher_new_mining_level_expected_updated_fields.json" ) self.check_expected_results( - updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN) + updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN + ) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) @@ -562,7 +564,8 @@ def mock_mapper(uri, resource_uri): # check that the Package has been updated correctly expected_loc = self.get_test_loc( - 'run_map/test_map_uri_replace_with_new_with_higher_new_mining_level-expected.json') + "run_map/test_map_uri_replace_with_new_with_higher_new_mining_level-expected.json" + ) result = mapped[0].to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) @@ -573,69 +576,71 @@ def mock_mapper(uri, resource_uri): self.assertEqual(0, scannable.count()) def test_merge_packages_no_replace(self): - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" existing_package, _created = packagedb.models.Package.objects.get_or_create( - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='beef', + sha1="beef", ) new_package_data = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', - download_url=download_url + type="generic", + name="pack", + version="0.2", + description="Description Updated", + download_url=download_url, ).to_dict() merge_packages(existing_package, new_package_data, replace=False) expected_loc = self.get_test_loc( - 'run_map/test_merge_packages_no_replace-expected.json') + "run_map/test_merge_packages_no_replace-expected.json" + ) result = existing_package.to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_merge_packages_with_replace(self): - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" existing_package, _created = packagedb.models.Package.objects.get_or_create( - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='beef', + sha1="beef", ) new_package_data = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', + type="generic", + name="pack", + version="0.2", + description="Description Updated", download_url=download_url, ).to_dict() merge_packages(existing_package, new_package_data, replace=True) expected_loc = self.get_test_loc( - 'run_map/test_merge_packages_with_replace-expected.json') + "run_map/test_merge_packages_with_replace-expected.json" + ) result = existing_package.to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_merge_packages_different_sha1(self): - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" existing_package, _created = packagedb.models.Package.objects.get_or_create( - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='beef', + sha1="beef", ) new_package_data = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', + type="generic", + name="pack", + version="0.2", + description="Description Updated", download_url=download_url, - sha1='feed' + sha1="feed", ).to_dict() with self.assertRaises(Exception) as e: merge_packages(existing_package, new_package_data) - self.assertTrue('Mismatched sha1' in e.exception) + self.assertTrue("Mismatched sha1" in e.exception) diff --git a/minecode/tests/test_run_visit.py b/minecode/tests/test_run_visit.py index b38102c6..ae9e6b0c 100644 --- a/minecode/tests/test_run_visit.py +++ b/minecode/tests/test_run_visit.py @@ -7,36 +7,55 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +from collections import Counter from io import StringIO -from collections import Counter from django.core import management -from minecode.utils_test import MiningTestCase from minecode.management.commands.run_visit import visit_uri +from minecode.miners import URI from minecode.models import ResourceURI from minecode.route import Router -from minecode.miners import URI +from minecode.utils_test import MiningTestCase class RunVisitWithCounterTest(MiningTestCase): - def test_visit_uri_with_counter_0_max_uris_3_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-0-max-uris-3-multi-uri1.com', package_url='pkg:npm/foobar@12.3.1'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri2.com', - package_url='pkg:npm/foobar@12.3.2'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri3.com', - package_url='pkg:npm/foobar@12.3.3'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri4.com', - package_url='pkg:npm/foobar@12.3.4'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri5.com', - package_url='pkg:npm/foobar@12.3.5'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri6.com', package_url='pkg:npm/foobar@12.3.5')], None, None + return ( + [ + URI( + uri="http://test-counter-0-max-uris-3-multi-uri1.com", + package_url="pkg:npm/foobar@12.3.1", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri2.com", + package_url="pkg:npm/foobar@12.3.2", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri3.com", + package_url="pkg:npm/foobar@12.3.3", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri4.com", + package_url="pkg:npm/foobar@12.3.4", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri5.com", + package_url="pkg:npm/foobar@12.3.5", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri6.com", + package_url="pkg:npm/foobar@12.3.5", + ), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -53,46 +72,62 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 0 - visit_uri(resource_uri, _visit_router=router, - max_uris=3, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=3, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri1.com') + uri="http://test-counter-0-max-uris-3-multi-uri1.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.1', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.1", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri2.com') + uri="http://test-counter-0-max-uris-3-multi-uri2.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.2', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.2", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri3.com') + uri="http://test-counter-0-max-uris-3-multi-uri3.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.3', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.3", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri4.com') + uri="http://test-counter-0-max-uris-3-multi-uri4.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.4', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.4", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri5.com') + uri="http://test-counter-0-max-uris-3-multi-uri5.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri6.com') + uri="http://test-counter-0-max-uris-3-multi-uri6.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_0_max_uris_1_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-0-max-uris-1-multi-uri1.com'), - URI(uri='http://test-counter-0-max-uris-1-multi-uri2.com'), - URI(uri='http://test-counter-0-max-uris-1-multi-uri3.com')], None, None + return ( + [ + URI(uri="http://test-counter-0-max-uris-1-multi-uri1.com"), + URI(uri="http://test-counter-0-max-uris-1-multi-uri2.com"), + URI(uri="http://test-counter-0-max-uris-1-multi-uri3.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -109,40 +144,53 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 0 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-1-multi-uri1.com') + uri="http://test-counter-0-max-uris-1-multi-uri1.com" + ) self.assertEqual(1, visited.count()) # MAX_URIS=1 still gives us two URIs visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-1-multi-uri2.com') + uri="http://test-counter-0-max-uris-1-multi-uri2.com" + ) self.assertEqual(1, visited.count()) # ... but not 3 visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-1-multi-uri3.com') + uri="http://test-counter-0-max-uris-1-multi-uri3.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_10_max_uris_10_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-10-max-uris-10-multi-uri1.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri2.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri3.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri4.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri5.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri6.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri7.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri8.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri9.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri10.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri11.com')], None, None + return ( + [ + URI(uri="http://test-counter-10-max-uris-10-multi-uri1.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri2.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri3.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri4.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri5.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri6.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri7.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri8.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri9.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri10.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri11.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -159,54 +207,75 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri1.com') + uri="http://test-counter-10-max-uris-10-multi-uri1.com" + ) self.assertEqual(1, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri2.com') + uri="http://test-counter-10-max-uris-10-multi-uri2.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri3.com') + uri="http://test-counter-10-max-uris-10-multi-uri3.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri4.com') + uri="http://test-counter-10-max-uris-10-multi-uri4.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri5.com') + uri="http://test-counter-10-max-uris-10-multi-uri5.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri6.com') + uri="http://test-counter-10-max-uris-10-multi-uri6.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri7.com') + uri="http://test-counter-10-max-uris-10-multi-uri7.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri8.com') + uri="http://test-counter-10-max-uris-10-multi-uri8.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri9.com') + uri="http://test-counter-10-max-uris-10-multi-uri9.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri10.com') + uri="http://test-counter-10-max-uris-10-multi-uri10.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri11.com') + uri="http://test-counter-10-max-uris-10-multi-uri11.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_3_max_uris_3_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-3-max-uris-3-multi-uri1.com'), - URI(uri='http://test-counter-3-max-uris-3-multi-uri2.com'), - URI(uri='http://test-counter-3-max-uris-3-multi-uri3.com'), - URI(uri='http://test-counter-3-max-uris-3-multi-uri4.com')], None, None + return ( + [ + URI(uri="http://test-counter-3-max-uris-3-multi-uri1.com"), + URI(uri="http://test-counter-3-max-uris-3-multi-uri2.com"), + URI(uri="http://test-counter-3-max-uris-3-multi-uri3.com"), + URI(uri="http://test-counter-3-max-uris-3-multi-uri4.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -223,31 +292,45 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri1.com') + uri="http://test-counter-3-max-uris-3-multi-uri1.com" + ) self.assertEqual(1, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri2.com') + uri="http://test-counter-3-max-uris-3-multi-uri2.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri3.com') + uri="http://test-counter-3-max-uris-3-multi-uri3.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri3.com') + uri="http://test-counter-3-max-uris-3-multi-uri3.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_1_max_uris_1_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-1-max-uris-1-multi-uri1.com'), - URI(uri='http://test-counter-1-max-uris-1-multi-uri2.com')], None, None + return ( + [ + URI(uri="http://test-counter-1-max-uris-1-multi-uri1.com"), + URI(uri="http://test-counter-1-max-uris-1-multi-uri2.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -264,24 +347,30 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-1-max-uris-1-multi-uri1.com') + uri="http://test-counter-1-max-uris-1-multi-uri1.com" + ) self.assertEqual(1, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-1-max-uris-1-multi-uri2.com') + uri="http://test-counter-1-max-uris-1-multi-uri2.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_10_max_uris_10(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-10-max-uris-10.com')], None, None + return [URI(uri="http://test-counter-10-max-uris-10.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -298,20 +387,25 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 10 - visit_uri(resource_uri, _visit_router=router, - max_uris=10, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=10, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10.com') + uri="http://test-counter-10-max-uris-10.com" + ) self.assertEqual(1, visited.count()) def test_visit_uri_with_counter_3_max_uris_3(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-3-max-uris-3.com')], None, None + return [URI(uri="http://test-counter-3-max-uris-3.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -328,20 +422,23 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 3 - visit_uri(resource_uri, _visit_router=router, - max_uris=3, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=3, + uri_counter_by_visitor=counter, + ) - visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-3-max-uris-3.com") self.assertEqual(1, visited.count()) def test_visit_uri_with_counter_1_max_uris_1(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-1-max-uris-1.com')], None, None + return [URI(uri="http://test-counter-1-max-uris-1.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -358,20 +455,23 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) - visited = ResourceURI.objects.filter( - uri='http://test-counter-1-max-uris-1.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-1-max-uris-1.com") self.assertEqual(1, visited.count()) def test_visit_uri_with_counter_2_max_uris_1(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-2-max-uris-1.com')], None, None + return [URI(uri="http://test-counter-2-max-uris-1.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -388,20 +488,23 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 2 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) - visited = ResourceURI.objects.filter( - uri='http://test-counter-2-max-uris-1.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-2-max-uris-1.com") self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_1_no_max_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-2-max-uris-1.com')], None, None + return [URI(uri="http://test-counter-2-max-uris-1.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -418,28 +521,28 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri( - resource_uri, _visit_router=router, uri_counter_by_visitor=counter) + visit_uri(resource_uri, _visit_router=router, uri_counter_by_visitor=counter) - visited = ResourceURI.objects.filter( - uri='http://test-counter-2-max-uris-1.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-2-max-uris-1.com") self.assertEqual(1, visited.count()) class RunVisitTest(MiningTestCase): - def setUp(self): - self.uri = 'http://nexb_visit.com' + self.uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test.com')], None, None + return [URI(uri="http://test.com")], None, None def mock_visitor2(uri): - return [ - URI(uri='http://test.com', package_url='pkg:npm/foobar@12.3.1'), - URI(uri='http://test.com', visited=True, - data={'some': 'data'}), - ], None, None + return ( + [ + URI(uri="http://test.com", package_url="pkg:npm/foobar@12.3.1"), + URI(uri="http://test.com", visited=True, data={"some": "data"}), + ], + None, + None, + ) self.router = Router() self.router.append(self.uri, mock_visitor) @@ -456,36 +559,36 @@ def tearDown(self): def test_visit_uri(self): visit_uri(self.resource_uri, _visit_router=self.router) - visited = ResourceURI.objects.filter(uri='http://test.com') + visited = ResourceURI.objects.filter(uri="http://test.com") self.assertEqual(1, visited.count()) def test_visit_uri_with_no_route_defined_does_not_visit(self): - resource_uri = ResourceURI.objects.create( - uri='http://undefined-route.com') + resource_uri = ResourceURI.objects.create(uri="http://undefined-route.com") resource_uri.is_visitable = True resource_uri.save() visit_uri(resource_uri, _visit_router=self.router) try: - ResourceURI.objects.get(uri='http://test.com') - self.fail('URI should not have been created.') + ResourceURI.objects.get(uri="http://test.com") + self.fail("URI should not have been created.") except ResourceURI.DoesNotExist: pass def test_run_visit_command(self): output = StringIO() - management.call_command('run_visit', exit_on_empty=True, stdout=output) - expected = 'Visited 0 URIs\nInserted 0 new URIs\n' + management.call_command("run_visit", exit_on_empty=True, stdout=output) + expected = "Visited 0 URIs\nInserted 0 new URIs\n" self.assertEqual(expected, output.getvalue()) def test_visit_uri_always_inserts_new_uri(self): # test proper visit_uri(self.resource_uri, _visit_router=self.router2) - visited = ResourceURI.objects.filter( - uri='http://test.com').order_by('-package_url') + visited = ResourceURI.objects.filter(uri="http://test.com").order_by( + "-package_url" + ) expected = [ - URI(uri=u'http://test.com', data=u"{'some': 'data'}"), - URI(uri=u'http://test.com', package_url='pkg:npm/foobar@12.3.1'), + URI(uri="http://test.com", data="{'some': 'data'}"), + URI(uri="http://test.com", package_url="pkg:npm/foobar@12.3.1"), ] results = sorted(URI.from_db(ruri) for ruri in visited) @@ -493,15 +596,13 @@ def test_visit_uri_always_inserts_new_uri(self): def test_visit_uri_always_inserts_new_uri_unless_there_is_pending_for_visit(self): # create a uri that is already pending visit - resource_uri2 = ResourceURI.objects.insert(uri='http://test.com') + resource_uri2 = ResourceURI.objects.insert(uri="http://test.com") resource_uri2.is_visitable = True resource_uri2.save() # test proper visit_uri(self.resource_uri, _visit_router=self.router) - visited = ResourceURI.objects.filter(uri='http://test.com') - expected = [ - resource_uri2 - ] + visited = ResourceURI.objects.filter(uri="http://test.com") + expected = [resource_uri2] self.assertEqual(expected, list(visited)) diff --git a/minecode/tests/test_seed.py b/minecode/tests/test_seed.py index 61634ce7..a6d896fe 100644 --- a/minecode/tests/test_seed.py +++ b/minecode/tests/test_seed.py @@ -8,41 +8,41 @@ # -from datetime import timedelta import os +from datetime import timedelta from io import StringIO +from unittest.mock import patch from django.core import management from django.utils import timezone -from mock import patch +from minecode import seed from minecode.management.commands.seed import SEED_PRIORITY from minecode.management.commands.seed import insert_seed_uris from minecode.models import ResourceURI -from minecode import seed from minecode.utils_test import MiningTestCase class RevisitSeedTest(MiningTestCase): - def setUp(self): class SampleSeed0(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' + yield "https://pypi.python.org/pypi/foo/json" class SampleSeed1(seed.Seeder): revisit_after = 1 # hours def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' + yield "https://pypi.python.org/pypi/foo/json" self.SampleSeed0 = SampleSeed0() self.SampleSeed1 = SampleSeed1() def test_insert_seed_uris_revisit_before_10_days_custom_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -51,15 +51,17 @@ def test_insert_seed_uris_revisit_before_10_days_custom_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(minutes=10) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) def test_insert_seed_uris_revisit_after_10_days_custom_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -68,15 +70,17 @@ def test_insert_seed_uris_revisit_after_10_days_custom_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(days=10) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(2, len(seeded)) def test_insert_seed_uris_revisit_before_10_days_default_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -85,15 +89,17 @@ def test_insert_seed_uris_revisit_before_10_days_default_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(days=9) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) def test_insert_seed_uris_revisit_after_10_days_default_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -102,42 +108,42 @@ def test_insert_seed_uris_revisit_after_10_days_default_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(days=10) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(2, len(seeded)) class SeedTest(MiningTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - class SampleSeed0(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/thatbar/json' - yield 'https://pypi.python.org/pypi/that/json' - yield 'https://elsewehre.com' + yield "https://pypi.python.org/pypi/thatbar/json" + yield "https://pypi.python.org/pypi/that/json" + yield "https://elsewehre.com" class SampleSeed1(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/igloo/json' - yield 'https://pypi.python.org/pypi/someigloo/json' + yield "https://pypi.python.org/pypi/igloo/json" + yield "https://pypi.python.org/pypi/someigloo/json" class SampleSeed2(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/igloo2/json' - yield 'https://pypi.python.org/pypi/otherigloo/json' + yield "https://pypi.python.org/pypi/igloo2/json" + yield "https://pypi.python.org/pypi/otherigloo/json" class SampleSeed3(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' - yield 'https://pypi.python.org/pypi/foobar/json' + yield "https://pypi.python.org/pypi/foo/json" + yield "https://pypi.python.org/pypi/foobar/json" class SampleSeed4(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' - yield 'https://pypi.python.org/pypi/foobaz/json' + yield "https://pypi.python.org/pypi/foo/json" + yield "https://pypi.python.org/pypi/foobaz/json" self.SampleSeed0 = SampleSeed0() self.SampleSeed1 = SampleSeed1() @@ -145,14 +151,14 @@ def get_seeds(self): self.SampleSeed3 = SampleSeed3() self.SampleSeed4 = SampleSeed4() - @patch('minecode.seed.get_active_seeders') + @patch("minecode.seed.get_active_seeders") def test_seed_command(self, mock_get_active_seeders): output = StringIO() mock_get_active_seeders.return_value = [self.SampleSeed0] - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) - management.call_command('seed', pattern=None, stdout=output) - expected = 'Inserted 3 seed URIs\n' + management.call_command("seed", pattern=None, stdout=output) + expected = "Inserted 3 seed URIs\n" self.assertEqual(expected, output.getvalue()) if before: @@ -160,32 +166,39 @@ def test_seed_command(self, mock_get_active_seeders): else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/thatbar/json', - 'https://pypi.python.org/pypi/that/json', - 'https://elsewehre.com', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/thatbar/json", + "https://pypi.python.org/pypi/that/json", + "https://elsewehre.com", + ] + ) self.assertEqual(expected, sorted([s.uri for s in seeded])) self.assertTrue(not all(s.is_visitable for s in seeded)) self.assertEqual(3, len([s.is_visitable for s in seeded])) self.assertTrue(all(s.priority == SEED_PRIORITY for s in seeded)) - @patch('minecode.seed.get_active_seeders') - def test_insert_seed_uris_inserts_uris_for_active_seeders_with_pattern(self, mock_get_active_seeders): + @patch("minecode.seed.get_active_seeders") + def test_insert_seed_uris_inserts_uris_for_active_seeders_with_pattern( + self, mock_get_active_seeders + ): mock_get_active_seeders.return_value = [self.SampleSeed1] - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) seeders = seed.get_active_seeders() - results = sorted(insert_seed_uris( - pattern='.*python.*igloo.json', seeders=seeders)) + results = sorted( + insert_seed_uris(pattern=".*python.*igloo.json", seeders=seeders) + ) if before: seeded = ResourceURI.objects.exclude(uri__in=before) else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/igloo/json', - 'https://pypi.python.org/pypi/someigloo/json', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/igloo/json", + "https://pypi.python.org/pypi/someigloo/json", + ] + ) self.assertEqual(expected, sorted(results)) self.assertEqual(expected, sorted([s.uri for s in seeded])) @@ -193,7 +206,7 @@ def test_insert_seed_uris_inserts_uris_for_active_seeders_with_pattern(self, moc self.assertTrue(all(s.priority == SEED_PRIORITY for s in seeded)) def test_insert_seed_uris_inserts_uris_for_active_seeders_without_pattern(self): - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) results = list(insert_seed_uris(seeders=[self.SampleSeed1])) @@ -202,10 +215,12 @@ def test_insert_seed_uris_inserts_uris_for_active_seeders_without_pattern(self): else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/igloo/json', - 'https://pypi.python.org/pypi/someigloo/json', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/igloo/json", + "https://pypi.python.org/pypi/someigloo/json", + ] + ) self.assertEqual(expected, sorted(results)) self.assertEqual(expected, sorted([s.uri for s in seeded])) @@ -214,7 +229,7 @@ def test_insert_seed_uris_inserts_uris_for_active_seeders_without_pattern(self): def test_insert_seed_uris_does_not_insert_duplicate(self): seeders = [self.SampleSeed3, self.SampleSeed4] - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) # seed twice seed_results = sorted(insert_seed_uris(seeders=seeders)) no_seed_results = sorted(insert_seed_uris()) @@ -224,11 +239,13 @@ def test_insert_seed_uris_does_not_insert_duplicate(self): else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/foo/json', - 'https://pypi.python.org/pypi/foobar/json', - 'https://pypi.python.org/pypi/foobaz/json', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/foo/json", + "https://pypi.python.org/pypi/foobar/json", + "https://pypi.python.org/pypi/foobaz/json", + ] + ) self.assertEqual(expected, sorted(seed_results)) self.assertEqual([], no_seed_results) @@ -241,13 +258,13 @@ def test_get_active_seeders(self): # and needs to be updated each time we enable a new seed seeds = [c.__class__.__name__ for c in seed.get_active_seeders()] expected = [ - 'MavenSeed', + "MavenSeed", ] assert sorted(expected) == sorted(seeds) def test_get_configured_seeders(self): seeders = seed.get_configured_seeders() expected = [ - 'minecode.miners.maven.MavenSeed', + "minecode.miners.maven.MavenSeed", ] assert sorted(expected) == sorted(seeders) diff --git a/minecode/tests/test_tasks.py b/minecode/tests/test_tasks.py index c020d9ef..3ef43943 100644 --- a/minecode/tests/test_tasks.py +++ b/minecode/tests/test_tasks.py @@ -9,39 +9,38 @@ import json import os +from unittest import mock from django.test import TestCase -from unittest import mock +from minecode import tasks from minecode.models import ScannableURI -from packagedb.models import Package from minecode.utils_test import JsonBasedTesting -from minecode import tasks +from packagedb.models import Package class MinecodeTasksTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package1 = Package.objects.create( - download_url='https://test-url.com/package1.tar.gz', - type='type1', - name='name1', - version='1.0', + download_url="https://test-url.com/package1.tar.gz", + type="type1", + name="name1", + version="1.0", ) self.scannable_uri1 = ScannableURI.objects.create( - uri='https://test-url.com/package1.tar.gz', - package=self.package1 + uri="https://test-url.com/package1.tar.gz", package=self.package1 ) self.project_extra_data1 = { - 'md5': 'md5', - 'sha1': 'sha1', - 'sha256': 'sha256', - 'sha512': 'sha512', - 'size': 100, + "md5": "md5", + "sha1": "sha1", + "sha256": "sha256", + "sha512": "sha512", + "size": 100, } - @mock.patch('os.remove') + @mock.patch("os.remove") def test_minecode_tasks_process_scan_results(self, mock_delete): mock_delete.side_effect = [None, None] @@ -53,9 +52,10 @@ def test_minecode_tasks_process_scan_results(self, mock_delete): self.assertFalse(self.package1.declared_license_expression) self.assertFalse(self.package1.copyright) self.assertEqual(0, self.package1.resources.count()) - scan_file_location = self.get_test_loc('scancodeio/get_scan_data.json') + scan_file_location = self.get_test_loc("scancodeio/get_scan_data.json") summary_file_location = self.get_test_loc( - 'scancodeio/scan_summary_response.json') + "scancodeio/scan_summary_response.json" + ) tasks.process_scan_results( self.scannable_uri1.uuid, scan_results_location=scan_file_location, @@ -63,23 +63,24 @@ def test_minecode_tasks_process_scan_results(self, mock_delete): project_extra_data=self.project_extra_data1, ) self.package1.refresh_from_db() - self.assertEqual('md5', self.package1.md5) - self.assertEqual('sha1', self.package1.sha1) - self.assertEqual('sha256', self.package1.sha256) - self.assertEqual('sha512', self.package1.sha512) + self.assertEqual("md5", self.package1.md5) + self.assertEqual("sha1", self.package1.sha1) + self.assertEqual("sha256", self.package1.sha256) + self.assertEqual("sha512", self.package1.sha512) self.assertEqual(100, self.package1.size) + self.assertEqual("apache-2.0", self.package1.declared_license_expression) self.assertEqual( - 'apache-2.0', self.package1.declared_license_expression) - self.assertEqual( - 'Copyright (c) Apache Software Foundation', self.package1.copyright) + "Copyright (c) Apache Software Foundation", self.package1.copyright + ) self.assertFalse(self.scannable_uri1.scan_error) self.assertEqual(64, self.package1.resources.count()) def test_minecode_tasks_process_scan_results_scannableuri_does_not_exist(self): - nonexisting_uuid = '420db78a-625f-4622-b1a0-93d1ea853194' - scan_file_location = self.get_test_loc('scancodeio/get_scan_data.json') + nonexisting_uuid = "420db78a-625f-4622-b1a0-93d1ea853194" + scan_file_location = self.get_test_loc("scancodeio/get_scan_data.json") summary_file_location = self.get_test_loc( - 'scancodeio/scan_summary_response.json') + "scancodeio/scan_summary_response.json" + ) project_extra_data = json.dumps(self.project_extra_data1) with self.assertRaises(Exception) as context: @@ -89,5 +90,5 @@ def test_minecode_tasks_process_scan_results_scannableuri_does_not_exist(self): scan_summary_location=summary_file_location, project_extra_data=project_extra_data, ) - expected_message = f'ScannableURI {nonexisting_uuid} does not exist!' + expected_message = f"ScannableURI {nonexisting_uuid} does not exist!" self.assertIn(expected_message, str(context.exception)) diff --git a/minecode/tests/test_utils.py b/minecode/tests/test_utils.py index 4f23d11f..912cd173 100644 --- a/minecode/tests/test_utils.py +++ b/minecode/tests/test_utils.py @@ -19,57 +19,55 @@ class UtilsTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def test_stringify_null_purl_fields_with_missing_purl_fields(self): - common_data = { - 'type': None - } + common_data = {"type": None} utils.stringify_null_purl_fields(common_data) self.assertEqual(1, len(common_data)) - self.assertEqual('', common_data['type']) + self.assertEqual("", common_data["type"]) def test_stringify_null_purl_fields(self): common_data = { - 'type': None, - 'namespace': None, - 'name': None, - 'version': None, - 'qualifiers': None, - 'subpath': None + "type": None, + "namespace": None, + "name": None, + "version": None, + "qualifiers": None, + "subpath": None, } utils.stringify_null_purl_fields(common_data) for d in common_data: self.assertIsNotNone(common_data[d]) - self.assertEqual('', common_data[d]) + self.assertEqual("", common_data[d]) def test_set_purl(self): common_data = dict( - type='generic', - name='openssl', - description='The OpenSSL Project is a collaborative effort.', + type="generic", + name="openssl", + description="The OpenSSL Project is a collaborative effort.", ) package = scan_models.Package(**common_data) - package.set_purl('pkg:generic/openssl@1.0.2o') + package.set_purl("pkg:generic/openssl@1.0.2o") self.assertEqual(None, package.namespace) - self.assertEqual('generic', package.type) - self.assertEqual('openssl', package.name) - self.assertEqual('1.0.2o', package.version) + self.assertEqual("generic", package.type) + self.assertEqual("openssl", package.name) + self.assertEqual("1.0.2o", package.version) self.assertEqual({}, package.qualifiers) self.assertEqual(None, package.subpath) def test_is_int(self): self.assertTrue(utils.is_int(0)) - self.assertFalse(utils.is_int('a')) + self.assertFalse(utils.is_int("a")) def test_validate_uuid(self): - invalid_uuid1 = 'invalid' - invalid_uuid2 = '123e4567-e89b-12d3-a456-42665544000G' - valid_uuid = 'c2cf7ef0-d3be-4011-bda7-8eb4a196eef2' + invalid_uuid1 = "invalid" + invalid_uuid2 = "123e4567-e89b-12d3-a456-42665544000G" + valid_uuid = "c2cf7ef0-d3be-4011-bda7-8eb4a196eef2" for uuid, expected_result in [ [invalid_uuid1, False], diff --git a/minecode/tests/test_version.py b/minecode/tests/test_version.py index 8f278e59..e3eaee63 100644 --- a/minecode/tests/test_version.py +++ b/minecode/tests/test_version.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -17,203 +16,202 @@ class VersionHintTestCase(unittest.TestCase): - def version_tester(self, versions, ignore_pre_releases=False): """Test versions mapping of (path, expected)""" for path in versions: self.assertEqual( versions[path], - version_hint(path, ignore_pre_releases=ignore_pre_releases) + version_hint(path, ignore_pre_releases=ignore_pre_releases), ) def test_version_hint_base(self): versions = { - '/xmlgraphics/fop/source/fop-1.0-src.zip': '1.0', - '/xml/xindice/xml-xindice-1.2m1-src.zip': '1.2m1', - '/xmlgraphics/fop/binaries/fop-0.94-bin-jdk1.3.tar.gz': '0.94', - '/xmlgraphics/batik/batik-src-1.7beta1.zip': '1.7beta1', - '/xmlgraphics/batik/batik-1.7-jre13.zip': '1.7', - '/xmlbeans/source/xmlbeans-2.3.0-src.tgz': '2.3.0', - '/xml/xindice/source/xml-xindice-1.2m1-src.tar.gz': '1.2m1', - '/xml/xerces-p/binaries/XML-Xerces-2.3.0-4-win32.zip': '2.3.0-4', - '/xml/xerces-p/source/XML-Xerces-2.3.0-3.tar.gz': '2.3.0-3', - '/xml/xalan-j/source/xalan-j_2_7_0-src-2jars.tar.gz': '2_7_0', - '/xml/security/java-library/xml-security-src-1_0_5D2.zip': '1_0_5D2', - '/xml/commons/binaries/xml-commons-external-1.4.01-bin.zip': '1.4.01', - '/xml/commons/xml-commons-1.0.b2.zip': '1.0.b2', - '/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz': '3.0.0-alpha-1', - '/xerces/j/source/Xerces-J-tools.2.10.0-xml-schema-1.1-beta.tar.gz': '2.10.0', - '/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-solaris-cc-5.10.tar.gz': '3.1.1', - '/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-windows-vc-8.0.zip': '3.1.1', - '/xerces/c/2/binaries/xerces-c_2_8_0-x86-windows-vc_7_1.zip': '2_8_0', - '/ws/woden/1.0M8/apache-woden-src-1.0M8.tar.gz': '1.0M8', - '/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip': '0.7rc1', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip': '3.0.0.rc1', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip': '3.0.0.beta', - '/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip': '2.0rc7', - '/ws/axis2/tools/1_4_1/axis2-wsdl2code-maven-plugin-1.4.1.jar': '1.4.1', - '/ws/axis/1_4/axis-src-1_4.zip': '1_4', - '/tuscany/java/sca/2.0-M5/apache-tuscany-sca-all-2.0-M5-src.tar.gz': '2.0-M5', - '/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip': '1.6b', - '/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip': '2.3.3-RC1', - '/tomcat/tomcat-connectors/jk/binaries/win64/jk-1.2.30/ia64/symbols-1.2.30.zip': '1.2.30', - '/tomcat/tomcat-7/v7.0.0-beta/bin/apache-tomcat-7.0.0-windows-i64.zip': '7.0.0', - '/tomcat/tomcat-4/v4.1.40/bin/apache-tomcat-4.1.40-LE-jdk14.exe': '4.1.40', - '/tapestry/tapestry-src-5.1.0.5.tar.gz': '5.1.0.5', - '/spamassassin/source/Mail-SpamAssassin-rules-3.3.0.r901671.tgz': '3.3.0.r901671', - '/spamassassin/Mail-SpamAssassin-rules-3.3.1.r923257.tgz': '3.3.1.r923257', - '/shindig/1.1-BETA5-incubating/shindig-1.1-BETA5-incubating-source.zip': '1.1-BETA5', - '/servicemix/nmr/1.0.0-m3/apache-servicemix-nmr-1.0.0-m3-src.tar.gz': '1.0.0-m3', - '/qpid/0.6/qpid-dotnet-0-10-0.6.zip': '0.6', - '/openjpa/2.0.0-beta/apache-openjpa-2.0.0-beta-binary.zip': '2.0.0-beta', - '/myfaces/source/portlet-bridge-2.0.0-alpha-2-src-all.tar.gz': '2.0.0-alpha-2', - '/myfaces/source/myfaces-extval20-2.0.3-src.tar.gz': '2.0.3', - '/geronimo/eclipse/updates/plugins/org.apache.geronimo.st.v21.ui_2.1.1.jar': '2.1.1', - '/directory/studio/update/1.x/plugins/org.apache.directory.studio.aciitemeditor_1.5.2.v20091211.jar': '1.5.2.v20091211', - '/db/torque/torque-3.3/source/torque-gen-3.3-RC3-src.zip': '3.3-RC3', - '/cayenne/cayenne-3.0B1.tar.gz': '3.0B1', - '/cayenne/cayenne-3.0M4-macosx.dmg': '3.0M4', - '/xmlgraphics/batik/batik-docs-current.zip': 'current', - '/xmlgraphics/batik/batik-docs-previous.zip': 'previous', - '/poi/dev/bin/poi-bin-3.7-beta1-20100620.zip': '3.7-beta1-20100620', - '/excalibur/avalon-logkit/source/excalibur-logkit-2.0.dev-0-src.zip': '2.0.dev-0', - '/db/derby/db-derby-10.4.2.0/derby_core_plugin_10.4.2.zip': '10.4.2', - '/httpd/modpython/win/2.7.1/mp152dll.zip': '2.7.1', - '/perl/mod_perl-1.31/apaci/mod_perl.config.sh': '1.31', - '/xml/xerces-j/old_xerces2/Xerces-J-bin.2.0.0.alpha.zip': '2.0.0.alpha', - '/xml/xerces-p/archives/XML-Xerces-1.7.0_0.tar.gz': '1.7.0_0', - '/httpd/docs/tools-2004-05-04.zip': '2004-05-04', - '/ws/axis2/c/M0_5/axis2c-src-M0.5.tar.gz': 'M0.5', - '/jakarta/poi/dev/src/jakarta-poi-1.8.0-dev-src.zip': '1.8.0-dev', - '/tapestry/tapestry-4.0-beta-8.zip': '4.0-beta-8', - '/openejb/3.0-beta-1/openejb-3.0-beta-1.zip': '3.0-beta-1', - '/tapestry/tapestry-4.0-rc-1.zip': '4.0-rc-1', - '/jakarta/tapestry/source/3.0-rc-3/Tapestry-3.0-rc-3-src.zip': '3.0-rc-3', - '/jakarta/lucene/binaries/lucene-1.3-final.tar.gz': '1.3-final', - '/jakarta/tapestry/binaries/3.0-beta-1a/Tapestry-3.0-beta-1a-bin.zip': '3.0-beta-1a', - '/poi/release/bin/poi-bin-3.0-FINAL-20070503.tar.gz': '3.0-FINAL-20070503', - '/harmony/milestones/M4/apache-harmony-hdk-r603534-linux-x86-32-libstdc++v6-snapshot.tar.gz': 'r603534', - '/ant/antidote/antidote-20050330.tar.bz2': '20050330', - '/apr/not-released/apr_20020725223645.tar.gz': '20020725223645', - '/ibatis/source/ibatis.net/src-revision-709676.zip': 'revision-709676', - '/ws/axis-c/source/win32/axis-c-src-1-2-win32.zip': '1-2', - '/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip': '2.0rc1', - '/httpd/modpython/win/3.0.1/python2.2.1-apache2.0.43.zip': '2.2.1', - '/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.1.0.cr1_20090319213629.jar': '2.1.0.cr1_20090319213629', - '/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar': '2.0-pre1-20030517', - '/jakarta/poi/release/bin/jakarta-poi-1.5.0-FINAL-bin.zip': '1.5.0-FINAL', - '/jakarta/poi/release/bin/poi-bin-2.0-final-20040126.zip': '2.0-final-20040126', - '/activemq/apache-activemq/5.0.0/apache-activemq-5.0.0-sources.jar': '5.0.0', - '/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz': '2.2-B1', - '/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.cr1.jar': '2.0.0.cr1', - '/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.final_20090108225011.jar': '2.0.0.final_20090108225011', - '/ws/axis/1_2RC3/axis-src-1_2RC3.zip': '1_2RC3', - '/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip': '1.0-b1.1', - '/commons/net/binaries/commons-net-1.2.0-release.tar.gz': '1.2.0-release', - '/ant/ivyde/2.0.0.final/apache-ivyde-2.0.0.final-200907011148-RELEASE.tgz': '2.0.0.final-200907011148-RELEASE', - '/geronimo/eclipse/updates/plugins/org.apache.geronimo.jetty.j2ee.server.v11_1.0.0.jar': 'v11_1.0.0', - '/jakarta/cactus/binaries/jakarta-cactus-13-1.7.1-fixed.zip': '1.7.1-fixed', - '/jakarta/jakarta-turbine-maven/maven/jars/maven-1.0-b5-dev.20020731.085427.jar': '1.0-b5-dev.20020731.085427', - '/xml/xalan-j/source/xalan-j_2_5_D1-src.tar.gz': '2_5_D1', - '/ws/woden/IBuilds/I20051002_1145/woden-I20051002_1145.tar.bz2': 'I20051002_1145', - '/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz': '1.8.0-BETA', - '/cocoon/BINARIES/cocoon-2.0.3-vm14-bin.tar.gz': '2.0.3-vm14', - '/felix/xliff_filters_v1_2_7_unix.jar': 'v1_2_7', - '/excalibur/releases/200702/excalibur-javadoc-r508111-15022007.tar.gz': 'r508111-15022007', - '/geronimo/eclipse/updates/features/org.apache.geronimo.v20.feature_2.0.0.jar': 'v20.feature_2.0.0', - '/geronimo/2.1.6/axis2-jaxws-1.3-G20090406.jar': '1.3-G20090406', - '/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz': '0.4.0~beta1', - '/ha-api-3.1.6.jar': '3.1.6', - 'ha-api-3.1.6.jar': '3.1.6', - 'fryPOS_20070919.exe': '20070919', + "/xmlgraphics/fop/source/fop-1.0-src.zip": "1.0", + "/xml/xindice/xml-xindice-1.2m1-src.zip": "1.2m1", + "/xmlgraphics/fop/binaries/fop-0.94-bin-jdk1.3.tar.gz": "0.94", + "/xmlgraphics/batik/batik-src-1.7beta1.zip": "1.7beta1", + "/xmlgraphics/batik/batik-1.7-jre13.zip": "1.7", + "/xmlbeans/source/xmlbeans-2.3.0-src.tgz": "2.3.0", + "/xml/xindice/source/xml-xindice-1.2m1-src.tar.gz": "1.2m1", + "/xml/xerces-p/binaries/XML-Xerces-2.3.0-4-win32.zip": "2.3.0-4", + "/xml/xerces-p/source/XML-Xerces-2.3.0-3.tar.gz": "2.3.0-3", + "/xml/xalan-j/source/xalan-j_2_7_0-src-2jars.tar.gz": "2_7_0", + "/xml/security/java-library/xml-security-src-1_0_5D2.zip": "1_0_5D2", + "/xml/commons/binaries/xml-commons-external-1.4.01-bin.zip": "1.4.01", + "/xml/commons/xml-commons-1.0.b2.zip": "1.0.b2", + "/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz": "3.0.0-alpha-1", + "/xerces/j/source/Xerces-J-tools.2.10.0-xml-schema-1.1-beta.tar.gz": "2.10.0", + "/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-solaris-cc-5.10.tar.gz": "3.1.1", + "/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-windows-vc-8.0.zip": "3.1.1", + "/xerces/c/2/binaries/xerces-c_2_8_0-x86-windows-vc_7_1.zip": "2_8_0", + "/ws/woden/1.0M8/apache-woden-src-1.0M8.tar.gz": "1.0M8", + "/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip": "0.7rc1", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip": "3.0.0.rc1", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip": "3.0.0.beta", + "/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip": "2.0rc7", + "/ws/axis2/tools/1_4_1/axis2-wsdl2code-maven-plugin-1.4.1.jar": "1.4.1", + "/ws/axis/1_4/axis-src-1_4.zip": "1_4", + "/tuscany/java/sca/2.0-M5/apache-tuscany-sca-all-2.0-M5-src.tar.gz": "2.0-M5", + "/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip": "1.6b", + "/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip": "2.3.3-RC1", + "/tomcat/tomcat-connectors/jk/binaries/win64/jk-1.2.30/ia64/symbols-1.2.30.zip": "1.2.30", + "/tomcat/tomcat-7/v7.0.0-beta/bin/apache-tomcat-7.0.0-windows-i64.zip": "7.0.0", + "/tomcat/tomcat-4/v4.1.40/bin/apache-tomcat-4.1.40-LE-jdk14.exe": "4.1.40", + "/tapestry/tapestry-src-5.1.0.5.tar.gz": "5.1.0.5", + "/spamassassin/source/Mail-SpamAssassin-rules-3.3.0.r901671.tgz": "3.3.0.r901671", + "/spamassassin/Mail-SpamAssassin-rules-3.3.1.r923257.tgz": "3.3.1.r923257", + "/shindig/1.1-BETA5-incubating/shindig-1.1-BETA5-incubating-source.zip": "1.1-BETA5", + "/servicemix/nmr/1.0.0-m3/apache-servicemix-nmr-1.0.0-m3-src.tar.gz": "1.0.0-m3", + "/qpid/0.6/qpid-dotnet-0-10-0.6.zip": "0.6", + "/openjpa/2.0.0-beta/apache-openjpa-2.0.0-beta-binary.zip": "2.0.0-beta", + "/myfaces/source/portlet-bridge-2.0.0-alpha-2-src-all.tar.gz": "2.0.0-alpha-2", + "/myfaces/source/myfaces-extval20-2.0.3-src.tar.gz": "2.0.3", + "/geronimo/eclipse/updates/plugins/org.apache.geronimo.st.v21.ui_2.1.1.jar": "2.1.1", + "/directory/studio/update/1.x/plugins/org.apache.directory.studio.aciitemeditor_1.5.2.v20091211.jar": "1.5.2.v20091211", + "/db/torque/torque-3.3/source/torque-gen-3.3-RC3-src.zip": "3.3-RC3", + "/cayenne/cayenne-3.0B1.tar.gz": "3.0B1", + "/cayenne/cayenne-3.0M4-macosx.dmg": "3.0M4", + "/xmlgraphics/batik/batik-docs-current.zip": "current", + "/xmlgraphics/batik/batik-docs-previous.zip": "previous", + "/poi/dev/bin/poi-bin-3.7-beta1-20100620.zip": "3.7-beta1-20100620", + "/excalibur/avalon-logkit/source/excalibur-logkit-2.0.dev-0-src.zip": "2.0.dev-0", + "/db/derby/db-derby-10.4.2.0/derby_core_plugin_10.4.2.zip": "10.4.2", + "/httpd/modpython/win/2.7.1/mp152dll.zip": "2.7.1", + "/perl/mod_perl-1.31/apaci/mod_perl.config.sh": "1.31", + "/xml/xerces-j/old_xerces2/Xerces-J-bin.2.0.0.alpha.zip": "2.0.0.alpha", + "/xml/xerces-p/archives/XML-Xerces-1.7.0_0.tar.gz": "1.7.0_0", + "/httpd/docs/tools-2004-05-04.zip": "2004-05-04", + "/ws/axis2/c/M0_5/axis2c-src-M0.5.tar.gz": "M0.5", + "/jakarta/poi/dev/src/jakarta-poi-1.8.0-dev-src.zip": "1.8.0-dev", + "/tapestry/tapestry-4.0-beta-8.zip": "4.0-beta-8", + "/openejb/3.0-beta-1/openejb-3.0-beta-1.zip": "3.0-beta-1", + "/tapestry/tapestry-4.0-rc-1.zip": "4.0-rc-1", + "/jakarta/tapestry/source/3.0-rc-3/Tapestry-3.0-rc-3-src.zip": "3.0-rc-3", + "/jakarta/lucene/binaries/lucene-1.3-final.tar.gz": "1.3-final", + "/jakarta/tapestry/binaries/3.0-beta-1a/Tapestry-3.0-beta-1a-bin.zip": "3.0-beta-1a", + "/poi/release/bin/poi-bin-3.0-FINAL-20070503.tar.gz": "3.0-FINAL-20070503", + "/harmony/milestones/M4/apache-harmony-hdk-r603534-linux-x86-32-libstdc++v6-snapshot.tar.gz": "r603534", + "/ant/antidote/antidote-20050330.tar.bz2": "20050330", + "/apr/not-released/apr_20020725223645.tar.gz": "20020725223645", + "/ibatis/source/ibatis.net/src-revision-709676.zip": "revision-709676", + "/ws/axis-c/source/win32/axis-c-src-1-2-win32.zip": "1-2", + "/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip": "2.0rc1", + "/httpd/modpython/win/3.0.1/python2.2.1-apache2.0.43.zip": "2.2.1", + "/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.1.0.cr1_20090319213629.jar": "2.1.0.cr1_20090319213629", + "/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar": "2.0-pre1-20030517", + "/jakarta/poi/release/bin/jakarta-poi-1.5.0-FINAL-bin.zip": "1.5.0-FINAL", + "/jakarta/poi/release/bin/poi-bin-2.0-final-20040126.zip": "2.0-final-20040126", + "/activemq/apache-activemq/5.0.0/apache-activemq-5.0.0-sources.jar": "5.0.0", + "/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz": "2.2-B1", + "/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.cr1.jar": "2.0.0.cr1", + "/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.final_20090108225011.jar": "2.0.0.final_20090108225011", + "/ws/axis/1_2RC3/axis-src-1_2RC3.zip": "1_2RC3", + "/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip": "1.0-b1.1", + "/commons/net/binaries/commons-net-1.2.0-release.tar.gz": "1.2.0-release", + "/ant/ivyde/2.0.0.final/apache-ivyde-2.0.0.final-200907011148-RELEASE.tgz": "2.0.0.final-200907011148-RELEASE", + "/geronimo/eclipse/updates/plugins/org.apache.geronimo.jetty.j2ee.server.v11_1.0.0.jar": "v11_1.0.0", + "/jakarta/cactus/binaries/jakarta-cactus-13-1.7.1-fixed.zip": "1.7.1-fixed", + "/jakarta/jakarta-turbine-maven/maven/jars/maven-1.0-b5-dev.20020731.085427.jar": "1.0-b5-dev.20020731.085427", + "/xml/xalan-j/source/xalan-j_2_5_D1-src.tar.gz": "2_5_D1", + "/ws/woden/IBuilds/I20051002_1145/woden-I20051002_1145.tar.bz2": "I20051002_1145", + "/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz": "1.8.0-BETA", + "/cocoon/BINARIES/cocoon-2.0.3-vm14-bin.tar.gz": "2.0.3-vm14", + "/felix/xliff_filters_v1_2_7_unix.jar": "v1_2_7", + "/excalibur/releases/200702/excalibur-javadoc-r508111-15022007.tar.gz": "r508111-15022007", + "/geronimo/eclipse/updates/features/org.apache.geronimo.v20.feature_2.0.0.jar": "v20.feature_2.0.0", + "/geronimo/2.1.6/axis2-jaxws-1.3-G20090406.jar": "1.3-G20090406", + "/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz": "0.4.0~beta1", + "/ha-api-3.1.6.jar": "3.1.6", + "ha-api-3.1.6.jar": "3.1.6", + "fryPOS_20070919.exe": "20070919", } self.version_tester(versions) def test_versions_with_7z_extensions(self): versions = { - 'http://heanet.dl.sourceforge.net/project/imadering/Imadering_500_211.7z': '500_211', - 'http://cznic.dl.sourceforge.net/project/lttty/LtTTY/LtTTY-0.6.0.2/lttty-src-0.602.7z': '0.602', - '/some/MPlayerGUI_0_6_79.7z': '0_6_79', - 'http://heanet.dl.sourceforge.net/project/qsubedit/0-2-1-23/QSubEdit-win32-0-2-1-23.7z': '0-2-1-23', - 'http://sourceforge.net/projects/vgmtoolbox/files/vgmtoolbox/VGMToolbox%20r930/vgmtoolbox_bin_r930.7z': 'r930', - 'blah/XMTunerSource-0-6-4.7z': '0-6-4', + "http://heanet.dl.sourceforge.net/project/imadering/Imadering_500_211.7z": "500_211", + "http://cznic.dl.sourceforge.net/project/lttty/LtTTY/LtTTY-0.6.0.2/lttty-src-0.602.7z": "0.602", + "/some/MPlayerGUI_0_6_79.7z": "0_6_79", + "http://heanet.dl.sourceforge.net/project/qsubedit/0-2-1-23/QSubEdit-win32-0-2-1-23.7z": "0-2-1-23", + "http://sourceforge.net/projects/vgmtoolbox/files/vgmtoolbox/VGMToolbox%20r930/vgmtoolbox_bin_r930.7z": "r930", + "blah/XMTunerSource-0-6-4.7z": "0-6-4", } self.version_tester(versions) def test_versions_of_debs_and_rpms(self): versions = { - 'bartlby-agent_1.2.3-1_i386.deb': '1.2.3', - 'milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb': '6.0', - 'bartlby-extensions_1.2.3-12_amd64.deb': '1.2.3', - 'bashish-2.0.4.tar.gz': '2.0.4', - 'bashish_2.0.4-1_all.deb': '2.0.4', - 'bashish-2.0.4-1.bashish.generic.noarch.rpm': '2.0.4', - 'bbbike_3.18-1_i386.deb': '3.18', - 'bbbike_3.18-1_amd64.deb': '3.18', - 'blueproximity-1.2.4.tar.gz': '1.2.4', - 'blueproximity_1.2.4-0ubuntu1_feisty1_all.deb': '1.2.4', - 'blueproximity_1.2.4-0ubuntu1_all.deb': '1.2.4', - 'blueproximity-1.2.4-1.fc8.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-1.2_opensuse10_2.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-1.2_opensuse10_3.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-12.1_opensuse10_3.x86_64.rpm': '1.2.4', - 'blueproximity-1.2.4-12.1_opensuse10_3.i586.rpm': '1.2.4', - 'blueproximity-1.2.4-13.1_upensuse10_2.x86_64.rpm': '1.2.4', - 'blueproximity-1.2.4-13.1_opensuse10_2.i586.rpm': '1.2.4', - 'blueproximity-1.2.4-14.1_opensuse10_3.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-14.1_opensuse10_2.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-2.fc8.noarch.rpm': '1.2.4', - 'bpmcalc4amarok_0.1.2-1_all.deb': '0.1.2', - 'bpmcalc4amarok_0.1.2-1.diff.gz': '0.1.2', + "bartlby-agent_1.2.3-1_i386.deb": "1.2.3", + "milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb": "6.0", + "bartlby-extensions_1.2.3-12_amd64.deb": "1.2.3", + "bashish-2.0.4.tar.gz": "2.0.4", + "bashish_2.0.4-1_all.deb": "2.0.4", + "bashish-2.0.4-1.bashish.generic.noarch.rpm": "2.0.4", + "bbbike_3.18-1_i386.deb": "3.18", + "bbbike_3.18-1_amd64.deb": "3.18", + "blueproximity-1.2.4.tar.gz": "1.2.4", + "blueproximity_1.2.4-0ubuntu1_feisty1_all.deb": "1.2.4", + "blueproximity_1.2.4-0ubuntu1_all.deb": "1.2.4", + "blueproximity-1.2.4-1.fc8.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-1.2_opensuse10_2.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-1.2_opensuse10_3.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-12.1_opensuse10_3.x86_64.rpm": "1.2.4", + "blueproximity-1.2.4-12.1_opensuse10_3.i586.rpm": "1.2.4", + "blueproximity-1.2.4-13.1_upensuse10_2.x86_64.rpm": "1.2.4", + "blueproximity-1.2.4-13.1_opensuse10_2.i586.rpm": "1.2.4", + "blueproximity-1.2.4-14.1_opensuse10_3.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-14.1_opensuse10_2.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-2.fc8.noarch.rpm": "1.2.4", + "bpmcalc4amarok_0.1.2-1_all.deb": "0.1.2", + "bpmcalc4amarok_0.1.2-1.diff.gz": "0.1.2", } self.version_tester(versions) def test_versions_without_rc_alpha_beta(self): versions = { - '/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz': '1.8.0', - '/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz': '0.4.0', - '/xmlgraphics/batik/batik-src-1.7beta1.zip': '1.7', - '/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz': '3.0.0', - '/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip': '0.7', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip': '3.0.0', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip': '3.0.0', - '/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip': '2.0', - '/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip': '2.3.3', - '/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip': '2.0', - '/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar': '2.0', - '/ws/axis/1_2RC3/axis-src-1_2RC3.zip': '1_2', - '/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip': '1.6b', - '/xml/commons/xml-commons-1.0.b2.zip': '1.0', - '/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip': '1.0', - '/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz': '2.2', + "/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz": "1.8.0", + "/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz": "0.4.0", + "/xmlgraphics/batik/batik-src-1.7beta1.zip": "1.7", + "/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz": "3.0.0", + "/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip": "0.7", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip": "3.0.0", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip": "3.0.0", + "/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip": "2.0", + "/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip": "2.3.3", + "/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip": "2.0", + "/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar": "2.0", + "/ws/axis/1_2RC3/axis-src-1_2RC3.zip": "1_2", + "/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip": "1.6b", + "/xml/commons/xml-commons-1.0.b2.zip": "1.0", + "/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip": "1.0", + "/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz": "2.2", } self.version_tester(versions, ignore_pre_releases=True) def test_versions_libpng(self): versions = { - 'libpng-1.0.16rc3-config.tar.gz': '1.0.16', - 'libpng-1.0.16rc4-config.tar.gz': '1.0.16', - 'libpng-1.0.16rc5-config.tar.gz': '1.0.16', - 'libpng-1.0.17rc1-config.tar.gz': '1.0.17', - 'libpng-1.0.18rc1-config.tar.gz': '1.0.18', - 'libpng-1.0.18rc1.tar.gz': '1.0.18', - 'libpng-1.2.17rc3-no-config.tar.gz': '1.2.17', - 'libpng-1.2.17rc4-no-config.tar.gz': '1.2.17', - 'libpng-1.2.19beta1-no-config.tar.gz': '1.2.19', - 'libpng-1.2.19beta12-no-config.tar.gz': '1.2.19', + "libpng-1.0.16rc3-config.tar.gz": "1.0.16", + "libpng-1.0.16rc4-config.tar.gz": "1.0.16", + "libpng-1.0.16rc5-config.tar.gz": "1.0.16", + "libpng-1.0.17rc1-config.tar.gz": "1.0.17", + "libpng-1.0.18rc1-config.tar.gz": "1.0.18", + "libpng-1.0.18rc1.tar.gz": "1.0.18", + "libpng-1.2.17rc3-no-config.tar.gz": "1.2.17", + "libpng-1.2.17rc4-no-config.tar.gz": "1.2.17", + "libpng-1.2.19beta1-no-config.tar.gz": "1.2.19", + "libpng-1.2.19beta12-no-config.tar.gz": "1.2.19", } self.version_tester(versions, ignore_pre_releases=True) def test_versions_corner_cases(self): versions = { - '/bar/zaiko_2013-03-14_192300.7z': '2013-03-14_192300', + "/bar/zaiko_2013-03-14_192300.7z": "2013-03-14_192300", } self.version_tester(versions) @expectedFailure def test_versions_corner_cases2(self): versions = { - 'foo/InstallXMTuner0-6-4.msi': '0-6-4', - '/harmony/milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb': '0.0r946981-1', + "foo/InstallXMTuner0-6-4.msi": "0-6-4", + "/harmony/milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb": "0.0r946981-1", } self.version_tester(versions) diff --git a/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py b/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py index 72f2e5fc..8bd397c4 100644 --- a/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py +++ b/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py @@ -1,6 +1,7 @@ class ConanFile: pass + required_conan_version = ">=1.53.0" @@ -10,8 +11,10 @@ class ZlibConan(ConanFile): url = "https://github.com/conan-io/conan-center-index" homepage = "https://zlib.net" license = "Zlib" - description = ("A Massively Spiffy Yet Delicately Unobtrusive Compression Library " - "(Also Free, Not to Mention Unencumbered by Patents)") + description = ( + "A Massively Spiffy Yet Delicately Unobtrusive Compression Library " + "(Also Free, Not to Mention Unencumbered by Patents)" + ) topics = ("zlib", "compression") settings = "os", "arch", "compiler", "build_type" @@ -45,8 +48,12 @@ def layout(self): cmake_layout(self, src_folder="src") def source(self): - get(self, **self.conan_data["sources"][self.version], - destination=self.source_folder, strip_root=True) + get( + self, + **self.conan_data["sources"][self.version], + destination=self.source_folder, + strip_root=True, + ) def generate(self): tc = CMakeToolchain(self) @@ -63,18 +70,27 @@ def generate(self): def _patch_sources(self): apply_conandata_patches(self) - is_apple_clang12 = self.settings.compiler == "apple-clang" and Version(self.settings.compiler.version) >= "12.0" + is_apple_clang12 = ( + self.settings.compiler == "apple-clang" + and Version(self.settings.compiler.version) >= "12.0" + ) if not is_apple_clang12: - for filename in ['zconf.h', 'zconf.h.cmakein', 'zconf.h.in']: + for filename in ["zconf.h", "zconf.h.cmakein", "zconf.h.in"]: filepath = os.path.join(self.source_folder, filename) - replace_in_file(self, filepath, - '#ifdef HAVE_UNISTD_H ' - '/* may be set to #if 1 by ./configure */', - '#if defined(HAVE_UNISTD_H) && (1-HAVE_UNISTD_H-1 != 0)') - replace_in_file(self, filepath, - '#ifdef HAVE_STDARG_H ' - '/* may be set to #if 1 by ./configure */', - '#if defined(HAVE_STDARG_H) && (1-HAVE_STDARG_H-1 != 0)') + replace_in_file( + self, + filepath, + "#ifdef HAVE_UNISTD_H " + "/* may be set to #if 1 by ./configure */", + "#if defined(HAVE_UNISTD_H) && (1-HAVE_UNISTD_H-1 != 0)", + ) + replace_in_file( + self, + filepath, + "#ifdef HAVE_STDARG_H " + "/* may be set to #if 1 by ./configure */", + "#if defined(HAVE_STDARG_H) && (1-HAVE_STDARG_H-1 != 0)", + ) def build(self): self._patch_sources() @@ -84,11 +100,15 @@ def build(self): def _extract_license(self): tmp = load(self, os.path.join(self.source_folder, "zlib.h")) - license_contents = tmp[2:tmp.find("*/", 1)] + license_contents = tmp[2 : tmp.find("*/", 1)] return license_contents def package(self): - save(self, os.path.join(self.package_folder, "licenses", "LICENSE"), self._extract_license()) + save( + self, + os.path.join(self.package_folder, "licenses", "LICENSE"), + self._extract_license(), + ) cmake = CMake(self) cmake.install() @@ -104,4 +124,4 @@ def package_info(self): self.cpp_info.libs = [libname] self.cpp_info.names["cmake_find_package"] = "ZLIB" - self.cpp_info.names["cmake_find_package_multi"] = "ZLIB" \ No newline at end of file + self.cpp_info.names["cmake_find_package_multi"] = "ZLIB" diff --git a/minecode/utils.py b/minecode/utils.py index cbcd4d3e..846ebe6b 100644 --- a/minecode/utils.py +++ b/minecode/utils.py @@ -20,15 +20,12 @@ from django.utils.encoding import force_str import arrow -from arrow.parser import ParserError import requests -from requests.exceptions import InvalidSchema -from requests.exceptions import ConnectionError - +from arrow.parser import ParserError from commoncode.fileutils import create_dir from extractcode.extract import extract - -from minecode.management.commands import get_settings +from requests.exceptions import ConnectionError +from requests.exceptions import InvalidSchema logger = logging.getLogger(__name__) # import sys @@ -41,35 +38,30 @@ def stringify_null_purl_fields(data): Modify `data` in place by ensuring `purl` fields are not None. This is useful for cleaning data before saving to db. """ - purl_fields = ('type', 'namespace', 'name', - 'version', 'qualifiers', 'subpath') + purl_fields = ("type", "namespace", "name", "version", "qualifiers", "subpath") for field in purl_fields: try: if not data[field]: - data[field] = '' + data[field] = "" except KeyError: continue def sha1(content): - """ - Returns the sha1 hash of the given content. - """ + """Returns the sha1 hash of the given content.""" h = hashlib.sha1() h.update(content) return h.hexdigest() def md5(content): - """ - Returns the md5 hash of the given content. - """ + """Returns the md5 hash of the given content.""" h = hashlib.md5() h.update(content) return h.hexdigest() -class DataObject(object): +class DataObject: """ A data object, using attributes for storage and a to_dict method to get a dict back. @@ -90,40 +82,35 @@ def __getitem__(self, item): return self.__dict__.get(item) def __eq__(self, other): - return ( - self.to_dict(other.to_dict()) - ) + return self.to_dict(other.to_dict()) def normalize_trailing_slash(uri): - """ - Appends a trailing slash if the URI is not ending with one already. - """ - if not uri.endswith('/'): - uri += '/' + """Appends a trailing slash if the URI is not ending with one already.""" + if not uri.endswith("/"): + uri += "/" return uri def is_ascii(s): - """ - Returns True is the string is ASCII. - """ + """Returns True is the string is ASCII.""" return all(ord(c) < 128 for c in s) def clean_html_entities(text): - """ - Reverse of django.utils.html.escape - """ - return text.replace('&', '&').replace('<', '<').replace('>', '>')\ - .replace('"', '"').replace(''', "'") + """Reverse of django.utils.html.escape""" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", '"') + .replace("'", "'") + ) def clean_description(text): - """ - Cleans the description text from HTML entities and from extra whitespaces. - """ - return ' '.join(clean_html_entities(text.strip()).split()) + """Cleans the description text from HTML entities and from extra whitespaces.""" + return " ".join(clean_html_entities(text.strip()).split()) def strip_nbsp(s): @@ -131,13 +118,13 @@ def strip_nbsp(s): Replace non breaking space HTML entities with regular space and strip the string. """ - return force_str(s).replace(' ', ' ').strip() + return force_str(s).replace(" ", " ").strip() -CR = '\r' -LF = '\n' +CR = "\r" +LF = "\n" CRLF = CR + LF -CRLF_NO_CR = ' ' + LF +CRLF_NO_CR = " " + LF def unixlinesep(text, preserve=False): @@ -167,7 +154,7 @@ def decode_fuzzy_date(s, _self=None): """ import dateutil - if hasattr(_self, 'testing'): + if hasattr(_self, "testing"): # fixed base date used only for testing for well defined date offsets base = arrow.get(2014, 2, 2) else: @@ -175,39 +162,36 @@ def decode_fuzzy_date(s, _self=None): base = arrow.utcnow() fuzzy = { - 'Last 30 days': -30, - 'Last 7 days': -7, - 'Today': 0, - 'Yesterday': -1, + "Last 30 days": -30, + "Last 7 days": -7, + "Today": 0, + "Yesterday": -1, } formats = [ - 'YYYY-MM-DD HH:mm:ss', - - 'MMM DD, YYYY', - 'MMM D, YYYY', - - 'ddd MMM D HH:mm:ss YYYY', - 'ddd MMM D H:mm:ss YYYY', - 'ddd MMM DD HH:mm:ss YYYY', - 'ddd MMM DD H:mm:ss YYYY', - 'dddd MMM D HH:mm:ss YYYY', - 'dddd MMM D H:mm:ss YYYY', - 'dddd MMM DD HH:mm:ss YYYY', - 'dddd MMM DD H:mm:ss YYYY', - - 'MM/DD/YYYY', + "YYYY-MM-DD HH:mm:ss", + "MMM DD, YYYY", + "MMM D, YYYY", + "ddd MMM D HH:mm:ss YYYY", + "ddd MMM D H:mm:ss YYYY", + "ddd MMM DD HH:mm:ss YYYY", + "ddd MMM DD H:mm:ss YYYY", + "dddd MMM D HH:mm:ss YYYY", + "dddd MMM D H:mm:ss YYYY", + "dddd MMM DD HH:mm:ss YYYY", + "dddd MMM DD H:mm:ss YYYY", + "MM/DD/YYYY", ] # normalize spaces - s = ' '.join(s.split()) - if s == 'Earlier this year': - ar = base.floor('year') + s = " ".join(s.split()) + if s == "Earlier this year": + ar = base.floor("year") elif s in fuzzy: ar = base.replace(days=fuzzy[s]) else: ar = arrow.get(s, formats) - ar = ar.replace(tzinfo=dateutil.tz.tzutc()).to('utc') # NOQA + ar = ar.replace(tzinfo=dateutil.tz.tzutc()).to("utc") # NOQA return ar.isoformat() @@ -224,24 +208,24 @@ def get_http_response(uri, timeout=10): Fetch and return the response object from an HTTP uri. `timeout` is a timeout with precedence over REQUESTS_ARGS settings. """ - requests_args = getattr(settings, 'REQUESTS_ARGS', {}) - requests_args['timeout'] = timeout + requests_args = getattr(settings, "REQUESTS_ARGS", {}) + requests_args["timeout"] = timeout - if not uri.lower().startswith('http'): - raise Exception( - 'get_http_response: Not an HTTP URI: %(uri)r' % locals()) + if not uri.lower().startswith("http"): + raise Exception("get_http_response: Not an HTTP URI: %(uri)r" % locals()) try: response = requests.get(uri, **requests_args) - except (ConnectionError, InvalidSchema) as e: - logger.error( - 'get_http_response: Download failed for %(uri)r' % locals()) + except (ConnectionError, InvalidSchema): + logger.error("get_http_response: Download failed for %(uri)r" % locals()) raise status = response.status_code if status != 200: - raise Exception('get_http_response: Download failed for %(uri)r ' - 'with %(status)r' % locals()) + raise Exception( + "get_http_response: Download failed for %(uri)r " + "with %(status)r" % locals() + ) return response @@ -258,7 +242,7 @@ def get_package_sha1(package, field="repository_download_url"): # Download archive from URL and calculate sha1 response = requests.get(download_url) if response: - sha1_hash = hashlib.new('sha1', response.content) + sha1_hash = hashlib.new("sha1", response.content) sha1 = sha1_hash.hexdigest() return sha1 @@ -275,9 +259,8 @@ def fetch_and_write_file_from_url(url): metadata_content = response.text filename = url.split("/")[-1] file_name, _, extension = filename.rpartition(".") - temp_metadata_file = get_temp_file( - file_name=file_name, extension=extension) - with open(temp_metadata_file, 'a') as metadata_file: + temp_metadata_file = get_temp_file(file_name=file_name, extension=extension) + with open(temp_metadata_file, "a") as metadata_file: metadata_file.write(metadata_content) return temp_metadata_file @@ -290,24 +273,20 @@ def validate_sha1(sha1): Return `sha1` if it is valid, None otherwise. """ if sha1 and len(sha1) != 40: - logger.warning( - f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!' - ) + logger.warning(f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!') sha1 = None return sha1 -def system_temp_dir(temp_dir=os.getenv('MINECODE_TMP')): - """ - Return the global temp directory.. - """ +def system_temp_dir(temp_dir=os.getenv("MINECODE_TMP")): + """Return the global temp directory..""" if not temp_dir: - temp_dir = os.path.join(tempfile.gettempdir(), 'minecode') + temp_dir = os.path.join(tempfile.gettempdir(), "minecode") create_dir(temp_dir) return temp_dir -def get_temp_dir(base_dir='', prefix=''): +def get_temp_dir(base_dir="", prefix=""): """ Return the path to base a new unique temporary directory, created under the system-wide `system_temp_dir` temp directory and as a subdir of the @@ -321,14 +300,14 @@ def get_temp_dir(base_dir='', prefix=''): return tempfile.mkdtemp(prefix=prefix, dir=base_dir) -def get_temp_file(file_name='data', extension='.file', dir_name=''): +def get_temp_file(file_name="data", extension=".file", dir_name=""): """ Return a file path string to a new, unique and non-existing temporary file that can safely be created without a risk of name collision. """ - if extension and not extension.startswith('.'): - extension = '.' + extension + if extension and not extension.startswith("."): + extension = "." + extension file_name = file_name + extension # create a new temp dir each time @@ -338,9 +317,7 @@ def get_temp_file(file_name='data', extension='.file', dir_name=''): def extract_file(location): - """ - Extract file at location returning the extracted location. - """ + """Extract file at location returning the extracted location.""" target = None try: for event in extract(location): @@ -350,18 +327,16 @@ def extract_file(location): target = event.target break except Exception as e: - logger.error('extract_file: failed for %(location)r' % locals()) + logger.error("extract_file: failed for %(location)r" % locals()) raise e return target def parse_date(s): - """ - Return date string in YYYY-MM-DD format from a datetime string - """ + """Return date string in YYYY-MM-DD format from a datetime string""" if s: try: - return arrow.get(s).format('YYYY-MM-DD') + return arrow.get(s).format("YYYY-MM-DD") except ParserError: # If we can't parse a date, it's not a big deal as `release_date` # is not an important field for us @@ -369,8 +344,7 @@ def parse_date(s): def is_int(s): - """To test if the input para is a int - """ + """To test if the input para is a int""" try: int(s) return True @@ -384,12 +358,11 @@ def form_vcs_url(vcs_tool, vcs_url, revision_tag_or_branch=None, sub_path=None): # +://[/][@][#] if vcs_url: if vcs_tool: - vcs_url = '+'.join(str(v) for v in [vcs_tool, vcs_url]) + vcs_url = "+".join(str(v) for v in [vcs_tool, vcs_url]) if revision_tag_or_branch: - vcs_url = '@'.join(str(v) - for v in [vcs_url, revision_tag_or_branch]) + vcs_url = "@".join(str(v) for v in [vcs_url, revision_tag_or_branch]) if sub_path: - vcs_url = '#'.join(str(v) for v in [vcs_url, sub_path]) + vcs_url = "#".join(str(v) for v in [vcs_url, sub_path]) return vcs_url @@ -402,7 +375,7 @@ def validate_uuid(uuid_string): # This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179 -class MemorySavingQuerysetIterator(object): +class MemorySavingQuerysetIterator: def __init__(self, queryset, max_obj_num=1000): self._base_queryset = queryset self._generator = self._setup() @@ -414,8 +387,9 @@ def _setup(self): # the objects we ensure that there are only `max_obj_num` objects in # memory at any given time smaller_queryset = copy.deepcopy(self._base_queryset)[ - i:i+self.max_obj_num] - logger.debug('Grabbing next %s objects from DB' % self.max_obj_num) + i : i + self.max_obj_num + ] + logger.debug("Grabbing next %s objects from DB" % self.max_obj_num) for obj in smaller_queryset.iterator(): yield obj diff --git a/minecode/utils_test.py b/minecode/utils_test.py index ca2ea0bd..5c114c33 100644 --- a/minecode/utils_test.py +++ b/minecode/utils_test.py @@ -7,10 +7,6 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict - -from itertools import chain -from unittest import TestCase import codecs import json import ntpath @@ -19,20 +15,22 @@ import shutil import stat import tarfile +from collections import OrderedDict +from itertools import chain +from unittest import TestCase from django.apps import apps from django.db import connection from django.db.migrations.executor import MigrationExecutor from django.test import TestCase as DjangoTestCase -from rest_framework.utils.serializer_helpers import ReturnDict -from rest_framework.utils.serializer_helpers import ReturnList from commoncode.testcase import FileBasedTesting +from rest_framework.utils.serializer_helpers import ReturnDict +from rest_framework.utils.serializer_helpers import ReturnList from scancode.cli_test_utils import purl_with_fake_uuid -from minecode.utils import get_temp_dir from minecode.tests import FIXTURES_REGEN - +from minecode.utils import get_temp_dir """ The conventions used for the tests are: @@ -47,14 +45,14 @@ class BaseMiningTestCase(TestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - if not hasattr(self, 'to_delete'): + if not hasattr(self, "to_delete"): self.to_delete = [] def tearDown(self): - for pth in getattr(self, 'to_delete', []): + for pth in getattr(self, "to_delete", []): self.make_rwe(pth) shutil.rmtree(pth, ignore_errors=True) @@ -87,7 +85,7 @@ def extract_archive(self, location, delete=True): Return the temporary dir where the trace was extracted. The temporary dir is deleted once tests are completed. """ - with open(location, 'rb') as input_tar: + with open(location, "rb") as input_tar: tar = tarfile.open(fileobj=input_tar) extract_dir = self.get_temp_dir(delete) tar.extractall(extract_dir) @@ -96,12 +94,13 @@ def extract_archive(self, location, delete=True): def build_archive(self, real_location, tar_path, outarch): from contextlib import closing - with closing(tarfile.open(outarch, mode='w:bz2')) as out: + + with closing(tarfile.open(outarch, mode="w:bz2")) as out: out.add(real_location, arcname=tar_path) def get_temp_dir(self, delete=True): - assert dir and dir != '' - tmp_dir = get_temp_dir(base_dir='', prefix='minecode-tests-') + assert dir and dir != "" + tmp_dir = get_temp_dir(base_dir="", prefix="minecode-tests-") if delete: self.to_delete.append(tmp_dir) return tmp_dir @@ -148,19 +147,15 @@ class MiningTestCase(BaseMiningTestCase, DjangoTestCase): def remove_vcs(location): - """ - Remove well known version control directories. - """ + """Remove well known version control directories.""" for root, dirs, _files in os.walk(location): - for vcs_dir in 'CVS', '.svn', '.git', '.hg': + for vcs_dir in "CVS", ".svn", ".git", ".hg": if vcs_dir in dirs: shutil.rmtree(os.path.join(root, vcs_dir), False) def to_os_native_path(path): - """ - Normalize a path to use the native OS path separator. - """ + """Normalize a path to use the native OS path separator.""" path = path.replace(posixpath.sep, os.path.sep) path = path.replace(ntpath.sep, os.path.sep) path = path.rstrip(os.path.sep) @@ -168,7 +163,6 @@ def to_os_native_path(path): class MockResponse: - def __init__(self, content, status_code): self.content = content self.status_code = status_code @@ -179,7 +173,7 @@ def mocked_requests_get(url, location): Return a MockResponse object by parsing the content of the file at `location` in a response to request to a single `url`. """ - with open(location, 'rb') as loc: + with open(location, "rb") as loc: return MockResponse(loc.read(), 200) @@ -190,15 +184,13 @@ def mocked_requests_get_for_uris(url_to_location, *args, **kwargs): mapping of url->location. """ location = url_to_location[args[0]] - with open(location, 'rb') as loc: + with open(location, "rb") as loc: return MockResponse(loc.read(), 200) def response_403(url, request): - """ - Returns a HTTP response with status 403. - """ - return {'status_code': 403, 'content': ''} + """Returns a HTTP response with status 403.""" + return {"status_code": 403, "content": ""} class JsonBasedTestingMixin(TestCase): @@ -222,8 +214,7 @@ def _normalize_results(self, data, fields_to_remove=[]): ): value = purl_with_fake_uuid(value) if key == "for_packages": - value = [purl_with_fake_uuid(package_uid) - for package_uid in value] + value = [purl_with_fake_uuid(package_uid) for package_uid in value] if key in fields_to_remove: continue normalized_data[key] = value @@ -233,17 +224,22 @@ def _normalize_results(self, data, fields_to_remove=[]): def _remove_fields_from_results(self, data, fields_to_remove): if type(data) in (list, ReturnList): - return [self._remove_fields_from_results(entry, fields_to_remove) for entry in data] + return [ + self._remove_fields_from_results(entry, fields_to_remove) + for entry in data + ] if type(data) in (dict, OrderedDict, ReturnDict): normalized_data = {} # Remove fields from results and normalize Package UIDs for field in fields_to_remove: - if not field in data: + if field not in data: continue data.pop(field) - def check_expected_results(self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN): + def check_expected_results( + self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN + ): """ Check `results` are equal to expected data stored in a JSON file at `expected_loc`. @@ -257,10 +253,10 @@ def check_expected_results(self, results, expected_loc, fields_to_remove=[], reg results = self._normalize_results(results, fields_to_remove) if regen: - with codecs.open(expected_loc, mode='wb', encoding='utf-8') as expect: - json.dump(results, expect, indent=2, separators=(',', ':')) + with codecs.open(expected_loc, mode="wb", encoding="utf-8") as expect: + json.dump(results, expect, indent=2, separators=(",", ":")) - with codecs.open(expected_loc, mode='rb', encoding='utf-8') as expect: + with codecs.open(expected_loc, mode="rb", encoding="utf-8") as expect: expected = json.load(expect) results = json.loads(json.dumps(results)) @@ -288,8 +284,7 @@ def _normalize_results(self, data, fields_to_remove=[]): ): value = purl_with_fake_uuid(value) if key == "for_packages": - value = [purl_with_fake_uuid(package_uid) - for package_uid in value] + value = [purl_with_fake_uuid(package_uid) for package_uid in value] if key in fields_to_remove: continue normalized_data[key] = value @@ -299,17 +294,22 @@ def _normalize_results(self, data, fields_to_remove=[]): def _remove_fields_from_results(self, data, fields_to_remove): if type(data) in (list, ReturnList): - return [self._remove_fields_from_results(entry, fields_to_remove) for entry in data] + return [ + self._remove_fields_from_results(entry, fields_to_remove) + for entry in data + ] if type(data) in (dict, OrderedDict, ReturnDict): normalized_data = {} # Remove fields from results and normalize Package UIDs for field in fields_to_remove: - if not field in data: + if field not in data: continue data.pop(field) - def check_expected_results(self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN): + def check_expected_results( + self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN + ): """ Check `results` are equal to expected data stored in a JSON file at `expected_loc`. @@ -323,16 +323,18 @@ def check_expected_results(self, results, expected_loc, fields_to_remove=[], reg results = self._normalize_results(results, fields_to_remove) if regen: - with codecs.open(expected_loc, mode='wb', encoding='utf-8') as expect: - json.dump(results, expect, indent=2, separators=(',', ':')) + with codecs.open(expected_loc, mode="wb", encoding="utf-8") as expect: + json.dump(results, expect, indent=2, separators=(",", ":")) - with codecs.open(expected_loc, mode='rb', encoding='utf-8') as expect: + with codecs.open(expected_loc, mode="rb", encoding="utf-8") as expect: expected = json.load(expect) results = json.loads(json.dumps(results)) self.assertEqual(expected, results) - def check_expected_uris(self, uris, expected_loc, data_is_json=False, regen=FIXTURES_REGEN): + def check_expected_uris( + self, uris, expected_loc, data_is_json=False, regen=FIXTURES_REGEN + ): """ Check a `uris` iterable of URIs matches the data stored in the JSON file at `expected_loc`. @@ -340,15 +342,16 @@ def check_expected_uris(self, uris, expected_loc, data_is_json=False, regen=FIXT results = [] for uri in uris: uri_dict = uri.to_dict(data_is_json=data_is_json) - if uri_dict.get('date'): + if uri_dict.get("date"): # Parse date since date will be used as Date field in # ResourceURI object, to make it as string format is just for # test comparation. # FIXME: we should ONLY have strings there!!! - uri_dict['date'] = str(uri_dict.get('date')) + uri_dict["date"] = str(uri_dict.get("date")) results.append(uri_dict) self.check_expected_results( - results=results, expected_loc=expected_loc, regen=regen) + results=results, expected_loc=expected_loc, regen=regen + ) def model_to_dict(instance, fields=None, exclude=None): @@ -372,14 +375,14 @@ def model_to_dict(instance, fields=None, exclude=None): opts = instance._meta data = dict() for f in chain(opts.concrete_fields, opts.private_fields, opts.many_to_many): - if not getattr(f, 'editable', False): + if not getattr(f, "editable", False): continue if fields and f.name not in fields: continue if exclude and f.name in exclude: continue value = f.value_from_object(instance) - if 'date' in f.name: + if "date" in f.name: value = bool(value) data[f.name] = value return data @@ -397,9 +400,7 @@ def app(self): def setUp(self): assert ( self.migrate_from and self.migrate_to - ), "TestCase '{}' must define migrate_from and migrate_to properties".format( - type(self).__name__ - ) + ), f"TestCase '{type(self).__name__}' must define migrate_from and migrate_to properties" self.migrate_from = [(self.app, self.migrate_from)] self.migrate_to = [(self.app, self.migrate_to)] executor = MigrationExecutor(connection) diff --git a/minecode/version.py b/minecode/version.py index 220bc087..9322a0de 100644 --- a/minecode/version.py +++ b/minecode/version.py @@ -10,40 +10,39 @@ import re - -VERSION_PATTERNS_REGEX = [re.compile(x, re.IGNORECASE) for x in [ - # v123413.feature_111.22.11.121 - 'v\d+\.feature\_(\d+\.){1,3}\d+', - - # YYYY-MM-DD_12345 - '\d{4}-\d{2}-\d{2}_\d+', - - # FIXME: this a single regex that should be split - '(M?(v\d+(\-|\_))?\d+\.){1,3}\d+[A-Za-z0-9]*' - '((\.|\-|_|~)(b|B|rc|r|v|RC|alpha|beta|m|pre|vm|G)?\d+((\-|\.)\d+)?)?' - '(' - '(\.|\-)' - '(' - '(' - '(alpha|dev|beta|rc|final|pre)' - '(\-|\_)\d+[A-Za-z]?(\-RELEASE)?' - ')' - '|alpha' - '|dev(\.\d+\.\d+)?' - '|beta|final|release|fixed' - '|(cr\d(\_\d*)?)' - ')' - ')?', - - '[A-Za-z]?(\d+\_){1,3}\d+\_?[A-Za-z]{0,2}\d+', - '(b|rc|r|v|RC|alpha|beta|m|pre|revision-)\d+(\-\d+)?', - 'current|previous|latest|alpha|beta', - '\d+-\d+-\d+-\d+', - '\d{4}-\d{2}-\d{2}', - '\d+-\d+-\d+', - '(\d(\-|\_)){1,2}\d', - '\d{5,14}', -]] +VERSION_PATTERNS_REGEX = [ + re.compile(x, re.IGNORECASE) + for x in [ + # v123413.feature_111.22.11.121 + r"v\d+\.feature\_(\d+\.){1,3}\d+", + # YYYY-MM-DD_12345 + r"\d{4}-\d{2}-\d{2}_\d+", + # FIXME: this a single regex that should be split + r"(M?(v\d+(\-|\_))?\d+\.){1,3}\d+[A-Za-z0-9]*" + r"((\.|\-|_|~)(b|B|rc|r|v|RC|alpha|beta|m|pre|vm|G)?\d+((\-|\.)\d+)?)?" + "(" + r"(\.|\-)" + "(" + "(" + "(alpha|dev|beta|rc|final|pre)" + r"(\-|\_)\d+[A-Za-z]?(\-RELEASE)?" + ")" + "|alpha" + r"|dev(\.\d+\.\d+)?" + "|beta|final|release|fixed" + r"|(cr\d(\_\d*)?)" + ")" + ")?", + r"[A-Za-z]?(\d+\_){1,3}\d+\_?[A-Za-z]{0,2}\d+", + r"(b|rc|r|v|RC|alpha|beta|m|pre|revision-)\d+(\-\d+)?", + "current|previous|latest|alpha|beta", + r"\d+-\d+-\d+-\d+", + r"\d{4}-\d{2}-\d{2}", + r"\d+-\d+-\d+", + r"(\d(\-|\_)){1,2}\d", + r"\d{5,14}", + ] +] def version_hint(path, ignore_pre_releases=False, remove_v_prefix=False): @@ -58,7 +57,7 @@ def version_hint(path, ignore_pre_releases=False, remove_v_prefix=False): if not stripped: return for pattern in VERSION_PATTERNS_REGEX: - segments = stripped.split('/') + segments = stripped.split("/") # skip the first path segment unless there's only one segment first_segment = 1 if len(segments) > 1 else 0 interesting_segments = segments[first_segment:] @@ -70,47 +69,87 @@ def version_hint(path, ignore_pre_releases=False, remove_v_prefix=False): fixed = fix_packages_version(path, vs) if ignore_pre_releases: fixed = strip_pre_releases(fixed) - if remove_v_prefix and fixed.startswith('v'): + if remove_v_prefix and fixed.startswith("v"): fixed = fixed[1:] return fixed -NON_VERSION_TAGS = ('win32', 'am64', 'x86_64', 'i386', 'i586', 'i586', 'x86', - 'macosx',) +NON_VERSION_TAGS = ( + "win32", + "am64", + "x86_64", + "i386", + "i586", + "i586", + "x86", + "macosx", +) -NON_VT_RES = [re.compile(re.escape(t), re.IGNORECASE) - for t in NON_VERSION_TAGS] +NON_VT_RES = [re.compile(re.escape(t), re.IGNORECASE) for t in NON_VERSION_TAGS] def strip_version_tags(path): """Remove well known tags that are not part of the version.""" for ret in NON_VT_RES: - path = ret.sub('', path) + path = ret.sub("", path) return path ARCHIVE_FILE_EXTENSIONS = ( - '.7z', '.7zip', '.tar.gz', '.tar.bz2', '.tar.xz', '.tgz', '.tbz', - '.tbz2', '.tz', '.txz', '.zip', '.rar', '.tar', '.gz', '.bz2', '.jar', - '.tar.lzma', '.war', '.lib', '.a', '.ear', '.sar', '.tlz', - '.xz', '.lzma', '.exe', '.rpm', '.deb', '.msi', '.z', '.pkg', + ".7z", + ".7zip", + ".tar.gz", + ".tar.bz2", + ".tar.xz", + ".tgz", + ".tbz", + ".tbz2", + ".tz", + ".txz", + ".zip", + ".rar", + ".tar", + ".gz", + ".bz2", + ".jar", + ".tar.lzma", + ".war", + ".lib", + ".a", + ".ear", + ".sar", + ".tlz", + ".xz", + ".lzma", + ".exe", + ".rpm", + ".deb", + ".msi", + ".z", + ".pkg", ) -ARCHIVE_FILE_EXT_RES = [re.compile(re.escape(e) + '$', re.IGNORECASE) - for e in ARCHIVE_FILE_EXTENSIONS] +ARCHIVE_FILE_EXT_RES = [ + re.compile(re.escape(e) + "$", re.IGNORECASE) for e in ARCHIVE_FILE_EXTENSIONS +] def strip_extensions(path): - """"Remove well known archive extensions from end of path.""" + """ "Remove well known archive extensions from end of path.""" for rext in ARCHIVE_FILE_EXT_RES: - path = rext.sub('', path) + path = rext.sub("", path) return path # these extensions are used for common RPMs and Deb packages -PACKAGE_EXTENSIONS = ('.deb', '.rpm', '.srpm', '.diff.gz',) +PACKAGE_EXTENSIONS = ( + ".deb", + ".rpm", + ".srpm", + ".diff.gz", +) def fix_packages_version(path, version_string): @@ -120,8 +159,8 @@ def fix_packages_version(path, version_string): becomes 1.2.4 instead of 1.2.4-1 """ if path.endswith(PACKAGE_EXTENSIONS): - if version_string.count('-') == 1: - left, _right = version_string.split('-') + if version_string.count("-") == 1: + left, _right = version_string.split("-") return left # return as-is in all other cases return version_string @@ -130,9 +169,14 @@ def fix_packages_version(path, version_string): PRE_RELEASE_TAGS = [] -for pt in ('pre', 'rc', 'alpha', 'beta', 'b1', 'b2', 'b3', 'b4', 'b5'): +for pt in ("pre", "rc", "alpha", "beta", "b1", "b2", "b3", "b4", "b5"): # common punctuation prefixes before the tag - for pp in ('_', '-', '.', '~',): + for pp in ( + "_", + "-", + ".", + "~", + ): # variants with prefix before the bare variant PRE_RELEASE_TAGS.append(pp + pt.upper()) PRE_RELEASE_TAGS.append(pp + pt) @@ -142,9 +186,7 @@ def fix_packages_version(path, version_string): def strip_pre_releases(version_string): - """ - Return a version string stripped from alpha, beta, rc and pre parts. - """ + """Return a version string stripped from alpha, beta, rc and pre parts.""" if not any(t in version_string for t in PRE_RELEASE_TAGS): return version_string for tag in PRE_RELEASE_TAGS: diff --git a/packagedb/api.py b/packagedb/api.py index 2503da7b..daacf6ed 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -9,18 +9,18 @@ import logging -import django_filters from django.core.exceptions import ValidationError from django.db.models import OuterRef from django.db.models import Q from django.db.models import Subquery from django.forms import widgets from django.forms.fields import MultipleChoiceField + +import django_filters from django_filters.filters import Filter from django_filters.filters import MultipleChoiceFilter from django_filters.filters import OrderingFilter from django_filters.rest_framework import FilterSet - from drf_spectacular.plumbing import build_array_type from drf_spectacular.plumbing import build_basic_type from drf_spectacular.types import OpenApiTypes @@ -39,10 +39,11 @@ from univers.version_range import VersionRange from univers.versions import InvalidVersion +from minecode import collectors # NOQA + # UnusedImport here! # But importing the collectors module triggers routes registration from minecode import priority_router -from minecode import collectors # NOQA from minecode.models import PriorityResourceURI from minecode.route import NoRouteAvailable from packagedb.filters import PackageSearchFilter @@ -55,7 +56,6 @@ from packagedb.package_managers import get_api_package_name from packagedb.package_managers import get_version_fetcher from packagedb.serializers import CollectPackageSerializer -from packagedb.serializers import is_supported_addon_pipeline from packagedb.serializers import DependentPackageSerializer from packagedb.serializers import IndexPackagesResponseSerializer from packagedb.serializers import IndexPackagesSerializer @@ -63,13 +63,14 @@ from packagedb.serializers import PackageSetAPISerializer from packagedb.serializers import PackageWatchAPISerializer from packagedb.serializers import PackageWatchCreateSerializer -from packagedb.serializers import UpdatePackagesSerializer from packagedb.serializers import PackageWatchUpdateSerializer from packagedb.serializers import PartySerializer -from packagedb.serializers import PurlValidateResponseSerializer from packagedb.serializers import PurlUpdateResponseSerializer +from packagedb.serializers import PurlValidateResponseSerializer from packagedb.serializers import PurlValidateSerializer from packagedb.serializers import ResourceAPISerializer +from packagedb.serializers import UpdatePackagesSerializer +from packagedb.serializers import is_supported_addon_pipeline from packagedb.throttling import StaffUserRateThrottle from purl2vcs.find_source_repo import get_source_package_and_add_to_package_set @@ -84,22 +85,19 @@ class CharMultipleWidget(widgets.TextInput): def value_from_datadict(self, data, files, name): value = widgets.SelectMultiple().value_from_datadict(data, files, name) - if not value or value == ['']: - return '' + if not value or value == [""]: + return "" return value def format_value(self, value): - """ - Return a value as it should appear when rendered in a template. - """ - return ', '.join(value) + """Return a value as it should appear when rendered in a template.""" + return ", ".join(value) class MultipleCharField(MultipleChoiceField): - """ - Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`. - """ + """Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`.""" + widget = CharMultipleWidget def valid_value(self, value): @@ -107,16 +105,13 @@ def valid_value(self, value): class MultipleCharFilter(MultipleChoiceFilter): - """ - Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax. - """ + """Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax.""" + field_class = MultipleCharField class MultipleCharInFilter(MultipleCharFilter): - """ - Does a __in = [value] filter instead of field=value filter - """ + """Does a __in = [value] filter instead of field=value filter""" def filter(self, qs, value): if not value: @@ -128,7 +123,7 @@ def filter(self, qs, value): predicate = self.get_filter_predicate(value) old_field_name = next(iter(predicate)) - new_field_name = f'{old_field_name}__in' + new_field_name = f"{old_field_name}__in" predicate[new_field_name] = predicate[old_field_name] predicate.pop(old_field_name) @@ -150,11 +145,11 @@ class CreateListRetrieveUpdateViewSetMixin( To use it, override the class and set the `.queryset` and `.serializer_class` attributes. """ + pass class PackageResourcePurlFilter(Filter): - def filter(self, qs, value): if not value: return qs @@ -172,7 +167,6 @@ def filter(self, qs, value): class PackageResourceUUIDFilter(Filter): - def filter(self, qs, value): if not value: return qs @@ -186,24 +180,24 @@ def filter(self, qs, value): class ResourceFilterSet(FilterSet): - package = PackageResourceUUIDFilter(label='Package UUID') - purl = PackageResourcePurlFilter(label='Package pURL') + package = PackageResourceUUIDFilter(label="Package UUID") + purl = PackageResourcePurlFilter(label="Package pURL") md5 = MultipleCharInFilter( - help_text='Exact MD5. Multi-value supported.', + help_text="Exact MD5. Multi-value supported.", ) sha1 = MultipleCharInFilter( - help_text='Exact SHA1. Multi-value supported.', + help_text="Exact SHA1. Multi-value supported.", ) class ResourceViewSet(viewsets.ReadOnlyModelViewSet): - queryset = Resource.objects.select_related('package') + queryset = Resource.objects.select_related("package") serializer_class = ResourceAPISerializer filterset_class = ResourceFilterSet throttle_classes = [StaffUserRateThrottle, AnonRateThrottle] - lookup_field = 'sha1' + lookup_field = "sha1" - @action(detail=False, methods=['post']) + @action(detail=False, methods=["post"]) def filter_by_checksums(self, request, *args, **kwargs): """ Take a mapping, where the keys are the names of the checksum algorthm @@ -216,7 +210,7 @@ def filter_by_checksums(self, request, *args, **kwargs): - sha1 Example: - + ------- { "sha1": [ "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", @@ -237,24 +231,23 @@ def filter_by_checksums(self, request, *args, **kwargs): } This will return Resources whose sha1 or md5 matches those values. + """ data = dict(request.data) unsupported_fields = [] for field, value in data.items(): - if field not in ('md5', 'sha1'): + if field not in ("md5", "sha1"): unsupported_fields.append(field) if unsupported_fields: - unsupported_fields_str = ', '.join(unsupported_fields) + unsupported_fields_str = ", ".join(unsupported_fields) response_data = { - 'status': f'Unsupported field(s) given: {unsupported_fields_str}' + "status": f"Unsupported field(s) given: {unsupported_fields_str}" } return Response(response_data, status=status.HTTP_400_BAD_REQUEST) if not data: - response_data = { - 'status': 'No values provided' - } + response_data = {"status": "No values provided"} return Response(response_data, status=status.HTTP_400_BAD_REQUEST) lookups = Q() @@ -262,18 +255,18 @@ def filter_by_checksums(self, request, *args, **kwargs): value = value or [] # We create this intermediate dictionary so we can modify the field # name to have __in at the end - d = {f'{field}__in': value} + d = {f"{field}__in": value} lookups |= Q(**d) qs = Resource.objects.filter(lookups) paginated_qs = self.paginate_queryset(qs) serializer = ResourceAPISerializer( - paginated_qs, many=True, context={'request': request}) + paginated_qs, many=True, context={"request": request} + ) return self.get_paginated_response(serializer.data) class MultiplePackageURLFilter(MultipleCharFilter): - def filter(self, qs, value): if not value: # Even though not a noop, no point filtering if empty. @@ -282,7 +275,7 @@ def filter(self, qs, value): if self.is_noop(qs, value): return qs - if all(v == '' for v in value): + if all(v == "" for v in value): return qs q = Q() @@ -301,48 +294,48 @@ def filter(self, qs, value): PACKAGE_FILTER_SORT_FIELDS = [ - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'download_url', - 'filename', - 'size', - 'release_date', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "download_url", + "filename", + "size", + "release_date", ] class PackageFilterSet(FilterSet): type = django_filters.CharFilter( - lookup_expr='iexact', - help_text='Exact type. (case-insensitive)', + lookup_expr="iexact", + help_text="Exact type. (case-insensitive)", ) namespace = django_filters.CharFilter( - lookup_expr='iexact', - help_text='Exact namespace. (case-insensitive)', + lookup_expr="iexact", + help_text="Exact namespace. (case-insensitive)", ) name = MultipleCharFilter( - lookup_expr='iexact', - help_text='Exact name. Multi-value supported. (case-insensitive)', + lookup_expr="iexact", + help_text="Exact name. Multi-value supported. (case-insensitive)", ) version = MultipleCharFilter( - help_text='Exact version. Multi-value supported.', + help_text="Exact version. Multi-value supported.", ) md5 = MultipleCharInFilter( - help_text='Exact MD5. Multi-value supported.', + help_text="Exact MD5. Multi-value supported.", ) sha1 = MultipleCharInFilter( - help_text='Exact SHA1. Multi-value supported.', + help_text="Exact SHA1. Multi-value supported.", ) purl = MultiplePackageURLFilter( - label='Package URL', + label="Package URL", ) search = PackageSearchFilter( - label='Search', - field_name='name', - lookup_expr='icontains', + label="Search", + field_name="name", + lookup_expr="icontains", ) sort = OrderingFilter(fields=PACKAGE_FILTER_SORT_FIELDS) @@ -350,31 +343,31 @@ class PackageFilterSet(FilterSet): class Meta: model = Package fields = ( - 'search', - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'download_url', - 'filename', - 'sha1', - 'sha256', - 'md5', - 'size', - 'release_date', + "search", + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "download_url", + "filename", + "sha1", + "sha256", + "md5", + "size", + "release_date", ) class PackagePublicViewSet(viewsets.ReadOnlyModelViewSet): - queryset = Package.objects.prefetch_related('dependencies', 'parties') + queryset = Package.objects.prefetch_related("dependencies", "parties") serializer_class = PackageAPISerializer - lookup_field = 'uuid' + lookup_field = "uuid" filterset_class = PackageFilterSet throttle_classes = [StaffUserRateThrottle, AnonRateThrottle] - @action(detail=True, methods=['get']) + @action(detail=True, methods=["get"]) def latest_version(self, request, *args, **kwargs): """ Return the latest version of the current Package, @@ -385,44 +378,38 @@ def latest_version(self, request, *args, **kwargs): latest_version = package.get_latest_version() if latest_version: return Response( - PackageAPISerializer(latest_version, context={ - 'request': request}).data + PackageAPISerializer(latest_version, context={"request": request}).data ) return Response({}) - @action(detail=True, methods=['get']) + @action(detail=True, methods=["get"]) def history(self, request, *args, **kwargs): - """ - Return the History field associated with the current Package. - """ + """Return the History field associated with the current Package.""" package = self.get_object() return Response({"history": package.history}) - @action(detail=True, methods=['get']) + @action(detail=True, methods=["get"]) def resources(self, request, *args, **kwargs): - """ - Return the Resources associated with the current Package. - """ + """Return the Resources associated with the current Package.""" package = self.get_object() qs = Resource.objects.filter(package=package) paginated_qs = self.paginate_queryset(qs) serializer = ResourceAPISerializer( - paginated_qs, many=True, context={'request': request}) + paginated_qs, many=True, context={"request": request} + ) return self.get_paginated_response(serializer.data) @action(detail=True) def get_enhanced_package_data(self, request, *args, **kwargs): - """ - Return a mapping of enhanced Package data for a given Package - """ + """Return a mapping of enhanced Package data for a given Package""" package = self.get_object() package_data = get_enhanced_package(package) return Response(package_data) - @action(detail=False, methods=['post']) + @action(detail=False, methods=["post"]) def filter_by_checksums(self, request, *args, **kwargs): """ Take a mapping, where the keys are the names of the checksum algorthm @@ -437,7 +424,7 @@ def filter_by_checksums(self, request, *args, **kwargs): - sha512 Example: - + ------- { "sha1": [ "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", @@ -458,42 +445,42 @@ def filter_by_checksums(self, request, *args, **kwargs): } This will return Packages whose sha1 or md5 matches those values. + """ data = dict(request.data) unsupported_fields = [] - supported_fields = ['md5', 'sha1', 'sha256', - 'sha512', 'enhance_package_data'] + supported_fields = ["md5", "sha1", "sha256", "sha512", "enhance_package_data"] for field, value in data.items(): if field not in supported_fields: unsupported_fields.append(field) if unsupported_fields: - unsupported_fields_str = ', '.join(unsupported_fields) + unsupported_fields_str = ", ".join(unsupported_fields) response_data = { - 'status': f'Unsupported field(s) given: {unsupported_fields_str}' + "status": f"Unsupported field(s) given: {unsupported_fields_str}" } return Response(response_data, status=status.HTTP_400_BAD_REQUEST) - enhance_package_data = data.pop('enhance_package_data', False) + enhance_package_data = data.pop("enhance_package_data", False) if not data: - response_data = { - 'status': 'No values provided' - } + response_data = {"status": "No values provided"} return Response(response_data, status=status.HTTP_400_BAD_REQUEST) lookups = Q() for field, value in data.items(): # Subquery to get the ids of the Packages with the earliest release_date for each `field` - earliest_release_dates = Package.objects.filter( - **{field: OuterRef(field)} - ).order_by('release_date').values('id')[:1] + earliest_release_dates = ( + Package.objects.filter(**{field: OuterRef(field)}) + .order_by("release_date") + .values("id")[:1] + ) value = value or [] lookups |= Q( **{ - f'{field}__in': value, - 'id__in': Subquery(earliest_release_dates), + f"{field}__in": value, + "id__in": Subquery(earliest_release_dates), } ) @@ -501,32 +488,28 @@ def filter_by_checksums(self, request, *args, **kwargs): qs = Package.objects.filter(lookups) paginated_qs = self.paginate_queryset(qs) if enhance_package_data: - serialized_package_data = [get_enhanced_package( - package=package) for package in paginated_qs] + serialized_package_data = [ + get_enhanced_package(package=package) for package in paginated_qs + ] else: serializer = PackageAPISerializer( - paginated_qs, many=True, context={'request': request}) + paginated_qs, many=True, context={"request": request} + ) serialized_package_data = serializer.data return self.get_paginated_response(serialized_package_data) class PackageViewSet(PackagePublicViewSet): - @action(detail=True) def reindex_package(self, request, *args, **kwargs): - """ - Reindex this package instance - """ + """Reindex this package instance""" package = self.get_object() package.reindex() - data = { - 'status': f'{package.package_url} has been queued for reindexing' - } + data = {"status": f"{package.package_url} has been queued for reindexing"} return Response(data) class PackageUpdateSet(viewsets.ViewSet): - """ Take a list of `purls` (where each item is a dictionary containing PURL and content_type). @@ -547,17 +530,18 @@ class PackageUpdateSet(viewsets.ViewSet): """ def create(self, request): - res = [] serializer = UpdatePackagesSerializer(data=request.data) if not serializer.is_valid(): - return Response({'errors': serializer.errors}, status=status.HTTP_400_BAD_REQUEST) + return Response( + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST + ) validated_data = serializer.validated_data - packages = validated_data.get('purls', []) - uuid = validated_data.get('uuid', None) + packages = validated_data.get("purls", []) + uuid = validated_data.get("uuid", None) package_set = None if uuid: @@ -568,34 +552,31 @@ def create(self, request): package_set = package_set except: - message = { - 'update_status': f'No Package Set found for {uuid}' - } + message = {"update_status": f"No Package Set found for {uuid}"} return Response(message, status=status.HTTP_400_BAD_REQUEST) for items in packages or []: - res_data = {} - purl = items.get('purl') + purl = items.get("purl") - res_data['purl'] = purl - content_type = items.get('content_type') + res_data["purl"] = purl + content_type = items.get("content_type") content_type_val = PackageContentType.__getitem__(content_type) lookups = purl_to_lookups(purl) filtered_packages = Package.objects.filter(**lookups) - res_data['update_status'] = "Already Exists" + res_data["update_status"] = "Already Exists" if not filtered_packages: if package_set is None: package_set = PackageSet.objects.create() - lookups['package_content'] = content_type_val - lookups['download_url'] = " " + lookups["package_content"] = content_type_val + lookups["download_url"] = " " cr = Package.objects.create(**lookups) package_set.add_to_package_set(cr) - res_data['update_status'] = "Updated" + res_data["update_status"] = "Updated" res.append(res_data) @@ -605,53 +586,51 @@ def create(self, request): UPDATEABLE_FIELDS = [ - 'primary_language', - 'copyright', - - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', + "primary_language", + "copyright", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", # TODO: update extracted license statement and other fields together # all license fields are based off of `extracted_license_statement` and should be treated as a unit # hold off for now - 'extracted_license_statement', - - 'notice_text', - 'api_data_url', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'source_packages', - 'repository_homepage_url', - 'dependencies', - 'parties', - 'homepage_url', - 'description', + "extracted_license_statement", + "notice_text", + "api_data_url", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "source_packages", + "repository_homepage_url", + "dependencies", + "parties", + "homepage_url", + "description", ] NONUPDATEABLE_FIELDS = [ - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'purl', - 'datasource_id', - 'download_url', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'package_uid', - 'repository_download_url', - 'file_references', - 'history', - 'last_modified_date', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "purl", + "datasource_id", + "download_url", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "package_uid", + "repository_download_url", + "file_references", + "history", + "last_modified_date", ] @@ -674,7 +653,10 @@ def get_enhanced_package(package): # and we can't enhance a package that is not in a package set. return package.to_dict() - elif package_content in [PackageContentType.BINARY, PackageContentType.SOURCE_ARCHIVE]: + elif package_content in [ + PackageContentType.BINARY, + PackageContentType.SOURCE_ARCHIVE, + ]: # Binary packages can only be part of one set # TODO: Can source_archive packages be part of multiple sets? first_package_in_set = package.package_sets.first() @@ -701,14 +683,12 @@ def _get_enhanced_package(package, packages): # always default to PackageContentType.BINARY as we can have None/NULL in the model for now # Reference: https://github.com/aboutcode-org/purldb/issues/490 - package_content = ( - package and package.package_content) or PackageContentType.BINARY + package_content = (package and package.package_content) or PackageContentType.BINARY for peer in packages: # always default to PackageContentType.BINARY as we can have None/NULL in the model for now # Reference: https://github.com/aboutcode-org/purldb/issues/490 - peer_content = ( - peer and peer.package_content) or PackageContentType.BINARY + peer_content = (peer and peer.package_content) or PackageContentType.BINARY if peer_content >= package_content: # We do not want to mix data with peers of the same package content @@ -719,25 +699,24 @@ def _get_enhanced_package(package, packages): package_value = package_data.get(field) peer_value = getattr(peer, field) if not package_value and peer_value: - if field == 'parties': + if field == "parties": peer_value = PartySerializer(peer_value, many=True).data - if field == 'dependencies': - peer_value = DependentPackageSerializer( - peer_value, many=True).data + if field == "dependencies": + peer_value = DependentPackageSerializer(peer_value, many=True).data package_data[field] = peer_value enhanced = True if enhanced: - extra_data = package_data.get('extra_data', {}) - enhanced_by = extra_data.get('enhanced_by', []) + extra_data = package_data.get("extra_data", {}) + enhanced_by = extra_data.get("enhanced_by", []) enhanced_by.append(peer.purl) - extra_data['enhanced_by'] = enhanced_by - package_data['extra_data'] = extra_data + extra_data["enhanced_by"] = enhanced_by + package_data["extra_data"] = extra_data return package_data class PackageSetViewSet(viewsets.ReadOnlyModelViewSet): - queryset = PackageSet.objects.prefetch_related('packages') + queryset = PackageSet.objects.prefetch_related("packages") serializer_class = PackageSetAPISerializer @@ -747,16 +726,17 @@ class PackageWatchViewSet(CreateListRetrieveUpdateViewSetMixin): Add the new package version to the scan queue. Default watch interval is 7 days. """ - queryset = PackageWatch.objects.get_queryset().order_by('-id') + + queryset = PackageWatch.objects.get_queryset().order_by("-id") serializer_class = PackageWatchAPISerializer - lookup_field = 'package_url' - lookup_value_regex = r'pkg:[a-zA-Z0-9_]+\/[a-zA-Z0-9_.-]+(?:\/[a-zA-Z0-9_.-]+)*' - http_method_names = ['get', 'post', 'patch'] + lookup_field = "package_url" + lookup_value_regex = r"pkg:[a-zA-Z0-9_]+\/[a-zA-Z0-9_.-]+(?:\/[a-zA-Z0-9_.-]+)*" + http_method_names = ["get", "post", "patch"] def get_serializer_class(self): - if self.action == 'create': + if self.action == "create": return PackageWatchCreateSerializer - elif self.request.method == 'PATCH': + elif self.request.method == "PATCH": return PackageWatchUpdateSerializer return super().get_serializer_class() @@ -800,20 +780,23 @@ class CollectViewSet(viewsets.ViewSet): **Note:** See `Index packages` for bulk indexing/reindexing of packages. """ + serializer_class = CollectPackageSerializer @extend_schema( parameters=[ - OpenApiParameter('purl', str, 'query', - description='PackageURL', required=True), - OpenApiParameter('source_purl', str, 'query', - description='Source PackageURL'), - + OpenApiParameter( + "purl", str, "query", description="PackageURL", required=True + ), + OpenApiParameter( + "source_purl", str, "query", description="Source PackageURL" + ), # There is no OpenApiTypes.LIST https://github.com/tfranzel/drf-spectacular/issues/341 OpenApiParameter( - 'addon_pipelines', + "addon_pipelines", build_array_type(build_basic_type(OpenApiTypes.STR)), - 'query', description='Addon pipelines', + "query", + description="Addon pipelines", ), ], responses={200: PackageAPISerializer()}, @@ -822,23 +805,25 @@ def list(self, request, format=None): serializer = self.serializer_class(data=request.query_params) if not serializer.is_valid(): return Response( - {'errors': serializer.errors}, + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST, ) validated_data = serializer.validated_data - purl = validated_data.get('purl') - sort = validated_data.get('sort') or ['-version',] + purl = validated_data.get("purl") + sort = validated_data.get("sort") or [ + "-version", + ] kwargs = dict() # We want this request to have high priority since the user knows the # exact package they want - kwargs['priority'] = 100 + kwargs["priority"] = 100 - if source_purl := validated_data.get('source_purl', None): + if source_purl := validated_data.get("source_purl", None): kwargs["source_purl"] = source_purl - if addon_pipelines := validated_data.get('addon_pipelines', []): + if addon_pipelines := validated_data.get("addon_pipelines", []): kwargs["addon_pipelines"] = addon_pipelines lookups = purl_to_lookups(purl) @@ -848,7 +833,7 @@ def list(self, request, format=None): errors = priority_router.process(purl, **kwargs) except NoRouteAvailable: message = { - 'status': f'cannot fetch Package data for {purl}: no available handler' + "status": f"cannot fetch Package data for {purl}: no available handler" } return Response(message, status=status.HTTP_400_BAD_REQUEST) @@ -858,7 +843,7 @@ def list(self, request, format=None): message = {} if errors: message = { - 'status': f'error(s) occurred when fetching metadata for {purl}: {errors}' + "status": f"error(s) occurred when fetching metadata for {purl}: {errors}" } return Response(message, status=status.HTTP_400_BAD_REQUEST) @@ -866,7 +851,8 @@ def list(self, request, format=None): get_source_package_and_add_to_package_set(package) serializer = PackageAPISerializer( - packages, many=True, context={'request': request}) + packages, many=True, context={"request": request} + ) return Response(serializer.data) @extend_schema( @@ -875,7 +861,7 @@ def list(self, request, format=None): 200: IndexPackagesResponseSerializer(), }, ) - @action(detail=False, methods=['post'], serializer_class=IndexPackagesSerializer) + @action(detail=False, methods=["post"], serializer_class=IndexPackagesSerializer) def index_packages(self, request, *args, **kwargs): """ Collect and index a JSON array of `packages` objects with PURLs to process. @@ -975,12 +961,14 @@ def _reindex_package(package, reindexed_packages, **kwargs): serializer = self.serializer_class(data=request.data) if not serializer.is_valid(): - return Response({'errors': serializer.errors}, status=status.HTTP_400_BAD_REQUEST) + return Response( + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST + ) validated_data = serializer.validated_data - packages = validated_data.get('packages', []) - reindex = validated_data.get('reindex', False) - reindex_set = validated_data.get('reindex_set', False) + packages = validated_data.get("packages", []) + reindex = validated_data.get("reindex", False) + reindex_set = validated_data.get("reindex_set", False) queued_packages = [] unqueued_packages = [] @@ -989,19 +977,31 @@ def _reindex_package(package, reindexed_packages, **kwargs): reindexed_packages = [] requeued_packages = [] - supported_ecosystems = ['maven', 'npm', 'deb', - 'generic', 'gnu', 'openssl', 'github', 'conan'] + supported_ecosystems = [ + "maven", + "npm", + "deb", + "generic", + "gnu", + "openssl", + "github", + "conan", + ] unique_packages, unsupported_packages, unsupported_vers = get_resolved_packages( - packages, supported_ecosystems) + packages, supported_ecosystems + ) if reindex: for package in unique_packages: - purl = package['purl'] + purl = package["purl"] kwargs = dict() - if addon_pipelines := package.get('addon_pipelines'): + if addon_pipelines := package.get("addon_pipelines"): kwargs["addon_pipelines"] = [ - pipe for pipe in addon_pipelines if is_supported_addon_pipeline(pipe)] + pipe + for pipe in addon_pipelines + if is_supported_addon_pipeline(pipe) + ] lookups = purl_to_lookups(purl) packages = Package.objects.filter(**lookups) if packages.count() > 0: @@ -1011,62 +1011,68 @@ def _reindex_package(package, reindexed_packages, **kwargs): if reindex_set: for package_set in package.package_sets.all(): for p in package_set.packages.all(): - _reindex_package( - p, reindexed_packages, **kwargs) + _reindex_package(p, reindexed_packages, **kwargs) else: nonexistent_packages.append(package) - requeued_packages.extend( - [p.package_url for p in reindexed_packages]) + requeued_packages.extend([p.package_url for p in reindexed_packages]) if not reindex or nonexistent_packages: - interesting_packages = nonexistent_packages if nonexistent_packages else unique_packages + interesting_packages = ( + nonexistent_packages if nonexistent_packages else unique_packages + ) for package in interesting_packages: - purl = package['purl'] + purl = package["purl"] is_routable_purl = priority_router.is_routable(purl) if not is_routable_purl: unsupported_packages.append(purl) else: # add to queue extra_fields = dict() - if source_purl := package.get('source_purl'): - extra_fields['source_uri'] = source_purl - if addon_pipelines := package.get('addon_pipelines'): - extra_fields['addon_pipelines'] = [ - pipe for pipe in addon_pipelines if is_supported_addon_pipeline(pipe)] - if priority := package.get('priority'): - extra_fields['priority'] = priority + if source_purl := package.get("source_purl"): + extra_fields["source_uri"] = source_purl + if addon_pipelines := package.get("addon_pipelines"): + extra_fields["addon_pipelines"] = [ + pipe + for pipe in addon_pipelines + if is_supported_addon_pipeline(pipe) + ] + if priority := package.get("priority"): + extra_fields["priority"] = priority priority_resource_uri = PriorityResourceURI.objects.insert( - purl, **extra_fields) + purl, **extra_fields + ) if priority_resource_uri: queued_packages.append(purl) else: unqueued_packages.append(purl) response_data = { - 'queued_packages_count': len(queued_packages), - 'queued_packages': queued_packages, - 'requeued_packages_count': len(requeued_packages), - 'requeued_packages': requeued_packages, - 'unqueued_packages_count': len(unqueued_packages), - 'unqueued_packages': unqueued_packages, - 'unsupported_packages_count': len(unsupported_packages), - 'unsupported_packages': unsupported_packages, - 'unsupported_vers_count': len(unsupported_vers), - 'unsupported_vers': unsupported_vers, + "queued_packages_count": len(queued_packages), + "queued_packages": queued_packages, + "requeued_packages_count": len(requeued_packages), + "requeued_packages": requeued_packages, + "unqueued_packages_count": len(unqueued_packages), + "unqueued_packages": unqueued_packages, + "unsupported_packages_count": len(unsupported_packages), + "unsupported_packages": unsupported_packages, + "unsupported_vers_count": len(unsupported_vers), + "unsupported_vers": unsupported_vers, } serializer = IndexPackagesResponseSerializer( - response_data, context={'request': request}) + response_data, context={"request": request} + ) return Response(serializer.data) @extend_schema( parameters=[ - OpenApiParameter('purl', str, 'query', - description='PackageURL', required=True), + OpenApiParameter( + "purl", str, "query", description="PackageURL", required=True + ), ], responses={200: PackageAPISerializer()}, ) - @action(detail=False, methods=['get'], serializer_class=CollectPackageSerializer) + @action(detail=False, methods=["get"], serializer_class=CollectPackageSerializer) def reindex_metadata(self, request, *args, **kwargs): """ Collect or recollect the package metadata of a ``PURL`` string. @@ -1087,18 +1093,18 @@ def reindex_metadata(self, request, *args, **kwargs): serializer = self.serializer_class(data=request.query_params) if not serializer.is_valid(): return Response( - {'errors': serializer.errors}, + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST, ) validated_data = serializer.validated_data - purl = validated_data.get('purl') + purl = validated_data.get("purl") lookups = purl_to_lookups(purl) packages = Package.objects.filter(**lookups) if packages.count() == 0: return Response( - {'status': f'Not recollecting: Package does not exist for {purl}'}, + {"status": f"Not recollecting: Package does not exist for {purl}"}, status=status.HTTP_400_BAD_REQUEST, ) @@ -1109,7 +1115,7 @@ def reindex_metadata(self, request, *args, **kwargs): errors = priority_router.process(purl, **kwargs) except NoRouteAvailable: message = { - 'status': f'cannot fetch Package data for {purl}: no available handler' + "status": f"cannot fetch Package data for {purl}: no available handler" } return Response(message, status=status.HTTP_400_BAD_REQUEST) @@ -1119,12 +1125,13 @@ def reindex_metadata(self, request, *args, **kwargs): message = {} if errors: message = { - 'status': f'error(s) occurred when fetching metadata for {purl}: {errors}' + "status": f"error(s) occurred when fetching metadata for {purl}: {errors}" } return Response(message, status=status.HTTP_400_BAD_REQUEST) serializer = PackageAPISerializer( - packages, many=True, context={'request': request}) + packages, many=True, context={"request": request} + ) return Response(serializer.data) @@ -1148,16 +1155,22 @@ class PurlValidateViewSet(viewsets.ViewSet): - exists - True, if input PURL exists in real world and `check_existence` flag is enabled. """ + serializer_class = PurlValidateSerializer def get_view_name(self): - return 'Validate PURL' + return "Validate PURL" @extend_schema( parameters=[ - OpenApiParameter('purl', str, 'query', description='PackageURL'), - OpenApiParameter('check_existence', bool, 'query', - description='Check existence', default=False), + OpenApiParameter("purl", str, "query", description="PackageURL"), + OpenApiParameter( + "check_existence", + bool, + "query", + description="Check existence", + default=False, + ), ], responses={200: PurlValidateResponseSerializer()}, ) @@ -1165,47 +1178,46 @@ def list(self, request): serializer = self.serializer_class(data=request.query_params) if not serializer.is_valid(): - return Response({'errors': serializer.errors}, status=status.HTTP_400_BAD_REQUEST) + return Response( + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST + ) validated_data = serializer.validated_data - purl = validated_data.get('purl') - check_existence = validated_data.get('check_existence', False) + purl = validated_data.get("purl") + check_existence = validated_data.get("check_existence", False) message_valid = "The provided PackageURL is valid." message_not_valid = "The provided PackageURL is not valid." - message_valid_and_exists = ( - "The provided Package URL is valid, and the package exists in the upstream repo." - ) + message_valid_and_exists = "The provided Package URL is valid, and the package exists in the upstream repo." message_valid_but_does_not_exist = ( "The provided PackageURL is valid, but does not exist in the upstream repo." ) - message_valid_but_package_type_not_supported = ( - "The provided PackageURL is valid, but `check_existence` is not supported for this package type." - ) + message_valid_but_package_type_not_supported = "The provided PackageURL is valid, but `check_existence` is not supported for this package type." response = {} - response['exists'] = None - response['purl'] = purl - response['valid'] = False - response['message'] = message_not_valid + response["exists"] = None + response["purl"] = purl + response["valid"] = False + response["message"] = message_not_valid # validate purl try: package_url = PackageURL.from_string(purl) except ValueError: serializer = PurlValidateResponseSerializer( - response, context={'request': request}) + response, context={"request": request} + ) return Response(serializer.data, status=status.HTTP_400_BAD_REQUEST) - response['valid'] = True + response["valid"] = True response["message"] = message_valid unsupported_ecosystem = False if check_existence: - response['exists'] = False + response["exists"] = False lookups = purl_to_lookups(purl) packages = Package.objects.filter(**lookups) if packages.exists(): - response['exists'] = True + response["exists"] = True else: versionless_purl = PackageURL( type=package_url.type, @@ -1217,25 +1229,26 @@ def list(self, request): and package_url.type in VERSION_CLASS_BY_PACKAGE_TYPE ): all_versions = get_all_versions_plain(versionless_purl) - if all_versions and (not package_url.version or ( - package_url.version in all_versions) + if all_versions and ( + not package_url.version or (package_url.version in all_versions) ): # True, if requested purl has no version and any version of package exists upstream. # True, if requested purl.version exists upstream. - response['exists'] = True + response["exists"] = True else: unsupported_ecosystem = True - if response['exists']: + if response["exists"]: response["message"] = message_valid_and_exists elif unsupported_ecosystem: - response['exists'] = None + response["exists"] = None response["message"] = message_valid_but_package_type_not_supported else: response["message"] = message_valid_but_does_not_exist serializer = PurlValidateResponseSerializer( - response, context={'request': request}) + response, context={"request": request} + ) return Response(serializer.data) @@ -1250,8 +1263,8 @@ def get_resolved_packages(packages, supported_ecosystems): unsupported_vers = set() for package in packages or []: - purl = package.get('purl') - vers = package.get('vers') + purl = package.get("purl") + vers = package.get("vers") if not purl: continue @@ -1268,7 +1281,7 @@ def get_resolved_packages(packages, supported_ecosystems): if parsed_purl.version: # We prioritize Package requests that have explicit versions - package['priority'] = 100 + package["priority"] = 100 resolved_packages_by_purl[purl] = package continue @@ -1276,24 +1289,26 @@ def get_resolved_packages(packages, supported_ecosystems): if not vers and not parsed_purl.version: if resolved_purls := resolve_all_versions(parsed_purl): for res_purl in resolved_purls: - resolved_packages_by_purl[res_purl] = {'purl': res_purl} + resolved_packages_by_purl[res_purl] = {"purl": res_purl} continue if resolved_purls := resolve_versions(parsed_purl, vers): for res_purl in resolved_purls: - resolved_packages_by_purl[res_purl] = {'purl': res_purl} + resolved_packages_by_purl[res_purl] = {"purl": res_purl} else: unsupported_vers.add(vers) unique_resolved_packages = resolved_packages_by_purl.values() - return list(unique_resolved_packages), list(unsupported_purls), list(unsupported_vers) + return ( + list(unique_resolved_packages), + list(unsupported_purls), + list(unsupported_vers), + ) def resolve_all_versions(parsed_purl): - """ - Take versionless and return a list of PURLs for all the released versions. - """ + """Take versionless and return a list of PURLs for all the released versions.""" all_versions = get_all_versions(parsed_purl) or [] return [ @@ -1340,16 +1355,15 @@ def resolve_versions(parsed_purl, vers): result.append(str(package_url)) except InvalidConstraintsError: logger.warning( - f"Invalid constraints sequence in '{vers}' for '{parsed_purl}'") + f"Invalid constraints sequence in '{vers}' for '{parsed_purl}'" + ) return return result def get_all_versions_plain(purl: PackageURL): - """ - Return all the versions available for the given purls. - """ + """Return all the versions available for the given purls.""" if ( purl.type not in VERSION_API_CLASSES_BY_PACKAGE_TYPE or purl.type not in VERSION_CLASS_BY_PACKAGE_TYPE @@ -1386,4 +1400,6 @@ def get_all_versions(purl): VERSION_CLASS_BY_PACKAGE_TYPE = { - pkg_type: range_class.version_class for pkg_type, range_class in RANGE_CLASS_BY_SCHEMES.items()} + pkg_type: range_class.version_class + for pkg_type, range_class in RANGE_CLASS_BY_SCHEMES.items() +} diff --git a/packagedb/api_custom.py b/packagedb/api_custom.py index e793fe8a..6a068788 100644 --- a/packagedb/api_custom.py +++ b/packagedb/api_custom.py @@ -17,6 +17,7 @@ class PageSizePagination(PageNumberPagination): For example: http://api.example.org/accounts/?page=4&page_size=20 """ + page_size = 20 max_page_size = 20 - page_size_query_param = 'page_size' + page_size_query_param = "page_size" diff --git a/packagedb/filters.py b/packagedb/filters.py index f6974764..11a4ff2c 100644 --- a/packagedb/filters.py +++ b/packagedb/filters.py @@ -9,10 +9,11 @@ import shlex -import django_filters from django.core.exceptions import FieldError from django.db.models import Q +import django_filters + # The function and Classes in this file are from https://github.com/aboutcode-org/scancode.io/blob/main/scanpipe/filters.py @@ -49,8 +50,7 @@ def parse_query_string_to_lookups(query_string, default_lookup_expr, default_fie search_value = term field_name = default_field - lookups &= Q( - **{f"{field_name}__{lookup_expr}": search_value}, _negated=negated) + lookups &= Q(**{f"{field_name}__{lookup_expr}": search_value}, _negated=negated) return lookups @@ -85,8 +85,7 @@ def filter(self, qs, value): if "://" not in value and ":" in value: return super().filter(qs, value) - search_fields = ["type", "namespace", - "name", "version", "download_url"] + search_fields = ["type", "namespace", "name", "version", "download_url"] lookups = Q() for field_names in search_fields: lookups |= Q(**{f"{field_names}__{self.lookup_expr}": value}) diff --git a/packagedb/from_purl.py b/packagedb/from_purl.py index b6102a9c..e0adc42a 100644 --- a/packagedb/from_purl.py +++ b/packagedb/from_purl.py @@ -14,23 +14,20 @@ from rest_framework import viewsets from rest_framework.response import Response -from purl2vcs.find_source_repo import get_package_object_from_purl -from purl2vcs.find_source_repo import get_source_repo from packagedb.serializers import PurltoGitRepoResponseSerializer from packagedb.serializers import PurltoGitRepoSerializer +from purl2vcs.find_source_repo import get_package_object_from_purl +from purl2vcs.find_source_repo import get_source_repo @extend_schema( parameters=[ - OpenApiParameter("package_url", str, "query", - description="package url"), + OpenApiParameter("package_url", str, "query", description="package url"), ], responses={200: PurltoGitRepoResponseSerializer()}, ) class FromPurlToGitRepoViewSet(viewsets.ViewSet): - """ - Return a ``git_repo`` from a standard PackageURL. - """ + """Return a ``git_repo`` from a standard PackageURL.""" serializer_class = PurltoGitRepoSerializer diff --git a/packagedb/management/commands/create_source_repo_packages.py b/packagedb/management/commands/create_source_repo_packages.py index 1d7d461c..0fa0817c 100644 --- a/packagedb/management/commands/create_source_repo_packages.py +++ b/packagedb/management/commands/create_source_repo_packages.py @@ -11,14 +11,13 @@ import sys import openpyxl -from packageurl.contrib.django.utils import purl_to_lookups from minecode.management.commands import VerboseCommand from minecode.model_utils import add_package_to_scan_queue -from purl2vcs.find_source_repo import add_source_package_to_package_set -from purl2vcs.find_source_repo import get_package_object_from_purl from packagedb.models import Package from packagedb.models import PackageContentType +from purl2vcs.find_source_repo import add_source_package_to_package_set +from purl2vcs.find_source_repo import get_package_object_from_purl TRACE = False diff --git a/packagedb/management/commands/fix_purl_values.py b/packagedb/management/commands/fix_purl_values.py index be34ff80..da0866be 100644 --- a/packagedb/management/commands/fix_purl_values.py +++ b/packagedb/management/commands/fix_purl_values.py @@ -7,22 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse -import copy import logging import sys -from urllib3.util import Retry +import requests +from dateutil.parser import parse as dateutil_parse +from packagedcode.maven import build_filename +from packagedcode.maven import get_urls from packageurl import PackageURL -from packagedcode.maven import get_urls, build_filename from requests import Session from requests.adapters import HTTPAdapter -import requests +from urllib3.util import Retry -from minecode.management.commands import VerboseCommand -from minecode.utils import MemorySavingQuerysetIterator from minecode.collectors.maven import collect_links_from_text from minecode.collectors.maven import filter_for_artifacts +from minecode.management.commands import VerboseCommand +from minecode.utils import MemorySavingQuerysetIterator from packagedb.models import Package DEFAULT_TIMEOUT = 30 @@ -34,9 +34,9 @@ logger.setLevel(logging.INFO) session = Session() -session.mount('https://', HTTPAdapter(max_retries=Retry(10))) +session.mount("https://", HTTPAdapter(max_retries=Retry(10))) headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36", } @@ -45,16 +45,18 @@ def get_timestamps_by_links(package_version_page_url): response = requests.get(package_version_page_url) if response: timestamps_by_links = collect_links_from_text( - response.text, filter=filter_for_artifacts) + response.text, filter=filter_for_artifacts + ) timestamps_by_links = { - link: dateutil_parse(timestamp) for link, timestamp in timestamps_by_links.items() + link: dateutil_parse(timestamp) + for link, timestamp in timestamps_by_links.items() } return timestamps_by_links -class MavenArtifact(object): - def __init__(self, namespace, name, version, qualifiers='', ec=[]): - type = 'maven' +class MavenArtifact: + def __init__(self, namespace, name, version, qualifiers="", ec=[]): + type = "maven" self.type = type self.namespace = namespace self.name = name @@ -65,7 +67,7 @@ def __init__(self, namespace, name, version, qualifiers='', ec=[]): namespace=namespace, name=name, version=version, - qualifiers=qualifiers + qualifiers=qualifiers, ) urls = get_urls( namespace=namespace, @@ -73,19 +75,18 @@ def __init__(self, namespace, name, version, qualifiers='', ec=[]): version=version, qualifiers=self.package_url.qualifiers, ) - self.download_url = urls['repository_download_url'] - self.repository_homepage_url = urls['repository_homepage_url'] - self.api_data_url = urls['api_data_url'] + self.download_url = urls["repository_download_url"] + self.repository_homepage_url = urls["repository_homepage_url"] + self.api_data_url = urls["api_data_url"] qualifiers_mapping = self.package_url.qualifiers filename = build_filename( artifact_id=name, version=version, - extension=qualifiers_mapping.get('type') or 'jar', - classifier=qualifiers_mapping.get('classifier'), + extension=qualifiers_mapping.get("type") or "jar", + classifier=qualifiers_mapping.get("classifier"), ) - timestamps_by_links = get_timestamps_by_links( - self.repository_homepage_url) + timestamps_by_links = get_timestamps_by_links(self.repository_homepage_url) self.release_date = timestamps_by_links.get(filename) self.related_artifacts = list( self._populate_related_artifacts( @@ -98,14 +99,14 @@ def __init__(self, namespace, name, version, qualifiers='', ec=[]): @classmethod def _populate_related_artifacts(cls, namespace, name, version, ec): - filtered_ec = [entry for entry in ec if not entry.startswith('.')] + filtered_ec = [entry for entry in ec if not entry.startswith(".")] for entry in filtered_ec: - _, ending = entry.split('-') - split_ending = ending.split('.') + _, ending = entry.split("-") + split_ending = ending.split(".") classifier = None if len(split_ending) > 0: classifier = split_ending[0] - qualifiers = f'classifier={classifier}' + qualifiers = f"classifier={classifier}" yield cls( namespace=namespace, name=name, @@ -115,7 +116,7 @@ def _populate_related_artifacts(cls, namespace, name, version, ec): def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT): - maven_api_search_url = f'https://search.maven.org/solrsearch/select?q=1:{sha1}' + maven_api_search_url = f"https://search.maven.org/solrsearch/select?q=1:{sha1}" try: response = session.get(maven_api_search_url, timeout=timeout) response.raise_for_status() @@ -125,14 +126,14 @@ def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT): if not response.ok: return f"API query failed for: {maven_api_search_url}" contents = response.json() - resp = contents.get('response', {}) + resp = contents.get("response", {}) matched_artifacts = [] - if resp.get('numFound', 0) > 0: - for matched_artifact in resp.get('docs', []): - namespace = matched_artifact.get('g', '') - name = matched_artifact.get('a', '') - version = matched_artifact.get('v', '') - ec = matched_artifact.get('ec', []) + if resp.get("numFound", 0) > 0: + for matched_artifact in resp.get("docs", []): + namespace = matched_artifact.get("g", "") + name = matched_artifact.get("a", "") + version = matched_artifact.get("v", "") + ec = matched_artifact.get("ec", []) if not namespace and name and version: continue matched_artifacts.append( @@ -147,14 +148,14 @@ def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT): class Command(VerboseCommand): - help = 'Update maven Package download_url values' + help = "Update maven Package download_url values" def handle(self, *args, **options): - maven_packages = Package.objects.filter( - type='maven', sha1__is_null=False) + maven_packages = Package.objects.filter(type="maven", sha1__is_null=False) maven_packages_count = maven_packages.count() logger.info( - f'Checking {maven_packages_count:,} Maven Package PackageURL values') + f"Checking {maven_packages_count:,} Maven Package PackageURL values" + ) packages_to_delete = [] for package in MemorySavingQuerysetIterator(maven_packages): @@ -197,8 +198,12 @@ def handle(self, *args, **options): package_different_case.qualifiers = artifact_qualifiers package_different_case.download_url = artifact.download_url package_different_case.release_date = artifact.release_date - package_different_case.repository_homepage_url = artifact.repository_homepage_url - package_different_case.repository_download_url = artifact.repository_download_url + package_different_case.repository_homepage_url = ( + artifact.repository_homepage_url + ) + package_different_case.repository_download_url = ( + artifact.repository_download_url + ) package_different_case.api_data_url = artifact.api_data_url package_different_case.sha1 = package.sha1 package_different_case.save() diff --git a/packagedb/management/commands/run_scheduler.py b/packagedb/management/commands/run_scheduler.py index b70f6e94..e09fc561 100644 --- a/packagedb/management/commands/run_scheduler.py +++ b/packagedb/management/commands/run_scheduler.py @@ -8,6 +8,7 @@ # from django_rq.management.commands import rqscheduler + from packagedb.models import PackageWatch from packagedb.schedules import clear_zombie_watch_schedules from packagedb.schedules import scheduled_job_exists diff --git a/packagedb/management/commands/watch_packages.py b/packagedb/management/commands/watch_packages.py index f6297f4e..4c379598 100644 --- a/packagedb/management/commands/watch_packages.py +++ b/packagedb/management/commands/watch_packages.py @@ -7,8 +7,9 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from commoncode import cliutils from django.core.management.base import BaseCommand + +from commoncode import cliutils from fetchcode.package_versions import SUPPORTED_ECOSYSTEMS from packageurl import PackageURL from univers.version_range import RANGE_CLASS_BY_SCHEMES @@ -36,8 +37,7 @@ def handle(self, *args, **options): purl_value = options.get("purl") packages_qs = ( - Package.objects.filter( - type__in=PRIORITY_QUEUE_SUPPORTED_ECOSYSTEMS) + Package.objects.filter(type__in=PRIORITY_QUEUE_SUPPORTED_ECOSYSTEMS) .filter(type__in=SUPPORTED_ECOSYSTEMS) .distinct("type", "namespace", "name") .order_by("type", "namespace", "name") diff --git a/packagedb/models.py b/packagedb/models.py index 4ed64760..0a1fcb09 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -13,8 +13,6 @@ import uuid from collections import OrderedDict -import natsort -from dateutil.parser import parse as dateutil_parse from django.conf import settings from django.contrib.auth.models import UserManager from django.contrib.postgres.fields import ArrayField @@ -27,14 +25,17 @@ from django.dispatch import receiver from django.utils import timezone from django.utils.translation import gettext_lazy as _ -from rest_framework.authtoken.models import Token +import natsort +from dateutil.parser import parse as dateutil_parse from licensedcode.cache import build_spdx_license_expression -from packagedb import schedules from packagedcode.models import normalize_qualifiers from packageurl import PackageURL from packageurl.contrib.django.models import PackageURLMixin from packageurl.contrib.django.models import PackageURLQuerySetMixin +from rest_framework.authtoken.models import Token + +from packagedb import schedules TRACE = False @@ -44,10 +45,8 @@ def sort_version(packages): - """ - Return the packages sorted by version. - """ - return natsort.natsorted(packages, key=lambda p: p.version.replace('.', '~')+'z') + """Return the packages sorted by version.""" + return natsort.natsorted(packages, key=lambda p: p.version.replace(".", "~") + "z") class PackageQuerySet(PackageURLQuerySetMixin, models.QuerySet): @@ -57,14 +56,13 @@ def insert(self, download_url, **extra_fields): Return None if the insertion failed when an identical entry already exist. """ package, created = self.get_or_create( - download_url=download_url, defaults=extra_fields) + download_url=download_url, defaults=extra_fields + ) if created: return package def get_or_none(self, *args, **kwargs): - """ - Return the object matching the given lookup parameters, or None if no match exists. - """ + """Return the object matching the given lookup parameters, or None if no match exists.""" try: return self.get(*args, **kwargs) except Package.DoesNotExist: @@ -86,11 +84,11 @@ def paginated(self, per_page=5000): VCS_CHOICES = [ - ('git', 'git'), - ('svn', 'subversion'), - ('hg', 'mercurial'), - ('bzr', 'bazaar'), - ('cvs', 'cvs'), + ("git", "git"), + ("svn", "subversion"), + ("hg", "mercurial"), + ("bzr", "bazaar"), + ("cvs", "cvs"), ] @@ -108,12 +106,13 @@ class HistoryMixin(models.Model): is a list containing mappings representing the history for this object. Each mapping contains the field "timestamp" and "message". """ + history = models.JSONField( default=list, blank=True, editable=False, help_text=_( - 'A list of mappings representing the history for this object. ' + "A list of mappings representing the history for this object. " 'Each mapping contains the fields "timestamp" and "message".' ), ) @@ -121,22 +120,20 @@ class HistoryMixin(models.Model): null=True, blank=True, db_index=True, - help_text=_('Timestamp set when a Package is created'), + help_text=_("Timestamp set when a Package is created"), ) last_modified_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text=_('Timestamp set when a Package is created or modified'), + help_text=_("Timestamp set when a Package is created or modified"), ) class Meta: abstract = True def append_to_history(self, message, data={}, save=False): - """ - Append the ``message`` string to the history of this object. - """ + """Append the ``message`` string to the history of this object.""" time = timezone.now() timestamp = time.strftime("%Y-%m-%d-%H:%M:%S") entry = { @@ -220,9 +217,7 @@ class ExtraDataFieldMixin(models.Model): ) def update_extra_data(self, data): - """ - Updates the `extra_data` field with the provided `data` dict. - """ + """Updates the `extra_data` field with the provided `data` dict.""" if type(data) != dict: raise ValueError("Argument `data` value must be a dict()") @@ -354,8 +349,7 @@ class AbstractPackage(models.Model): copyright = models.TextField( blank=True, null=True, - help_text=_( - "Copyright statements for this package. Typically one per line."), + help_text=_("Copyright statements for this package. Typically one per line."), ) holder = models.TextField( blank=True, @@ -456,13 +450,13 @@ class PackageContentType(models.IntegerChoices): # TODO: curation is a special case, based on how the curation identity # fields matches with the current package - CURATION = 1, 'curation' - PATCH = 2, 'patch' - SOURCE_REPO = 3, 'source_repo' - SOURCE_ARCHIVE = 4, 'source_archive' - BINARY = 5, 'binary' - TEST = 6, 'test' - DOC = 7, 'doc' + CURATION = 1, "curation" + PATCH = 2, "patch" + SOURCE_REPO = 3, "source_repo" + SOURCE_ARCHIVE = 4, "source_archive" + BINARY = 5, "binary" + TEST = 6, "test" + DOC = 7, "doc" def get_class_name(obj): @@ -483,9 +477,11 @@ class Package( ) mining_level = models.PositiveIntegerField( default=0, - help_text=_('A numeric indication of the highest depth and breadth ' - 'of package data collected through previous visits. ' - 'Higher means more and deeper collection.'), + help_text=_( + "A numeric indication of the highest depth and breadth " + "of package data collected through previous visits. " + "Higher means more and deeper collection." + ), ) keywords = ArrayField( base_field=models.TextField( @@ -495,15 +491,17 @@ class Package( default=list, blank=True, null=True, - help_text=_('A list of keywords.'), + help_text=_("A list of keywords."), ) root_path = models.CharField( max_length=1024, blank=True, null=True, - help_text=_('The path to the root of the package documented in this manifest ' - 'if any, such as a Maven .pom or a npm package.json parent ' - 'directory.') + help_text=_( + "The path to the root of the package documented in this manifest " + "if any, such as a Maven .pom or a npm package.json parent " + "directory." + ), ) source_packages = ArrayField( base_field=models.TextField( @@ -513,31 +511,33 @@ class Package( default=list, blank=True, null=True, - help_text=_('A list of source package URLs (aka. "purl") for this package. ' - 'For instance an SRPM is the "source package" for a binary RPM.'), + help_text=_( + 'A list of source package URLs (aka. "purl") for this package. ' + 'For instance an SRPM is the "source package" for a binary RPM.' + ), ) last_indexed_date = models.DateTimeField( null=True, blank=True, - help_text='Timestamp set to the date of the last indexing. Used to track indexing status.' + help_text="Timestamp set to the date of the last indexing. Used to track indexing status.", ) index_error = models.TextField( null=True, blank=True, - help_text='Indexing errors messages. When present this means the indexing has failed.', + help_text="Indexing errors messages. When present this means the indexing has failed.", ) package_sets = models.ManyToManyField( - 'PackageSet', - related_name='packages', - help_text=_( - 'A set representing the Package sets this Package is a member of.'), + "PackageSet", + related_name="packages", + help_text=_("A set representing the Package sets this Package is a member of."), ) package_content = models.IntegerField( null=True, choices=PackageContentType.choices, help_text=_( - 'Content of this Package as one of: {}'.format( - ', '.join(PackageContentType.labels)) + "Content of this Package as one of: {}".format( + ", ".join(PackageContentType.labels) + ) ), ) summary = models.JSONField( @@ -545,7 +545,7 @@ class Package( blank=True, null=True, help_text=_( - 'A mapping containing a summary and license clarity score for this Package' + "A mapping containing a summary and license clarity score for this Package" ), ) @@ -553,37 +553,37 @@ class Package( # TODO: Think about ordering, unique together, indexes, etc. class Meta: - ordering = ['id'] + ordering = ["id"] unique_together = [ ( - 'download_url', - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath' + "download_url", + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", ) ] indexes = [ # multicolumn index for search on a whole `purl` - models.Index(fields=[ - 'type', 'namespace', 'name', 'version', 'qualifiers', 'subpath' - ]), - models.Index(fields=['type']), - models.Index(fields=['namespace']), - models.Index(fields=['name']), - models.Index(fields=['version']), - models.Index(fields=['qualifiers']), - models.Index(fields=['subpath']), - models.Index(fields=['download_url']), - models.Index(fields=['filename']), - models.Index(fields=['size']), - models.Index(fields=['release_date']), - models.Index(fields=['md5']), - models.Index(fields=['sha1']), - models.Index(fields=['sha256']), - models.Index(fields=['sha512']), + models.Index( + fields=["type", "namespace", "name", "version", "qualifiers", "subpath"] + ), + models.Index(fields=["type"]), + models.Index(fields=["namespace"]), + models.Index(fields=["name"]), + models.Index(fields=["version"]), + models.Index(fields=["qualifiers"]), + models.Index(fields=["subpath"]), + models.Index(fields=["download_url"]), + models.Index(fields=["filename"]), + models.Index(fields=["size"]), + models.Index(fields=["release_date"]), + models.Index(fields=["md5"]), + models.Index(fields=["sha1"]), + models.Index(fields=["sha256"]), + models.Index(fields=["sha512"]), ] def __str__(self): @@ -596,18 +596,17 @@ def purl(self): @property def package_uid(self): purl = PackageURL.from_string(self.package_url) - purl.qualifiers['uuid'] = str(self.uuid) + purl.qualifiers["uuid"] = str(self.uuid) return str(purl) def to_dict(self): from packagedb.serializers import PackageMetadataSerializer + package_metadata = PackageMetadataSerializer(self).data return package_metadata def get_all_versions(self): - """ - Return a list of all the versions of this Package. - """ + """Return a list of all the versions of this Package.""" manager = self.__class__.objects queryset = manager.filter( name=self.name, @@ -617,9 +616,7 @@ def get_all_versions(self): return queryset def get_latest_version(self): - """ - Return the latest version of this Package. - """ + """Return the latest version of this Package.""" sorted_versions = sort_version(self.get_all_versions()) if sorted_versions: return sorted_versions[-1] @@ -630,14 +627,15 @@ def reindex(self, **kwargs): created for this Package. The fingerprints and Resources associated with this Package are deleted and recreated from the updated scan data. """ - from minecode.model_utils import add_package_to_scan_queue from minecode.model_utils import DEFAULT_PIPELINES + from minecode.model_utils import add_package_to_scan_queue - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) add_package_to_scan_queue( - self, pipelines=pipelines, reindex_uri=True, priority=100) + self, pipelines=pipelines, reindex_uri=True, priority=100 + ) def update_fields(self, save=False, **values_by_fields): """ @@ -661,28 +659,28 @@ def update_fields(self, save=False, **values_by_fields): if not hasattr(self, field): # Raise exception when we we are given a keyword argument that # doesn't correspond to a Package field - raise AttributeError( - f"'{class_name}' has no attribute '{field}'") + raise AttributeError(f"'{class_name}' has no attribute '{field}'") related_model_fields = [ - 'dependencies', - 'parties', - 'resources', + "dependencies", + "parties", + "resources", ] if field in related_model_fields: unsaved_models = [] - if field == 'dependencies': + if field == "dependencies": for dep_data in value: if isinstance(dep_data, (dict, OrderedDict)): dep = DependentPackage( package=self, - purl=dep_data.get('purl'), + purl=dep_data.get("purl"), extracted_requirement=dep_data.get( - 'extracted_requirement'), - scope=dep_data.get('scope'), - is_runtime=dep_data.get('is_runtime'), - is_optional=dep_data.get('is_optional'), - is_resolved=dep_data.get('is_resolved'), + "extracted_requirement" + ), + scope=dep_data.get("scope"), + is_runtime=dep_data.get("is_runtime"), + is_optional=dep_data.get("is_optional"), + is_resolved=dep_data.get("is_resolved"), ) elif isinstance(dep_data, DependentPackage): dep = dep_data @@ -692,16 +690,16 @@ def update_fields(self, save=False, **values_by_fields): ) unsaved_models.append(dep) - if field == 'parties': + if field == "parties": for party_data in value: if isinstance(party_data, (dict, OrderedDict)): party = Party( package=self, - type=party_data.get('type'), - role=party_data.get('role'), - name=party_data.get('name'), - email=party_data.get('email'), - url=party_data.get('url'), + type=party_data.get("type"), + role=party_data.get("role"), + name=party_data.get("name"), + email=party_data.get("email"), + url=party_data.get("url"), ) elif isinstance(party_data, Party): party = party_data @@ -711,28 +709,29 @@ def update_fields(self, save=False, **values_by_fields): ) unsaved_models.append(party) - if field == 'resources': + if field == "resources": for resource_data in value: if isinstance(resource_data, (dict, OrderedDict)): resource = Resource( package=self, - path=resource_data.get('path'), - is_file=resource_data.get('type') == 'file', - name=resource_data.get('name'), - extension=resource_data.get('extension'), - size=resource_data.get('size'), - md5=resource_data.get('md5'), - sha1=resource_data.get('sha1'), - sha256=resource_data.get('sha256'), - mime_type=resource_data.get('mime_type'), - file_type=resource_data.get('file_type'), + path=resource_data.get("path"), + is_file=resource_data.get("type") == "file", + name=resource_data.get("name"), + extension=resource_data.get("extension"), + size=resource_data.get("size"), + md5=resource_data.get("md5"), + sha1=resource_data.get("sha1"), + sha256=resource_data.get("sha256"), + mime_type=resource_data.get("mime_type"), + file_type=resource_data.get("file_type"), programming_language=resource_data.get( - 'programming_language'), - is_binary=resource_data.get('is_binary'), - is_text=resource_data.get('is_text'), - is_archive=resource_data.get('is_archive'), - is_media=resource_data.get('is_media'), - is_key_file=resource_data.get('is_key_file'), + "programming_language" + ), + is_binary=resource_data.get("is_binary"), + is_text=resource_data.get("is_text"), + is_archive=resource_data.get("is_archive"), + is_media=resource_data.get("is_media"), + is_key_file=resource_data.get("is_key_file"), ) resource.set_scan_results(resource_data) elif isinstance(resource_data, Resource): @@ -746,20 +745,19 @@ def update_fields(self, save=False, **values_by_fields): if unsaved_models: created_models_count = len(unsaved_models) model_count = 0 - if field == 'dependencies': + if field == "dependencies": model_count = self.dependencies.all().count() with transaction.atomic(): self.dependencies.all().delete() - DependentPackage.objects.bulk_create( - unsaved_models) + DependentPackage.objects.bulk_create(unsaved_models) - if field == 'parties': + if field == "parties": model_count = self.parties.all().count() with transaction.atomic(): self.parties.all().delete() Party.objects.bulk_create(unsaved_models) - if field == 'resources': + if field == "resources": model_count = self.resources.all().count() with transaction.atomic(): self.resources.all().delete() @@ -767,17 +765,17 @@ def update_fields(self, save=False, **values_by_fields): msg = f"Replaced {model_count} existing entries of field '{field}' with {created_models_count} new entries." self.append_to_history(msg) - replaced_fields.extend([field, 'history']) + replaced_fields.extend([field, "history"]) else: # Ensure the incoming value is of the correct type - if field == 'qualifiers' and isinstance(value, dict): + if field == "qualifiers" and isinstance(value, dict): value = normalize_qualifiers(value, encode=True) date_fields = [ - 'created_date', - 'last_indexed_date', - 'last_modified_date', - 'release_date', + "created_date", + "last_indexed_date", + "last_modified_date", + "release_date", ] if field in date_fields and isinstance(value, str): value = dateutil_parse(value) @@ -802,14 +800,14 @@ def update_fields(self, save=False, **values_by_fields): if updated_fields and history_entries: data = { - 'updated_fields': history_entries, + "updated_fields": history_entries, } self.append_to_history( - 'Package field values have been updated.', + "Package field values have been updated.", data=data, save=save, ) - updated_fields.append('history') + updated_fields.append("history") if replaced_fields: updated_fields.extend(replaced_fields) @@ -821,11 +819,11 @@ def update_fields(self, save=False, **values_by_fields): return self, updated_fields -party_person = 'person' +party_person = "person" # often loosely defined -party_project = 'project' +party_project = "project" # more formally defined -party_org = 'organization' +party_org = "organization" PARTY_TYPES = ( (party_person, party_person), (party_project, party_project), @@ -834,14 +832,13 @@ def update_fields(self, save=False, **values_by_fields): class Party(models.Model): - """ - A party is a person, project or organization related to a package. - """ + """A party is a person, project or organization related to a package.""" + package = models.ForeignKey( Package, - related_name='parties', + related_name="parties", on_delete=models.CASCADE, - help_text=_('The Package that this party is related to') + help_text=_("The Package that this party is related to"), ) type = models.CharField( @@ -849,118 +846,114 @@ class Party(models.Model): blank=True, null=True, choices=PARTY_TYPES, - help_text=_('the type of this party') + help_text=_("the type of this party"), ) role = models.CharField( max_length=32, blank=True, null=True, - help_text=_('A role for this party. Something such as author, ' - 'maintainer, contributor, owner, packager, distributor, ' - 'vendor, developer, owner, etc.') + help_text=_( + "A role for this party. Something such as author, " + "maintainer, contributor, owner, packager, distributor, " + "vendor, developer, owner, etc." + ), ) name = models.CharField( - max_length=70, - blank=True, - null=True, - help_text=_('Name of this party.') + max_length=70, blank=True, null=True, help_text=_("Name of this party.") ) email = models.CharField( - max_length=255, - blank=True, - null=True, - help_text=_('Email for this party.') + max_length=255, blank=True, null=True, help_text=_("Email for this party.") ) url = models.CharField( max_length=1024, blank=True, null=True, - help_text=_('URL to a primary web page for this party.') + help_text=_("URL to a primary web page for this party."), ) def to_dict(self): from packagedb.serializers import PartySerializer + party_data = PartySerializer(self).data return party_data class DependentPackage(models.Model): - """ - An identifiable dependent package package object. - """ + """An identifiable dependent package package object.""" + package = models.ForeignKey( Package, - related_name='dependencies', + related_name="dependencies", on_delete=models.CASCADE, - help_text=_('The Package that this dependent package is related to') + help_text=_("The Package that this dependent package is related to"), ) purl = models.CharField( max_length=2048, blank=True, null=True, - help_text=_('A compact purl package URL') + help_text=_("A compact purl package URL"), ) extracted_requirement = models.CharField( max_length=200, blank=True, null=True, - help_text=_( - 'A string defining version(s)requirements. Package-type specific.') + help_text=_("A string defining version(s)requirements. Package-type specific."), ) scope = models.CharField( max_length=100, blank=True, null=True, - help_text=_('The scope of this dependency, such as runtime, install, etc. ' - 'This is package-type specific and is the original scope string.') + help_text=_( + "The scope of this dependency, such as runtime, install, etc. " + "This is package-type specific and is the original scope string." + ), ) is_runtime = models.BooleanField( - default=True, - help_text=_('True if this dependency is a runtime dependency.') + default=True, help_text=_("True if this dependency is a runtime dependency.") ) is_optional = models.BooleanField( - default=False, - help_text=_('True if this dependency is an optional dependency') + default=False, help_text=_("True if this dependency is an optional dependency") ) is_resolved = models.BooleanField( default=False, - help_text=_('True if this dependency version requirement has ' - 'been resolved and this dependency url points to an ' - 'exact version.') + help_text=_( + "True if this dependency version requirement has " + "been resolved and this dependency url points to an " + "exact version." + ), ) def to_dict(self): from packagedb.serializers import DependentPackageSerializer + depedent_package_data = DependentPackageSerializer(self).data return depedent_package_data class AbstractResource(models.Model): - """ - These model fields should be kept in line with scancode.resource.Resource - """ + """These model fields should be kept in line with scancode.resource.Resource""" path = models.CharField( max_length=2000, help_text=_( - 'The full path value of a resource (file or directory) in the archive it is from.'), + "The full path value of a resource (file or directory) in the archive it is from." + ), ) name = models.CharField( max_length=255, blank=True, - help_text=_( - "File or directory name of this resource with its extension."), + help_text=_("File or directory name of this resource with its extension."), ) extension = models.CharField( @@ -974,7 +967,7 @@ class AbstractResource(models.Model): size = models.BigIntegerField( blank=True, null=True, - help_text=_('Size in bytes.'), + help_text=_("Size in bytes."), ) mime_type = models.CharField( @@ -998,8 +991,7 @@ class AbstractResource(models.Model): max_length=50, blank=True, null=True, - help_text=_( - "Programming language of this resource if this is a code file."), + help_text=_("Programming language of this resource if this is a code file."), ) is_binary = models.BooleanField(default=False) @@ -1010,17 +1002,16 @@ class AbstractResource(models.Model): is_file = models.BooleanField( default=False, - help_text=_( - 'True if this Resource is a file, False if it is a Directory') + help_text=_("True if this Resource is a file, False if it is a Directory"), ) @property def type(self): - return 'file' if self.is_file else 'directory' + return "file" if self.is_file else "directory" @type.setter def type(self, value): - if value == 'file': + if value == "file": self.is_file = True else: self.is_file = False @@ -1079,8 +1070,7 @@ class ScanFieldsModelMixin(models.Model): authors = models.JSONField( blank=True, default=list, - help_text=_( - "List of detected authors (and related detection details)."), + help_text=_("List of detected authors (and related detection details)."), ) package_data = models.JSONField( default=list, @@ -1137,47 +1127,41 @@ def copy_scan_results(self, from_instance, save=False): class Resource( - ExtraDataFieldMixin, - HashFieldsMixin, - ScanFieldsModelMixin, - AbstractResource + ExtraDataFieldMixin, HashFieldsMixin, ScanFieldsModelMixin, AbstractResource ): package = models.ForeignKey( Package, - related_name='resources', + related_name="resources", on_delete=models.CASCADE, - help_text=_('The Package that this Resource is from') + help_text=_("The Package that this Resource is from"), ) git_sha1 = models.CharField( max_length=40, blank=True, null=True, - help_text=_('git SHA1 checksum hex-encoded'), + help_text=_("git SHA1 checksum hex-encoded"), ) class Meta: - unique_together = ( - ('package', 'path'), - ) - ordering = ('id',) + unique_together = (("package", "path"),) + ordering = ("id",) indexes = [ - models.Index(fields=['md5']), - models.Index(fields=['sha1']), - models.Index(fields=['sha256']), - models.Index(fields=['sha512']), - models.Index(fields=['git_sha1']), + models.Index(fields=["md5"]), + models.Index(fields=["sha1"]), + models.Index(fields=["sha256"]), + models.Index(fields=["sha512"]), + models.Index(fields=["git_sha1"]), ] @property def for_packages(self): """Return the list of all Packages associated to this resource.""" - return [ - self.package.package_uid or str(self.package) - ] + return [self.package.package_uid or str(self.package)] def to_dict(self): from packagedb.serializers import ResourceMetadataSerializer + resource_metadata = ResourceMetadataSerializer(self).data return resource_metadata @@ -1213,9 +1197,9 @@ class Relationship(models.TextChoices): relationship = models.CharField( max_length=30, choices=Relationship.choices, - help_text='Relationship between the from and to package ' - 'URLs such as "source_package" when a package ' - 'is the source code package for another package.' + help_text="Relationship between the from and to package " + 'URLs such as "source_package" when a package ' + "is the source code package for another package.", ) def __str__(self): @@ -1226,11 +1210,11 @@ def __str__(self): def make_relationship( - from_package, to_package, relationship, + from_package, + to_package, + relationship, ): - """ - Create and return the from/to package relathionship if it does exists. - """ + """Create and return the from/to package relathionship if it does exists.""" pkg, _created = PackageRelation.objects.get_or_create( from_package=from_package, to_package=to_package, @@ -1240,9 +1224,8 @@ def make_relationship( class PackageWatch(models.Model): - """ - Model representing a watch on a package to monitor for new versions. - """ + """Model representing a watch on a package to monitor for new versions.""" + DEPTH_CHOICES = ( (1, "Version"), (2, "Metadata"), @@ -1302,14 +1285,14 @@ class PackageWatch(models.Model): choices=DEPTH_CHOICES, default=3, help_text=_( - "Depth of data collection from listing versions up to a full scan."), + "Depth of data collection from listing versions up to a full scan." + ), ) watch_interval = models.PositiveSmallIntegerField( validators=[ MinValueValidator(1, message="Interval must be at least 1 day."), - MaxValueValidator( - 365, message="Interval must be at most 365 days."), + MaxValueValidator(365, message="Interval must be at most 365 days."), ], default=7, help_text=_("Number of days to wait between watches of this package."), @@ -1417,16 +1400,13 @@ def create_new_job(self): class PackageSet(models.Model): - """ - A group of related Packages - """ + """A group of related Packages""" + uuid = models.UUIDField( verbose_name=_("UUID"), default=uuid.uuid4, unique=True, - help_text=_( - 'The identifier of the Package set' - ) + help_text=_("The identifier of the Package set"), ) def add_to_package_set(self, package): @@ -1435,15 +1415,13 @@ def add_to_package_set(self, package): def get_package_set_members(self): """Return related Packages""" return self.packages.order_by( - 'package_content', + "package_content", ) class ApiUserManager(UserManager): def create_api_user(self, username, first_name="", last_name="", **extra_fields): - """ - Create and return an API-only user. Raise ValidationError. - """ + """Create and return an API-only user. Raise ValidationError.""" username = self.normalize_email(username) email = username self._validate_username(email) @@ -1469,16 +1447,15 @@ def create_api_user(self, username, first_name="", last_name="", **extra_fields) return user def _validate_username(self, email): - """ - Validate username. If invalid, raise a ValidationError - """ + """Validate username. If invalid, raise a ValidationError""" try: self.get_by_natural_key(email) except models.ObjectDoesNotExist: pass else: raise exceptions.ValidationError( - f"Error: This email already exists: {email}") + f"Error: This email already exists: {email}" + ) @receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL) diff --git a/packagedb/package_managers.py b/packagedb/package_managers.py index 14916614..b25e101d 100644 --- a/packagedb/package_managers.py +++ b/packagedb/package_managers.py @@ -8,20 +8,17 @@ # import dataclasses -import json import logging import traceback import xml.etree.ElementTree as ET +from collections.abc import Iterable from datetime import datetime -from typing import Iterable -from typing import List -from typing import Optional -from typing import Set from urllib.parse import urlparse +from django.utils.dateparse import parse_datetime + import requests from dateutil import parser as dateparser -from django.utils.dateparse import parse_datetime from packageurl import PackageURL logger = logging.getLogger(__name__) @@ -38,7 +35,7 @@ @dataclasses.dataclass(frozen=True) class PackageVersion: value: str - release_date: Optional[datetime] = None + release_date: datetime | None = None def to_dict(self): release_date = self.release_date @@ -48,8 +45,8 @@ def to_dict(self): @dataclasses.dataclass class VersionResponse: - valid_versions: Set[str] = dataclasses.field(default_factory=set) - newer_versions: Set[str] = dataclasses.field(default_factory=set) + valid_versions: set[str] = dataclasses.field(default_factory=set) + newer_versions: set[str] = dataclasses.field(default_factory=set) def get_response(url, content_type="json", headers=None): @@ -124,7 +121,9 @@ def get_until(self, package_name, until=None) -> VersionResponse: else: valid_versions.add(version.value) - return VersionResponse(valid_versions=valid_versions, newer_versions=new_versions) + return VersionResponse( + valid_versions=valid_versions, newer_versions=new_versions + ) def fetch(self, pkg: str) -> Iterable[PackageVersion]: """ @@ -135,16 +134,12 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: def remove_debian_default_epoch(version): - """ - Remove the default epoch from a Debian ``version`` string. - """ + """Remove the default epoch from a Debian ``version`` string.""" return version and version.replace("0:", "") class LaunchpadVersionAPI(VersionAPI): - """ - Fetch versions of Ubuntu debian packages from Launchpad - """ + """Fetch versions of Ubuntu debian packages from Launchpad""" package_type = "deb" @@ -165,7 +160,9 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: for release in entries: source_package_version = release.get("source_package_version") - source_package_version = remove_debian_default_epoch(version=source_package_version) + source_package_version = remove_debian_default_epoch( + version=source_package_version + ) date_published = release.get("date_published") release_date = None if date_published and type(date_published) is str: @@ -182,9 +179,7 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: class PypiVersionAPI(VersionAPI): - """ - Fetch versions of Python pypi packages from the PyPI API. - """ + """Fetch versions of Python pypi packages from the PyPI API.""" package_type = "pypi" @@ -238,9 +233,7 @@ def get_latest_date(self, downloads): class CratesVersionAPI(VersionAPI): - """ - Fetch versions of Rust cargo packages from the crates.io API. - """ + """Fetch versions of Rust cargo packages from the crates.io API.""" package_type = "cargo" @@ -255,9 +248,7 @@ def fetch(self, pkg): class RubyVersionAPI(VersionAPI): - """ - Fetch versions of Rubygems packages from the rubygems API. - """ + """Fetch versions of Rubygems packages from the rubygems API.""" package_type = "gem" @@ -280,9 +271,7 @@ def fetch(self, pkg): class NpmVersionAPI(VersionAPI): - """ - Fetch versions of npm packages from the npm registry API. - """ + """Fetch versions of npm packages from the npm registry API.""" package_type = "npm" @@ -300,9 +289,7 @@ def fetch(self, pkg): class DebianVersionAPI(VersionAPI): - """ - Fetch versions of Debian debian packages from the sources.debian.org API - """ + """Fetch versions of Debian debian packages from the sources.debian.org API""" package_type = "deb" @@ -324,9 +311,7 @@ def fetch(self, pkg): class MavenVersionAPI(VersionAPI): - """ - Fetch versions of Maven packages from Maven Central maven-metadata.xml data - """ + """Fetch versions of Maven packages from Maven Central maven-metadata.xml data""" package_type = "maven" @@ -339,7 +324,7 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: yield from self.extract_versions(xml_resp) @staticmethod - def artifact_url(artifact_comps: List[str]) -> str: + def artifact_url(artifact_comps: list[str]) -> str: try: group_id, artifact_id = artifact_comps except ValueError: @@ -365,9 +350,7 @@ def extract_versions(xml_response: ET.ElementTree) -> Iterable[PackageVersion]: class NugetVersionAPI(VersionAPI): - """ - Fetch versions of NuGet packages from the nuget.org API - """ + """Fetch versions of NuGet packages from the nuget.org API""" package_type = "nuget" @@ -396,16 +379,12 @@ def extract_versions(response: dict) -> Iterable[PackageVersion]: def cleaned_version(version): - """ - Return a ``version`` string stripped from leading "v" prefix. - """ + """Return a ``version`` string stripped from leading "v" prefix.""" return version.lstrip("vV") class ComposerVersionAPI(VersionAPI): - """ - Fetch versions of PHP Composer packages from the packagist.org API - """ + """Fetch versions of PHP Composer packages from the packagist.org API""" package_type = "composer" @@ -431,9 +410,7 @@ def extract_versions(resp: dict, pkg: str) -> Iterable[PackageVersion]: class HexVersionAPI(VersionAPI): - """ - Fetch versions of Erlang packages from the hex API - """ + """Fetch versions of Erlang packages from the hex API""" package_type = "hex" @@ -451,9 +428,7 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: class GoproxyVersionAPI(VersionAPI): - """ - Fetch versions of Go "golang" packages from the Go proxy API - """ + """Fetch versions of Go "golang" packages from the Go proxy API""" package_type = "golang" @@ -461,7 +436,7 @@ def __init__(self): self.module_name_by_package_name = {} @staticmethod - def trim_go_url_path(url_path: str) -> Optional[str]: + def trim_go_url_path(url_path: str) -> str | None: """ Return a trimmed Go `url_path` removing trailing package references and keeping only the module @@ -512,7 +487,9 @@ def escape_path(path: str) -> str: return escaped_path @staticmethod - def fetch_version_info(version_info: str, escaped_pkg: str) -> Optional[PackageVersion]: + def fetch_version_info( + version_info: str, escaped_pkg: str + ) -> PackageVersion | None: v = version_info.split() if not v: return None @@ -534,12 +511,13 @@ def fetch_version_info(version_info: str, escaped_pkg: str) -> Optional[PackageV f"Error while fetching version info for {escaped_pkg}/{escaped_ver} " f"from goproxy:\n{traceback.format_exc()}" ) - release_date = parse_datetime(response.get("Time", "")) if response else None + release_date = ( + parse_datetime(response.get("Time", "")) if response else None + ) return PackageVersion(value=value, release_date=release_date) def fetch(self, pkg: str) -> Iterable[PackageVersion]: - # escape uppercase in module path escaped_pkg = self.escape_path(pkg) trimmed_pkg = pkg @@ -584,7 +562,9 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: } -VERSION_API_CLASSES_BY_PACKAGE_TYPE = {cls.package_type: cls for cls in VERSION_API_CLASSES} +VERSION_API_CLASSES_BY_PACKAGE_TYPE = { + cls.package_type: cls for cls in VERSION_API_CLASSES +} VERSION_API_CLASS_BY_PACKAGE_NAMESPACE = { @@ -617,7 +597,11 @@ def get_api_package_name(purl: PackageURL) -> str: def get_version_fetcher(package_url): if package_url.type == "deb": - versions_fetcher: VersionAPI = VERSION_API_CLASS_BY_PACKAGE_NAMESPACE[package_url.namespace] + versions_fetcher: VersionAPI = VERSION_API_CLASS_BY_PACKAGE_NAMESPACE[ + package_url.namespace + ] else: - versions_fetcher: VersionAPI = VERSION_API_CLASSES_BY_PACKAGE_TYPE[package_url.type] + versions_fetcher: VersionAPI = VERSION_API_CLASSES_BY_PACKAGE_TYPE[ + package_url.type + ] return versions_fetcher diff --git a/packagedb/schedules.py b/packagedb/schedules.py index 65b44eb8..f171aa62 100644 --- a/packagedb/schedules.py +++ b/packagedb/schedules.py @@ -20,13 +20,10 @@ def get_next_execution(watch_interval_days, last_watch_date): - """ - Calculate the next execution time based on the watch_interval_days and last_watch_date. - """ + """Calculate the next execution time based on the watch_interval_days and last_watch_date.""" current_date_time = datetime.datetime.now(tz=datetime.timezone.utc) if last_watch_date: - next_execution = last_watch_date + \ - datetime.timedelta(days=watch_interval_days) + next_execution = last_watch_date + datetime.timedelta(days=watch_interval_days) if next_execution > current_date_time: return next_execution @@ -64,17 +61,14 @@ def clear_job(job): def scheduled_job_exists(job_id): - """ - Check if a scheduled job with the given job ID exists. - """ + """Check if a scheduled job with the given job ID exists.""" return job_id and (job_id in scheduler) def clear_zombie_watch_schedules(logger=log): - """ - Clear scheduled jobs not associated with any PackageWatch object. - """ + """Clear scheduled jobs not associated with any PackageWatch object.""" from packagedb.models import PackageWatch + schedule_ids = PackageWatch.objects.all().values_list("schedule_work_id", flat=True) for job in scheduler.get_jobs(): @@ -84,9 +78,7 @@ def clear_zombie_watch_schedules(logger=log): def is_redis_running(logger=log): - """ - Check the status of the Redis server. - """ + """Check the status of the Redis server.""" try: connection = django_rq.get_connection() return connection.ping() diff --git a/packagedb/serializers.py b/packagedb/serializers.py index 0bb22b03..251d3f79 100644 --- a/packagedb/serializers.py +++ b/packagedb/serializers.py @@ -33,44 +33,45 @@ class ResourceAPISerializer(HyperlinkedModelSerializer): package = HyperlinkedRelatedField( - view_name='api:package-detail', lookup_field='uuid', read_only=True) - purl = CharField(source='package.package_url') + view_name="api:package-detail", lookup_field="uuid", read_only=True + ) + purl = CharField(source="package.package_url") class Meta: model = Resource fields = ( - 'package', - 'purl', - 'path', - 'type', - 'name', - 'extension', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'git_sha1', - 'mime_type', - 'file_type', - 'programming_language', - 'is_binary', - 'is_text', - 'is_archive', - 'is_media', - 'is_key_file', - 'detected_license_expression', - 'detected_license_expression_spdx', - 'license_detections', - 'license_clues', - 'percentage_of_license_text', - 'copyrights', - 'holders', - 'authors', - 'package_data', - 'emails', - 'urls', - 'extra_data', + "package", + "purl", + "path", + "type", + "name", + "extension", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "git_sha1", + "mime_type", + "file_type", + "programming_language", + "is_binary", + "is_text", + "is_archive", + "is_media", + "is_key_file", + "detected_license_expression", + "detected_license_expression_spdx", + "license_detections", + "license_clues", + "percentage_of_license_text", + "copyrights", + "holders", + "authors", + "package_data", + "emails", + "urls", + "extra_data", ) read_only_fields = fields @@ -81,37 +82,37 @@ class ResourceMetadataSerializer(HyperlinkedModelSerializer): class Meta: model = Resource fields = ( - 'path', - 'type', - 'name', - 'extension', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'git_sha1', - 'mime_type', - 'file_type', - 'programming_language', - 'is_binary', - 'is_text', - 'is_archive', - 'is_media', - 'is_key_file', - 'detected_license_expression', - 'detected_license_expression_spdx', - 'license_detections', - 'license_clues', - 'percentage_of_license_text', - 'copyrights', - 'holders', - 'authors', - 'package_data', - 'for_packages', - 'emails', - 'urls', - 'extra_data', + "path", + "type", + "name", + "extension", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "git_sha1", + "mime_type", + "file_type", + "programming_language", + "is_binary", + "is_text", + "is_archive", + "is_media", + "is_key_file", + "detected_license_expression", + "detected_license_expression_spdx", + "license_detections", + "license_clues", + "percentage_of_license_text", + "copyrights", + "holders", + "authors", + "package_data", + "for_packages", + "emails", + "urls", + "extra_data", ) @@ -119,11 +120,11 @@ class PartySerializer(ModelSerializer): class Meta: model = Party fields = ( - 'type', - 'role', - 'name', - 'email', - 'url', + "type", + "role", + "name", + "email", + "url", ) @@ -131,12 +132,12 @@ class DependentPackageSerializer(ModelSerializer): class Meta: model = DependentPackage fields = ( - 'purl', - 'extracted_requirement', - 'scope', - 'is_runtime', - 'is_optional', - 'is_resolved', + "purl", + "extracted_requirement", + "scope", + "is_runtime", + "is_optional", + "is_resolved", ) @@ -145,16 +146,14 @@ class PackageInPackageSetAPISerializer(ModelSerializer): This serializes Package instances within a PackageSet that is within a Package in the PackageAPISerializer """ + class Meta: model = Package - fields = ( - 'uuid', - ) + fields = ("uuid",) def to_representation(self, instance): - reverse_uri = reverse_lazy( - 'api:package-detail', kwargs={'uuid': instance.uuid}) - request = self.context['request'] + reverse_uri = reverse_lazy("api:package-detail", kwargs={"uuid": instance.uuid}) + request = self.context["request"] return request.build_absolute_uri(reverse_uri) @@ -164,8 +163,8 @@ class PackageSetAPISerializer(ModelSerializer): class Meta: model = PackageSet fields = ( - 'uuid', - 'packages', + "uuid", + "packages", ) @@ -173,11 +172,12 @@ class PackageAPISerializer(HyperlinkedModelSerializer): dependencies = DependentPackageSerializer(many=True) parties = PartySerializer(many=True) resources = HyperlinkedIdentityField( - view_name='api:package-resources', lookup_field='uuid') + view_name="api:package-resources", lookup_field="uuid" + ) history = HyperlinkedIdentityField( - view_name='api:package-history', lookup_field='uuid') - url = HyperlinkedIdentityField( - view_name='api:package-detail', lookup_field='uuid') + view_name="api:package-history", lookup_field="uuid" + ) + url = HyperlinkedIdentityField(view_name="api:package-detail", lookup_field="uuid") package_sets = PackageSetAPISerializer(many=True) package_content = SerializerMethodField() declared_license_expression_spdx = CharField() @@ -186,54 +186,54 @@ class PackageAPISerializer(HyperlinkedModelSerializer): class Meta: model = Package fields = ( - 'url', - 'uuid', - 'filename', - 'package_sets', - 'package_content', - 'purl', - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'primary_language', - 'description', - 'release_date', - 'parties', - 'keywords', - 'homepage_url', - 'download_url', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'copyright', - 'holder', - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', - 'extracted_license_statement', - 'notice_text', - 'source_packages', - 'extra_data', - 'package_uid', - 'datasource_id', - 'file_references', - 'dependencies', - 'resources', - 'history', + "url", + "uuid", + "filename", + "package_sets", + "package_content", + "purl", + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "primary_language", + "description", + "release_date", + "parties", + "keywords", + "homepage_url", + "download_url", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "copyright", + "holder", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", + "extracted_license_statement", + "notice_text", + "source_packages", + "extra_data", + "package_uid", + "datasource_id", + "file_references", + "dependencies", + "resources", + "history", ) read_only_fields = fields @@ -246,11 +246,10 @@ class PackageInPackageSetMetadataSerializer(ModelSerializer): This serializes Package instances within a PackageSet that is within a Package in the PackageMetadataSerializer """ + class Meta: model = Package - fields = ( - 'uuid', - ) + fields = ("uuid",) def to_representation(self, instance): return instance.package_uid @@ -262,8 +261,8 @@ class PackageSetMetadataSerializer(ModelSerializer): class Meta: model = PackageSet fields = ( - 'uuid', - 'packages', + "uuid", + "packages", ) @@ -275,6 +274,7 @@ class PackageMetadataSerializer(ModelSerializer): This differs from PackageSerializer used for the API by the addition of the `package_url` field and the exclusion of the `uuid`, and `filename` fields. """ + dependencies = DependentPackageSerializer(many=True) parties = PartySerializer(many=True) package_sets = PackageSetMetadataSerializer(many=True) @@ -285,49 +285,49 @@ class PackageMetadataSerializer(ModelSerializer): class Meta: model = Package fields = ( - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'package_sets', - 'package_content', - 'primary_language', - 'description', - 'release_date', - 'parties', - 'keywords', - 'homepage_url', - 'download_url', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'copyright', - 'holder', - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', - 'extracted_license_statement', - 'notice_text', - 'source_packages', - 'extra_data', - 'dependencies', - 'package_uid', - 'datasource_id', - 'purl', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'file_references', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "package_sets", + "package_content", + "primary_language", + "description", + "release_date", + "parties", + "keywords", + "homepage_url", + "download_url", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "copyright", + "holder", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", + "extracted_license_statement", + "notice_text", + "source_packages", + "extra_data", + "dependencies", + "package_uid", + "datasource_id", + "purl", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "file_references", ) def get_package_content(self, obj): @@ -340,29 +340,28 @@ class PackageSetAPISerializer(ModelSerializer): class Meta: model = PackageSet fields = [ - 'uuid', - 'packages', + "uuid", + "packages", ] class PackageWatchAPISerializer(HyperlinkedModelSerializer): url = HyperlinkedIdentityField( - view_name='api:packagewatch-detail', - lookup_field='package_url' + view_name="api:packagewatch-detail", lookup_field="package_url" ) class Meta: model = PackageWatch fields = [ - 'url', - 'package_url', - 'is_active', - 'depth', - 'watch_interval', - 'creation_date', - 'last_watch_date', - 'watch_error', - 'schedule_work_id', + "url", + "package_url", + "is_active", + "depth", + "watch_interval", + "creation_date", + "last_watch_date", + "watch_error", + "schedule_work_id", ] @@ -371,8 +370,7 @@ class Meta: model = PackageWatch fields = ["package_url", "depth", "watch_interval", "is_active"] extra_kwargs = { - field: {"initial": PackageWatch._meta.get_field( - field).get_default()} + field: {"initial": PackageWatch._meta.get_field(field).get_default()} for field in ["depth", "watch_interval", "is_active"] } @@ -380,7 +378,7 @@ class Meta: class PackageWatchUpdateSerializer(ModelSerializer): class Meta: model = PackageWatch - fields = ['depth', 'watch_interval', 'is_active'] + fields = ["depth", "watch_interval", "is_active"] class CommaListField(ListField): @@ -390,7 +388,7 @@ def to_internal_value(self, data): if isinstance(data, str): split_data = [] for datum in data: - split_data.extend(datum.split(',')) + split_data.extend(datum.split(",")) data = split_data return super().to_internal_value(data) @@ -416,7 +414,7 @@ def validate_purl(self, value): try: PackageURL.from_string(value) except ValueError as e: - raise ValidationError(f'purl validation error: {e}') + raise ValidationError(f"purl validation error: {e}") return value def validate_source_purl(self, value): @@ -424,17 +422,25 @@ def validate_source_purl(self, value): try: PackageURL.from_string(value) except ValueError as e: - raise ValidationError(f'purl validation error: {e}') + raise ValidationError(f"purl validation error: {e}") return value def validate_addon_pipelines(self, value): - if invalid_pipelines := [pipe for pipe in value if not is_supported_addon_pipeline(pipe)]: - raise ValidationError(f'Error unsupported addon pipelines: {",".join(invalid_pipelines)}') + if invalid_pipelines := [ + pipe for pipe in value if not is_supported_addon_pipeline(pipe) + ]: + raise ValidationError( + f'Error unsupported addon pipelines: {",".join(invalid_pipelines)}' + ) return value def validate_sort(self, value): - if invalid_sort_fields := [field for field in value if not is_supported_sort_field(field)]: - raise ValidationError(f'Error unsupported sort fields: {",".join(invalid_sort_fields)}') + if invalid_sort_fields := [ + field for field in value if not is_supported_sort_field(field) + ]: + raise ValidationError( + f'Error unsupported sort fields: {",".join(invalid_sort_fields)}' + ) return value @@ -473,34 +479,39 @@ class PurlUpdateResponseSerializer(Serializer): class IndexPackagesResponseSerializer(Serializer): queued_packages_count = IntegerField( - help_text="Number of package urls placed on the index queue.") + help_text="Number of package urls placed on the index queue." + ) queued_packages = ListField( child=CharField(), - help_text="List of package urls that were placed on the index queue." + help_text="List of package urls that were placed on the index queue.", ) requeued_packages_count = IntegerField( - help_text="Number of existing package urls placed on the rescan queue.") + help_text="Number of existing package urls placed on the rescan queue." + ) requeued_packages = ListField( child=CharField(), - help_text="List of existing package urls that were placed on the rescan queue." + help_text="List of existing package urls that were placed on the rescan queue.", ) unqueued_packages_count = IntegerField( - help_text="Number of package urls not placed on the index queue.") + help_text="Number of package urls not placed on the index queue." + ) unqueued_packages = ListField( child=CharField(), - help_text="List of package urls that were not placed on the index queue." + help_text="List of package urls that were not placed on the index queue.", ) unsupported_packages_count = IntegerField( - help_text="Number of package urls that are not processable by the index queue.") + help_text="Number of package urls that are not processable by the index queue." + ) unsupported_packages = ListField( child=CharField(), - help_text="List of package urls that are not processable by the index queue." + help_text="List of package urls that are not processable by the index queue.", ) unsupported_vers_count = IntegerField( - help_text="Number of vers range that are not supported by the univers or package_manager.") + help_text="Number of vers range that are not supported by the univers or package_manager." + ) unsupported_vers = ListField( child=CharField(), - help_text="List of vers range that are not supported by the univers or package_manager." + help_text="List of vers range that are not supported by the univers or package_manager.", ) @@ -534,10 +545,12 @@ class PurltoGitRepoResponseSerializer(Serializer): def is_supported_addon_pipeline(addon_pipeline): from minecode.model_utils import SUPPORTED_ADDON_PIPELINES + return addon_pipeline in SUPPORTED_ADDON_PIPELINES def is_supported_sort_field(field): from packagedb.api import PACKAGE_FILTER_SORT_FIELDS + # A field could have a leading `-` - return field.lstrip('-') in PACKAGE_FILTER_SORT_FIELDS + return field.lstrip("-") in PACKAGE_FILTER_SORT_FIELDS diff --git a/packagedb/tasks.py b/packagedb/tasks.py index 2c390f61..decbb376 100644 --- a/packagedb/tasks.py +++ b/packagedb/tasks.py @@ -67,8 +67,7 @@ def get_and_index_new_purls(package_url): try: local_versions = [version_class(version) for version in local_versions] - all_versions = [version_class(version.value) - for version in all_versions] + all_versions = [version_class(version.value) for version in all_versions] except InvalidVersion as e: return f"InvalidVersion exception: {e}" @@ -101,8 +100,7 @@ def is_supported_watch_ecosystem(watch): watch.watch_error = ( f"`{watch.type}` ecosystem is not supported by {error_message}" ) - watch.last_watch_date = datetime.datetime.now( - tz=datetime.timezone.utc) + watch.last_watch_date = datetime.datetime.now(tz=datetime.timezone.utc) watch.save(update_fields=["last_watch_date"]) return False diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index d1620333..9076e905 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -15,14 +15,15 @@ from django.test import TestCase from django.urls import reverse from django.utils import timezone + from rest_framework import status from rest_framework.test import APIClient from univers.versions import MavenVersion from minecode.models import PriorityResourceURI from minecode.models import ScannableURI -from minecode.utils_test import JsonBasedTesting from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting from packagedb.models import Package from packagedb.models import PackageContentType from packagedb.models import PackageSet @@ -31,354 +32,350 @@ class ResourceAPITestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package1 = Package.objects.create( - download_url='https://test-url.com/package1.tar.gz', - type='type1', - name='name1', + download_url="https://test-url.com/package1.tar.gz", + type="type1", + name="name1", ) self.package2 = Package.objects.create( - download_url='https://test-url.com/package2.tar.gz', - type='type2', - name='name2', + download_url="https://test-url.com/package2.tar.gz", + type="type2", + name="name2", ) self.resource1 = Resource.objects.create( package=self.package1, - path='package1/contents1.txt', + path="package1/contents1.txt", size=101, - sha1='testsha11', - md5='testmd51', - sha256='testsha2561', - sha512='testsha5121', - git_sha1='testgit_sha11', + sha1="testsha11", + md5="testmd51", + sha256="testsha2561", + sha512="testsha5121", + git_sha1="testgit_sha11", is_file=True, - extra_data=json.dumps({'test1': 'data1'}) + extra_data=json.dumps({"test1": "data1"}), ) self.resource2 = Resource.objects.create( package=self.package2, - path='package2/contents2.txt', + path="package2/contents2.txt", size=102, - sha1='testsha12', - md5='testmd52', - sha256='testsha2562', - sha512='testsha5122', - git_sha1='testgit_sha12', + sha1="testsha12", + md5="testmd52", + sha256="testsha2562", + sha512="testsha5122", + git_sha1="testgit_sha12", is_file=True, - extra_data=json.dumps({'test2': 'data2'}) + extra_data=json.dumps({"test2": "data2"}), ) - self.test_url = 'http://testserver/api/packages/{}/' + self.test_url = "http://testserver/api/packages/{}/" self.client = APIClient() def test_api_resource_list_endpoint(self): - response = self.client.get('/api/resources/') + response = self.client.get("/api/resources/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(2, response.data.get("count")) def test_api_resource_retrieve_endpoint(self): - response = self.client.get( - '/api/resources/{}/'.format(self.resource1.sha1)) + response = self.client.get(f"/api/resources/{self.resource1.sha1}/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get('package'), - self.test_url.format(str(self.package1.uuid))) - self.assertEqual(response.data.get('purl'), self.package1.package_url) - self.assertEqual(response.data.get('path'), self.resource1.path) - self.assertEqual(response.data.get('size'), self.resource1.size) - self.assertEqual(response.data.get('sha1'), self.resource1.sha1) - self.assertEqual(response.data.get('md5'), self.resource1.md5) - self.assertEqual(response.data.get('sha256'), self.resource1.sha256) - self.assertEqual(response.data.get('sha512'), self.resource1.sha512) - self.assertEqual(response.data.get( - 'git_sha1'), self.resource1.git_sha1) - self.assertEqual(response.data.get('extra_data'), - self.resource1.extra_data) - self.assertEqual(response.data.get('type'), self.resource1.type) - - def test_api_resource_list_endpoint_returns_none_when_filtering_by_non_uuid_value(self): - response = self.client.get( - '/api/resources/?package={}'.format('not-a-uuid')) + self.assertEqual( + response.data.get("package"), self.test_url.format(str(self.package1.uuid)) + ) + self.assertEqual(response.data.get("purl"), self.package1.package_url) + self.assertEqual(response.data.get("path"), self.resource1.path) + self.assertEqual(response.data.get("size"), self.resource1.size) + self.assertEqual(response.data.get("sha1"), self.resource1.sha1) + self.assertEqual(response.data.get("md5"), self.resource1.md5) + self.assertEqual(response.data.get("sha256"), self.resource1.sha256) + self.assertEqual(response.data.get("sha512"), self.resource1.sha512) + self.assertEqual(response.data.get("git_sha1"), self.resource1.git_sha1) + self.assertEqual(response.data.get("extra_data"), self.resource1.extra_data) + self.assertEqual(response.data.get("type"), self.resource1.type) + + def test_api_resource_list_endpoint_returns_none_when_filtering_by_non_uuid_value( + self, + ): + response = self.client.get("/api/resources/?package={}".format("not-a-uuid")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_api_resource_list_endpoint_returns_none_when_filtering_by_wrong_uuid(self): response = self.client.get( - '/api/resources/?package={}'.format('4eb22e66-3e1c-4818-9b5e-858008a7c2b5')) + "/api/resources/?package={}".format("4eb22e66-3e1c-4818-9b5e-858008a7c2b5") + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_api_resource_list_endpoint_returns_none_when_filtering_by_blank_uuid(self): - response = self.client.get('/api/resources/?package={}'.format('')) + response = self.client.get("/api/resources/?package={}".format("")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(2, response.data.get("count")) def test_api_resource_list_endpoint_filters_by_package1_uuid(self): response = self.client.get( - '/api/resources/?package={}'.format(self.package1.uuid)) + f"/api/resources/?package={self.package1.uuid}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package1.uuid))) - self.assertEqual(test_resource.get('purl'), self.package1.package_url) - self.assertEqual(test_resource.get('path'), self.resource1.path) - self.assertEqual(test_resource.get('size'), self.resource1.size) - self.assertEqual(test_resource.get('sha1'), self.resource1.sha1) - self.assertEqual(test_resource.get('md5'), self.resource1.md5) - self.assertEqual(test_resource.get('sha256'), self.resource1.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource1.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource1.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource1.extra_data) - self.assertEqual(test_resource.get('type'), self.resource1.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package1.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package1.package_url) + self.assertEqual(test_resource.get("path"), self.resource1.path) + self.assertEqual(test_resource.get("size"), self.resource1.size) + self.assertEqual(test_resource.get("sha1"), self.resource1.sha1) + self.assertEqual(test_resource.get("md5"), self.resource1.md5) + self.assertEqual(test_resource.get("sha256"), self.resource1.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource1.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource1.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource1.extra_data) + self.assertEqual(test_resource.get("type"), self.resource1.type) def test_api_resource_list_endpoint_filters_by_package2_uuid(self): response = self.client.get( - '/api/resources/?package={}'.format(self.package2.uuid)) + f"/api/resources/?package={self.package2.uuid}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package2.uuid))) - self.assertEqual(test_resource.get('purl'), self.package2.package_url) - self.assertEqual(test_resource.get('path'), self.resource2.path) - self.assertEqual(test_resource.get('size'), self.resource2.size) - self.assertEqual(test_resource.get('sha1'), self.resource2.sha1) - self.assertEqual(test_resource.get('md5'), self.resource2.md5) - self.assertEqual(test_resource.get('sha256'), self.resource2.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource2.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource2.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource2.extra_data) - self.assertEqual(test_resource.get('type'), self.resource2.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package2.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package2.package_url) + self.assertEqual(test_resource.get("path"), self.resource2.path) + self.assertEqual(test_resource.get("size"), self.resource2.size) + self.assertEqual(test_resource.get("sha1"), self.resource2.sha1) + self.assertEqual(test_resource.get("md5"), self.resource2.md5) + self.assertEqual(test_resource.get("sha256"), self.resource2.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource2.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource2.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource2.extra_data) + self.assertEqual(test_resource.get("type"), self.resource2.type) def test_api_resource_list_endpoint_returns_none_when_filtering_by_wrong_purl(self): response = self.client.get( - '/api/resources/?purl={}'.format('pkg:npm/test@1.0.0')) + "/api/resources/?purl={}".format("pkg:npm/test@1.0.0") + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_api_resource_list_endpoint_returns_none_when_filtering_by_blank_uuid(self): - response = self.client.get('/api/resources/?purl={}'.format('')) + response = self.client.get("/api/resources/?purl={}".format("")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(2, response.data.get("count")) def test_api_resource_list_endpoint_filters_by_package1_purl(self): response = self.client.get( - '/api/resources/?purl={}'.format(self.package1.package_url)) + f"/api/resources/?purl={self.package1.package_url}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package1.uuid))) - self.assertEqual(test_resource.get('purl'), self.package1.package_url) - self.assertEqual(test_resource.get('path'), self.resource1.path) - self.assertEqual(test_resource.get('size'), self.resource1.size) - self.assertEqual(test_resource.get('sha1'), self.resource1.sha1) - self.assertEqual(test_resource.get('md5'), self.resource1.md5) - self.assertEqual(test_resource.get('sha256'), self.resource1.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource1.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource1.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource1.extra_data) - self.assertEqual(test_resource.get('type'), self.resource1.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package1.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package1.package_url) + self.assertEqual(test_resource.get("path"), self.resource1.path) + self.assertEqual(test_resource.get("size"), self.resource1.size) + self.assertEqual(test_resource.get("sha1"), self.resource1.sha1) + self.assertEqual(test_resource.get("md5"), self.resource1.md5) + self.assertEqual(test_resource.get("sha256"), self.resource1.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource1.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource1.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource1.extra_data) + self.assertEqual(test_resource.get("type"), self.resource1.type) def test_api_resource_list_endpoint_filters_by_package2_purl(self): response = self.client.get( - '/api/resources/?purl={}'.format(self.package2.package_url)) + f"/api/resources/?purl={self.package2.package_url}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package2.uuid))) - self.assertEqual(test_resource.get('purl'), self.package2.package_url) - self.assertEqual(test_resource.get('path'), self.resource2.path) - self.assertEqual(test_resource.get('size'), self.resource2.size) - self.assertEqual(test_resource.get('sha1'), self.resource2.sha1) - self.assertEqual(test_resource.get('md5'), self.resource2.md5) - self.assertEqual(test_resource.get('sha256'), self.resource2.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource2.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource2.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource2.extra_data) - self.assertEqual(test_resource.get('type'), self.resource2.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package2.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package2.package_url) + self.assertEqual(test_resource.get("path"), self.resource2.path) + self.assertEqual(test_resource.get("size"), self.resource2.size) + self.assertEqual(test_resource.get("sha1"), self.resource2.sha1) + self.assertEqual(test_resource.get("md5"), self.resource2.md5) + self.assertEqual(test_resource.get("sha256"), self.resource2.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource2.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource2.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource2.extra_data) + self.assertEqual(test_resource.get("type"), self.resource2.type) def test_api_resource_filter_by_checksums(self): sha1s = [ - 'testsha11', - 'testsha12', + "testsha11", + "testsha12", ] - data = { - 'sha1': sha1s - } - response = self.client.post( - '/api/resources/filter_by_checksums/', data=data) - self.assertEqual(2, response.data['count']) - expected = self.get_test_loc( - 'api/resource-filter_by_checksums-expected.json') - self.check_expected_results(response.data['results'], expected, fields_to_remove=[ - "url", "uuid", "package"], regen=FIXTURES_REGEN) + data = {"sha1": sha1s} + response = self.client.post("/api/resources/filter_by_checksums/", data=data) + self.assertEqual(2, response.data["count"]) + expected = self.get_test_loc("api/resource-filter_by_checksums-expected.json") + self.check_expected_results( + response.data["results"], + expected, + fields_to_remove=["url", "uuid", "package"], + regen=FIXTURES_REGEN, + ) - data = { - 'does-not-exist': 'dne' - } - response = self.client.post( - '/api/resources/filter_by_checksums/', data=data) + data = {"does-not-exist": "dne"} + response = self.client.post("/api/resources/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'Unsupported field(s) given: does-not-exist' - self.assertEqual(expected_status, response.data['status']) + expected_status = "Unsupported field(s) given: does-not-exist" + self.assertEqual(expected_status, response.data["status"]) data = {} - response = self.client.post( - '/api/resources/filter_by_checksums/', data=data) + response = self.client.post("/api/resources/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'No values provided' - self.assertEqual(expected_status, response.data['status']) + expected_status = "No values provided" + self.assertEqual(expected_status, response.data["status"]) class PackageApiTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - self.package_data = { - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Foo', - 'version': '12.34', - 'qualifiers': 'test_qual=qual', - 'subpath': 'test_subpath', - 'download_url': 'http://example.com', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, + "type": "generic", + "namespace": "generic", + "name": "Foo", + "version": "12.34", + "qualifiers": "test_qual=qual", + "subpath": "test_subpath", + "download_url": "http://example.com", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() - self.package.append_to_history('test-message') + self.package.append_to_history("test-message") self.package.save() self.package_data2 = { - 'type': 'npm', - 'namespace': 'example', - 'name': 'Bar', - 'version': '56.78', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://somethingelse.org', - 'filename': 'Bar.zip', - 'sha1': 'testsha1-2', - 'md5': 'testmd5-2', - 'size': 100, + "type": "npm", + "namespace": "example", + "name": "Bar", + "version": "56.78", + "qualifiers": "", + "subpath": "", + "download_url": "http://somethingelse.org", + "filename": "Bar.zip", + "sha1": "testsha1-2", + "md5": "testmd5-2", + "size": 100, } self.package2 = Package.objects.create(**self.package_data2) self.package2.refresh_from_db() self.package_data3 = { - 'type': 'jar', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.12', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://anotherexample.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-3', - 'md5': 'testmd5-3', - 'size': 100, + "type": "jar", + "namespace": "sample", + "name": "Baz", + "version": "90.12", + "qualifiers": "", + "subpath": "", + "download_url": "http://anotherexample.com", + "filename": "Baz.zip", + "sha1": "testsha1-3", + "md5": "testmd5-3", + "size": 100, } self.package3 = Package.objects.create(**self.package_data3) self.package3.refresh_from_db() self.package_data4 = { - 'type': 'jar', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.123', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://anothersample.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-4', - 'md5': 'testmd5-3', - 'size': 100, - 'package_content': PackageContentType.BINARY, + "type": "jar", + "namespace": "sample", + "name": "Baz", + "version": "90.123", + "qualifiers": "", + "subpath": "", + "download_url": "http://anothersample.com", + "filename": "Baz.zip", + "sha1": "testsha1-4", + "md5": "testmd5-3", + "size": 100, + "package_content": PackageContentType.BINARY, } self.package4 = Package.objects.create(**self.package_data4) self.package4.refresh_from_db() self.package_data5 = { - 'type': 'maven', - 'namespace': 'foot', - 'name': 'baz', - 'version': '90.123', - 'qualifiers': 'classifier=source', - 'subpath': '', - 'download_url': 'http://test-maven.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-5', - 'md5': 'testmd5-11', - 'size': 100, - 'package_content': PackageContentType.SOURCE_ARCHIVE, - 'declared_license_expression': 'MIT', + "type": "maven", + "namespace": "foot", + "name": "baz", + "version": "90.123", + "qualifiers": "classifier=source", + "subpath": "", + "download_url": "http://test-maven.com", + "filename": "Baz.zip", + "sha1": "testsha1-5", + "md5": "testmd5-11", + "size": 100, + "package_content": PackageContentType.SOURCE_ARCHIVE, + "declared_license_expression": "MIT", } self.package5 = Package.objects.create(**self.package_data5) self.package5.refresh_from_db() self.package_data6 = { - 'type': 'maven', - 'namespace': 'fooo', - 'name': 'baz', - 'version': '90.123', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://test-maven-11.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-6', - 'md5': 'testmd5-11', - 'size': 100, - 'package_content': PackageContentType.BINARY, + "type": "maven", + "namespace": "fooo", + "name": "baz", + "version": "90.123", + "qualifiers": "", + "subpath": "", + "download_url": "http://test-maven-11.com", + "filename": "Baz.zip", + "sha1": "testsha1-6", + "md5": "testmd5-11", + "size": 100, + "package_content": PackageContentType.BINARY, } self.package6 = Package.objects.create(**self.package_data6) self.package6.refresh_from_db() self.package_data7 = { - 'type': 'github', - 'namespace': 'glue', - 'name': 'cat', - 'version': '90.123', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://test-maven-111.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-7', - 'md5': 'testmd5-11', - 'size': 100, - 'copyright': 'BACC', - 'package_content': PackageContentType.SOURCE_REPO, + "type": "github", + "namespace": "glue", + "name": "cat", + "version": "90.123", + "qualifiers": "", + "subpath": "", + "download_url": "http://test-maven-111.com", + "filename": "Baz.zip", + "sha1": "testsha1-7", + "md5": "testmd5-11", + "size": 100, + "copyright": "BACC", + "package_content": PackageContentType.SOURCE_REPO, } self.package7 = Package.objects.create(**self.package_data7) @@ -389,79 +386,78 @@ def setUp(self): self.packageset_1.packages.add(self.package5) self.packageset_1.packages.add(self.package7) - self.test_url = 'http://testserver/api/packages/{}/' + self.test_url = "http://testserver/api/packages/{}/" self.client = APIClient() def test_package_api_list_endpoint(self): - response = self.client.get('/api/packages/') + response = self.client.get("/api/packages/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(7, response.data.get('count')) + self.assertEqual(7, response.data.get("count")) def test_package_api_list_endpoint_filter(self): for key, value in self.package_data.items(): - response = self.client.get( - '/api/packages/?{}={}'.format(key, value)) + response = self.client.get(f"/api/packages/?{key}={value}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) + self.assertEqual(1, response.data.get("count")) def test_package_api_list_endpoint_filter_by_purl_fields_ignores_case(self): for key, value in self.package_data.items(): # Skip non-purl fields - if key not in ['type', 'namespace', 'name']: + if key not in ["type", "namespace", "name"]: continue response = self.client.get( - '/api/packages/?{}={}'.format(key, value.lower())) + f"/api/packages/?{key}={value.lower()}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) + self.assertEqual(1, response.data.get("count")) response = self.client.get( - '/api/packages/?{}={}'.format(key, value.upper())) + f"/api/packages/?{key}={value.upper()}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) + self.assertEqual(1, response.data.get("count")) def test_package_api_list_endpoint_search(self): # Create a dummy package to verify search filter works. Package.objects.create( - type='generic', - namespace='dummy-namespace', - name='dummy-name', - version='12.35', - download_url='https://dummy.com/dummy' + type="generic", + namespace="dummy-namespace", + name="dummy-name", + version="12.35", + download_url="https://dummy.com/dummy", ) + response = self.client.get("/api/packages/?search={}".format("generic")) + assert response.data.get("count") == 2 + response = self.client.get("/api/packages/?search={}".format("dummy")) + assert response.data.get("count") == 1 + response = self.client.get("/api/packages/?search={}".format("DUMMY")) + assert response.data.get("count") == 1 + response = self.client.get("/api/packages/?search={}".format("12.35")) + assert response.data.get("count") == 1 response = self.client.get( - '/api/packages/?search={}'.format('generic')) - assert response.data.get('count') == 2 - response = self.client.get('/api/packages/?search={}'.format('dummy')) - assert response.data.get('count') == 1 - response = self.client.get('/api/packages/?search={}'.format('DUMMY')) - assert response.data.get('count') == 1 - response = self.client.get('/api/packages/?search={}'.format('12.35')) - assert response.data.get('count') == 1 - response = self.client.get( - '/api/packages/?search={}'.format('https://dummy.com/dummy')) - assert response.data.get('count') == 1 + "/api/packages/?search={}".format("https://dummy.com/dummy") + ) + assert response.data.get("count") == 1 def test_package_api_retrieve_endpoint(self): - response = self.client.get( - '/api/packages/{}/'.format(self.package.uuid)) + response = self.client.get(f"/api/packages/{self.package.uuid}/") self.assertEqual(response.status_code, status.HTTP_200_OK) for key, value in response.data.items(): # Handle the API-only `url` key - if key == 'url': - self.assertEqual(value, self.test_url.format( - str(self.package.uuid))) + if key == "url": + self.assertEqual(value, self.test_url.format(str(self.package.uuid))) continue - if key in ['type', 'namespace', 'name', 'version', 'qualifiers', 'subpath']: + if key in ["type", "namespace", "name", "version", "qualifiers", "subpath"]: self.assertEqual(value, getattr(self.package, key)) continue - if key == 'history': - url = reverse('api:package-history', args=[self.package.uuid]) + if key == "history": + url = reverse("api:package-history", args=[self.package.uuid]) self.assertIn(url, value) self.assertTrue(hasattr(self.package, key)) @@ -470,117 +466,127 @@ def test_package_api_retrieve_endpoint(self): def test_api_package_latest_version_action(self): p1 = Package.objects.create( - download_url='http://a.a', type='generic', name='name', version='1.0') + download_url="http://a.a", type="generic", name="name", version="1.0" + ) p2 = Package.objects.create( - download_url='http://b.b', type='generic', name='name', version='2.0') + download_url="http://b.b", type="generic", name="name", version="2.0" + ) p3 = Package.objects.create( - download_url='http://c.c', type='generic', name='name', version='3.0') + download_url="http://c.c", type="generic", name="name", version="3.0" + ) response = self.client.get( - reverse('api:package-latest-version', args=[p1.uuid])) - self.assertEqual('3.0', response.data['version']) + reverse("api:package-latest-version", args=[p1.uuid]) + ) + self.assertEqual("3.0", response.data["version"]) response = self.client.get( - reverse('api:package-latest-version', args=[p2.uuid])) - self.assertEqual('3.0', response.data['version']) + reverse("api:package-latest-version", args=[p2.uuid]) + ) + self.assertEqual("3.0", response.data["version"]) response = self.client.get( - reverse('api:package-latest-version', args=[p3.uuid])) - self.assertEqual('3.0', response.data['version']) + reverse("api:package-latest-version", args=[p3.uuid]) + ) + self.assertEqual("3.0", response.data["version"]) def test_api_package_resources_action(self): # create 10 resources for i in range(0, 10): - Resource.objects.create( - package=self.package, path='path{}/'.format(i)) + Resource.objects.create(package=self.package, path=f"path{i}/") response = self.client.get( - reverse('api:package-resources', args=[self.package.uuid])) + reverse("api:package-resources", args=[self.package.uuid]) + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(10, response.data['count']) + self.assertEqual(10, response.data["count"]) - for result, i in zip(response.data['results'], range(0, 10)): - self.assertEqual(result.get('path'), 'path{}/'.format(i)) + for result, i in zip(response.data["results"], range(0, 10)): + self.assertEqual(result.get("path"), f"path{i}/") def test_api_package_list_endpoint_multiple_char_filters(self): - filters = f'?md5={self.package.md5}&md5={self.package2.md5}' - response = self.client.get(f'/api/packages/{filters}') - self.assertEqual(2, response.data['count']) - purls = [result.get('purl') for result in response.data['results']] + filters = f"?md5={self.package.md5}&md5={self.package2.md5}" + response = self.client.get(f"/api/packages/{filters}") + self.assertEqual(2, response.data["count"]) + purls = [result.get("purl") for result in response.data["results"]] self.assertIn(self.package.purl, purls) self.assertIn(self.package2.purl, purls) self.assertNotIn(self.package3.purl, purls) - filters = f'?sha1={self.package2.sha1}&sha1={self.package3.sha1}' - response = self.client.get(f'/api/packages/{filters}') + filters = f"?sha1={self.package2.sha1}&sha1={self.package3.sha1}" + response = self.client.get(f"/api/packages/{filters}") self.assertEqual(2, response.data["count"]) - purls = [result.get('purl') for result in response.data['results']] + purls = [result.get("purl") for result in response.data["results"]] self.assertIn(self.package2.purl, purls) self.assertIn(self.package3.purl, purls) self.assertNotIn(self.package.purl, purls) def test_package_api_filter_by_checksums(self): sha1s = [ - 'testsha1', - 'testsha1-2', - 'testsha1-3', - 'testsha1-4', - 'testsha1-6', + "testsha1", + "testsha1-2", + "testsha1-3", + "testsha1-4", + "testsha1-6", ] data = { - 'sha1': sha1s, + "sha1": sha1s, } - response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) - self.assertEqual(5, response.data['count']) - expected = self.get_test_loc( - 'api/package-filter_by_checksums-expected.json') - self.check_expected_results(response.data['results'], expected, fields_to_remove=[ - "url", "uuid", "resources", "package_sets", "history"], regen=FIXTURES_REGEN) + response = self.client.post("/api/packages/filter_by_checksums/", data=data) + self.assertEqual(5, response.data["count"]) + expected = self.get_test_loc("api/package-filter_by_checksums-expected.json") + self.check_expected_results( + response.data["results"], + expected, + fields_to_remove=["url", "uuid", "resources", "package_sets", "history"], + regen=FIXTURES_REGEN, + ) data["enhance_package_data"] = True enhanced_response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) - self.assertEqual(5, len(enhanced_response.data['results'])) + "/api/packages/filter_by_checksums/", data=data + ) + self.assertEqual(5, len(enhanced_response.data["results"])) expected = self.get_test_loc( - 'api/package-filter_by_checksums-enhanced-package-data-expected.json') - self.check_expected_results(enhanced_response.data['results'], expected, fields_to_remove=[ - "url", "uuid", "resources", "package_sets", "history"], regen=FIXTURES_REGEN) + "api/package-filter_by_checksums-enhanced-package-data-expected.json" + ) + self.check_expected_results( + enhanced_response.data["results"], + expected, + fields_to_remove=["url", "uuid", "resources", "package_sets", "history"], + regen=FIXTURES_REGEN, + ) - data = { - 'does-not-exist': 'dne' - } - response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) + data = {"does-not-exist": "dne"} + response = self.client.post("/api/packages/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'Unsupported field(s) given: does-not-exist' - self.assertEqual(expected_status, response.data['status']) + expected_status = "Unsupported field(s) given: does-not-exist" + self.assertEqual(expected_status, response.data["status"]) data = {} - response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) + response = self.client.post("/api/packages/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'No values provided' - self.assertEqual(expected_status, response.data['status']) + expected_status = "No values provided" + self.assertEqual(expected_status, response.data["status"]) class PackageApiReindexingTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - package_download_url = 'http://anotherexample.com' + package_download_url = "http://anotherexample.com" self.package_data = { - 'type': 'maven', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.12', - 'qualifiers': '', - 'subpath': '', - 'download_url': package_download_url, - 'filename': 'Baz.zip', - 'sha1': 'testsha1-3', - 'md5': 'testmd5-3', - 'size': 100, + "type": "maven", + "namespace": "sample", + "name": "Baz", + "version": "90.12", + "qualifiers": "", + "subpath": "", + "download_url": package_download_url, + "filename": "Baz.zip", + "sha1": "testsha1-3", + "md5": "testmd5-3", + "size": 100, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() @@ -591,20 +597,24 @@ def setUp(self): self.scannableuri.scan_status = ScannableURI.SCAN_INDEXED self.scan_uuid = uuid4() self.scannableuri.scan_uuid = self.scan_uuid - self.scannableuri.scan_error = 'error' - self.scannableuri.index_error = 'error' + self.scannableuri.scan_error = "error" + self.scannableuri.index_error = "error" self.scan_date = timezone.now() self.scannableuri.scan_date = self.scan_date def test_reindex_package(self): self.assertEqual(1, ScannableURI.objects.all().count()) response = self.client.get( - f'/api/packages/{self.package.uuid}/reindex_package/') + f"/api/packages/{self.package.uuid}/reindex_package/" + ) self.assertEqual( - 'pkg:maven/sample/Baz@90.12 has been queued for reindexing', response.data['status']) + "pkg:maven/sample/Baz@90.12 has been queued for reindexing", + response.data["status"], + ) self.assertEqual(2, ScannableURI.objects.all().count()) new_scannable_uri = ScannableURI.objects.exclude( - pk=self.scannableuri.pk).first() + pk=self.scannableuri.pk + ).first() self.assertEqual(self.package, new_scannable_uri.package) self.assertEqual(True, new_scannable_uri.reindex_uri) self.assertEqual(100, new_scannable_uri.priority) @@ -615,54 +625,54 @@ def test_reindex_package(self): # Ensure previous ScannableURI was not modified self.assertEqual(False, self.scannableuri.reindex_uri) self.assertEqual(0, self.scannableuri.priority) - self.assertEqual('error', self.scannableuri.scan_error) - self.assertEqual('error', self.scannableuri.index_error) + self.assertEqual("error", self.scannableuri.scan_error) + self.assertEqual("error", self.scannableuri.index_error) self.assertEqual(self.scan_date, self.scannableuri.scan_date) class PackageApiPurlFilterTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package_data1 = { - 'type': 'maven', - 'namespace': 'org.apache.commons', - 'name': 'io', - 'version': '1.3.4', - 'download_url': 'http://example1.com', - 'extra_data': json.dumps({'test2': 'data2'}) + "type": "maven", + "namespace": "org.apache.commons", + "name": "io", + "version": "1.3.4", + "download_url": "http://example1.com", + "extra_data": json.dumps({"test2": "data2"}), } self.package_data2 = { - 'type': 'maven', - 'namespace': 'org.apache.commons', - 'name': 'io', - 'version': '2.3.4', - 'download_url': 'http://example2.com', - 'extra_data': json.dumps({'test2': 'data2'}) + "type": "maven", + "namespace": "org.apache.commons", + "name": "io", + "version": "2.3.4", + "download_url": "http://example2.com", + "extra_data": json.dumps({"test2": "data2"}), } self.package_data3 = { - 'type': 'maven', - 'namespace': '', - 'name': 'test', - 'version': '1.0.0', - 'qualifiers': '', - 'package_content': PackageContentType.BINARY, - 'download_url': 'https://example.com/test-1.0.0.jar', + "type": "maven", + "namespace": "", + "name": "test", + "version": "1.0.0", + "qualifiers": "", + "package_content": PackageContentType.BINARY, + "download_url": "https://example.com/test-1.0.0.jar", } self.package_data4 = { - 'type': 'maven', - 'namespace': '', - 'name': 'test', - 'version': '1.0.0', - 'qualifiers': 'classifier=sources', - 'declared_license_expression': 'apache-2.0', - 'copyright': 'Copyright (c) example corp.', - 'holder': 'example corp.', - 'package_content': PackageContentType.SOURCE_ARCHIVE, - 'download_url': 'https://example.com/test-1.0.0-sources.jar', + "type": "maven", + "namespace": "", + "name": "test", + "version": "1.0.0", + "qualifiers": "classifier=sources", + "declared_license_expression": "apache-2.0", + "copyright": "Copyright (c) example corp.", + "holder": "example corp.", + "package_content": PackageContentType.SOURCE_ARCHIVE, + "download_url": "https://example.com/test-1.0.0-sources.jar", } self.package1 = Package.objects.create(**self.package_data1) @@ -673,7 +683,7 @@ def setUp(self): self.purl1 = self.package1.package_url self.purl2 = self.package2.package_url - self.missing_purl = 'pkg:PYPI/Django_package@1.11.1.dev1' + self.missing_purl = "pkg:PYPI/Django_package@1.11.1.dev1" self.package_set1 = PackageSet.objects.create() self.package_set1.add_to_package_set(self.package1) @@ -689,192 +699,196 @@ def tearDown(self): Package.objects.all().delete() def test_package_api_purl_filter_by_query_param_invalid_purl(self): - response = self.client.get('/api/packages/?purl={}'.format('11111')) + response = self.client.get("/api/packages/?purl={}".format("11111")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_package_api_purl_filter_by_query_param_no_value(self): - response = self.client.get('/api/packages/?purl={}'.format('')) + response = self.client.get("/api/packages/?purl={}".format("")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(4, response.data.get('count')) + self.assertEqual(4, response.data.get("count")) def test_package_api_purl_filter_by_query_param_non_existant_purl(self): - response = self.client.get( - '/api/packages/?purl={}'.format(self.missing_purl)) + response = self.client.get(f"/api/packages/?purl={self.missing_purl}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_package_api_purl_filter_by_query_param_no_version(self): response = self.client.get( - '/api/packages/?purl={}'.format('pkg:maven/org.apache.commons/io')) + "/api/packages/?purl={}".format("pkg:maven/org.apache.commons/io") + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(2, response.data.get("count")) def test_package_api_purl_filter_by_query_param1(self): - response = self.client.get('/api/packages/?purl={}'.format(self.purl1)) + response = self.client.get(f"/api/packages/?purl={self.purl1}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) + self.assertEqual(1, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) def test_package_api_purl_filter_by_query_param2(self): - response = self.client.get('/api/packages/?purl={}'.format(self.purl2)) + response = self.client.get(f"/api/packages/?purl={self.purl2}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data2.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data2.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data2.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data2.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data2.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data2.get('extra_data')) + self.assertEqual(1, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data2.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data2.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data2.get("name")) + self.assertEqual(test_package.get("version"), self.package_data2.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data2.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data2.get("extra_data") + ) def test_package_api_purl_filter_by_both_query_params(self): response = self.client.get( - '/api/packages/?purl={}&purl={}'.format(self.purl1, self.purl2)) + f"/api/packages/?purl={self.purl1}&purl={self.purl2}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) - - test_package = response.data.get('results')[1] - self.assertEqual(test_package.get('type'), - self.package_data2.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data2.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data2.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data2.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data2.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data2.get('extra_data')) + self.assertEqual(2, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) + + test_package = response.data.get("results")[1] + self.assertEqual(test_package.get("type"), self.package_data2.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data2.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data2.get("name")) + self.assertEqual(test_package.get("version"), self.package_data2.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data2.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data2.get("extra_data") + ) def test_package_api_purl_filter_by_two_purl_values_on_multiple_packages(self): extra_test_package = Package.objects.create( - download_url='https://extra-pkg.com/download', - type='generic', - name='extra-name', - version='2.2.2' + download_url="https://extra-pkg.com/download", + type="generic", + name="extra-name", + version="2.2.2", ) response = self.client.get( - '/api/packages/?purl={}&purl={}'.format(self.purl1, self.purl2)) + f"/api/packages/?purl={self.purl1}&purl={self.purl2}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) - - test_package = response.data.get('results')[1] - self.assertEqual(test_package.get('type'), - self.package_data2.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data2.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data2.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data2.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data2.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data2.get('extra_data')) + self.assertEqual(2, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) + + test_package = response.data.get("results")[1] + self.assertEqual(test_package.get("type"), self.package_data2.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data2.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data2.get("name")) + self.assertEqual(test_package.get("version"), self.package_data2.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data2.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data2.get("extra_data") + ) def test_package_api_purl_filter_by_one_purl_multiple_params(self): response = self.client.get( - '/api/packages/?purl={}&purl={}'.format(self.purl1, self.missing_purl)) + f"/api/packages/?purl={self.purl1}&purl={self.missing_purl}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) + self.assertEqual(1, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) def test_package_api_purl_filter_by_multiple_blank_purl(self): - response = self.client.get( - '/api/packages/?purl={}&purl={}'.format('', '')) + response = self.client.get("/api/packages/?purl={}&purl={}".format("", "")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(4, response.data.get('count')) + self.assertEqual(4, response.data.get("count")) def test_package_api_get_enhanced_package(self): response = self.client.get( - reverse('api:package-get-enhanced-package-data', args=[self.package3.uuid])) + reverse("api:package-get-enhanced-package-data", args=[self.package3.uuid]) + ) result = response.data - expected = self.get_test_loc('api/enhanced_package.json') - self.check_expected_results(result, expected, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) + expected = self.get_test_loc("api/enhanced_package.json") + self.check_expected_results( + result, expected, fields_to_remove=["package_sets"], regen=FIXTURES_REGEN + ) class CollectApiTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - self.package_download_url = 'http://anotherexample.com' + self.package_download_url = "http://anotherexample.com" self.package_data = { - 'type': 'maven', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.12', - 'qualifiers': '', - 'subpath': '', - 'download_url': self.package_download_url, - 'filename': 'Baz.zip', - 'sha1': 'testsha1-3', - 'md5': 'testmd5-3', - 'size': 100, + "type": "maven", + "namespace": "sample", + "name": "Baz", + "version": "90.12", + "qualifiers": "", + "subpath": "", + "download_url": self.package_download_url, + "filename": "Baz.zip", + "sha1": "testsha1-3", + "md5": "testmd5-3", + "size": 100, } self.package = Package.objects.create(**self.package_data) self.scannableuri = ScannableURI.objects.create( @@ -884,24 +898,24 @@ def setUp(self): self.scannableuri.scan_status = ScannableURI.SCAN_INDEX_FAILED self.scan_uuid = uuid4() self.scannableuri.scan_uuid = self.scan_uuid - self.scannableuri.scan_error = 'error' - self.scannableuri.index_error = 'error' + self.scannableuri.scan_error = "error" + self.scannableuri.index_error = "error" self.scan_request_date = timezone.now() self.scannableuri.scan_request_date = self.scan_request_date - self.package_download_url2 = 'http://somethingelse.org' + self.package_download_url2 = "http://somethingelse.org" self.package_data2 = { - 'type': 'npm', - 'namespace': 'example', - 'name': 'bar', - 'version': '56.78', - 'qualifiers': '', - 'subpath': '', - 'download_url': self.package_download_url2, - 'filename': 'Bar.zip', - 'sha1': 'testsha1-2', - 'md5': 'testmd5-2', - 'size': 100, + "type": "npm", + "namespace": "example", + "name": "bar", + "version": "56.78", + "qualifiers": "", + "subpath": "", + "download_url": self.package_download_url2, + "filename": "Bar.zip", + "sha1": "testsha1-2", + "md5": "testmd5-2", + "size": 100, } self.package2 = Package.objects.create(**self.package_data2) self.scannableuri2 = ScannableURI.objects.create( @@ -911,217 +925,206 @@ def setUp(self): self.scannableuri2.scan_status = ScannableURI.SCAN_INDEX_FAILED self.scan_uuid2 = uuid4() self.scannableuri2.scan_uuid = self.scan_uuid2 - self.scannableuri2.scan_error = 'error' - self.scannableuri2.index_error = 'error' + self.scannableuri2.scan_error = "error" + self.scannableuri2.index_error = "error" self.scan_request_date2 = timezone.now() self.scannableuri2.scan_request_date = self.scan_request_date2 - self.package_download_url3 = 'http://clone.org/clone1.zip' + self.package_download_url3 = "http://clone.org/clone1.zip" self.package_data3 = { - 'type': 'pypi', - 'namespace': '', - 'name': 'clone', - 'version': '1', - 'qualifiers': '', - 'subpath': '', - 'download_url': self.package_download_url3, - 'filename': 'clone1.zip', - 'sha1': 'clone1', - 'md5': '', - 'size': 100, + "type": "pypi", + "namespace": "", + "name": "clone", + "version": "1", + "qualifiers": "", + "subpath": "", + "download_url": self.package_download_url3, + "filename": "clone1.zip", + "sha1": "clone1", + "md5": "", + "size": 100, } self.package3 = Package.objects.create(**self.package_data3) - self.package_download_url4 = 'http://clone.org/clone1-src.zip' + self.package_download_url4 = "http://clone.org/clone1-src.zip" self.package_data4 = { - 'type': 'pypi', - 'namespace': '', - 'name': 'clone', - 'version': '1', - 'qualifiers': 'package=src', - 'subpath': '', - 'download_url': self.package_download_url4, - 'filename': 'clone1-src.zip', - 'sha1': 'clone1-src', - 'md5': '', - 'size': 50, + "type": "pypi", + "namespace": "", + "name": "clone", + "version": "1", + "qualifiers": "package=src", + "subpath": "", + "download_url": self.package_download_url4, + "filename": "clone1-src.zip", + "sha1": "clone1-src", + "md5": "", + "size": 50, } self.package4 = Package.objects.create(**self.package_data4) - self.package_download_url5 = 'http://clone.org/clone1-all.zip' + self.package_download_url5 = "http://clone.org/clone1-all.zip" self.package_data5 = { - 'type': 'pypi', - 'namespace': '', - 'name': 'clone', - 'version': '1', - 'qualifiers': 'package=all', - 'subpath': '', - 'download_url': self.package_download_url5, - 'filename': 'clone1-all.zip', - 'sha1': 'clone1-all', - 'md5': '', - 'size': 25, + "type": "pypi", + "namespace": "", + "name": "clone", + "version": "1", + "qualifiers": "package=all", + "subpath": "", + "download_url": self.package_download_url5, + "filename": "clone1-all.zip", + "sha1": "clone1-all", + "md5": "", + "size": 25, } self.package5 = Package.objects.create(**self.package_data5) - def test_package_live(self): - purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' - download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' - purl_sources_str = f'{purl_str}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' - - self.assertEqual(0, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(0, Package.objects.filter( - download_url=sources_download_url).count()) - response = self.client.get(f'/api/collect/?purl={purl_str}') - self.assertEqual(1, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(1, Package.objects.filter( - download_url=sources_download_url).count()) - expected = self.get_test_loc('api/twill-core-0.12.0.json') + purl_str = "pkg:maven/org.apache.twill/twill-core@0.12.0" + download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar" + purl_sources_str = f"{purl_str}?classifier=sources" + sources_download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar" + + self.assertEqual(0, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 0, Package.objects.filter(download_url=sources_download_url).count() + ) + response = self.client.get(f"/api/collect/?purl={purl_str}") + self.assertEqual(1, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 1, Package.objects.filter(download_url=sources_download_url).count() + ) + expected = self.get_test_loc("api/twill-core-0.12.0.json") self.assertEqual(2, len(response.data)) result = response.data[0] # remove fields - result.pop('url') - fields_to_remove = [ - 'uuid', - 'resources', - 'package_sets', - 'history' - ] + result.pop("url") + fields_to_remove = ["uuid", "resources", "package_sets", "history"] self.check_expected_results( - result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN) + result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN + ) # Ensure that the created ScannableURI objects have a priority of 100 package = Package.objects.get(download_url=download_url) source_package = Package.objects.get(download_url=sources_download_url) package_scannable_uri = ScannableURI.objects.get(package=package) - source_package_scannable_uri = ScannableURI.objects.get( - package=source_package) + source_package_scannable_uri = ScannableURI.objects.get(package=source_package) self.assertEqual(100, package_scannable_uri.priority) self.assertEqual(100, source_package_scannable_uri.priority) def test_package_live_works_with_purl2vcs(self): purl = "pkg:maven/org.elasticsearch.plugin/elasticsearch-scripting-painless-spi@6.8.15" - download_url = 'https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15.jar' - purl_sources_str = f'{purl}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15-sources.jar' - - self.assertEqual(0, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(0, Package.objects.filter( - download_url=sources_download_url).count()) - response = self.client.get(f'/api/collect/?purl={purl}') - self.assertEqual(1, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(1, Package.objects.filter( - download_url=sources_download_url).count()) + download_url = "https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15.jar" + purl_sources_str = f"{purl}?classifier=sources" + sources_download_url = "https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15-sources.jar" + + self.assertEqual(0, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 0, Package.objects.filter(download_url=sources_download_url).count() + ) + response = self.client.get(f"/api/collect/?purl={purl}") + self.assertEqual(1, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 1, Package.objects.filter(download_url=sources_download_url).count() + ) expected = self.get_test_loc( - 'api/elasticsearch-scripting-painless-spi-6.8.15.json') + "api/elasticsearch-scripting-painless-spi-6.8.15.json" + ) self.assertEqual(2, len(response.data)) result = response.data[0] # remove fields - result.pop('url') - fields_to_remove = [ - 'uuid', - 'resources', - 'package_sets', - 'history' - ] + result.pop("url") + fields_to_remove = ["uuid", "resources", "package_sets", "history"] self.check_expected_results( - result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN) + result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN + ) def test_collect_sort(self): - purl_str = 'pkg:pypi/clone@1' - response = self.client.get(f'/api/collect/?purl={purl_str}&sort=size') + purl_str = "pkg:pypi/clone@1" + response = self.client.get(f"/api/collect/?purl={purl_str}&sort=size") for i, package_data in enumerate(response.data[1:], start=1): - prev_package_data = response.data[i-1] - self.assertTrue(prev_package_data['size'] < package_data['size']) + prev_package_data = response.data[i - 1] + self.assertTrue(prev_package_data["size"] < package_data["size"]) - response = self.client.get(f'/api/collect/?purl={purl_str}&sort=-size') + response = self.client.get(f"/api/collect/?purl={purl_str}&sort=-size") for i, package_data in enumerate(response.data[1:], start=1): - prev_package_data = response.data[i-1] - self.assertTrue(prev_package_data['size'] > package_data['size']) + prev_package_data = response.data[i - 1] + self.assertTrue(prev_package_data["size"] > package_data["size"]) def test_package_api_index_packages_endpoint(self): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) packages = [ - {'purl': 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24'}, - {'purl': 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0'}, - {'purl': 'pkg:bitbucket/example/example@1.0.0'}, + {"purl": "pkg:maven/ch.qos.reload4j/reload4j@1.2.24"}, + {"purl": "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0"}, + {"purl": "pkg:bitbucket/example/example@1.0.0"}, ] - data = { - 'packages': packages - } + data = {"packages": packages} response = self.client.post( - '/api/collect/index_packages/', data=data, content_type="application/json") - self.assertEqual(2, response.data['queued_packages_count']) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) + self.assertEqual(2, response.data["queued_packages_count"]) expected_queued_packages = [ - 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0', + "pkg:maven/ch.qos.reload4j/reload4j@1.2.24", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0", ] self.assertEqual( - sorted(expected_queued_packages), - sorted(response.data['queued_packages']) - ) - self.assertEqual(0, response.data['unqueued_packages_count']) - self.assertEqual([], response.data['unqueued_packages']) - self.assertEqual(1, response.data['unsupported_packages_count']) - expected_unsupported_packages = [ - 'pkg:bitbucket/example/example@1.0.0' - ] - self.assertEqual(expected_unsupported_packages, - response.data['unsupported_packages']) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) + ) + self.assertEqual(0, response.data["unqueued_packages_count"]) + self.assertEqual([], response.data["unqueued_packages"]) + self.assertEqual(1, response.data["unsupported_packages_count"]) + expected_unsupported_packages = ["pkg:bitbucket/example/example@1.0.0"] + self.assertEqual( + expected_unsupported_packages, response.data["unsupported_packages"] + ) priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(2, priority_resource_uris_count) # Ensure that we don't add the same packages to the queue if they have # not yet been processed purls = [ - {'purl': 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24'}, - {'purl': 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0'}, - {'purl': 'pkg:bitbucket/example/example@1.0.0'}, + {"purl": "pkg:maven/ch.qos.reload4j/reload4j@1.2.24"}, + {"purl": "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0"}, + {"purl": "pkg:bitbucket/example/example@1.0.0"}, ] - data = { - 'packages': purls - } + data = {"packages": purls} response = self.client.post( - '/api/collect/index_packages/', data=data, content_type="application/json") - self.assertEqual(0, response.data['queued_packages_count']) - self.assertEqual([], response.data['queued_packages']) - self.assertEqual(0, response.data['requeued_packages_count']) - self.assertEqual([], response.data['requeued_packages']) - self.assertEqual(2, response.data['unqueued_packages_count']) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) + self.assertEqual(0, response.data["queued_packages_count"]) + self.assertEqual([], response.data["queued_packages"]) + self.assertEqual(0, response.data["requeued_packages_count"]) + self.assertEqual([], response.data["requeued_packages"]) + self.assertEqual(2, response.data["unqueued_packages_count"]) expected_unqueued_packages = [ - 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0', + "pkg:maven/ch.qos.reload4j/reload4j@1.2.24", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0", ] self.assertEqual( sorted(expected_unqueued_packages), - sorted(response.data['unqueued_packages']) + sorted(response.data["unqueued_packages"]), + ) + self.assertEqual(1, response.data["unsupported_packages_count"]) + expected_unsupported_packages = ["pkg:bitbucket/example/example@1.0.0"] + self.assertEqual( + expected_unsupported_packages, response.data["unsupported_packages"] ) - self.assertEqual(1, response.data['unsupported_packages_count']) - expected_unsupported_packages = [ - 'pkg:bitbucket/example/example@1.0.0' - ] - self.assertEqual(expected_unsupported_packages, - response.data['unsupported_packages']) - bad_data = {'does-not-exist': 'dne'} + bad_data = {"does-not-exist": "dne"} response = self.client.post( - '/api/collect/index_packages/', data=bad_data, content_type="application/json") - expected_errors = {'packages': ['This field is required.']} + "/api/collect/index_packages/", + data=bad_data, + content_type="application/json", + ) + expected_errors = {"packages": ["This field is required."]} self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - self.assertEqual(expected_errors, response.data['errors']) + self.assertEqual(expected_errors, response.data["errors"]) @mock.patch("packagedb.api.get_all_versions") def test_package_api_index_packages_endpoint_with_vers(self, mock_get_all_versions): @@ -1168,11 +1171,10 @@ def test_package_api_index_packages_endpoint_with_vers(self, mock_get_all_versio "pkg:maven/ch.qos.reload4j/reload4j@1.2.23", ] self.assertEqual( - sorted(expected_queued_packages), sorted( - response.data["queued_packages"]) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) ) - self.assertEqual(0, response.data['requeued_packages_count']) - self.assertEqual([], response.data['requeued_packages']) + self.assertEqual(0, response.data["requeued_packages_count"]) + self.assertEqual([], response.data["requeued_packages"]) self.assertEqual(0, response.data["unqueued_packages_count"]) self.assertEqual([], response.data["unqueued_packages"]) self.assertEqual(0, response.data["unsupported_packages_count"]) @@ -1180,7 +1182,9 @@ def test_package_api_index_packages_endpoint_with_vers(self, mock_get_all_versio self.assertEqual(9, priority_resource_uris_count) @mock.patch("packagedb.api.get_all_versions") - def test_package_api_index_packages_endpoint_all_version_index(self, mock_get_all_versions): + def test_package_api_index_packages_endpoint_all_version_index( + self, mock_get_all_versions + ): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) packages = [ @@ -1227,11 +1231,10 @@ def test_package_api_index_packages_endpoint_all_version_index(self, mock_get_al "pkg:maven/ch.qos.reload4j/reload4j@1.2.25", ] self.assertEqual( - sorted(expected_queued_packages), sorted( - response.data["queued_packages"]) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) ) - self.assertEqual(0, response.data['requeued_packages_count']) - self.assertEqual([], response.data['requeued_packages']) + self.assertEqual(0, response.data["requeued_packages_count"]) + self.assertEqual([], response.data["requeued_packages"]) self.assertEqual(0, response.data["unqueued_packages_count"]) self.assertEqual([], response.data["unqueued_packages"]) self.assertEqual(0, response.data["unsupported_packages_count"]) @@ -1245,66 +1248,64 @@ def test_reindex_packages_bulk(self): self.assertEqual(False, self.scannableuri.reindex_uri) self.assertEqual(0, self.scannableuri.priority) self.assertEqual(self.scan_uuid, self.scannableuri.scan_uuid) - self.assertEqual('error', self.scannableuri.scan_error) - self.assertEqual('error', self.scannableuri.index_error) - self.assertEqual(self.scan_request_date, - self.scannableuri.scan_request_date) - self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, - self.scannableuri.scan_status) + self.assertEqual("error", self.scannableuri.scan_error) + self.assertEqual("error", self.scannableuri.index_error) + self.assertEqual(self.scan_request_date, self.scannableuri.scan_request_date) + self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, self.scannableuri.scan_status) self.assertEqual(False, self.scannableuri2.reindex_uri) self.assertEqual(0, self.scannableuri2.priority) self.assertEqual(self.scan_uuid2, self.scannableuri2.scan_uuid) - self.assertEqual('error', self.scannableuri2.scan_error) - self.assertEqual('error', self.scannableuri2.index_error) - self.assertEqual(self.scan_request_date2, - self.scannableuri2.scan_request_date) - self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, - self.scannableuri2.scan_status) + self.assertEqual("error", self.scannableuri2.scan_error) + self.assertEqual("error", self.scannableuri2.index_error) + self.assertEqual(self.scan_request_date2, self.scannableuri2.scan_request_date) + self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, self.scannableuri2.scan_status) packages = [ # Existing package { - "purl": 'pkg:maven/sample/Baz@90.12', + "purl": "pkg:maven/sample/Baz@90.12", }, { - "purl": 'pkg:npm/example/bar@56.78', + "purl": "pkg:npm/example/bar@56.78", }, # NOt in DB and unsupported { - "purl": 'pkg:pypi/does/not-exist@1', + "purl": "pkg:pypi/does/not-exist@1", }, ] data = {"packages": packages, "reindex": True} existing_purls = [ - 'pkg:maven/sample/Baz@90.12', - 'pkg:npm/example/bar@56.78', + "pkg:maven/sample/Baz@90.12", + "pkg:npm/example/bar@56.78", ] unsupported_purls = [ - 'pkg:pypi/does/not-exist@1', + "pkg:pypi/does/not-exist@1", ] response = self.client.post( - f'/api/collect/index_packages/', data=data, content_type="application/json") - - self.assertEqual(2, response.data['requeued_packages_count']) - self.assertListEqual(sorted(existing_purls), sorted( - response.data['requeued_packages'])) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) - self.assertEqual(1, response.data['unsupported_packages_count']) + self.assertEqual(2, response.data["requeued_packages_count"]) self.assertListEqual( - unsupported_purls, response.data['unsupported_packages']) + sorted(existing_purls), sorted(response.data["requeued_packages"]) + ) + + self.assertEqual(1, response.data["unsupported_packages_count"]) + self.assertListEqual(unsupported_purls, response.data["unsupported_packages"]) - self.assertEqual(0, response.data['queued_packages_count']) - self.assertEqual([], response.data['queued_packages']) + self.assertEqual(0, response.data["queued_packages_count"]) + self.assertEqual([], response.data["queued_packages"]) self.assertEqual(0, response.data["unqueued_packages_count"]) self.assertEqual([], response.data["unqueued_packages"]) self.assertEqual(4, ScannableURI.objects.all().count()) new_scannable_uris = ScannableURI.objects.exclude( - pk__in=[self.scannableuri.pk, self.scannableuri2.pk]) + pk__in=[self.scannableuri.pk, self.scannableuri2.pk] + ) self.assertEqual(2, new_scannable_uris.count()) for scannable_uri in new_scannable_uris: @@ -1319,147 +1320,143 @@ def test_package_api_index_packages_priority(self): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) packages = [ - {'purl': 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24'}, - {'purl': 'pkg:maven/com.esotericsoftware.kryo/kryo'}, + {"purl": "pkg:maven/ch.qos.reload4j/reload4j@1.2.24"}, + {"purl": "pkg:maven/com.esotericsoftware.kryo/kryo"}, ] - data = { - 'packages': packages - } + data = {"packages": packages} response = self.client.post( - '/api/collect/index_packages/', data=data, content_type="application/json") - self.assertEqual(14, response.data['queued_packages_count']) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) + self.assertEqual(14, response.data["queued_packages_count"]) expected_kryo_packages = [ - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.10', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.12', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.14', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.16', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.17', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.19', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.20', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.21', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.21.1', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.22', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.23.0', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.23.1', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0', + "pkg:maven/com.esotericsoftware.kryo/kryo@2.10", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.12", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.14", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.16", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.17", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.19", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.20", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.21", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.21.1", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.22", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.23.0", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.23.1", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0", + ] + expected_queued_packages = expected_kryo_packages + [ + "pkg:maven/ch.qos.reload4j/reload4j@1.2.24" ] - expected_queued_packages = expected_kryo_packages + \ - ['pkg:maven/ch.qos.reload4j/reload4j@1.2.24'] self.assertEqual( - sorted(expected_queued_packages), - sorted(response.data['queued_packages']) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) ) priority_resource_uri = PriorityResourceURI.objects.get( - package_url='pkg:maven/ch.qos.reload4j/reload4j@1.2.24') + package_url="pkg:maven/ch.qos.reload4j/reload4j@1.2.24" + ) self.assertEqual(100, priority_resource_uri.priority) for purl in expected_kryo_packages: - priority_resource_uri = PriorityResourceURI.objects.get( - package_url=purl) + priority_resource_uri = PriorityResourceURI.objects.get(package_url=purl) self.assertEqual(0, priority_resource_uri.priority) def test_collect_errors(self): - invalid_purl = 'pkg:asdf1' - response = self.client.get(f'/api/collect/?purl={invalid_purl}') + invalid_purl = "pkg:asdf1" + response = self.client.get(f"/api/collect/?purl={invalid_purl}") self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = {'purl': [ - "purl validation error: purl is missing the required type component: 'pkg:asdf1'."]} - self.assertEqual(expected_status, response.data['errors']) + expected_status = { + "purl": [ + "purl validation error: purl is missing the required type component: 'pkg:asdf1'." + ] + } + self.assertEqual(expected_status, response.data["errors"]) - unhandled_purl = 'pkg:does-not-exist/does-not-exist@1.0' - response = self.client.get(f'/api/collect/?purl={unhandled_purl}') + unhandled_purl = "pkg:does-not-exist/does-not-exist@1.0" + response = self.client.get(f"/api/collect/?purl={unhandled_purl}") self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = f'cannot fetch Package data for {unhandled_purl}: no available handler' - self.assertEqual(expected_status, response.data['status']) + expected_status = ( + f"cannot fetch Package data for {unhandled_purl}: no available handler" + ) + self.assertEqual(expected_status, response.data["status"]) - purl_str = 'pkg:maven/does-not-exist@1.0' - response = self.client.get(f'/api/collect/?purl={purl_str}') + purl_str = "pkg:maven/does-not-exist@1.0" + response = self.client.get(f"/api/collect/?purl={purl_str}") self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) expected_status = ( - 'error(s) occurred when fetching metadata for pkg:maven/does-not-exist@1.0: ' - 'Package does not exist on maven: pkg:maven/does-not-exist@1.0\n' - 'Package does not exist on maven: pkg:maven/does-not-exist@1.0?classifier=sources\n' + "error(s) occurred when fetching metadata for pkg:maven/does-not-exist@1.0: " + "Package does not exist on maven: pkg:maven/does-not-exist@1.0\n" + "Package does not exist on maven: pkg:maven/does-not-exist@1.0?classifier=sources\n" ) - self.assertEqual(expected_status, response.data['status']) + self.assertEqual(expected_status, response.data["status"]) class ResourceApiTestCase(TestCase): - def setUp(self): self.package_data = { - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Foo', - 'version': '12.34', - 'qualifiers': 'test_qual=qual', - 'subpath': 'test_subpath', - 'download_url': 'http://example.com', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, + "type": "generic", + "namespace": "generic", + "name": "Foo", + "version": "12.34", + "qualifiers": "test_qual=qual", + "subpath": "test_subpath", + "download_url": "http://example.com", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() self.resource1 = Resource.objects.create( - path='foo', - name='foo', - sha1='sha1-1', - md5='md5-1', - package=self.package + path="foo", name="foo", sha1="sha1-1", md5="md5-1", package=self.package ) self.resource1.refresh_from_db() self.resource2 = Resource.objects.create( - path='foo/bar', - name='bar', - sha1='sha1-2', - md5='md5-2', - package=self.package + path="foo/bar", name="bar", sha1="sha1-2", md5="md5-2", package=self.package ) self.resource2.refresh_from_db() def test_api_resource_checksum_filter(self): - filters = f'?md5={self.resource1.md5}&md5={self.resource2.md5}' - response = self.client.get(f'/api/resources/{filters}') - self.assertEqual(2, response.data['count']) - names = sorted([result.get('name') - for result in response.data['results']]) - expected_names = sorted([ - self.resource1.name, - self.resource2.name, - ]) + filters = f"?md5={self.resource1.md5}&md5={self.resource2.md5}" + response = self.client.get(f"/api/resources/{filters}") + self.assertEqual(2, response.data["count"]) + names = sorted([result.get("name") for result in response.data["results"]]) + expected_names = sorted( + [ + self.resource1.name, + self.resource2.name, + ] + ) self.assertEqual(expected_names, names) - filters = f'?sha1={self.resource1.sha1}&sha1={self.resource2.sha1}' - response = self.client.get(f'/api/resources/{filters}') + filters = f"?sha1={self.resource1.sha1}&sha1={self.resource2.sha1}" + response = self.client.get(f"/api/resources/{filters}") self.assertEqual(2, response.data["count"]) - names = sorted([result.get('name') - for result in response.data['results']]) - expected_names = sorted([ - self.resource1.name, - self.resource2.name, - ]) + names = sorted([result.get("name") for result in response.data["results"]]) + expected_names = sorted( + [ + self.resource1.name, + self.resource2.name, + ] + ) self.assertEqual(expected_names, names) class PackageUpdateSetTestCase(TestCase): - def setUp(self): self.package_data = { - 'type': 'npm', - 'namespace': '', - 'name': 'foobar', - 'version': '1.1.0', - 'qualifiers': '', - 'subpath': '', - 'download_url': '', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, - 'package_content': 1 + "type": "npm", + "namespace": "", + "name": "foobar", + "version": "1.1.0", + "qualifiers": "", + "subpath": "", + "download_url": "", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, + "package_content": 1, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() @@ -1468,97 +1465,84 @@ def setUp(self): def test_api_purl_updation(self): data = { - "purls": [ - {"purl": "pkg:npm/hologram@1.1.0", "content_type": "CURATION"}], - "uuid": str(self.new_package_set_uuid) + "purls": [{"purl": "pkg:npm/hologram@1.1.0", "content_type": "CURATION"}], + "uuid": str(self.new_package_set_uuid), } response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) - expected = [{"purl": "pkg:npm/hologram@1.1.0", - "update_status": "Updated"}] + expected = [{"purl": "pkg:npm/hologram@1.1.0", "update_status": "Updated"}] self.assertEqual(expected, response.data) def test_api_purl_updation_existing_package(self): data = { - "purls": [ - {"purl": "pkg:npm/foobar@1.1.0", "content_type": "PATCH"} - ], - "uuid": str(self.new_package_set_uuid) + "purls": [{"purl": "pkg:npm/foobar@1.1.0", "content_type": "PATCH"}], + "uuid": str(self.new_package_set_uuid), } - expected = [{"purl": "pkg:npm/foobar@1.1.0", - "update_status": "Already Exists"}] + expected = [{"purl": "pkg:npm/foobar@1.1.0", "update_status": "Already Exists"}] response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) self.assertEqual(expected, response.data) def test_api_purl_updation_non_existing_uuid(self): data = { - "purls": [ - {"purl": "pkg:npm/foobar@1.1.0", "content_type": "SOURCE_REPO"} - ], - "uuid": "ac9c36f4-a1ed-4824-8448-c6ed8f1da71d" + "purls": [{"purl": "pkg:npm/foobar@1.1.0", "content_type": "SOURCE_REPO"}], + "uuid": "ac9c36f4-a1ed-4824-8448-c6ed8f1da71d", } expected = { - "update_status": "No Package Set found for ac9c36f4-a1ed-4824-8448-c6ed8f1da71d"} + "update_status": "No Package Set found for ac9c36f4-a1ed-4824-8448-c6ed8f1da71d" + } response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) self.assertEqual(expected, response.data) def test_api_purl_updation_without_uuid(self): - data = { - "purls": [ - {"purl": "pkg:npm/jammy@1.1.9", "content_type": "BINARY"} - ] - } + data = {"purls": [{"purl": "pkg:npm/jammy@1.1.9", "content_type": "BINARY"}]} - expected = [{"purl": "pkg:npm/jammy@1.1.9", - "update_status": "Updated"}] + expected = [{"purl": "pkg:npm/jammy@1.1.9", "update_status": "Updated"}] response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) self.assertEqual(expected, response.data) def test_api_purl_validation_empty_request(self): data = {} response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) - expected = { - "errors": { - "purls": [ - "This field is required." - ] - } - } + expected = {"errors": {"purls": ["This field is required."]}} self.assertAlmostEqual(expected, response.data) class PurlValidateApiTestCase(TestCase): - def setUp(self): self.package_data = { - 'type': 'npm', - 'namespace': '', - 'name': 'foobar', - 'version': '1,1.0', - 'qualifiers': '', - 'subpath': '', - 'download_url': '', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, + "type": "npm", + "namespace": "", + "name": "foobar", + "version": "1,1.0", + "qualifiers": "", + "subpath": "", + "download_url": "", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() @@ -1568,13 +1552,13 @@ def test_api_purl_validation(self): "purl": "pkg:npm/foobar@1.1.0", "check_existence": True, } - response1 = self.client.get(f"/api/validate/", data=data1) + response1 = self.client.get("/api/validate/", data=data1) data2 = { "purl": "pkg:npm/?foobar@1.1.0", "check_existence": True, } - response2 = self.client.get(f"/api/validate/", data=data2) + response2 = self.client.get("/api/validate/", data=data2) self.assertEqual(True, response1.data["valid"]) self.assertEqual(True, response1.data["exists"]) @@ -1595,31 +1579,25 @@ def test_api_purl_validation_unsupported_package_type(self): "purl": "pkg:random/foobar@1.1.0", "check_existence": True, } - response1 = self.client.get(f"/api/validate/", data=data1) + response1 = self.client.get("/api/validate/", data=data1) self.assertEqual(True, response1.data["valid"]) self.assertEqual( - "The provided PackageURL is valid, but `check_existence` is not supported for this package type.", response1.data[ - "message"] + "The provided PackageURL is valid, but `check_existence` is not supported for this package type.", + response1.data["message"], ) self.assertEqual(None, response1.data["exists"]) def test_api_purl_validation_empty_request(self): data1 = {} - response1 = self.client.get(f"/api/validate/", data=data1) + response1 = self.client.get("/api/validate/", data=data1) data2 = { "does-not-exist": "dne", } - response2 = self.client.get(f"/api/validate/", data=data2) + response2 = self.client.get("/api/validate/", data=data2) - expected = { - "errors": { - "purl": [ - "This field is required." - ] - } - } + expected = {"errors": {"purl": ["This field is required."]}} self.assertAlmostEqual(expected, response1.data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response1.status_code) @@ -1629,7 +1607,6 @@ def test_api_purl_validation_empty_request(self): class PackageWatchTestCase(TestCase): - @mock.patch("packagedb.models.PackageWatch.create_new_job") def setUp(self, mock_create_new_job): mock_create_new_job.return_value = None @@ -1723,12 +1700,10 @@ def test_api_package_watch_put_not_allowed(self): "/api/watch/pkg:npm/foobar/", data=data, content_type="application/json" ) - self.assertEqual(status.HTTP_405_METHOD_NOT_ALLOWED, - response1.status_code) + self.assertEqual(status.HTTP_405_METHOD_NOT_ALLOWED, response1.status_code) class ToGolangPurlTestCase(TestCase): - def test_to_golang_purl(self): response = self.client.get( "/api/to_purl/go", diff --git a/packagedb/tests/test_filters.py b/packagedb/tests/test_filters.py index c4d5fb65..baf910f1 100644 --- a/packagedb/tests/test_filters.py +++ b/packagedb/tests/test_filters.py @@ -8,27 +8,27 @@ # from django.test import TestCase + from packagedb.api import PackageFilterSet from packagedb.filters import parse_query_string_to_lookups from packagedb.models import Package class PackageDBFilterTest(TestCase): - def test_scanpipe_filters_package_filterset_search(self): p1 = Package.objects.create( - type='maven', - namespace='org.example', - name='foo', - version='1.0.0', - download_url='https://example.com/foo-1.0.0.jar', + type="maven", + namespace="org.example", + name="foo", + version="1.0.0", + download_url="https://example.com/foo-1.0.0.jar", ) p2 = Package.objects.create( - type='maven', - namespace='org.somethingelse', - name='foo', - version='0.35.7', - download_url='https://somethingelse.net/foo-0.35.7.jar', + type="maven", + namespace="org.somethingelse", + name="foo", + version="0.35.7", + download_url="https://somethingelse.net/foo-0.35.7.jar", ) filterset = PackageFilterSet(data={}) @@ -80,6 +80,5 @@ def test_packagedb_filters_parse_query_string_to_lookups(self): } for query_string, expected in inputs.items(): - lookups = parse_query_string_to_lookups( - query_string, "icontains", "name") + lookups = parse_query_string_to_lookups(query_string, "icontains", "name") self.assertEqual(expected, str(lookups)) diff --git a/packagedb/tests/test_migrations.py b/packagedb/tests/test_migrations.py index cd07a379..4015bacb 100644 --- a/packagedb/tests/test_migrations.py +++ b/packagedb/tests/test_migrations.py @@ -96,21 +96,27 @@ def test_package_set_creation(self): self.package4, self.package5, ] - self.assertTrue(all(package.package_sets for package in packages_in_package_sets)) + self.assertTrue( + all(package.package_sets for package in packages_in_package_sets) + ) package_set1 = PackageSet.objects.get(uuid=self.package_set1) self.assertTrue(package_set1) - self.assertRaises(PackageSet.DoesNotExist, PackageSet.objects.get, uuid=self.package_set2) + self.assertRaises( + PackageSet.DoesNotExist, PackageSet.objects.get, uuid=self.package_set2 + ) self.assertEqual(1, self.package1.package_sets.count()) self.assertEqual(1, self.package2.package_sets.count()) self.assertEqual(package_set1.uuid, self.package1.package_sets.first().uuid) self.assertEqual(package_set1.uuid, self.package2.package_sets.first().uuid) - self.assertEqual(0, self.package3.package_sets.count()) + self.assertEqual(0, self.package3.package_sets.count()) - self.assertEqual(1, self.package4.package_sets.count()) - self.assertEqual(1, self.package5.package_sets.count()) - self.assertEqual(self.package4.package_sets.first(), self.package5.package_sets.first()) + self.assertEqual(1, self.package4.package_sets.count()) + self.assertEqual(1, self.package5.package_sets.count()) + self.assertEqual( + self.package4.package_sets.first(), self.package5.package_sets.first() + ) package_set_for_package4_and_package5 = self.package4.package_sets.first() self.assertEqual(2, package_set_for_package4_and_package5.packages.count()) diff --git a/packagedb/tests/test_models.py b/packagedb/tests/test_models.py index aa9f3334..d4620060 100644 --- a/packagedb/tests/test_models.py +++ b/packagedb/tests/test_models.py @@ -7,35 +7,32 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse +from unittest.mock import patch from django.db import IntegrityError from django.test import TransactionTestCase from django.utils import timezone -from packagedb.models import DependentPackage, PackageWatch +from dateutil.parser import parse as dateutil_parse + +from packagedb.models import DependentPackage from packagedb.models import Package +from packagedb.models import PackageWatch from packagedb.models import Party from packagedb.models import Resource -from unittest.mock import patch - class ResourceModelTestCase(TransactionTestCase): def setUp(self): - self.package = Package.objects.create(download_url='test-pkg.com') - self.resource_paths = [ - 'root/', - 'root/test.json' - ] + self.package = Package.objects.create(download_url="test-pkg.com") + self.resource_paths = ["root/", "root/test.json"] def tearDown(self): Package.objects.all().delete() Resource.objects.all().delete() def test_resource_is_created_on_a_package(self): - Resource.objects.create(package=self.package, - path=self.resource_paths[0]) + Resource.objects.create(package=self.package, path=self.resource_paths[0]) self.assertEqual(1, Resource.objects.all().count()) @@ -50,7 +47,8 @@ def test_duplicate_resources_are_not_created(self): Resource.objects.create(package=self.package, path=path) for path in self.resource_paths: self.assertRaises( - IntegrityError, Resource.objects.create, package=self.package, path=path) + IntegrityError, Resource.objects.create, package=self.package, path=path + ) self.assertEqual(2, Resource.objects.all().count()) @@ -58,87 +56,97 @@ def test_duplicate_resources_are_not_created(self): class PackageModelHistoryFieldTestCase(TransactionTestCase): def setUp(self): self.test_package = Package.objects.create( - download_url='https://test.com', + download_url="https://test.com", ) - self.message0 = 'test-message0' - self.message1 = 'test-message1' - self.message2 = 'test-message2' + self.message0 = "test-message0" + self.message1 = "test-message1" + self.message2 = "test-message2" def test_history_field_append_and_get_one_item(self): self.test_package.append_to_history(self.message0) - expected_date = timezone.now().strftime('%Y-%m-%d') + expected_date = timezone.now().strftime("%Y-%m-%d") expected_message = self.message0 history = self.test_package.get_history()[0] - self.assertIn(expected_date, history.get('timestamp')) - self.assertEqual(expected_message, history.get('message')) + self.assertIn(expected_date, history.get("timestamp")) + self.assertEqual(expected_message, history.get("message")) def test_history_field_append_and_get_multiple_items(self): self.test_package.append_to_history(self.message0) self.test_package.append_to_history(self.message1) self.test_package.append_to_history(self.message2) - expected_date = timezone.now().strftime('%Y-%m-%d') + expected_date = timezone.now().strftime("%Y-%m-%d") expected_messages = [ self.message0, self.message1, self.message2, ] - for expected_message, entry in zip(expected_messages, self.test_package.get_history()): - self.assertIn(expected_date, entry.get('timestamp')) - self.assertEqual(expected_message, entry.get('message')) + for expected_message, entry in zip( + expected_messages, self.test_package.get_history() + ): + self.assertIn(expected_date, entry.get("timestamp")) + self.assertEqual(expected_message, entry.get("message")) class PackageModelTestCase(TransactionTestCase): def setUp(self): - self.created_package_download_url = 'https://created-example.com' - self.inserted_package_download_url = 'https://inserted-example.com' + self.created_package_download_url = "https://created-example.com" + self.inserted_package_download_url = "https://inserted-example.com" self.created_package_data = { - 'download_url': self.created_package_download_url, - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Foo', - 'version': '12.34', + "download_url": self.created_package_download_url, + "type": "generic", + "namespace": "generic", + "name": "Foo", + "version": "12.34", } self.inserted_package_data = { - 'download_url': self.inserted_package_download_url, - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Bar', - 'version': '12.34', + "download_url": self.inserted_package_download_url, + "type": "generic", + "namespace": "generic", + "name": "Bar", + "version": "12.34", } - self.created_package = Package.objects.create( - **self.created_package_data) - self.inserted_package = Package.objects.insert( - **self.inserted_package_data) + self.created_package = Package.objects.create(**self.created_package_data) + self.inserted_package = Package.objects.insert(**self.inserted_package_data) def test_package_download_url_is_unique(self): - self.assertIsNone(Package.objects.insert( - download_url=self.created_package_download_url)) - self.assertIsNone(Package.objects.insert( - download_url=self.inserted_package_download_url)) + self.assertIsNone( + Package.objects.insert(download_url=self.created_package_download_url) + ) + self.assertIsNone( + Package.objects.insert(download_url=self.inserted_package_download_url) + ) def test_packagedb_package_model_history_field(self): - self.created_package.append_to_history('test-message') + self.created_package.append_to_history("test-message") for entry in self.created_package.get_history(): - self.assertEqual('test-message', entry.get('message')) + self.assertEqual("test-message", entry.get("message")) def test_packagedb_package_model_get_all_versions(self): p1 = Package.objects.create( - download_url='http://a.a', type='generic', name='name', version='1.0') + download_url="http://a.a", type="generic", name="name", version="1.0" + ) p2 = Package.objects.create( - download_url='http://b.b', type='generic', name='name', version='2.0') + download_url="http://b.b", type="generic", name="name", version="2.0" + ) p3 = Package.objects.create( - download_url='http://c.c', type='generic', name='name', version='3.0') - p4 = Package.objects.create(download_url='http://d.d', type='generic', namespace='space', name='name', - version='4.0') + download_url="http://c.c", type="generic", name="name", version="3.0" + ) + p4 = Package.objects.create( + download_url="http://d.d", + type="generic", + namespace="space", + name="name", + version="4.0", + ) self.assertEqual([p1, p2, p3], list(p1.get_all_versions())) self.assertEqual([p1, p2, p3], list(p2.get_all_versions())) @@ -147,13 +155,17 @@ def test_packagedb_package_model_get_all_versions(self): def test_packagedb_package_model_get_latest_version(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) p2 = Package.objects.create( - download_url='http://b.b', name='name', version='2.0') + download_url="http://b.b", name="name", version="2.0" + ) p3 = Package.objects.create( - download_url='http://c.c', name='name', version='3.0') - p4 = Package.objects.create(download_url='http://d.d', namespace='space', name='name', - version='4.0') + download_url="http://c.c", name="name", version="3.0" + ) + p4 = Package.objects.create( + download_url="http://d.d", namespace="space", name="name", version="4.0" + ) self.assertEqual(p3, p1.get_latest_version()) self.assertEqual(p3, p2.get_latest_version()) @@ -162,54 +174,51 @@ def test_packagedb_package_model_get_latest_version(self): def test_packagedb_package_model_update_fields(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) self.assertFalse(p1.history) - self.assertEqual('', p1.namespace) + self.assertEqual("", p1.namespace) self.assertEqual(None, p1.homepage_url) package, updated_fields = p1.update_fields( - namespace='test', homepage_url='https://example.com') + namespace="test", homepage_url="https://example.com" + ) self.assertEqual( - sorted(updated_fields), - sorted(['homepage_url', 'history', 'namespace']) + sorted(updated_fields), sorted(["homepage_url", "history", "namespace"]) ) - self.assertEqual('test', p1.namespace) - self.assertEqual('https://example.com', p1.homepage_url) + self.assertEqual("test", p1.namespace) + self.assertEqual("https://example.com", p1.homepage_url) self.assertEqual(1, len(p1.history)) expected_history_entry = { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': - [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ + {"field": "namespace", "old_value": "", "new_value": "test"}, { - 'field': 'namespace', - 'old_value': '', - 'new_value': 'test' + "field": "homepage_url", + "old_value": None, + "new_value": "https://example.com", }, - { - 'field': 'homepage_url', - 'old_value': None, - 'new_value': 'https://example.com' - } ] - } + }, } history_entry = p1.history[0] - history_entry.pop('timestamp') + history_entry.pop("timestamp") self.assertEqual(expected_history_entry, history_entry) def test_packagedb_package_model_update_fields_special_cases(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) # Test dates date_fields = [ - 'created_date', - 'last_indexed_date', - 'release_date', + "created_date", + "last_indexed_date", + "release_date", ] for field in date_fields: value = getattr(p1, field) self.assertEqual(None, value) - timestamp_str = '2017-03-25T14:39:00+00:00' + timestamp_str = "2017-03-25T14:39:00+00:00" package, updated_fields = p1.update_fields( **{field: timestamp_str for field in date_fields} ) @@ -217,32 +226,25 @@ def test_packagedb_package_model_update_fields_special_cases(self): for field in date_fields: value = getattr(package, field) self.assertEqual(timestamp, value) - self.assertEqual( - sorted(updated_fields), - sorted(date_fields + ['history']) - ) + self.assertEqual(sorted(updated_fields), sorted(date_fields + ["history"])) # Test qualifiers - self.assertEqual('', p1.qualifiers) + self.assertEqual("", p1.qualifiers) dict_qualifiers1 = { - 'classifier': 'sources', - 'type': 'war', + "classifier": "sources", + "type": "war", } - string_qualifiers1 = 'classifier=sources&type=war' + string_qualifiers1 = "classifier=sources&type=war" package, updated_fields = p1.update_fields(qualifiers=dict_qualifiers1) self.assertEqual( - sorted(['qualifiers', 'history']), + sorted(["qualifiers", "history"]), sorted(updated_fields), ) + self.assertEqual(string_qualifiers1, p1.qualifiers) + string_qualifiers2 = "classifier=somethingelse" + package, updated_fields = p1.update_fields(qualifiers=string_qualifiers2) self.assertEqual( - string_qualifiers1, - p1.qualifiers - ) - string_qualifiers2 = 'classifier=somethingelse' - package, updated_fields = p1.update_fields( - qualifiers=string_qualifiers2) - self.assertEqual( - sorted(['qualifiers', 'history']), + sorted(["qualifiers", "history"]), sorted(updated_fields), ) self.assertEqual( @@ -251,74 +253,77 @@ def test_packagedb_package_model_update_fields_special_cases(self): ) expected_history = [ { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ { - 'field': 'created_date', - 'old_value': 'None', - 'new_value': '2017-03-25 14:39:00+00:00' - }, { - 'field': 'last_indexed_date', - 'old_value': 'None', - 'new_value': '2017-03-25 14:39:00+00:00' - }, { - 'field': 'release_date', - 'old_value': 'None', - 'new_value': '2017-03-25 14:39:00+00:00' - } + "field": "created_date", + "old_value": "None", + "new_value": "2017-03-25 14:39:00+00:00", + }, + { + "field": "last_indexed_date", + "old_value": "None", + "new_value": "2017-03-25 14:39:00+00:00", + }, + { + "field": "release_date", + "old_value": "None", + "new_value": "2017-03-25 14:39:00+00:00", + }, ] - } + }, }, { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ { - 'field': 'qualifiers', - 'old_value': '', - 'new_value': 'classifier=sources&type=war' + "field": "qualifiers", + "old_value": "", + "new_value": "classifier=sources&type=war", } ] - } + }, }, { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ { - 'field': 'qualifiers', - 'old_value': 'classifier=sources&type=war', - 'new_value': 'classifier=somethingelse' + "field": "qualifiers", + "old_value": "classifier=sources&type=war", + "new_value": "classifier=somethingelse", } ] - } - } + }, + }, ] # remove timestamp before comparison history = [] for entry in p1.history: - entry.pop('timestamp') + entry.pop("timestamp") history.append(entry) self.assertEqual(expected_history, history) def test_packagedb_package_model_update_fields_related_models(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') - path = 'asdf' + download_url="http://a.a", name="name", version="1.0" + ) + path = "asdf" resources = [Resource(package=p1, path=path)] _, updated_fields = p1.update_fields(resources=resources) - self.assertEqual( - sorted(['resources', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["resources", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'resources' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'resources' with 1 new entries." self.assertEqual(1, len(p1.history)) - history_message = p1.history[0]['message'] + history_message = p1.history[0]["message"] self.assertEqual(expected_message, history_message) p2 = Package.objects.create( - download_url='http://b.b', name='example', version='1.0') + download_url="http://b.b", name="example", version="1.0" + ) resources = [ { "path": "example.jar", @@ -350,115 +355,113 @@ def test_packagedb_package_model_update_fields_related_models(self): "holders": [], "authors": [], "package_data": [], - "for_packages": [ - - ], + "for_packages": [], "emails": [], "urls": [], - "extra_data": {} + "extra_data": {}, } ] _, updated_fields = p2.update_fields(resources=resources) - self.assertEqual( - sorted(['resources', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["resources", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'resources' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'resources' with 1 new entries." self.assertEqual(1, len(p2.history)) - history_message = p2.history[0]['message'] + history_message = p2.history[0]["message"] self.assertEqual(expected_message, history_message) p3 = Package.objects.create( - download_url='http://foo', name='foo', version='1.0') + download_url="http://foo", name="foo", version="1.0" + ) parties = [ dict( - type='admin', - role='admin', - name='foo', - email='foo@foo.com', - url='foo.com', + type="admin", + role="admin", + name="foo", + email="foo@foo.com", + url="foo.com", ) ] _, updated_fields = p3.update_fields(parties=parties) - self.assertEqual( - sorted(['parties', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["parties", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'parties' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'parties' with 1 new entries." self.assertEqual(1, len(p3.history)) - history_message = p3.history[0]['message'] + history_message = p3.history[0]["message"] self.assertEqual(expected_message, history_message) p4 = Package.objects.create( - download_url='http://bar', name='bar', version='1.0') + download_url="http://bar", name="bar", version="1.0" + ) parties = [ Party( package=p4, - type='admin', - role='admin', - name='bar', - email='bar@bar.com', - url='foo.com', + type="admin", + role="admin", + name="bar", + email="bar@bar.com", + url="foo.com", ) ] _, updated_fields = p4.update_fields(parties=parties) - self.assertEqual( - sorted(['parties', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["parties", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'parties' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'parties' with 1 new entries." self.assertEqual(1, len(p4.history)) - history_message = p4.history[0]['message'] + history_message = p4.history[0]["message"] self.assertEqual(expected_message, history_message) p5 = Package.objects.create( - download_url='http://baz', name='baz', version='1.0') + download_url="http://baz", name="baz", version="1.0" + ) dependencies = [ dict( - purl='pkg:baz_dep@1.0', - extracted_requirement='>1', - scope='runtime', + purl="pkg:baz_dep@1.0", + extracted_requirement=">1", + scope="runtime", is_runtime=True, is_optional=False, is_resolved=True, ) ] _, updated_fields = p5.update_fields(dependencies=dependencies) - self.assertEqual( - sorted(['dependencies', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["dependencies", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'dependencies' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'dependencies' with 1 new entries." self.assertEqual(1, len(p5.history)) - history_message = p5.history[0]['message'] + history_message = p5.history[0]["message"] self.assertEqual(expected_message, history_message) p6 = Package.objects.create( - download_url='http://qux', name='qux', version='1.0') + download_url="http://qux", name="qux", version="1.0" + ) dependencies = [ DependentPackage( package=p6, - purl='pkg:qux_dep@1.0', - extracted_requirement='>1', - scope='runtime', + purl="pkg:qux_dep@1.0", + extracted_requirement=">1", + scope="runtime", is_runtime=True, is_optional=False, is_resolved=True, ) ] _, updated_fields = p6.update_fields(dependencies=dependencies) - self.assertEqual( - sorted(['dependencies', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["dependencies", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'dependencies' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'dependencies' with 1 new entries." self.assertEqual(1, len(p6.history)) - history_message = p6.history[0]['message'] + history_message = p6.history[0]["message"] self.assertEqual(expected_message, history_message) def test_packagedb_package_model_update_fields_exceptions(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) with self.assertRaises(AttributeError): p1.update_fields(asdf=123) @@ -524,20 +527,21 @@ def test_package_watch_reschedule_on_modification(self, mock_create_new_job): self.package_watch1.watch_interval = 1 self.package_watch1.save() - self.assertEqual("reschedule_id_new_interval", - self.package_watch1.schedule_work_id) + self.assertEqual( + "reschedule_id_new_interval", self.package_watch1.schedule_work_id + ) self.package_watch1.is_active = False self.package_watch1.save() self.assertEqual(None, self.package_watch1.schedule_work_id) def test_get_or_none(self): - Package.objects.create(download_url='http://a.ab', - name='name', version='1.0', type="foo") - package = Package.objects.filter( - download_url="http://a.ab" - ).get_or_none() + Package.objects.create( + download_url="http://a.ab", name="name", version="1.0", type="foo" + ) + package = Package.objects.filter(download_url="http://a.ab").get_or_none() assert package - assert Package.objects.filter( - download_url="http://a.ab-foobar" - ).get_or_none() == None + assert ( + Package.objects.filter(download_url="http://a.ab-foobar").get_or_none() + == None + ) diff --git a/packagedb/tests/test_package_managers.py b/packagedb/tests/test_package_managers.py index 90e99d23..5c672177 100644 --- a/packagedb/tests/test_package_managers.py +++ b/packagedb/tests/test_package_managers.py @@ -12,9 +12,9 @@ from datetime import datetime from functools import partial from unittest import mock + from django.test import TestCase -import pytest from dateutil.tz import tzlocal from packageurl import PackageURL @@ -34,84 +34,144 @@ dt_local = partial(datetime, tzinfo=tzlocal()) -class TestPackageManagers(TestCase): +class TestPackageManagers(TestCase): def test_trim_go_url_path(self): - assert GoproxyVersionAPI.trim_go_url_path("https://pkg.go.dev/https://github.com/xx/a/b") == "github.com/xx/a" - assert GoproxyVersionAPI.trim_go_url_path("https://github.com/xx/a/b") == "github.com/xx/a" - + assert ( + GoproxyVersionAPI.trim_go_url_path( + "https://pkg.go.dev/https://github.com/xx/a/b" + ) + == "github.com/xx/a" + ) + assert ( + GoproxyVersionAPI.trim_go_url_path("https://github.com/xx/a/b") + == "github.com/xx/a" + ) def test_nuget_extract_version(self): - with open(os.path.join(TEST_DATA, "nuget-data.json"), "r") as f: + with open(os.path.join(TEST_DATA, "nuget-data.json")) as f: response = json.load(f) results = list(NugetVersionAPI().extract_versions(response)) expected = [ - PackageVersion(value="2.1.0", release_date=dt_local(2011, 1, 22, 13, 34, 8, 550000)), - PackageVersion(value="3.0.0", release_date=dt_local(2011, 11, 24, 0, 26, 2, 527000)), - PackageVersion(value="3.0.3", release_date=dt_local(2011, 11, 27, 13, 50, 2, 63000)), - PackageVersion(value="3.0.4", release_date=dt_local(2011, 12, 12, 10, 18, 33, 380000)), - PackageVersion(value="3.0.5", release_date=dt_local(2011, 12, 12, 12, 0, 25, 947000)), - PackageVersion(value="3.0.6", release_date=dt_local(2012, 1, 2, 21, 10, 43, 403000)), - PackageVersion(value="3.4.0", release_date=dt_local(2013, 10, 20, 13, 32, 30, 837000)), - PackageVersion(value="3.4.1", release_date=dt_local(2014, 1, 17, 9, 17, 43, 680000)), - PackageVersion(value="3.5.0-beta2", release_date=dt_local(2015, 1, 1, 14, 9, 28, 710000)), - PackageVersion(value="3.5.0-beta3", release_date=dt_local(2015, 1, 6, 17, 39, 25, 147000)), - PackageVersion(value="3.5.0", release_date=dt_local(2015, 1, 14, 2, 1, 58, 853000)), - PackageVersion(value="3.5.1", release_date=dt_local(2015, 1, 23, 1, 5, 44, 447000)), + PackageVersion( + value="2.1.0", release_date=dt_local(2011, 1, 22, 13, 34, 8, 550000) + ), + PackageVersion( + value="3.0.0", release_date=dt_local(2011, 11, 24, 0, 26, 2, 527000) + ), + PackageVersion( + value="3.0.3", release_date=dt_local(2011, 11, 27, 13, 50, 2, 63000) + ), + PackageVersion( + value="3.0.4", release_date=dt_local(2011, 12, 12, 10, 18, 33, 380000) + ), + PackageVersion( + value="3.0.5", release_date=dt_local(2011, 12, 12, 12, 0, 25, 947000) + ), + PackageVersion( + value="3.0.6", release_date=dt_local(2012, 1, 2, 21, 10, 43, 403000) + ), + PackageVersion( + value="3.4.0", release_date=dt_local(2013, 10, 20, 13, 32, 30, 837000) + ), + PackageVersion( + value="3.4.1", release_date=dt_local(2014, 1, 17, 9, 17, 43, 680000) + ), + PackageVersion( + value="3.5.0-beta2", + release_date=dt_local(2015, 1, 1, 14, 9, 28, 710000), + ), + PackageVersion( + value="3.5.0-beta3", + release_date=dt_local(2015, 1, 6, 17, 39, 25, 147000), + ), + PackageVersion( + value="3.5.0", release_date=dt_local(2015, 1, 14, 2, 1, 58, 853000) + ), + PackageVersion( + value="3.5.1", release_date=dt_local(2015, 1, 23, 1, 5, 44, 447000) + ), ] assert results == expected - def test_nuget_extract_version_with_illformed_data(self): test_data = {"items": [{"items": [{"catalogEntry": {}}]}]} results = list(NugetVersionAPI.extract_versions(test_data)) assert results == [] - @mock.patch("packagedb.package_managers.get_response") def test_pypi_fetch_data(self, mock_response): pypi_api = PypiVersionAPI() - with open(os.path.join(TEST_DATA, "pypi.json"), "r") as f: + with open(os.path.join(TEST_DATA, "pypi.json")) as f: mock_response.return_value = json.load(f) results = list(pypi_api.fetch("django")) expected = [ - PackageVersion(value="1.1.3", release_date=dt_local(2010, 12, 23, 5, 14, 23, 509436)), - PackageVersion(value="1.1.4", release_date=dt_local(2011, 2, 9, 4, 13, 7, 75)), - PackageVersion(value="1.10", release_date=dt_local(2016, 8, 1, 18, 32, 16, 280614)), - PackageVersion(value="1.10.1", release_date=dt_local(2016, 9, 1, 23, 18, 18, 672706)), - PackageVersion(value="1.10.2", release_date=dt_local(2016, 10, 1, 20, 5, 31, 330942)), - PackageVersion(value="1.10.3", release_date=dt_local(2016, 11, 1, 13, 57, 16, 55061)), - PackageVersion(value="1.10.4", release_date=dt_local(2016, 12, 1, 23, 46, 50, 215935)), - PackageVersion(value="1.10.5", release_date=dt_local(2017, 1, 4, 19, 23, 0, 596664)), - PackageVersion(value="1.10.6", release_date=dt_local(2017, 3, 1, 13, 37, 40, 243134)), - PackageVersion(value="1.10.7", release_date=dt_local(2017, 4, 4, 14, 27, 54, 235551)), - PackageVersion(value="1.10.8", release_date=dt_local(2017, 9, 5, 15, 31, 58, 221021)), - PackageVersion(value="1.10a1", release_date=dt_local(2016, 5, 20, 12, 24, 59, 952686)), - PackageVersion(value="1.10b1", release_date=dt_local(2016, 6, 22, 1, 15, 17, 267637)), - PackageVersion(value="1.10rc1", release_date=dt_local(2016, 7, 18, 18, 5, 5, 503584)), + PackageVersion( + value="1.1.3", release_date=dt_local(2010, 12, 23, 5, 14, 23, 509436) + ), + PackageVersion( + value="1.1.4", release_date=dt_local(2011, 2, 9, 4, 13, 7, 75) + ), + PackageVersion( + value="1.10", release_date=dt_local(2016, 8, 1, 18, 32, 16, 280614) + ), + PackageVersion( + value="1.10.1", release_date=dt_local(2016, 9, 1, 23, 18, 18, 672706) + ), + PackageVersion( + value="1.10.2", release_date=dt_local(2016, 10, 1, 20, 5, 31, 330942) + ), + PackageVersion( + value="1.10.3", release_date=dt_local(2016, 11, 1, 13, 57, 16, 55061) + ), + PackageVersion( + value="1.10.4", release_date=dt_local(2016, 12, 1, 23, 46, 50, 215935) + ), + PackageVersion( + value="1.10.5", release_date=dt_local(2017, 1, 4, 19, 23, 0, 596664) + ), + PackageVersion( + value="1.10.6", release_date=dt_local(2017, 3, 1, 13, 37, 40, 243134) + ), + PackageVersion( + value="1.10.7", release_date=dt_local(2017, 4, 4, 14, 27, 54, 235551) + ), + PackageVersion( + value="1.10.8", release_date=dt_local(2017, 9, 5, 15, 31, 58, 221021) + ), + PackageVersion( + value="1.10a1", release_date=dt_local(2016, 5, 20, 12, 24, 59, 952686) + ), + PackageVersion( + value="1.10b1", release_date=dt_local(2016, 6, 22, 1, 15, 17, 267637) + ), + PackageVersion( + value="1.10rc1", release_date=dt_local(2016, 7, 18, 18, 5, 5, 503584) + ), ] assert results == expected - @mock.patch("packagedb.package_managers.get_response") def test_pypi_fetch_with_no_release(self, mock_response): mock_response.return_value = {"info": {}} results = list(PypiVersionAPI().fetch("django")) assert results == [] - @mock.patch("packagedb.package_managers.get_response") - def test_ruby_fetch_with_no_release(self,mock_response): - + def test_ruby_fetch_with_no_release(self, mock_response): with open(os.path.join(TEST_DATA, "gem.json")) as f: mock_response.return_value = json.load(f) results = list(RubyVersionAPI().fetch("rails")) expected = [ - PackageVersion(value="7.0.2.3", release_date=dt_local(2022, 3, 8, 17, 50, 52, 496000)), - PackageVersion(value="7.0.2.2", release_date=dt_local(2022, 2, 11, 19, 44, 19, 17000)), + PackageVersion( + value="7.0.2.3", release_date=dt_local(2022, 3, 8, 17, 50, 52, 496000) + ), + PackageVersion( + value="7.0.2.2", release_date=dt_local(2022, 2, 11, 19, 44, 19, 17000) + ), ] assert results == expected @@ -124,7 +184,6 @@ def test_get_version_fetcher(self): class TestComposerVersionAPI(TestCase): - expected_versions = [ PackageVersion(value="10.0.0", release_date=dt_local(2019, 7, 23, 7, 6, 3)), PackageVersion(value="10.1.0", release_date=dt_local(2019, 10, 1, 8, 18, 18)), @@ -198,7 +257,9 @@ def test_extract_versions(self): with open(os.path.join(TEST_DATA, "composer.json")) as f: mock_response = json.load(f) - results = list(ComposerVersionAPI().extract_versions(mock_response, "typo3/cms-core")) + results = list( + ComposerVersionAPI().extract_versions(mock_response, "typo3/cms-core") + ) assert results == self.expected_versions @mock.patch("packagedb.package_managers.get_response") @@ -218,7 +279,11 @@ def test_extract_versions(self): mock_response = ET.parse(f) results = list(MavenVersionAPI().extract_versions(mock_response)) - expected = [PackageVersion("1.2.2"), PackageVersion("1.2.3"), PackageVersion("1.3.0")] + expected = [ + PackageVersion("1.2.2"), + PackageVersion("1.2.3"), + PackageVersion("1.3.0"), + ] assert results == expected def test_artifact_url(self): @@ -228,7 +293,9 @@ def test_artifact_url(self): url1 = MavenVersionAPI.artifact_url(eg_comps1) url2 = MavenVersionAPI.artifact_url(eg_comps2) - assert url1 == "https://repo1.maven.org/maven2/org/apache/kafka/maven-metadata.xml" + assert ( + url1 == "https://repo1.maven.org/maven2/org/apache/kafka/maven-metadata.xml" + ) assert ( url2 == "https://repo1.maven.org/maven2/apple/msft/windows/mac/oss/exfat-ntfs/maven-metadata.xml" @@ -259,15 +326,20 @@ def test_fetch(self, mock_response): class TestGoproxyVersionAPI(TestCase): def test_trim_go_url_path(self): - url1 = "https://pkg.go.dev/github.com/containous/traefik/v2" - assert GoproxyVersionAPI.trim_go_url_path(url1) == "github.com/containous/traefik" + assert ( + GoproxyVersionAPI.trim_go_url_path(url1) == "github.com/containous/traefik" + ) url2 = "github.com/FerretDB/FerretDB/cmd/ferretdb" - assert GoproxyVersionAPI.trim_go_url_path(url2) == "github.com/FerretDB/FerretDB" + assert ( + GoproxyVersionAPI.trim_go_url_path(url2) == "github.com/FerretDB/FerretDB" + ) url3 = GoproxyVersionAPI.trim_go_url_path(url2) - assert GoproxyVersionAPI.trim_go_url_path(url3) == "github.com/FerretDB/FerretDB" + assert ( + GoproxyVersionAPI.trim_go_url_path(url3) == "github.com/FerretDB/FerretDB" + ) def test_escape_path(self): path = "github.com/FerretDB/FerretDB" @@ -276,7 +348,10 @@ def test_escape_path(self): @mock.patch("packagedb.package_managers.get_response") def test_fetch_version_info(self, mock_response): - mock_response.return_value = {"Version": "v0.0.5", "Time": "2022-01-04T13:54:01Z"} + mock_response.return_value = { + "Version": "v0.0.5", + "Time": "2022-01-04T13:54:01Z", + } result = GoproxyVersionAPI.fetch_version_info( "v0.0.5", "github.com/!ferret!d!b/!ferret!d!b", @@ -303,11 +378,21 @@ def test_fetch(self, mock_fetcher): results = list(GoproxyVersionAPI().fetch("github.com/FerretDB/FerretDB")) expected = [ - PackageVersion(value="v0.0.1", release_date=dt_local(2021, 11, 2, 6, 56, 38)), - PackageVersion(value="v0.0.5", release_date=dt_local(2021, 11, 13, 21, 36, 37)), - PackageVersion(value="v0.0.3", release_date=dt_local(2021, 11, 19, 20, 31, 22)), - PackageVersion(value="v0.0.4", release_date=dt_local(2021, 12, 1, 19, 2, 44)), - PackageVersion(value="v0.0.2", release_date=dt_local(2022, 1, 4, 13, 54, 1)), + PackageVersion( + value="v0.0.1", release_date=dt_local(2021, 11, 2, 6, 56, 38) + ), + PackageVersion( + value="v0.0.5", release_date=dt_local(2021, 11, 13, 21, 36, 37) + ), + PackageVersion( + value="v0.0.3", release_date=dt_local(2021, 11, 19, 20, 31, 22) + ), + PackageVersion( + value="v0.0.4", release_date=dt_local(2021, 12, 1, 19, 2, 44) + ), + PackageVersion( + value="v0.0.2", release_date=dt_local(2022, 1, 4, 13, 54, 1) + ), ] assert results == expected @@ -323,22 +408,49 @@ def test_fetch_with_responses_are_none(self, mock_fetcher): class TestNugetVersionAPI(TestCase): expected_versions = [ - PackageVersion(value="0.23.0", release_date=dt_local(2018, 1, 17, 9, 32, 59, 283000)), - PackageVersion(value="0.24.0", release_date=dt_local(2018, 3, 30, 7, 25, 18, 393000)), - PackageVersion(value="1.0.0", release_date=dt_local(2018, 9, 13, 8, 16, 0, 420000)), - PackageVersion(value="1.0.1", release_date=dt_local(2020, 1, 17, 15, 31, 41, 857000)), - PackageVersion(value="1.0.2", release_date=dt_local(2020, 4, 21, 12, 24, 53, 877000)), PackageVersion( - value="2.0.0-preview01", release_date=dt_local(2018, 1, 9, 17, 12, 20, 440000) + value="0.23.0", release_date=dt_local(2018, 1, 17, 9, 32, 59, 283000) + ), + PackageVersion( + value="0.24.0", release_date=dt_local(2018, 3, 30, 7, 25, 18, 393000) + ), + PackageVersion( + value="1.0.0", release_date=dt_local(2018, 9, 13, 8, 16, 0, 420000) + ), + PackageVersion( + value="1.0.1", release_date=dt_local(2020, 1, 17, 15, 31, 41, 857000) + ), + PackageVersion( + value="1.0.2", release_date=dt_local(2020, 4, 21, 12, 24, 53, 877000) + ), + PackageVersion( + value="2.0.0-preview01", + release_date=dt_local(2018, 1, 9, 17, 12, 20, 440000), + ), + PackageVersion( + value="2.0.0", release_date=dt_local(2018, 9, 27, 13, 33, 15, 370000) + ), + PackageVersion( + value="2.1.0", release_date=dt_local(2018, 10, 16, 6, 59, 44, 680000) + ), + PackageVersion( + value="2.2.0", release_date=dt_local(2018, 11, 23, 8, 13, 8, 3000) + ), + PackageVersion( + value="2.3.0", release_date=dt_local(2019, 6, 27, 14, 27, 31, 613000) + ), + PackageVersion( + value="2.4.0", release_date=dt_local(2020, 1, 17, 15, 11, 5, 810000) + ), + PackageVersion( + value="2.5.0", release_date=dt_local(2020, 3, 24, 14, 22, 39, 960000) + ), + PackageVersion( + value="2.6.0", release_date=dt_local(2020, 3, 27, 11, 6, 27, 500000) + ), + PackageVersion( + value="2.7.0", release_date=dt_local(2020, 4, 21, 12, 27, 36, 427000) ), - PackageVersion(value="2.0.0", release_date=dt_local(2018, 9, 27, 13, 33, 15, 370000)), - PackageVersion(value="2.1.0", release_date=dt_local(2018, 10, 16, 6, 59, 44, 680000)), - PackageVersion(value="2.2.0", release_date=dt_local(2018, 11, 23, 8, 13, 8, 3000)), - PackageVersion(value="2.3.0", release_date=dt_local(2019, 6, 27, 14, 27, 31, 613000)), - PackageVersion(value="2.4.0", release_date=dt_local(2020, 1, 17, 15, 11, 5, 810000)), - PackageVersion(value="2.5.0", release_date=dt_local(2020, 3, 24, 14, 22, 39, 960000)), - PackageVersion(value="2.6.0", release_date=dt_local(2020, 3, 27, 11, 6, 27, 500000)), - PackageVersion(value="2.7.0", release_date=dt_local(2020, 4, 21, 12, 27, 36, 427000)), ] def test_extract_versions(self): diff --git a/packagedb/tests/test_schedules.py b/packagedb/tests/test_schedules.py index c2c5ae02..2965fa60 100644 --- a/packagedb/tests/test_schedules.py +++ b/packagedb/tests/test_schedules.py @@ -39,7 +39,5 @@ def test_get_next_execution(): with patch("datetime.datetime", wraps=datetime.datetime) as dt: dt.now.return_value = time_now - assert expected1 == get_next_execution( - watch_interval_days1, last_watch_date1) - assert expected2 == get_next_execution( - watch_interval_days2, last_watch_date2) + assert expected1 == get_next_execution(watch_interval_days1, last_watch_date1) + assert expected2 == get_next_execution(watch_interval_days2, last_watch_date2) diff --git a/packagedb/tests/test_tasks.py b/packagedb/tests/test_tasks.py index 851a36e6..6677e2f1 100644 --- a/packagedb/tests/test_tasks.py +++ b/packagedb/tests/test_tasks.py @@ -10,6 +10,7 @@ from unittest.mock import patch from django.test import TestCase + from fetchcode.package_versions import PackageVersion from minecode.models import PriorityResourceURI diff --git a/packagedb/tests/test_throttling.py b/packagedb/tests/test_throttling.py index f21a26d8..d8dcc5eb 100644 --- a/packagedb/tests/test_throttling.py +++ b/packagedb/tests/test_throttling.py @@ -7,22 +7,21 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from rest_framework.test import APIClient -from rest_framework.test import APITestCase from unittest.mock import patch from django.contrib.auth.models import User +from rest_framework.test import APIClient +from rest_framework.test import APITestCase + -@patch('rest_framework.throttling.UserRateThrottle.get_rate', lambda x: '20/day') -@patch('rest_framework.throttling.AnonRateThrottle.get_rate', lambda x: '10/day') +@patch("rest_framework.throttling.UserRateThrottle.get_rate", lambda x: "20/day") +@patch("rest_framework.throttling.AnonRateThrottle.get_rate", lambda x: "10/day") class ThrottleApiTests(APITestCase): def setUp(self): # create a basic user self.user = User.objects.create_user( - username="username", - email="e@mail.com", - password="secret" + username="username", email="e@mail.com", password="secret" ) self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) @@ -33,7 +32,7 @@ def setUp(self): username="staff_username", email="staff_e@mail.com", password="secret", - is_staff=True + is_staff=True, ) self.staff_auth = f"Token {self.staff_user.auth_token.key}" self.staff_csrf_client = APIClient(enforce_csrf_checks=True) @@ -43,36 +42,36 @@ def setUp(self): def test_package_endpoint_throttling(self): for i in range(0, 20): - response = self.csrf_client.get('/api/packages/') + response = self.csrf_client.get("/api/packages/") self.assertEqual(response.status_code, 200) - response = self.staff_csrf_client.get('/api/packages/') + response = self.staff_csrf_client.get("/api/packages/") self.assertEqual(response.status_code, 200) - response = self.csrf_client.get('/api/packages/') + response = self.csrf_client.get("/api/packages/") # 429 - too many requests for basic user self.assertEqual(response.status_code, 429) - response = self.staff_csrf_client.get('/api/packages/', format='json') + response = self.staff_csrf_client.get("/api/packages/", format="json") # 200 - staff user can access API unlimited times self.assertEqual(response.status_code, 200) # A anonymous user can only access /packages endpoint 10 times a day for i in range(0, 10): - response = self.csrf_client_anon.get('/api/packages/') + response = self.csrf_client_anon.get("/api/packages/") self.assertEqual(response.status_code, 200) - response = self.csrf_client_anon.get('/api/packages/') + response = self.csrf_client_anon.get("/api/packages/") # 429 - too many requests for anon user self.assertEqual(response.status_code, 429) self.assertEqual( - response.data.get('message'), - 'Your request has been throttled. Please contact support@nexb.com', + response.data.get("message"), + "Your request has been throttled. Please contact support@nexb.com", ) - response = self.csrf_client_anon.get('/api/resources/') + response = self.csrf_client_anon.get("/api/resources/") # 429 - too many requests for anon user self.assertEqual(response.status_code, 429) self.assertEqual( - response.data.get('message'), - 'Your request has been throttled. Please contact support@nexb.com', + response.data.get("message"), + "Your request has been throttled. Please contact support@nexb.com", ) diff --git a/packagedb/tests/test_views.py b/packagedb/tests/test_views.py index 227e3885..2a0ae9b7 100644 --- a/packagedb/tests/test_views.py +++ b/packagedb/tests/test_views.py @@ -12,10 +12,10 @@ class TestViews(TestCase): def test_robots_txt(self): - response = self.client.get('/robots.txt') + response = self.client.get("/robots.txt") assert response.status_code == 200 - assert response['content-type'] == 'text/plain' - assert response.content == b'User-agent: *\nDisallow: *\n' + assert response["content-type"] == "text/plain" + assert response.content == b"User-agent: *\nDisallow: *\n" response = self.client.post("/robots.txt") assert response.status_code == 405 diff --git a/packagedb/throttling.py b/packagedb/throttling.py index 6f310043..ac1dee95 100644 --- a/packagedb/throttling.py +++ b/packagedb/throttling.py @@ -13,9 +13,7 @@ class StaffUserRateThrottle(UserRateThrottle): def allow_request(self, request, view): - """ - Do not apply throttling for superusers and admins. - """ + """Do not apply throttling for superusers and admins.""" if request.user.is_superuser or request.user.is_staff: return True @@ -23,10 +21,7 @@ def allow_request(self, request, view): def throttled_exception_handler(exception, context): - """ - Return this response whenever a request has been throttled - """ - + """Return this response whenever a request has been throttled""" response = exception_handler(exception, context) if isinstance(exception, Throttled): diff --git a/packagedb/to_purl.py b/packagedb/to_purl.py index 0fd03afa..9eac48b1 100644 --- a/packagedb/to_purl.py +++ b/packagedb/to_purl.py @@ -21,8 +21,7 @@ @extend_schema( parameters=[ - OpenApiParameter("go_package", str, "query", - description="go import package"), + OpenApiParameter("go_package", str, "query", description="go import package"), ], responses={200: GoLangPurlResponseSerializer()}, ) diff --git a/purldb_project/__init__.py b/purldb_project/__init__.py index fd15ad83..8e16890b 100644 --- a/purldb_project/__init__.py +++ b/purldb_project/__init__.py @@ -14,8 +14,8 @@ def command_line(): - '''Command line entry point.''' + """Command line entry point.""" from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'purldb_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_project.settings") execute_from_command_line(sys.argv) diff --git a/purldb_project/settings.py b/purldb_project/settings.py index 28c89180..aacea674 100644 --- a/purldb_project/settings.py +++ b/purldb_project/settings.py @@ -11,6 +11,7 @@ from pathlib import Path import environ + from purldb_project import __version__ PURLDB_VERSION = __version__ @@ -32,15 +33,12 @@ SECRET_KEY = env.str("SECRET_KEY") -ALLOWED_HOSTS = env.list("ALLOWED_HOSTS", default=[ - ".localhost", "127.0.0.1", "[::1]"]) +ALLOWED_HOSTS = env.list("ALLOWED_HOSTS", default=[".localhost", "127.0.0.1", "[::1]"]) # SECURITY WARNING: do not run with debug turned on in production DEBUG = env.bool("PURLDB_DEBUG", default=False) -PURLDB_REQUIRE_AUTHENTICATION = env.bool( - "PURLDB_REQUIRE_AUTHENTICATION", default=False -) +PURLDB_REQUIRE_AUTHENTICATION = env.bool("PURLDB_REQUIRE_AUTHENTICATION", default=False) # SECURITY WARNING: do not run with debug turned on in production DEBUG_TOOLBAR = env.bool("PURLDB_DEBUG_TOOLBAR", default=False) @@ -62,38 +60,38 @@ INSTALLED_APPS = ( # Local apps # Must come before Third-party apps for proper templates override - 'clearcode', - 'clearindex', - 'minecode', - 'matchcode', - 'packagedb', + "clearcode", + "clearindex", + "minecode", + "matchcode", + "packagedb", # Django built-in "django.contrib.auth", - 'django.contrib.contenttypes', - 'django.contrib.sessions', - 'django.contrib.messages', - 'django.contrib.staticfiles', - 'django.contrib.admin', + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", "django.contrib.humanize", # Third-party apps - 'django_filters', - 'rest_framework', - 'drf_spectacular', - 'rest_framework.authtoken', - 'django_rq', + "django_filters", + "rest_framework", + "drf_spectacular", + "rest_framework.authtoken", + "django_rq", ) MIDDLEWARE = ( "django.middleware.security.SecurityMiddleware", - 'django.contrib.sessions.middleware.SessionMiddleware', - 'django.middleware.common.CommonMiddleware', - 'django.middleware.csrf.CsrfViewMiddleware', - 'django.contrib.auth.middleware.AuthenticationMiddleware', - 'django.contrib.messages.middleware.MessageMiddleware', - 'django.middleware.clickjacking.XFrameOptionsMiddleware', + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", ) -ROOT_URLCONF = 'purldb_project.urls' +ROOT_URLCONF = "purldb_project.urls" WSGI_APPLICATION = "purldb_project.wsgi.application" @@ -103,20 +101,18 @@ # API -DATA_UPLOAD_MAX_NUMBER_FIELDS = env.int( - "DATA_UPLOAD_MAX_NUMBER_FIELDS", default=2048 -) +DATA_UPLOAD_MAX_NUMBER_FIELDS = env.int("DATA_UPLOAD_MAX_NUMBER_FIELDS", default=2048) # Database DATABASES = { - 'default': { - 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), - 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), - 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), - 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), - 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, + "default": { + "ENGINE": env.str("PACKAGEDB_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("PACKAGEDB_DB_HOST", "localhost"), + "NAME": env.str("PACKAGEDB_DB_NAME", "packagedb"), + "USER": env.str("PACKAGEDB_DB_USER", "packagedb"), + "PASSWORD": env.str("PACKAGEDB_DB_PASSWORD", "packagedb"), + "PORT": env.str("PACKAGEDB_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, } } @@ -126,15 +122,15 @@ TEMPLATES = [ { - 'BACKEND': 'django.template.backends.django.DjangoTemplates', + "BACKEND": "django.template.backends.django.DjangoTemplates", "DIRS": [str(PROJECT_DIR.joinpath("templates"))], "APP_DIRS": True, - 'OPTIONS': { + "OPTIONS": { "debug": DEBUG, - 'context_processors': [ - 'django.contrib.auth.context_processors.auth', - 'django.contrib.messages.context_processors.messages', - 'django.template.context_processors.request', + "context_processors": [ + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "django.template.context_processors.request", "django.template.context_processors.static", ], }, @@ -174,8 +170,8 @@ # Cache CACHES = { - 'default': { - 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', + "default": { + "BACKEND": "django.core.cache.backends.locmem.LocMemCache", "LOCATION": "default", } } @@ -229,42 +225,43 @@ # Static files (CSS, JavaScript, Images) -STATIC_URL = '/static/' +STATIC_URL = "/static/" -STATIC_ROOT = '/var/purldb/static/' +STATIC_ROOT = "/var/purldb/static/" STATICFILES_DIRS = [ - PROJECT_DIR / 'static', + PROJECT_DIR / "static", ] # Third-party apps # Django restframework -REST_FRAMEWORK_DEFAULT_THROTTLE_RATES = { - 'anon': '3600/hour', 'user': '10800/hour'} +REST_FRAMEWORK_DEFAULT_THROTTLE_RATES = {"anon": "3600/hour", "user": "10800/hour"} REST_FRAMEWORK = { - 'DEFAULT_AUTHENTICATION_CLASSES': ('rest_framework.authentication.TokenAuthentication',), - 'DEFAULT_PERMISSION_CLASSES': ('rest_framework.permissions.IsAuthenticated',), - 'DEFAULT_RENDERER_CLASSES': ( - 'rest_framework.renderers.JSONRenderer', - 'rest_framework.renderers.BrowsableAPIRenderer', - 'rest_framework.renderers.AdminRenderer', + "DEFAULT_AUTHENTICATION_CLASSES": ( + "rest_framework.authentication.TokenAuthentication", + ), + "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), + "DEFAULT_RENDERER_CLASSES": ( + "rest_framework.renderers.JSONRenderer", + "rest_framework.renderers.BrowsableAPIRenderer", + "rest_framework.renderers.AdminRenderer", ), - 'DEFAULT_FILTER_BACKENDS': ( - 'django_filters.rest_framework.DjangoFilterBackend', - 'rest_framework.filters.SearchFilter', + "DEFAULT_FILTER_BACKENDS": ( + "django_filters.rest_framework.DjangoFilterBackend", + "rest_framework.filters.SearchFilter", ), - 'DEFAULT_THROTTLE_CLASSES': [ - 'packagedb.throttling.StaffUserRateThrottle', - 'rest_framework.throttling.AnonRateThrottle', - 'rest_framework.throttling.UserRateThrottle', + "DEFAULT_THROTTLE_CLASSES": [ + "packagedb.throttling.StaffUserRateThrottle", + "rest_framework.throttling.AnonRateThrottle", + "rest_framework.throttling.UserRateThrottle", ], - 'DEFAULT_THROTTLE_RATES': REST_FRAMEWORK_DEFAULT_THROTTLE_RATES, - 'EXCEPTION_HANDLER': 'packagedb.throttling.throttled_exception_handler', - 'DEFAULT_PAGINATION_CLASS': 'packagedb.api_custom.PageSizePagination', - 'DEFAULT_SCHEMA_CLASS': 'drf_spectacular.openapi.AutoSchema', + "DEFAULT_THROTTLE_RATES": REST_FRAMEWORK_DEFAULT_THROTTLE_RATES, + "EXCEPTION_HANDLER": "packagedb.throttling.throttled_exception_handler", + "DEFAULT_PAGINATION_CLASS": "packagedb.api_custom.PageSizePagination", + "DEFAULT_SCHEMA_CLASS": "drf_spectacular.openapi.AutoSchema", # Limit the load on the Database returning a small number of records by default. https://github.com/aboutcode-org/vulnerablecode/issues/819 "PAGE_SIZE": 20, } @@ -302,18 +299,18 @@ # Active seeders: each active seeder class need to be added explicitly here ACTIVE_SEEDERS = [ - 'minecode.miners.maven.MavenSeed', + "minecode.miners.maven.MavenSeed", ] SPECTACULAR_SETTINGS = { - 'TITLE': 'PurlDB API', - 'DESCRIPTION': 'Tools to create and expose a database of purls (Package URLs)', - 'VERSION': PURLDB_VERSION, - 'SERVE_INCLUDE_SCHEMA': False, + "TITLE": "PurlDB API", + "DESCRIPTION": "Tools to create and expose a database of purls (Package URLs)", + "VERSION": PURLDB_VERSION, + "SERVE_INCLUDE_SCHEMA": False, } RQ_QUEUES = { - 'default': { + "default": { "HOST": env.str("PURLDB_REDIS_HOST", default="localhost"), "PORT": env.str("PURLDB_REDIS_PORT", default="6379"), "PASSWORD": env.str("PURLDB_REDIS_PASSWORD", default=""), diff --git a/purldb_project/urls.py b/purldb_project/urls.py index 36e17371..c3e8fab5 100644 --- a/purldb_project/urls.py +++ b/purldb_project/urls.py @@ -11,6 +11,7 @@ from django.urls import path from django.views.generic import RedirectView from django.views.generic.base import TemplateView + from drf_spectacular.views import SpectacularAPIView from drf_spectacular.views import SpectacularSwaggerView from rest_framework import routers @@ -30,34 +31,40 @@ from packagedb.to_purl import api_to_purl_router api_router = routers.DefaultRouter() -api_router.register('packages', PackageViewSet) -api_router.register('update_packages', PackageUpdateSet, 'update_packages') -api_router.register('package_sets', PackageSetViewSet) -api_router.register('resources', ResourceViewSet) -api_router.register('validate', PurlValidateViewSet, 'validate') -api_router.register('collect', CollectViewSet, 'collect') -api_router.register('watch', PackageWatchViewSet) -api_router.register('scan_queue', ScannableURIViewSet) -api_router.register('approximate_directory_content_index', - ApproximateDirectoryContentIndexViewSet) -api_router.register('approximate_directory_structure_index', - ApproximateDirectoryStructureIndexViewSet) +api_router.register("packages", PackageViewSet) +api_router.register("update_packages", PackageUpdateSet, "update_packages") +api_router.register("package_sets", PackageSetViewSet) +api_router.register("resources", ResourceViewSet) +api_router.register("validate", PurlValidateViewSet, "validate") +api_router.register("collect", CollectViewSet, "collect") +api_router.register("watch", PackageWatchViewSet) +api_router.register("scan_queue", ScannableURIViewSet) +api_router.register( + "approximate_directory_content_index", ApproximateDirectoryContentIndexViewSet +) +api_router.register( + "approximate_directory_structure_index", ApproximateDirectoryStructureIndexViewSet +) urlpatterns = [ path( - 'robots.txt', - TemplateView.as_view(template_name='robots.txt', - content_type='text/plain'), + "robots.txt", + TemplateView.as_view(template_name="robots.txt", content_type="text/plain"), ), - path('api/', include((api_router.urls, 'api'))), - path('api/to_purl/', include((api_to_purl_router.urls, 'api_to'))), - path('api/from_purl/', include((api_from_purl_router.urls, 'api_from'))), + path("api/", include((api_router.urls, "api"))), + path("api/to_purl/", include((api_to_purl_router.urls, "api_to"))), + path("api/from_purl/", include((api_from_purl_router.urls, "api_from"))), path("", RedirectView.as_view(url="api/")), - path('api/schema/', SpectacularAPIView.as_view(), name='schema'), - path('api/docs/', SpectacularSwaggerView.as_view(url_name='schema'), - name='swagger-ui'), + path("api/schema/", SpectacularAPIView.as_view(), name="schema"), + path( + "api/docs/", + SpectacularSwaggerView.as_view(url_name="schema"), + name="swagger-ui", + ), path( - 'api/scan_queue/index_package_scan//', index_package_scan, name='index_package_scan' + "api/scan_queue/index_package_scan//", + index_package_scan, + name="index_package_scan", ), ] diff --git a/purldb_project/wsgi.py b/purldb_project/wsgi.py index 485a66e1..bfb452a3 100644 --- a/purldb_project/wsgi.py +++ b/purldb_project/wsgi.py @@ -8,8 +8,8 @@ # import os -from django.core.wsgi import get_wsgi_application +from django.core.wsgi import get_wsgi_application """ WSGI config for purldb. @@ -18,6 +18,6 @@ """ -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'purldb_project.settings') +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_project.settings") application = get_wsgi_application() diff --git a/purldb_public_project/__init__.py b/purldb_public_project/__init__.py index adb81bf9..a77c6343 100644 --- a/purldb_public_project/__init__.py +++ b/purldb_public_project/__init__.py @@ -12,9 +12,8 @@ def command_line(): - '''Command line entry point.''' + """Command line entry point.""" from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', - 'purldb_public_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_public_project.settings") execute_from_command_line(sys.argv) diff --git a/purldb_public_project/settings.py b/purldb_public_project/settings.py index 5e6d5bee..f990759e 100644 --- a/purldb_public_project/settings.py +++ b/purldb_public_project/settings.py @@ -9,7 +9,6 @@ from purldb_project.settings import * +ROOT_URLCONF = "purldb_public_project.urls" -ROOT_URLCONF = 'purldb_public_project.urls' - -WSGI_APPLICATION = 'purldb_public_project.wsgi.application' +WSGI_APPLICATION = "purldb_public_project.wsgi.application" diff --git a/purldb_public_project/urls.py b/purldb_public_project/urls.py index 14e5546a..fddb594b 100644 --- a/purldb_public_project/urls.py +++ b/purldb_public_project/urls.py @@ -12,30 +12,31 @@ from django.views.generic import RedirectView from django.views.generic.base import TemplateView +from drf_spectacular.views import SpectacularAPIView +from drf_spectacular.views import SpectacularSwaggerView from rest_framework import routers from packagedb.api import PackagePublicViewSet from packagedb.api import PurlValidateViewSet from packagedb.api import ResourceViewSet -from drf_spectacular.views import SpectacularAPIView -from drf_spectacular.views import SpectacularSwaggerView - api_router = routers.DefaultRouter() -api_router.register('packages', PackagePublicViewSet) -api_router.register('resources', ResourceViewSet) -api_router.register('validate', PurlValidateViewSet, 'validate') +api_router.register("packages", PackagePublicViewSet) +api_router.register("resources", ResourceViewSet) +api_router.register("validate", PurlValidateViewSet, "validate") urlpatterns = [ path( - 'robots.txt', - TemplateView.as_view(template_name='robots.txt', - content_type='text/plain'), + "robots.txt", + TemplateView.as_view(template_name="robots.txt", content_type="text/plain"), + ), + path("api/", include((api_router.urls, "api"))), + path("", RedirectView.as_view(url="api/")), + path("api/schema/", SpectacularAPIView.as_view(), name="schema"), + path( + "api/docs/", + SpectacularSwaggerView.as_view(url_name="schema"), + name="swagger-ui", ), - path('api/', include((api_router.urls, 'api'))), - path('', RedirectView.as_view(url='api/')), - path('api/schema/', SpectacularAPIView.as_view(), name='schema'), - path('api/docs/', SpectacularSwaggerView.as_view(url_name='schema'), - name='swagger-ui'), ] diff --git a/purldb_public_project/wsgi.py b/purldb_public_project/wsgi.py index f1448671..242c50a8 100644 --- a/purldb_public_project/wsgi.py +++ b/purldb_public_project/wsgi.py @@ -8,8 +8,8 @@ # import os -from django.core.wsgi import get_wsgi_application +from django.core.wsgi import get_wsgi_application """ WSGI config for purldb-public. @@ -18,7 +18,6 @@ """ -os.environ.setdefault('DJANGO_SETTINGS_MODULE', - 'purldb_public_project.settings') +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_public_project.settings") application = get_wsgi_application() From f1d05f89c272c4c9fd76bdce5c312ea5f0e58f26 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 13:55:42 -0700 Subject: [PATCH 06/12] Address style and format issues #512 #515 Signed-off-by: Jono Yang --- clearcode/load.py | 4 +- clearcode/store_scans.py | 28 ++++++------ clearcode/sync.py | 2 +- clearcode/tests/test_models.py | 21 +++++---- clearindex/harvest.py | 8 +--- etc/scripts/clearcode-api-backup.py | 4 -- etc/scripts/utils_thirdparty.py | 6 +-- matchcode/tests/test_match.py | 6 +-- matchcode/tests/test_models.py | 2 +- matchcode_pipeline/tests/test_api.py | 10 +---- minecode/api.py | 2 +- minecode/collectors/debian.py | 4 +- minecode/collectors/maven.py | 6 +-- minecode/filter.py | 32 ++++++------- minecode/indexing.py | 4 +- .../management/commands/check_licenses.py | 3 +- minecode/management/commands/import_queue.py | 4 +- .../management/commands/make_scannableuris.py | 4 +- .../management/commands/priority_queue.py | 3 -- minecode/management/commands/run_map.py | 12 ++--- minecode/management/commands/run_visit.py | 10 ++--- .../commands/update_maven_package_data.py | 4 +- minecode/miners/__init__.py | 6 +-- minecode/miners/bitbucket.py | 9 ++-- minecode/miners/bower.py | 2 + minecode/miners/cpan.py | 10 +++-- minecode/miners/cran.py | 4 +- minecode/miners/debian.py | 11 ++--- minecode/miners/dockerhub.py | 2 +- minecode/miners/eclipse.py | 2 +- minecode/miners/fdroid.py | 2 +- minecode/miners/github.py | 2 +- minecode/miners/java_stream.py | 1 - minecode/miners/maven.py | 12 ++--- minecode/miners/pypi.py | 2 +- minecode/miners/repodata.py | 10 ++--- minecode/miners/repomd.py | 6 +-- minecode/miners/rubygems.py | 4 +- minecode/miners/sourceforge.py | 7 +-- minecode/miners/ubuntu.py | 1 - minecode/model_utils.py | 8 ++-- minecode/models.py | 13 +++--- minecode/route.py | 5 ++- minecode/rsync.py | 4 +- minecode/saneyaml.py | 6 +-- minecode/seed.py | 2 +- minecode/tests/collectors/test_conan.py | 2 +- minecode/tests/collectors/test_gnu.py | 2 +- minecode/tests/collectors/test_maven.py | 2 +- minecode/tests/collectors/test_npm.py | 2 +- minecode/tests/miners/test_debian.py | 3 +- minecode/tests/miners/test_maven.py | 1 - minecode/tests/miners/test_rubygems.py | 4 +- minecode/tests/test_api.py | 8 ++-- minecode/tests/test_command.py | 2 +- minecode/tests/test_migrations.py | 45 ------------------- minecode/tests/test_models.py | 7 +-- minecode/tests/test_rsync.py | 2 +- minecode/utils.py | 28 ++++++------ minecode/utils_test.py | 24 +++++----- minecode/version.py | 2 +- packagedb/api.py | 2 +- .../commands/create_source_repo_packages.py | 1 - .../management/commands/run_scheduler.py | 2 +- packagedb/models.py | 17 ++++--- packagedb/package_managers.py | 2 +- packagedb/schedules.py | 4 +- packagedb/serializers.py | 8 ++-- packagedb/tests/test_api.py | 34 ++++---------- packagedb/tests/test_filters.py | 2 +- packagedb/tests/test_models.py | 2 +- packagedb/tests/test_throttling.py | 6 ++- packagedb/to_purl.py | 2 +- purl2vcs/src/purl2vcs/find_source_repo.py | 2 +- 74 files changed, 219 insertions(+), 309 deletions(-) diff --git a/clearcode/load.py b/clearcode/load.py index e4e3b0ab..c90a4b48 100644 --- a/clearcode/load.py +++ b/clearcode/load.py @@ -71,9 +71,7 @@ def walk_and_load_from_filesystem(input_dir, cd_root_dir): # Save to DB try: - cditem = models.CDitem.objects.create( - path=cditem_rel_path, content=content - ) + models.CDitem.objects.create(path=cditem_rel_path, content=content) except IntegrityError: # skip if we already have it in the DB continue diff --git a/clearcode/store_scans.py b/clearcode/store_scans.py index 0304f193..f2dbef2a 100644 --- a/clearcode/store_scans.py +++ b/clearcode/store_scans.py @@ -33,18 +33,18 @@ from clearcode.models import CDitem """ -The input is a bunch of scans from ClearlyDefined and -the output is a bunch of git repositories with commited and -pushed scans such that we balance the scans roughly evenly accross +The input is a bunch of scans from ClearlyDefined and +the output is a bunch of git repositories with commited and +pushed scans such that we balance the scans roughly evenly accross different repositories. -The primary reason for multiple repositories is size of a single -repo. There is a size limit of 5 GB at GitHub and it's difficult +The primary reason for multiple repositories is size of a single +repo. There is a size limit of 5 GB at GitHub and it's difficult to work with repositories with million files. -Therefore the approach is to use hashing as a way to name git -repositories and directories. We compute hash on the purl of the scanned -package and use the first few layers of this hash for the repo and +Therefore the approach is to use hashing as a way to name git +repositories and directories. We compute hash on the purl of the scanned +package and use the first few layers of this hash for the repo and directory names. Initial processing steps are: @@ -54,15 +54,15 @@ - Then we store the scan using the purl hash and purl as path. - Finally commit and push! : ) -Because it's not practical to process many repos at once, we organize the -processing one repo a time. For this, we iterate over a bunch of records get or compute +Because it's not practical to process many repos at once, we organize the +processing one repo a time. For this, we iterate over a bunch of records get or compute the purl hash and process the records that share the same hash. -We are using a short hash that is three characters long using hexadecimal encoding. -Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about +We are using a short hash that is three characters long using hexadecimal encoding. +Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about 25k scan files, if we were to store 100 million scans (which is a high mark). -For reference one scan should use less than a 100k on average when compressed -with gzip or git based on looking at 15 million scans. Each repo should be roughly +For reference one scan should use less than a 100k on average when compressed +with gzip or git based on looking at 15 million scans. Each repo should be roughly couple hundred mega bytes big, based on 15 million scans. """ diff --git a/clearcode/sync.py b/clearcode/sync.py index 4b1d8098..9d3dd34f 100644 --- a/clearcode/sync.py +++ b/clearcode/sync.py @@ -350,7 +350,7 @@ def is_unchanged_remotely(self, url, session=session): remote_etag = response.headers.get("etag") if remote_etag and self.etags_cache.get(url) == remote_etag: return True - except: + except Exception: return False def is_fetched(self, checksum, url): diff --git a/clearcode/tests/test_models.py b/clearcode/tests/test_models.py index ddaf6524..33fa4a27 100644 --- a/clearcode/tests/test_models.py +++ b/clearcode/tests/test_models.py @@ -39,7 +39,7 @@ def test_known_package_types(self): # This path starts with npm, which is known cditem_1 = CDitem.objects.create(path="npm/name/version") # asdf is not a proper type - cditem_2 = CDitem.objects.create(path="asdf/name/version") + CDitem.objects.create(path="asdf/name/version") cditems = list(CDitem.objects.known_package_types()) self.assertEqual(1, len(cditems)) cditem = cditems[0] @@ -50,7 +50,7 @@ def test_definitions(self): path="composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json" ) # harvest should not be in cditems - harvest = CDitem.objects.create( + CDitem.objects.create( path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) cditems = list(CDitem.objects.definitions()) @@ -63,7 +63,7 @@ def test_scancode_harvests(self): path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) # unexpected_harvest should not be in cditems - unexpected_harvest = CDitem.objects.create( + CDitem.objects.create( path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/licensee/9.13.0.json" ) harvests = list(CDitem.objects.scancode_harvests()) @@ -75,7 +75,8 @@ def test_mappable(self): definition_1 = CDitem.objects.create( path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" ) - definition_2 = CDitem.objects.create( + # This should not be mappable + CDitem.objects.create( path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json", last_map_date=timezone.now(), map_error="error", @@ -92,12 +93,14 @@ def test_mappable_definitions(self): definition_1 = CDitem.objects.create( path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" ) - definition_2 = CDitem.objects.create( + # This should not be mappable + CDitem.objects.create( path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json", last_map_date=timezone.now(), map_error="error", ) - harvest = CDitem.objects.create( + # This should not be mappable + CDitem.objects.create( path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) mappables = list(CDitem.objects.mappable_definitions()) @@ -109,12 +112,14 @@ def test_mappable_scancode_harvests(self): harvest_1 = CDitem.objects.create( path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) - harvest_2 = CDitem.objects.create( + # This should not be mappable + CDitem.objects.create( path="sourcearchive/mavencentral/io.cucumber/cucumber-core/revision/5.0.0-RC1/tool/scancode/3.2.2.json", last_map_date=timezone.now(), map_error="error", ) - definition_1 = CDitem.objects.create( + # This should not be mappable + CDitem.objects.create( path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" ) mappables = list(CDitem.objects.mappable_scancode_harvests()) diff --git a/clearindex/harvest.py b/clearindex/harvest.py index a152c7db..31105093 100644 --- a/clearindex/harvest.py +++ b/clearindex/harvest.py @@ -94,9 +94,7 @@ def create_from_harvest(package_scan={}, files_data=[], cditem_path=""): download_url = package_data.get("download_url") if not download_url: - logger.error( - f"Null `download_url` value for `package_data`: {package_data}" - ) + logger.error(f"Null `download_url` value for `package_data`: {package_data}") return # This ugly block is needed until https://github.com/nexB/packagedb/issues/14 @@ -115,9 +113,7 @@ def create_from_harvest(package_scan={}, files_data=[], cditem_path=""): merge_packages( existing_package=package, new_package_data=package_data, replace=False ) - package.append_to_history( - f"Updated package from CDitem harvest: {cditem_path}" - ) + package.append_to_history(f"Updated package from CDitem harvest: {cditem_path}") logger.info(f"Merged package data from scancode harvest: {package}") diff --git a/etc/scripts/clearcode-api-backup.py b/etc/scripts/clearcode-api-backup.py index d11c4d34..88171041 100644 --- a/etc/scripts/clearcode-api-backup.py +++ b/etc/scripts/clearcode-api-backup.py @@ -151,10 +151,6 @@ def run_api_backup(api_root_url, extra_payload=None): objects = get_all_objects_from_endpoint(endpoint_url, extra_payload=extra_payload) print('{} {} collected.'.format(len(objects), endpoint_name)) - collect_extra_conditions = [ - extra_payload.get('last_modified_date'), - ] - results[endpoint_name] += objects timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') diff --git a/etc/scripts/utils_thirdparty.py b/etc/scripts/utils_thirdparty.py index addf8e5e..7f222abc 100644 --- a/etc/scripts/utils_thirdparty.py +++ b/etc/scripts/utils_thirdparty.py @@ -845,7 +845,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): if TRACE: print(f"Fetched license from remote: {lic_url}") - except: + except Exception: try: # try licensedb second lic_url = f"{LICENSEDB_API_URL}/{filename}" @@ -858,7 +858,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): if TRACE: print(f"Fetched license from licensedb: {lic_url}") - except: + except Exception: msg = f'No text for license {filename} in expression "{self.license_expression}" from {self}' print(msg) errors.append(msg) @@ -1290,7 +1290,7 @@ def is_pure(self): def is_pure_wheel(filename): try: return Wheel.from_filename(filename).is_pure() - except: + except Exception: return False diff --git a/matchcode/tests/test_match.py b/matchcode/tests/test_match.py index 07faada7..a09fab4d 100644 --- a/matchcode/tests/test_match.py +++ b/matchcode/tests/test_match.py @@ -34,7 +34,7 @@ class MatchPackagesTestCase(MatchcodeTestCase): def setUp(self): # Execute the superclass' setUp method before creating our own # DB objects - super(MatchPackagesTestCase, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( filename="abbot-0.12.3.jar", @@ -158,7 +158,7 @@ class MatchNestedPackagesTestCase(MatchcodeTestCase): def setUp(self): # Execute the superclass' setUp method before creating our own # DB objects - super(MatchNestedPackagesTestCase, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( filename="plugin-request-2.4.1.tgz", @@ -219,7 +219,7 @@ class DirectoryMatchingTestCase(MatchcodeTestCase): maxDiff = None def setUp(self): - super(DirectoryMatchingTestCase, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( filename="abbrev-1.0.3.tgz", diff --git a/matchcode/tests/test_models.py b/matchcode/tests/test_models.py index a74d6aa2..d0eef2e9 100644 --- a/matchcode/tests/test_models.py +++ b/matchcode/tests/test_models.py @@ -41,7 +41,7 @@ class BaseModelTest(MatchcodeTestCase): maxDiff = None def setUp(self): - super(BaseModelTest, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( filename="abbot-0.12.3.jar", diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index 7933298f..d2bdbe9e 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -141,11 +141,8 @@ def test_matching_pipeline_api_matching_create(self, mock_execute_pipeline_task) self.assertEqual("matching", response.data["runs"][0]["pipeline_name"]) mock_execute_pipeline_task.assert_called_once() - created_matching_project_detail_url = response.data["url"] - matching_project_uuid = response.data["uuid"] - results_url = reverse("matching-results", args=[matching_project_uuid]) - # Check that the file was uploaded + created_matching_project_detail_url = response.data["url"] response = self.csrf_client.get(created_matching_project_detail_url) self.assertEqual("test-out.json", response.data["input_sources"][0]["filename"]) @@ -165,11 +162,8 @@ def test_matching_pipeline_api_matching_create_multiple_input_urls( self.assertEqual("matching", response.data["runs"][0]["pipeline_name"]) mock_execute_pipeline_task.assert_called_once() - created_matching_project_detail_url = response.data["url"] - matching_project_uuid = response.data["uuid"] - results_url = reverse("matching-results", args=[matching_project_uuid]) - # Check that the file was uploaded + created_matching_project_detail_url = response.data["url"] response = self.csrf_client.get(created_matching_project_detail_url) input_sources = response.data["input_sources"] self.assertEqual(2, len(input_sources)) diff --git a/minecode/api.py b/minecode/api.py index a479f01a..c47a38cb 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -201,7 +201,7 @@ def index_package_scan(request, key): user_id = signing.loads(key) User = get_user_model() - user = get_object_or_404(User, id=user_id) + get_object_or_404(User, id=user_id) results = json_data.get("results") summary = json_data.get("summary") diff --git a/minecode/collectors/debian.py b/minecode/collectors/debian.py index 72216078..c7bf3d42 100644 --- a/minecode/collectors/debian.py +++ b/minecode/collectors/debian.py @@ -521,9 +521,7 @@ def get_vcs_repo(description): repos.append((vcs_tool, vcs_repo)) if len(repos) > 1: - raise TypeError( - "Debian description with more than one Vcs repos: %(repos)r" % locals() - ) + raise TypeError(f"Debian description with more than one Vcs repos: {repos}") if repos: vcs_tool, vcs_repo = repos[0] diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index 0aed49ef..7aa4bd35 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -498,17 +498,17 @@ def process_request(purl_str, **kwargs): def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): """Return True if `file_name` is in `links`""" - return any(l.endswith(file_name) for l in links) + return any(link.endswith(file_name) for link in links) def check_if_page_has_pom_files(links, **kwargs): """Return True of any entry in `links` ends with .pom.""" - return any(l.endswith(".pom") for l in links) + return any(link.endswith(".pom") for link in links) def check_if_page_has_directories(links, **kwargs): """Return True if any entry, excluding "../", ends with /.""" - return any(l.endswith("/") for l in links if l != "../") + return any(link.endswith("/") for link in links if link != "../") def check_if_package_version_page(links, **kwargs): diff --git a/minecode/filter.py b/minecode/filter.py index 371e9415..018c6bc9 100644 --- a/minecode/filter.py +++ b/minecode/filter.py @@ -39,28 +39,28 @@ def sf_net(input_file, output): writer = csv.writer(fo, quoting=csv.QUOTE_ALL) with open(input_file) as fi: reader = csv.reader(fi) - for i, l in enumerate(reader): + for i, row in enumerate(reader): if i == 0: # add headers on first row - l.extend(new_headers) - if not l: + row.extend(new_headers) + if not row: continue - project_id = l[0] - name = l[1] - version_column = l[2] + project_id = row[0] + name = row[1] + version_column = row[2] sep = ": released on " if sep not in version_column: # write as is if we do not have a file release date # separator - writer.writerow(l) + writer.writerow(row) continue filename, release_date_ts = version_column.split(sep, 1) found_version = version.version_hint(filename) - l.append(found_version or "") - l.append(release_date_ts or "") - l.append(download_url_template % locals()) - l.append("") # reviewed - l.append("") # curated name + row.append(found_version or "") + row.append(release_date_ts or "") + row.append(download_url_template % locals()) + row.append("") # reviewed + row.append("") # curated name excluded_reason = "" if "." in project_id: excluded_reason = "mirror or special project" @@ -70,10 +70,10 @@ def sf_net(input_file, output): excluded_reason = "special chars in name" elif not good_filename(project_id, filename, name): excluded_reason = "multi component possible" - l.append(excluded_reason) - l.append("") # curated_owner - l.append("") # owner_type - writer.writerow(l) + row.append(excluded_reason) + row.append("") # curated_owner + row.append("") # owner_type + writer.writerow(row) def good_name(s): diff --git a/minecode/indexing.py b/minecode/indexing.py index 797f6d15..c94d14e4 100644 --- a/minecode/indexing.py +++ b/minecode/indexing.py @@ -100,7 +100,9 @@ def index_package( declared_license_expression = summary_data.get("declared_license_expression") other_license_expressions = summary_data.get("other_license_expressions", []) other_license_expressions = [ - l["value"] for l in other_license_expressions if l["value"] + license_expression["value"] + for license_expression in other_license_expressions + if license_expression["value"] ] other_license_expression = combine_expressions(other_license_expressions) diff --git a/minecode/management/commands/check_licenses.py b/minecode/management/commands/check_licenses.py index 62eb32cc..fbab6dcc 100644 --- a/minecode/management/commands/check_licenses.py +++ b/minecode/management/commands/check_licenses.py @@ -99,8 +99,7 @@ def find_ambiguous_packages( ) license_filter = reduce(operator.or_, filter_expression) - for package in Package.objects.filter(type__in=types).filter(license_filter): - yield package + yield from Package.objects.filter(type__in=types).filter(license_filter) def dump(packages, json_location): diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 5b53c28a..0be7a9f2 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -92,7 +92,9 @@ def handle(self, *args, **options): try: errors = process_request(importable_uri) except Exception as e: - errors = f"Error: Failed to process ImportableURI: {repr(importable_uri)}\n" + errors = ( + f"Error: Failed to process ImportableURI: {repr(importable_uri)}\n" + ) errors += get_error_message(e) finally: if errors: diff --git a/minecode/management/commands/make_scannableuris.py b/minecode/management/commands/make_scannableuris.py index ae06a7a9..4384c287 100644 --- a/minecode/management/commands/make_scannableuris.py +++ b/minecode/management/commands/make_scannableuris.py @@ -30,9 +30,7 @@ def handle(self, *args, **options): uri=package_uri, package=package ) if created: - self.stdout.write( - f"ScannableURI created for: {package_uri}" - ) + self.stdout.write(f"ScannableURI created for: {package_uri}") except Exception as e: msg = f"Error creating ScannableURI for: {package_uri}" msg += get_error_message(e) diff --git a/minecode/management/commands/priority_queue.py b/minecode/management/commands/priority_queue.py index 3c88e849..510febbb 100644 --- a/minecode/management/commands/priority_queue.py +++ b/minecode/management/commands/priority_queue.py @@ -116,9 +116,6 @@ def process_request(priority_resource_uri, _priority_router=priority_router): if priority: kwargs["priority"] = priority errors = _priority_router.process(purl_to_visit, **kwargs) - if TRACE: - new_uris_to_visit = list(new_uris_to_visit or []) - logger.debug(f"visit_uri: new_uris_to_visit: {new_uris_to_visit}") return errors diff --git a/minecode/management/commands/run_map.py b/minecode/management/commands/run_map.py index d8ff545f..819b25e8 100644 --- a/minecode/management/commands/run_map.py +++ b/minecode/management/commands/run_map.py @@ -130,7 +130,9 @@ def map_uri(resource_uri, _map_router=map_router): return except Exception as e: - msg = f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" + msg = ( + f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" + ) msg += get_error_message(e) logger.error(msg) # we had an error, so mapped_scanned_packages is an error string @@ -164,12 +166,12 @@ def map_uri(resource_uri, _map_router=map_router): package=package, ) if scannable_uri_created: - logger.debug( - f" + Inserted ScannableURI\t: {package_uri}" - ) + logger.debug(f" + Inserted ScannableURI\t: {package_uri}") except Exception as e: - msg = f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" + msg = ( + f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" + ) msg += f"While processing scanned_package: {repr(scanned_package)}\n" msg += get_error_message(e) logger.error(msg) diff --git a/minecode/management/commands/run_visit.py b/minecode/management/commands/run_visit.py index d428e95a..1f923d2f 100644 --- a/minecode/management/commands/run_visit.py +++ b/minecode/management/commands/run_visit.py @@ -223,9 +223,7 @@ def visit_uris( ) if max_loops and int(visited_counter) > int(max_loops): - logger.info( - f"Stopping visits after max_loops: {max_loops} visit loops." - ) + logger.info(f"Stopping visits after max_loops: {max_loops} visit loops.") break return visited_counter, inserted_counter @@ -296,7 +294,7 @@ def visit_uri( if visit_error: msg = f"Visit error for URI: {uri_to_visit}" msg += "\n".format() - msg += get_error_message(e) + msg += get_error_message(e) # NOQA visit_errors.append(msg) logger.error(msg) @@ -366,9 +364,7 @@ def visit_uri( break if max_uris and int(uri_counter_by_visitor[visitor_key]) > int(max_uris): - logger.info( - f" ! Breaking after processing max-uris: {max_uris} URIs." - ) + logger.info(f" ! Breaking after processing max-uris: {max_uris} URIs.") break except Exception as e: diff --git a/minecode/management/commands/update_maven_package_data.py b/minecode/management/commands/update_maven_package_data.py index f8607cf1..2ac6c0c7 100644 --- a/minecode/management/commands/update_maven_package_data.py +++ b/minecode/management/commands/update_maven_package_data.py @@ -209,7 +209,7 @@ def update_maven_packages( namespace = namespace.lower() name = name.lower() version = version.lower() - normalize_qualifiers = normalize_qualifiers.lower() + normalized_qualifiers = normalized_qualifiers.lower() existing_packages = Package.objects.filter( type="maven", @@ -218,7 +218,7 @@ def update_maven_packages( version=version, qualifiers=normalized_qualifiers or "", ) - if existing_package.exists(): + if existing_packages.exists(): duplicate_packages = [] for existing_package in existing_packages: if existing_package.download_url != maven_package.download_url: diff --git a/minecode/miners/__init__.py b/minecode/miners/__init__.py index 5ff27551..70314368 100644 --- a/minecode/miners/__init__.py +++ b/minecode/miners/__init__.py @@ -227,7 +227,7 @@ def fetch(self, uri, timeout=10): `timeout` is a default timeout. """ - content = super(NonPersistentHttpVisitor, self).fetch(uri, timeout=timeout) + content = super().fetch(uri, timeout=timeout) temp_file = get_temp_file("NonPersistentHttpVisitor") with open(temp_file, "wb") as tmp: tmp.write(content) @@ -266,8 +266,8 @@ def __call__(self, uri, resource_uri): def get_packages(self, uri, resource_uri): """ - This method must yield ScannedPackage objects (or return a list) built - from a resource_uri ResourceURI object. + Yield ScannedPackage objects (or return a list) built from a + resource_uri ResourceURI object. """ raise NotImplementedError diff --git a/minecode/miners/bitbucket.py b/minecode/miners/bitbucket.py index e7b95356..91b3b4fc 100644 --- a/minecode/miners/bitbucket.py +++ b/minecode/miners/bitbucket.py @@ -207,10 +207,9 @@ def get_packages(self, uri, resource_uri): """Yield Package built from resource_uri record for a single package version.""" downloads_data = json.loads(resource_uri.data) for download_data in downloads_data.get("values", []): - for package in build_bitbucket_download_packages( + yield from build_bitbucket_download_packages( download_data, resource_uri.package_url - ): - yield package + ) def build_bitbucket_download_packages(download_data, purl): @@ -223,8 +222,8 @@ def build_bitbucket_download_packages(download_data, purl): name = purl.name # FIXME: add these ? - filename = download_data.get("name") - download_counts = download_data.get("downloads", 0) + # filename = download_data.get("name") + # download_counts = download_data.get("downloads", 0) download_url = download_data.get("links", {}).get("self", {}).get("href") size = download_data.get("size") diff --git a/minecode/miners/bower.py b/minecode/miners/bower.py index 7195294d..55aa3d81 100644 --- a/minecode/miners/bower.py +++ b/minecode/miners/bower.py @@ -32,6 +32,8 @@ class BowerTopJsonVisitor(HttpJsonVisitor): def get_uris(self, content): """ + Yield URIs given `content` from Bower. + The json content is a list with name and url, like the following format: ... { diff --git a/minecode/miners/cpan.py b/minecode/miners/cpan.py index ec4f941c..d2eebcc1 100644 --- a/minecode/miners/cpan.py +++ b/minecode/miners/cpan.py @@ -192,7 +192,7 @@ class CpanReadmeVisitors(HttpVisitor): def dumps(self, content): """Return the json by parsing the readme content""" # Handle bytes properly in python3 - if type(content) == bytes: + if type(content) is bytes: content = content.decode("utf-8") lines = content.splitlines() @@ -240,7 +240,7 @@ def build_packages_from_release_json(metadata, uri=None): continue extracted_license_statement = [ - l for l in release.get("license", []) if l and l.strip() + lic for lic in release.get("license", []) if lic and lic.strip() ] common_data = dict( @@ -278,7 +278,9 @@ def build_packages_from_release_json(metadata, uri=None): # like perl_5. The license here under resources section is the # url of license for example: http://dev.perl.org/licenses/ So # it's useful to collect both information... - license_url = [l for l in resources.get("license", []) if l and l.strip()] + license_url = [ + lic for lic in resources.get("license", []) if lic and lic.strip() + ] if license_url: common_data["extracted_license_statement"].extend(license_url) @@ -355,7 +357,7 @@ def build_packages_from_metafile(metadata, uri=None, purl=None): licenses_content = content.get("license") extracted_license_statement = [] if licenses_content: - if isinstance(licenses_content, (list,)): + if isinstance(licenses_content, list): for lic in licenses_content: extracted_license_statement.append(lic) else: diff --git a/minecode/miners/cran.py b/minecode/miners/cran.py index 5465c37d..c524c36c 100644 --- a/minecode/miners/cran.py +++ b/minecode/miners/cran.py @@ -131,7 +131,7 @@ def build_packages_from_html(metadata, uri=None, purl=None): if key == "Version:": common_data["version"] = value elif key == "URL:": - if type(value) == list and len(value) > 0: + if type(value) is list and len(value) > 0: homepages = [] for home_page in value: homepages.append(home_page) @@ -165,7 +165,7 @@ def build_packages_from_html(metadata, uri=None, purl=None): ) common_data["parties"].append(party.to_dict()) elif "source" in key or "binaries" in key: - if type(value) == list: + if type(value) is list: for url in value: download_urls.append(get_download_url(url)) elif key == "Published:": diff --git a/minecode/miners/debian.py b/minecode/miners/debian.py index 0eeed102..13702168 100644 --- a/minecode/miners/debian.py +++ b/minecode/miners/debian.py @@ -20,6 +20,7 @@ from packagedcode import models as scan_models from packageurl import PackageURL +import minecode.collectors.debian as debian_collector from minecode import debutils from minecode import ls from minecode import map_router @@ -352,12 +353,12 @@ def parse_description(metadata, purl=None, base_download_url=None): if metadata.get("Label"): common_data["keywords"] = [metadata.get("Label")] - vcs_tool, vcs_repo = get_vcs_repo(metadata) + vcs_tool, vcs_repo = debian_collector.get_vcs_repo(metadata) if vcs_tool and vcs_repo: vcs_repo = form_vcs_url(vcs_tool, vcs_repo) common_data["vcs_url"] = vcs_repo - dependencies = get_dependencies(metadata) + dependencies = debian_collector.get_dependencies(metadata) if dependencies: common_data["dependencies"] = dependencies @@ -452,7 +453,7 @@ def build_source_file_packages(metadata, purl=None): ) parties.append(party) - dependencies = get_dependencies(source, ["Build-Depends"]) + dependencies = debian_collector.get_dependencies(source, ["Build-Depends"]) keywords = set() keywords.update(debutils.comma_separated(source.get("Binary", ""))) @@ -478,7 +479,7 @@ def build_source_file_packages(metadata, purl=None): package["download_url"] = download_url - vcs_tool, vcs_repo = get_vcs_repo(source) + vcs_tool, vcs_repo = debian_collector.get_vcs_repo(source) if vcs_tool and vcs_repo: vcs_repo = form_vcs_url(vcs_tool, vcs_repo) package["vcs_url"] = vcs_repo @@ -549,7 +550,7 @@ def parse_packages(metadata, purl=None): party = scan_models.Party(name=name, role="maintainer", email=email) data["parties"].append(party) - dependencies = get_dependencies(pack) + dependencies = debian_collector.get_dependencies(pack) if dependencies: data["dependencies"] = dependencies diff --git a/minecode/miners/dockerhub.py b/minecode/miners/dockerhub.py index 3824294e..3e9dd285 100644 --- a/minecode/miners/dockerhub.py +++ b/minecode/miners/dockerhub.py @@ -155,7 +155,7 @@ def get_uris(self, content): for result in results: name = result.get("name") # TODO: This will be used when new Package definition is merged. - star_count = result.get("star_count") + # star_count = result.get("star_count") if name: package_url = PackageURL(type="docker", name=name).to_string() yield URI( diff --git a/minecode/miners/eclipse.py b/minecode/miners/eclipse.py index 60c0e525..4e30aceb 100644 --- a/minecode/miners/eclipse.py +++ b/minecode/miners/eclipse.py @@ -218,7 +218,7 @@ def build_packages_with_json(metadata, purl=None, uri=None): if project_metadata.get("licenses"): common_data["extracted_license_statement"] = [ - l.get("name") for l in project_metadata.get("licenses", []) + lic.get("name") for lic in project_metadata.get("licenses", []) ] common_data["license_detections"] = [] diff --git a/minecode/miners/fdroid.py b/minecode/miners/fdroid.py index 4a10a162..86bf5f8e 100644 --- a/minecode/miners/fdroid.py +++ b/minecode/miners/fdroid.py @@ -226,7 +226,7 @@ def build_packages(purl, data): size = file["size"] download_url = f"https://f-droid.org/repo/{filename}" - package_mappping = dict( + package_mapping = dict( version=version_code, download_url=download_url, repository_download_url=download_url, diff --git a/minecode/miners/github.py b/minecode/miners/github.py index 8d0c9758..2345e8aa 100644 --- a/minecode/miners/github.py +++ b/minecode/miners/github.py @@ -189,7 +189,7 @@ def fetch(self, uri, timeout=None): def json_serial_date_obj(obj): """JSON serializer for date object""" - if obj and isinstance(obj, (datetime, date)): + if obj and isinstance(obj, datetime | date): return obj.isoformat() diff --git a/minecode/miners/java_stream.py b/minecode/miners/java_stream.py index 52480264..63e32287 100644 --- a/minecode/miners/java_stream.py +++ b/minecode/miners/java_stream.py @@ -1,4 +1,3 @@ - # The MIT License (MIT) # # Copyright (c) 2014 Gustav ArngÃ¥rden diff --git a/minecode/miners/maven.py b/minecode/miners/maven.py index fc74662e..6b21b4d7 100644 --- a/minecode/miners/maven.py +++ b/minecode/miners/maven.py @@ -743,13 +743,9 @@ def get_entries(location, fields=frozenset(ENTRY_FIELDS)): except EOFError: if TRACE_DEEP: print( - "Index version: %(_index_version)r last_modified: %(_last_modified)r" - % locals() - ) - print( - "Processed %(entries_count)d docs. Last entry: %(entry)r" - % locals() + f"Index version: {_index_version} last_modified: {_last_modified}" ) + print(f"Processed {entries_count} docs. Last entry: {entry}") print("Unique keys:") for k in sorted(keys): print(k) @@ -1052,9 +1048,7 @@ def get_packages(self, uri, resource_uri): ) package = get_package(resource_uri.data, resource_uri.package_url) if package: - logger.debug( - f"MavenPomMapper.get_packages: uri: {uri}, package: {package}" - ) + logger.debug(f"MavenPomMapper.get_packages: uri: {uri}, package: {package}") yield package diff --git a/minecode/miners/pypi.py b/minecode/miners/pypi.py index 98876a21..9669ff19 100644 --- a/minecode/miners/pypi.py +++ b/minecode/miners/pypi.py @@ -61,7 +61,7 @@ def fetch(self, uri, timeout=None): return temp_file def dumps(self, content): - """The content is huge json and should not be dumped.""" + """Return None as the content is huge json and should not be dumped.""" return None def get_uris(self, content): diff --git a/minecode/miners/repodata.py b/minecode/miners/repodata.py index e40eb788..4584222d 100644 --- a/minecode/miners/repodata.py +++ b/minecode/miners/repodata.py @@ -16,7 +16,7 @@ def remove_list_repetitions(input_list): - """Removes the repeated items in a list and returns a list with unique values""" + """Remove the repeated items in a list and return a list with unique values""" output = [] for item in input_list: if item not in output: @@ -43,7 +43,7 @@ def combine_dicts_using_pkgid(all_dicts): def combine_list_of_dicts(input_dicts): """ - Combines a list of dictionaries and returns a single dictionary with all the + Combine a list of dictionaries and return a single dictionary with all the keys and values from all the dictionaries in the list. """ all_dict_items = [] @@ -132,7 +132,7 @@ def get_value_from_tuple_pairs(tuples, key): def filelistsxml_parser(location): """ - Parses filelists.xml file and yields the data needed to generate RPM objects. + Parse filelists.xml file and yield the data needed to generate RPM objects. @@ -177,7 +177,7 @@ def filelistsxml_parser(location): def primaryxml_parser(location): """ - Parses primary.xml file and yields the data needed to generate RPM objects. + Parse primary.xml file and yield the data needed to generate RPM objects. 36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5 @@ -252,7 +252,7 @@ def primaryxml_parser(location): def otherxml_parser(location): """ - Parses other.xml file and yields the data needed to generate RPM objects. + Parse other.xml file and yield the data needed to generate RPM objects. diff --git a/minecode/miners/repomd.py b/minecode/miners/repomd.py index 73c23fce..a39217aa 100644 --- a/minecode/miners/repomd.py +++ b/minecode/miners/repomd.py @@ -83,8 +83,8 @@ def generate_rpm_objects(package_infos, base_url): def fetch_repomd_subfile(base_url, repomd_xml, subfile): """ - Downloads and extract a subfile('filelists.xml.gz', 'primary.xml.gz', - 'other.xml.gz') of any repodata and returns the subfile location. + Download and extract a subfile('filelists.xml.gz', 'primary.xml.gz', + 'other.xml.gz') of any repodata and return the subfile location. """ url = base_url + repodata.get_url_for_tag(repomd_xml, subfile) target_location = extract_file(download(url)) @@ -113,7 +113,7 @@ def collect_rpm_packages_from_repomd(uri): @map_router.route(".+/repomd.xml") def map_repomd_data(uris, resource_uri): - """Returns a list of RpmPackage objects collected from visitors.""" + """Return a list of RpmPackage objects collected from visitors.""" if not resource_uri.data: return packages = [] diff --git a/minecode/miners/rubygems.py b/minecode/miners/rubygems.py index 3a9e94d2..fb6ab2e8 100644 --- a/minecode/miners/rubygems.py +++ b/minecode/miners/rubygems.py @@ -105,9 +105,9 @@ def get_uris(self, content): self.uri.index("/versions/") + len("/versions/") : -len(".json") ] version = version_details.get("number") - gem_name = "%(name)s-%(version)s" % locals() + gem_name = f"{name}-{version}" package_url = PackageURL(type="gem", name=name, version=version).to_string() - download_url = "https://rubygems.org/downloads/%(gem_name)s.gem" % locals() + download_url = f"https://rubygems.org/downloads/{gem_name}.gem" yield URI( uri=download_url, source_uri=self.uri, diff --git a/minecode/miners/sourceforge.py b/minecode/miners/sourceforge.py index 9746c333..dd62397d 100644 --- a/minecode/miners/sourceforge.py +++ b/minecode/miners/sourceforge.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import json import logging import re @@ -156,11 +157,11 @@ def build_packages_from_metafile(metadata, purl=None, uri=None): extracted_license_statement = [] licenses = categories.get("license") or [] - for l in licenses: - license_name = l.get("fullname") + for lic in licenses: + license_name = lic.get("fullname") # full name is first priority than shortname since shortname is like gpl, it doesn't show detailed gpl version etc. if license_name: - extracted_license_statement.append(l.get("shortname")) + extracted_license_statement.append(lic.get("shortname")) if license_name: extracted_license_statement.append(license_name) if extracted_license_statement: diff --git a/minecode/miners/ubuntu.py b/minecode/miners/ubuntu.py index 9106db0b..3736b0ac 100644 --- a/minecode/miners/ubuntu.py +++ b/minecode/miners/ubuntu.py @@ -3,6 +3,5 @@ # - # http://askubuntu.com/questions/139032/how-to-programmatically-fetch-a-list-of-applications-from-the-software-center # http://askubuntu.com/questions/112004/is-there-any-web-api-for-software-center-available diff --git a/minecode/model_utils.py b/minecode/model_utils.py index 218e5717..16f79baf 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -153,9 +153,7 @@ def merge_packages(existing_package, new_package_data, replace=False): logger.debug(f" No existing value: set to new: {new_value}") if TRACE and replace: - logger.debug( - f" Existing value and replace: set to new: {new_value}" - ) + logger.debug(f" Existing value and replace: set to new: {new_value}") if existing_field == "parties": # If `existing_field` is `parties`, then we update the `Party` table @@ -460,8 +458,8 @@ def merge_or_create_package(scanned_package, visit_level, override=False): def update_or_create_resource(package, resource_data): """ - Using Resource data from `resource_data`, create or update the - corresponding purldb Resource from `package`. + Create or update the corresponding purldb Resource from `package` using + Resource data from `resource_data`. Return a 3-tuple of the corresponding purldb Resource of `resource_data`, `resource`, as well as booleans representing whether the Resource was diff --git a/minecode/models.py b/minecode/models.py index 40263cf1..009c5bb6 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -21,9 +21,10 @@ from minecode import map_router # UnusedImport here! -# But importing the miners module triggers routes registration -from minecode import miners # NOQA +# But importing the mappers and visitors module triggers routes registration +from minecode import mappers # NOQA from minecode import visit_router +from minecode import visitors # NOQA from packagedb.models import Package logger = logging.getLogger(__name__) @@ -491,7 +492,7 @@ def save(self, *args, **kwargs): self.normalize_fields() self.has_map_error = True if self.map_error else False self.has_visit_error = True if self.visit_error else False - super(ResourceURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) class ScannableURIManager(models.Manager): @@ -789,7 +790,7 @@ def save(self, *args, **kwargs): if not self.canonical: self.canonical = get_canonical(self.uri) self.normalize_fields() - super(ScannableURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) def process_scan_results( self, scan_results_location, scan_summary_location, project_extra_data @@ -970,7 +971,7 @@ class Meta: def save(self, *args, **kwargs): """Save, adding defaults for computed fields and validating fields.""" self.normalize_fields() - super(PriorityResourceURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. @@ -1086,7 +1087,7 @@ class Meta: def save(self, *args, **kwargs): """Save, adding defaults for computed fields and validating fields.""" self.normalize_fields() - super(ImportableURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) class ProcessingError(BaseURI): diff --git a/minecode/route.py b/minecode/route.py index f034bb81..0b0445c7 100644 --- a/minecode/route.py +++ b/minecode/route.py @@ -119,7 +119,8 @@ def append(self, pattern, endpoint): def route(self, *patterns): """ - Decorator to make a callable 'endpoint' routed to one or more patterns. + Return a decorator to make a callable 'endpoint' routed to one or more + patterns. Example: ------- @@ -178,7 +179,7 @@ def resolve(self, string): # this can happen when multiple patterns match the same string # we raise an exception with enough debugging information pats = repr([r.pattern for r in candidates]) - msg = "%(string)r matches multiple patterns %(pats)r" % locals() + msg = f"{string} matches multiple patterns {pats}" raise MultipleRoutesDefined(msg) return candidates[0].endpoint diff --git a/minecode/rsync.py b/minecode/rsync.py index bb765669..c67bd9ea 100644 --- a/minecode/rsync.py +++ b/minecode/rsync.py @@ -137,7 +137,7 @@ def fetch_directory(uri, recurse=True): file_name = tmp.name ends = not uri.endswith("/") and "/" or "" recursive = recurse and "--recursive" or "--no-recursive" - cmd = 'rsync --no-motd %(recursive)s -d "%(uri)s%(ends)s"' % locals() + cmd = f'rsync --no-motd {recursive} -d "{uri}{ends}"' rsync = command.Command(cmd) out, err = rsync.execute() @@ -147,6 +147,6 @@ def fetch_directory(uri, recurse=True): err = "\n".join([e for e in err]) rc = rsync.returncode if err or rc: - raise Exception("%(cmd) failed. rc:%(tc)d err: %(err)s" % locals()) + raise Exception(f"{cmd} failed. rc:{rc} err: {err}") else: return file_name diff --git a/minecode/saneyaml.py b/minecode/saneyaml.py index e55fc638..ad2176d0 100644 --- a/minecode/saneyaml.py +++ b/minecode/saneyaml.py @@ -121,7 +121,7 @@ def ordered_loader(loader, node): class SaneDumper(SafeDumper): def increase_indent(self, flow=False, indentless=False): """Ensure that lists items are always indented.""" - return super(SaneDumper, self).increase_indent(flow, indentless=False) + return super().increase_indent(flow, indentless=False) def ignore_aliases(self, data): """Avoid having aliases created from re-used Python objects.""" @@ -137,7 +137,7 @@ def ordered_dumper(dumper, data): def null_dumper(dumper, value): - """Always dump nulls as empty string.""" + """Dump nulls as an empty string.""" return dumper.represent_scalar("tag:yaml.org,2002:null", "") @@ -149,7 +149,7 @@ def string_dumper(dumper, value, _tag="tag:yaml.org,2002:str"): Ensure that all scalars are dumped as UTF-8 unicode, folded and quoted in the sanest and most readable way. """ - if not isinstance(value, basestring): + if not isinstance(value, str): value = repr(value) if isinstance(value, str): diff --git a/minecode/seed.py b/minecode/seed.py index 9a805960..abba367a 100644 --- a/minecode/seed.py +++ b/minecode/seed.py @@ -45,7 +45,7 @@ def get_active_seeders(seeders=()): if not seeders: seeders = get_configured_seeders() for seeder in seeders: - if isinstance(seeder, (bytes, unicode)): + if isinstance(seeder, bytes | unicode): module_name, _, class_name = seeder.rpartition(".") module = importlib.import_module(module_name) yield getattr(module, class_name)() diff --git a/minecode/tests/collectors/test_conan.py b/minecode/tests/collectors/test_conan.py index 1c20e33f..6f2556da 100644 --- a/minecode/tests/collectors/test_conan.py +++ b/minecode/tests/collectors/test_conan.py @@ -27,7 +27,7 @@ class ConanPriorityQueueTests(JsonBasedTesting, TestCase): ) def setUp(self): - super(ConanPriorityQueueTests, self).setUp() + super().setUp() self.package_url1 = PackageURL.from_string("pkg:conan/zlib@1.3.1") zlib_conanfile_loc = self.get_test_loc("conan/zlib/manifest/conanfile.py") zlib_conandata_loc = self.get_test_loc("conan/zlib/manifest/conandata.yml") diff --git a/minecode/tests/collectors/test_gnu.py b/minecode/tests/collectors/test_gnu.py index ef2fc331..3dec9b7b 100644 --- a/minecode/tests/collectors/test_gnu.py +++ b/minecode/tests/collectors/test_gnu.py @@ -24,7 +24,7 @@ class GnuPriorityQueueTests(JsonBasedTesting, TestCase): ) def setUp(self): - super(GnuPriorityQueueTests, self).setUp() + super().setUp() glibc_data_loc = self.get_test_loc("gnu/glibc/index.html") with open(glibc_data_loc) as f: diff --git a/minecode/tests/collectors/test_maven.py b/minecode/tests/collectors/test_maven.py index 322beed6..1cbd2f71 100644 --- a/minecode/tests/collectors/test_maven.py +++ b/minecode/tests/collectors/test_maven.py @@ -19,7 +19,7 @@ class MavenPriorityQueueTests(JsonBasedTesting, DjangoTestCase): ) def setUp(self): - super(MavenPriorityQueueTests, self).setUp() + super().setUp() self.expected_pom_loc = self.get_test_loc("maven/pom/classworlds-1.1.pom") with open(self.expected_pom_loc) as f: diff --git a/minecode/tests/collectors/test_npm.py b/minecode/tests/collectors/test_npm.py index 5a378be8..d517e278 100644 --- a/minecode/tests/collectors/test_npm.py +++ b/minecode/tests/collectors/test_npm.py @@ -27,7 +27,7 @@ class NpmPriorityQueueTests(JsonBasedTesting, DjangoTestCase): ) def setUp(self): - super(NpmPriorityQueueTests, self).setUp() + super().setUp() self.expected_json_loc = self.get_test_loc("npm/lodash_package-expected.json") with open(self.expected_json_loc) as f: self.expected_json_contents = json.load(f) diff --git a/minecode/tests/miners/test_debian.py b/minecode/tests/miners/test_debian.py index 80755719..54b08db0 100644 --- a/minecode/tests/miners/test_debian.py +++ b/minecode/tests/miners/test_debian.py @@ -38,7 +38,6 @@ def check_objects_expected(self, results, expected_loc, regen=FIXTURES_REGEN): result = "" for item in results: if isinstance(item, str): - item = unicode(item, "utf-8") result += item.encode("utf-8") else: if isinstance(item, debcon.Debian822): @@ -67,7 +66,7 @@ def check_expected_deb822(self, deb_object, expected_loc, regen=FIXTURES_REGEN): assert expected == result def get_tmp_gz_file(self, loc): - """Creates a .gz file at a temporary location, and returns that location.""" + """Create a .gz file at a temporary location, and return that location.""" temp_gz_location = self.get_temp_file(extension=".gz") with open(loc, "rb") as f: file_content = f.read() diff --git a/minecode/tests/miners/test_maven.py b/minecode/tests/miners/test_maven.py index e3aaa597..b122ec7c 100644 --- a/minecode/tests/miners/test_maven.py +++ b/minecode/tests/miners/test_maven.py @@ -541,7 +541,6 @@ def test_visit_and_map_with_index(self): ) before_uri = [p.id for p in ResourceURI.objects.all()] - before_pkg = [p.id for p in packagedb.models.Package.objects.all()] resource_uri = ResourceURI.objects.insert(uri=uri) diff --git a/minecode/tests/miners/test_rubygems.py b/minecode/tests/miners/test_rubygems.py index 5d5b3b99..0b3b0740 100644 --- a/minecode/tests/miners/test_rubygems.py +++ b/minecode/tests/miners/test_rubygems.py @@ -52,8 +52,8 @@ def test_check_gem_file_visitor_routes(self): "https://rubygems.org/downloads/O365RubyEasy-0.0.1.gem", # upper ] - for route in routes: - self.assertTrue(visit_router.resolve(route)) + for gem_file_visitor_route in routes: + self.assertTrue(visit_router.resolve(gem_file_visitor_route)) def test_RubyGemsIndexVisitor_latest(self): uri = "http://rubygems.org/specs.4.8.gz" diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index 9eab2af7..5996a6d1 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -30,7 +30,9 @@ class ScannableURIAPITestCase(JsonBasedTesting, TestCase): def setUp(self): self.scan_queue_worker_user = User.objects.create_user( - username="username", email="e@mail.com", password="secret" + username="username", + email="e@mail.com", + password="secret", # NOQA ) scan_queue_workers_group, _ = Group.objects.get_or_create( name="scan_queue_workers" @@ -49,7 +51,7 @@ def setUp(self): self.staff_user = User.objects.create_user( username="staff_username", email="staff_e@mail.com", - password="secret", + password="secret", # NOQA is_staff=True, ) self.staff_auth = f"Token {self.staff_user.auth_token.key}" @@ -60,7 +62,7 @@ def setUp(self): self.regular_user = User.objects.create_user( username="regular_username", email="regular_e@mail.com", - password="secret", + password="secret", # NOQA ) self.regular_auth = f"Token {self.regular_user.auth_token.key}" self.regular_client = APIClient(enforce_csrf_checks=True) diff --git a/minecode/tests/test_command.py b/minecode/tests/test_command.py index 053e8133..ed8ae7eb 100644 --- a/minecode/tests/test_command.py +++ b/minecode/tests/test_command.py @@ -21,7 +21,7 @@ class CommandTest(MiningTestCase): def test_listing_command(self): td = self.get_test_loc("command") osc = "ls" if not ON_WINDOWS else "dir" - c = '%(osc)s "%(td)s"' % locals() + c = f'{osc} "{td}"' cmd = command.Command(c) out, err = cmd.execute() err = [e for e in err] diff --git a/minecode/tests/test_migrations.py b/minecode/tests/test_migrations.py index 83ec1311..45dd1243 100644 --- a/minecode/tests/test_migrations.py +++ b/minecode/tests/test_migrations.py @@ -133,51 +133,6 @@ def test_set_is_visitable_for_maven_index_uris(self): self.assertEqual(results, expected) -class TestSetIsVisitableForMavenIndexURIs(TestMigrations): - app_name = "minecode" - migrate_from = "0025_populate_has_error_fields" - migrate_to = "0026_set_is_visitable_for_maven_index_uris" - - def setUpBeforeMigration(self, apps): - # using get_model to avoid circular import - ResourceURI = apps.get_model("minecode", "ResourceURI") - - self.resource_uris = [ - ResourceURI.objects.create( - uri="maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar", - is_visitable=True, - ), - ResourceURI.objects.create( - uri="maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar", - is_visitable=False, - ), - ] - - for resource_uri in self.resource_uris: - resource_uri.save() - - def test_set_is_visitable_for_maven_index_uris(self): - # using get_model to avoid circular import - ResourceURI = apps.get_model("minecode", "ResourceURI") - results = list( - ResourceURI.objects.values( - "uri", - "is_visitable", - ).all() - ) - expected = [ - { - "is_visitable": False, - "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar", - }, - { - "is_visitable": False, - "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar", - }, - ] - self.assertEqual(results, expected) - - class TestReplaceHttpWithHttpsInMavenURIs(TestMigrations): app_name = "minecode" migrate_from = "0026_set_is_visitable_for_maven_index_uris" diff --git a/minecode/tests/test_models.py b/minecode/tests/test_models.py index 4e528e5a..7c180808 100644 --- a/minecode/tests/test_models.py +++ b/minecode/tests/test_models.py @@ -371,7 +371,7 @@ def test_get_mappables(self): self.assertEqual(2, ResourceURI.objects.get_mappables().count()) self.resource1.last_map_date = timezone.now() self.resource1.save() - resource1 = ResourceURI.objects.get(id=self.resource1.id) + # resource2 should only be mappable self.assertEqual([self.resource2], list(ResourceURI.objects.get_mappables())) def test_get_mappables__map_error_must_make_a_resourceuri_non_mappable(self): @@ -381,7 +381,6 @@ def test_get_mappables__map_error_must_make_a_resourceuri_non_mappable(self): self.resource2.map_error = "Some error happened" self.resource1.save() self.resource2.save() - resource1 = ResourceURI.objects.get(id=self.resource1.id) self.assertEqual([], list(ResourceURI.objects.get_mappables())) @@ -457,9 +456,7 @@ def setUp(self): ) def test_ScannableURI_create_basic_record(self): - scannable_uri = ScannableURI.objects.create( - uri=self.test_uri, package=self.test_package - ) + ScannableURI.objects.create(uri=self.test_uri, package=self.test_package) result = ScannableURI.objects.get(uri=self.test_uri) self.assertEqual(self.test_uri, result.uri) self.assertEqual(self.test_package, result.package) diff --git a/minecode/tests/test_rsync.py b/minecode/tests/test_rsync.py index 9e768b98..35b8e50c 100644 --- a/minecode/tests/test_rsync.py +++ b/minecode/tests/test_rsync.py @@ -279,7 +279,7 @@ def test_fetch_directory_no_recurse(self): self.assertTrue("bar/this" not in results) def get_dirs(self, input_path): - """Returns only the type and path from rsync entries.""" + """Return only the type and path from rsync entries.""" return [ (e["type"], e["path"]) for e in rsync.directory_entries(input_path) diff --git a/minecode/utils.py b/minecode/utils.py index 846ebe6b..64d38b2a 100644 --- a/minecode/utils.py +++ b/minecode/utils.py @@ -48,14 +48,14 @@ def stringify_null_purl_fields(data): def sha1(content): - """Returns the sha1 hash of the given content.""" + """Return the sha1 hash of the given content.""" h = hashlib.sha1() h.update(content) return h.hexdigest() def md5(content): - """Returns the md5 hash of the given content.""" + """Return the md5 hash of the given content.""" h = hashlib.md5() h.update(content) return h.hexdigest() @@ -86,14 +86,14 @@ def __eq__(self, other): def normalize_trailing_slash(uri): - """Appends a trailing slash if the URI is not ending with one already.""" + """Append a trailing slash if the URI is not ending with one already.""" if not uri.endswith("/"): uri += "/" return uri def is_ascii(s): - """Returns True is the string is ASCII.""" + """Return True is the string is ASCII.""" return all(ord(c) < 128 for c in s) @@ -109,7 +109,7 @@ def clean_html_entities(text): def clean_description(text): - """Cleans the description text from HTML entities and from extra whitespaces.""" + """Clean the description text from HTML entities and from extra whitespaces.""" return " ".join(clean_html_entities(text.strip()).split()) @@ -212,19 +212,18 @@ def get_http_response(uri, timeout=10): requests_args["timeout"] = timeout if not uri.lower().startswith("http"): - raise Exception("get_http_response: Not an HTTP URI: %(uri)r" % locals()) + raise Exception(f"get_http_response: Not an HTTP URI: {uri}") try: response = requests.get(uri, **requests_args) except (ConnectionError, InvalidSchema): - logger.error("get_http_response: Download failed for %(uri)r" % locals()) + logger.error(f"get_http_response: Download failed for {uri}") raise status = response.status_code if status != 200: raise Exception( - "get_http_response: Download failed for %(uri)r " - "with %(status)r" % locals() + f"get_http_response: Download failed for {uri} " f"with {status}" ) return response @@ -249,8 +248,8 @@ def get_package_sha1(package, field="repository_download_url"): def fetch_and_write_file_from_url(url): """ - Fetches a file from the `url` and returns the location for the - temporary file. Return None if the url is not reachable. + Fetch a file from the `url` and return the location for the temporary file. + Return None if the url is not reachable. """ response = requests.get(url) if not response.ok: @@ -327,7 +326,7 @@ def extract_file(location): target = event.target break except Exception as e: - logger.error("extract_file: failed for %(location)r" % locals()) + logger.error(f"extract_file: failed for {location}") raise e return target @@ -389,9 +388,8 @@ def _setup(self): smaller_queryset = copy.deepcopy(self._base_queryset)[ i : i + self.max_obj_num ] - logger.debug("Grabbing next %s objects from DB" % self.max_obj_num) - for obj in smaller_queryset.iterator(): - yield obj + logger.debug(f"Grabbing next {self.max_obj_num} objects from DB") + yield from smaller_queryset.iterator() def __iter__(self): return self._generator diff --git a/minecode/utils_test.py b/minecode/utils_test.py index 5c114c33..31a7c393 100644 --- a/minecode/utils_test.py +++ b/minecode/utils_test.py @@ -189,16 +189,16 @@ def mocked_requests_get_for_uris(url_to_location, *args, **kwargs): def response_403(url, request): - """Returns a HTTP response with status 403.""" + """Return a HTTP response with status 403.""" return {"status_code": 403, "content": ""} class JsonBasedTestingMixin(TestCase): def _normalize_results(self, data, fields_to_remove=[]): """ - Returns the `data`, where any `package_uid` value has been normalized - with `purl_with_fake_uuid()` and fields from `fields_to_remove` have - been removed from `data`. + Return `data`, where any `package_uid` value has been normalized with + `purl_with_fake_uuid()` and fields from `fields_to_remove` have been + removed from it. """ if type(data) in (list, ReturnList): return [self._normalize_results(entry, fields_to_remove) for entry in data] @@ -230,7 +230,6 @@ def _remove_fields_from_results(self, data, fields_to_remove): ] if type(data) in (dict, OrderedDict, ReturnDict): - normalized_data = {} # Remove fields from results and normalize Package UIDs for field in fields_to_remove: if field not in data: @@ -266,9 +265,9 @@ def check_expected_results( class JsonBasedTesting(JsonBasedTestingMixin, FileBasedTesting): def _normalize_results(self, data, fields_to_remove=[]): """ - Returns the `data`, where any `package_uid` value has been normalized - with `purl_with_fake_uuid()` and fields from `fields_to_remove` have - been removed from `data`. + Return the `data`, where any `package_uid` value has been normalized + with `purl_with_fake_uuid()` and fields from `fields_to_remove` that + have been removed from `data`. """ if type(data) in (list, ReturnList): return [self._normalize_results(entry, fields_to_remove) for entry in data] @@ -300,7 +299,6 @@ def _remove_fields_from_results(self, data, fields_to_remove): ] if type(data) in (dict, OrderedDict, ReturnDict): - normalized_data = {} # Remove fields from results and normalize Package UIDs for field in fields_to_remove: if field not in data: @@ -356,10 +354,6 @@ def check_expected_uris( def model_to_dict(instance, fields=None, exclude=None): """ - Copied from django.forms.models. model_to_dict - license: bsd-new - see ABOUT file for details - Return a mapping containing the data in ``instance``. ``fields`` is an optional list of field names. If provided, only the @@ -371,6 +365,10 @@ def model_to_dict(instance, fields=None, exclude=None): Note that all field with the word "date" in their name is converted to a boolean value to abstract test results from dates. + + Copied from django.forms.models. model_to_dict + license: bsd-new + see ABOUT file for details """ opts = instance._meta data = dict() diff --git a/minecode/version.py b/minecode/version.py index 9322a0de..5d25ea9a 100644 --- a/minecode/version.py +++ b/minecode/version.py @@ -136,7 +136,7 @@ def strip_version_tags(path): def strip_extensions(path): - """ "Remove well known archive extensions from end of path.""" + """Remove well known archive extensions from end of path.""" for rext in ARCHIVE_FILE_EXT_RES: path = rext.sub("", path) return path diff --git a/packagedb/api.py b/packagedb/api.py index daacf6ed..a6097b2e 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -551,7 +551,7 @@ def create(self, request): if package_set: package_set = package_set - except: + except Exception: message = {"update_status": f"No Package Set found for {uuid}"} return Response(message, status=status.HTTP_400_BAD_REQUEST) diff --git a/packagedb/management/commands/create_source_repo_packages.py b/packagedb/management/commands/create_source_repo_packages.py index 0fa0817c..2d32653c 100644 --- a/packagedb/management/commands/create_source_repo_packages.py +++ b/packagedb/management/commands/create_source_repo_packages.py @@ -74,7 +74,6 @@ def handle(self, *args, **options): for row in rows: # Look up the package the row is for by using the purl to query the db. purl = row["purl"] - source_purl = row["source_purl"] print(f"Processing packages for: {purl}") package = get_package_object_from_purl(package_url=purl) if not package: diff --git a/packagedb/management/commands/run_scheduler.py b/packagedb/management/commands/run_scheduler.py index e09fc561..e8f065f1 100644 --- a/packagedb/management/commands/run_scheduler.py +++ b/packagedb/management/commands/run_scheduler.py @@ -29,4 +29,4 @@ class Command(rqscheduler.Command): def handle(self, *args, **kwargs): clear_zombie_watch_schedules() init_watch_scheduled() - super(Command, self).handle(*args, **kwargs) + super().handle(*args, **kwargs) diff --git a/packagedb/models.py b/packagedb/models.py index 0a1fcb09..9b35266e 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -79,8 +79,7 @@ def paginated(self, per_page=5000): paginator = Paginator(self, per_page=per_page) for page_number in paginator.page_range: page = paginator.page(page_number) - for object in page.object_list: - yield object + yield from page.object_list VCS_CHOICES = [ @@ -94,7 +93,7 @@ def paginated(self, per_page=5000): class LowerCaseField(models.CharField): def __init__(self, *args, **kwargs): - super(LowerCaseField, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) def to_python(self, value): return str(value).lower() @@ -217,8 +216,8 @@ class ExtraDataFieldMixin(models.Model): ) def update_extra_data(self, data): - """Updates the `extra_data` field with the provided `data` dict.""" - if type(data) != dict: + """Update `extra_data` field with the provided `data` dict.""" + if type(data) is not dict: raise ValueError("Argument `data` value must be a dict()") self.extra_data.update(data) @@ -670,7 +669,7 @@ def update_fields(self, save=False, **values_by_fields): unsaved_models = [] if field == "dependencies": for dep_data in value: - if isinstance(dep_data, (dict, OrderedDict)): + if isinstance(dep_data, dict | OrderedDict): dep = DependentPackage( package=self, purl=dep_data.get("purl"), @@ -692,7 +691,7 @@ def update_fields(self, save=False, **values_by_fields): if field == "parties": for party_data in value: - if isinstance(party_data, (dict, OrderedDict)): + if isinstance(party_data, dict | OrderedDict): party = Party( package=self, type=party_data.get("type"), @@ -711,7 +710,7 @@ def update_fields(self, save=False, **values_by_fields): if field == "resources": for resource_data in value: - if isinstance(resource_data, (dict, OrderedDict)): + if isinstance(resource_data, dict | OrderedDict): resource = Resource( package=self, path=resource_data.get("path"), @@ -1377,7 +1376,7 @@ def save(self, *args, **kwargs): if schedule: self.schedule_work_id = self.create_new_job() - super(PackageWatch, self).save(*args, **kwargs) + super().save(*args, **kwargs) def delete(self, *args, **kwargs): """Clear associated watch schedule.""" diff --git a/packagedb/package_managers.py b/packagedb/package_managers.py index b25e101d..898dc4da 100644 --- a/packagedb/package_managers.py +++ b/packagedb/package_managers.py @@ -58,7 +58,7 @@ def get_response(url, content_type="json", headers=None): try: resp = requests.get(url=url, headers=headers) - except: + except Exception: logger.error(traceback.format_exc()) return if not resp.status_code == 200: diff --git a/packagedb/schedules.py b/packagedb/schedules.py index f171aa62..916490ef 100644 --- a/packagedb/schedules.py +++ b/packagedb/schedules.py @@ -32,8 +32,8 @@ def get_next_execution(watch_interval_days, last_watch_date): def schedule_watch(watch): """ - Takes a `PackageWatch` object as input and schedule a - recurring job using `rq_scheduler` to watch the package. + Schedule a recurring job with a `PackageWatch` object using `rq_scheduler` + to watch the package. """ watch_interval = watch.watch_interval last_watch_date = watch.last_watch_date diff --git a/packagedb/serializers.py b/packagedb/serializers.py index 251d3f79..1433f248 100644 --- a/packagedb/serializers.py +++ b/packagedb/serializers.py @@ -143,8 +143,8 @@ class Meta: class PackageInPackageSetAPISerializer(ModelSerializer): """ - This serializes Package instances within a PackageSet that is within a - Package in the PackageAPISerializer + Serialize Package instances within a PackageSet that is within a Package in + the PackageAPISerializer """ class Meta: @@ -243,8 +243,8 @@ def get_package_content(self, obj): class PackageInPackageSetMetadataSerializer(ModelSerializer): """ - This serializes Package instances within a PackageSet that is within a - Package in the PackageMetadataSerializer + Serialize Package instances within a PackageSet that is within a Package in + the PackageMetadataSerializer """ class Meta: diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index 9076e905..118ae6cb 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -120,9 +120,7 @@ def test_api_resource_list_endpoint_returns_none_when_filtering_by_blank_uuid(se self.assertEqual(2, response.data.get("count")) def test_api_resource_list_endpoint_filters_by_package1_uuid(self): - response = self.client.get( - f"/api/resources/?package={self.package1.uuid}" - ) + response = self.client.get(f"/api/resources/?package={self.package1.uuid}") self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(1, response.data.get("count")) @@ -143,9 +141,7 @@ def test_api_resource_list_endpoint_filters_by_package1_uuid(self): self.assertEqual(test_resource.get("type"), self.resource1.type) def test_api_resource_list_endpoint_filters_by_package2_uuid(self): - response = self.client.get( - f"/api/resources/?package={self.package2.uuid}" - ) + response = self.client.get(f"/api/resources/?package={self.package2.uuid}") self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(1, response.data.get("count")) @@ -172,15 +168,8 @@ def test_api_resource_list_endpoint_returns_none_when_filtering_by_wrong_purl(se self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(0, response.data.get("count")) - def test_api_resource_list_endpoint_returns_none_when_filtering_by_blank_uuid(self): - response = self.client.get("/api/resources/?purl={}".format("")) - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get("count")) - def test_api_resource_list_endpoint_filters_by_package1_purl(self): - response = self.client.get( - f"/api/resources/?purl={self.package1.package_url}" - ) + response = self.client.get(f"/api/resources/?purl={self.package1.package_url}") self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(1, response.data.get("count")) @@ -201,9 +190,7 @@ def test_api_resource_list_endpoint_filters_by_package1_purl(self): self.assertEqual(test_resource.get("type"), self.resource1.type) def test_api_resource_list_endpoint_filters_by_package2_purl(self): - response = self.client.get( - f"/api/resources/?purl={self.package2.package_url}" - ) + response = self.client.get(f"/api/resources/?purl={self.package2.package_url}") self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(1, response.data.get("count")) @@ -407,15 +394,11 @@ def test_package_api_list_endpoint_filter_by_purl_fields_ignores_case(self): if key not in ["type", "namespace", "name"]: continue - response = self.client.get( - f"/api/packages/?{key}={value.lower()}" - ) + response = self.client.get(f"/api/packages/?{key}={value.lower()}") self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(1, response.data.get("count")) - response = self.client.get( - f"/api/packages/?{key}={value.upper()}" - ) + response = self.client.get(f"/api/packages/?{key}={value.upper()}") self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(1, response.data.get("count")) @@ -794,7 +777,8 @@ def test_package_api_purl_filter_by_both_query_params(self): ) def test_package_api_purl_filter_by_two_purl_values_on_multiple_packages(self): - extra_test_package = Package.objects.create( + # extra test package + Package.objects.create( download_url="https://extra-pkg.com/download", type="generic", name="extra-name", @@ -981,7 +965,6 @@ def setUp(self): def test_package_live(self): purl_str = "pkg:maven/org.apache.twill/twill-core@0.12.0" download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar" - purl_sources_str = f"{purl_str}?classifier=sources" sources_download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar" self.assertEqual(0, Package.objects.filter(download_url=download_url).count()) @@ -1017,7 +1000,6 @@ def test_package_live(self): def test_package_live_works_with_purl2vcs(self): purl = "pkg:maven/org.elasticsearch.plugin/elasticsearch-scripting-painless-spi@6.8.15" download_url = "https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15.jar" - purl_sources_str = f"{purl}?classifier=sources" sources_download_url = "https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15-sources.jar" self.assertEqual(0, Package.objects.filter(download_url=download_url).count()) diff --git a/packagedb/tests/test_filters.py b/packagedb/tests/test_filters.py index baf910f1..8d859b2a 100644 --- a/packagedb/tests/test_filters.py +++ b/packagedb/tests/test_filters.py @@ -23,7 +23,7 @@ def test_scanpipe_filters_package_filterset_search(self): version="1.0.0", download_url="https://example.com/foo-1.0.0.jar", ) - p2 = Package.objects.create( + Package.objects.create( type="maven", namespace="org.somethingelse", name="foo", diff --git a/packagedb/tests/test_models.py b/packagedb/tests/test_models.py index d4620060..4babf5ee 100644 --- a/packagedb/tests/test_models.py +++ b/packagedb/tests/test_models.py @@ -543,5 +543,5 @@ def test_get_or_none(self): assert package assert ( Package.objects.filter(download_url="http://a.ab-foobar").get_or_none() - == None + is None ) diff --git a/packagedb/tests/test_throttling.py b/packagedb/tests/test_throttling.py index d8dcc5eb..d17557fe 100644 --- a/packagedb/tests/test_throttling.py +++ b/packagedb/tests/test_throttling.py @@ -21,7 +21,9 @@ class ThrottleApiTests(APITestCase): def setUp(self): # create a basic user self.user = User.objects.create_user( - username="username", email="e@mail.com", password="secret" + username="username", + email="e@mail.com", + password="secret", # NOQA ) self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) @@ -31,7 +33,7 @@ def setUp(self): self.staff_user = User.objects.create_user( username="staff_username", email="staff_e@mail.com", - password="secret", + password="secret", # NOQA is_staff=True, ) self.staff_auth = f"Token {self.staff_user.auth_token.key}" diff --git a/packagedb/to_purl.py b/packagedb/to_purl.py index 9eac48b1..5066b23a 100644 --- a/packagedb/to_purl.py +++ b/packagedb/to_purl.py @@ -58,7 +58,7 @@ def list(self, request): go_import = validated_data.get("go_package") try: purl = get_golang_purl(go_import) - except: + except Exception: return Response( {"errors": "`@` is not supported either in import or go.mod string"}, status=status.HTTP_400_BAD_REQUEST, diff --git a/purl2vcs/src/purl2vcs/find_source_repo.py b/purl2vcs/src/purl2vcs/find_source_repo.py index 6fe917a7..e3608d3f 100644 --- a/purl2vcs/src/purl2vcs/find_source_repo.py +++ b/purl2vcs/src/purl2vcs/find_source_repo.py @@ -147,7 +147,7 @@ def get_source_package_and_add_to_package_set(package): download_url = get_download_url(str(source_purl)) if not download_url: return - except: + except Exception: logger.error(f"Error getting download_url for {source_purl}") return From 75dd5e623aeea80bb1a0d82a7abcb8fa6397a406 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 13:56:38 -0700 Subject: [PATCH 07/12] Check code style and format in CI #512 #515 Signed-off-by: Jono Yang --- etc/ci/azure-posix.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/etc/ci/azure-posix.yml b/etc/ci/azure-posix.yml index b139a66c..2fe1633c 100644 --- a/etc/ci/azure-posix.yml +++ b/etc/ci/azure-posix.yml @@ -41,5 +41,8 @@ jobs: sudo cp .env /etc/scancodeio displayName: '${{ pyver }} - Configure' + - script: make check + displayName: '${{ pyver }} - Check code style and format' + - script: $(test_suite) displayName: '${{ pyver }} - $(test_suite_label) on ${{ parameters.job_name }}' From 73454da297ce4905c972b05b277adb56c4428bb7 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 14:18:00 -0700 Subject: [PATCH 08/12] Update fdroid test results #512 #515 Signed-off-by: Jono Yang --- minecode/miners/fdroid.py | 1 + minecode/models.py | 5 +- .../index-v2-visited-expected-mapped.json | 996 ++++++++++-------- pyproject.toml | 2 +- 4 files changed, 585 insertions(+), 419 deletions(-) diff --git a/minecode/miners/fdroid.py b/minecode/miners/fdroid.py index 86bf5f8e..127f34f2 100644 --- a/minecode/miners/fdroid.py +++ b/minecode/miners/fdroid.py @@ -222,6 +222,7 @@ def build_packages(purl, data): # these must exists or there is no F-Droid package file = version_data["file"] filename = file["name"] + filename = filename.lstrip("/") sha256 = file["sha256"] size = file["size"] download_url = f"https://f-droid.org/repo/{filename}" diff --git a/minecode/models.py b/minecode/models.py index 009c5bb6..a728b1af 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -21,10 +21,9 @@ from minecode import map_router # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA +# But importing the miners module triggers routes registration +from minecode import miners # NOQA from minecode import visit_router -from minecode import visitors # NOQA from packagedb.models import Package logger = logging.getLogger(__name__) diff --git a/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json b/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json index ad733c73..3abbe968 100644 --- a/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json +++ b/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json @@ -83,11 +83,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/An.stop_10_src.tar.gz", - "size":558337, + "download_url":"https://f-droid.org/repo/An.stop_10.apk", + "size":66218, "sha1":null, "md5":null, - "sha256":"d489eee14c4693a4aa742c490f2566d2d17170a3977cc04993d96ba4588384c8", + "sha256":"78ec7805f5a49b156fbd5f6af174c1cd8ae9900c9c7af2b2df021aca8cd5eae9", "sha512":null, "bug_tracking_url":"https://github.com/jdmonin/anstop/issues", "code_view_url":null, @@ -125,14 +125,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/An.stop@10?download_url=https://f-droid.org/repo/An.stop_10_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/An.stop", - "repository_download_url":"https://f-droid.org/repo/An.stop_10_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/An.stop_10.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/An.stop@10" @@ -221,11 +223,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/An.stop_9_src.tar.gz", - "size":63674, + "download_url":"https://f-droid.org/repo/An.stop_9.apk", + "size":49763, "sha1":null, "md5":null, - "sha256":"af6baad5820f1b86e8aeeec00bd3a46ad929dbae28dd3615e9ef94a555bd309f", + "sha256":"79f5253bab33cf4030b01fec457fd6ffa4fd54b631ee0bc4c1549fbb69ca6680", "sha512":null, "bug_tracking_url":"https://github.com/jdmonin/anstop/issues", "code_view_url":null, @@ -263,14 +265,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/An.stop@9?download_url=https://f-droid.org/repo/An.stop_9_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/An.stop", - "repository_download_url":"https://f-droid.org/repo/An.stop_9_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/An.stop_9.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/An.stop@9" @@ -359,11 +363,11 @@ "Navigation" ], "homepage_url":"https://sourceforge.net/projects/androidspeedo", - "download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1_src.tar.gz", - "size":3962, + "download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1.apk", + "size":6618, "sha1":null, "md5":null, - "sha256":"079d03842cbc4730495a916298e7e5b6874a583c504529d194ba0785d38153e4", + "sha256":"c838e3b53794fa4958b913a1e540167aa2e52c904f1d462352d86d4124982664", "sha512":null, "bug_tracking_url":null, "code_view_url":null, @@ -401,14 +405,16 @@ "other_license_detections":[], "extracted_license_statement":"PublicDomain", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/SpeedoMeterApp.main@1?download_url=https://f-droid.org/repo/SpeedoMeterApp.main_1_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/SpeedoMeterApp.main", - "repository_download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/SpeedoMeterApp.main@1" @@ -497,11 +503,11 @@ "Multimedia" ], "homepage_url":"http://jimroal.com/slist.htm", - "download_url":"https://f-droid.org/repo/a2dp.Vol_169_src.tar.gz", - "size":1361887, + "download_url":"https://f-droid.org/repo/a2dp.Vol_169.apk", + "size":2748737, "sha1":null, "md5":null, - "sha256":"83ce527b19a42424eef5f6f3dc837a1c8d9a5d6c3e646c67b845f3bf7cb43b4e", + "sha256":"f67ef52502faf8dbcef310c122d9efe7871effdd8f9fe0ca93b9925513152d37", "sha512":null, "bug_tracking_url":"https://github.com/jroal/a2dpvolume/issues", "code_view_url":null, @@ -539,14 +545,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/a2dp.Vol@169?download_url=https://f-droid.org/repo/a2dp.Vol_169_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/a2dp.Vol", - "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_169_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_169.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/a2dp.Vol@169" @@ -635,11 +643,11 @@ "Multimedia" ], "homepage_url":"http://jimroal.com/slist.htm", - "download_url":"https://f-droid.org/repo/a2dp.Vol_137_src.tar.gz", - "size":566140, + "download_url":"https://f-droid.org/repo/a2dp.Vol_137.apk", + "size":826576, "sha1":null, "md5":null, - "sha256":"86fb52d03061de322f07a2dd9d4ee20946f19181c85fba6672009c9d369600be", + "sha256":"fb913cccb0957c5b52caea48c3ef7a3ce1d616219b47eed65482097920fe8cc5", "sha512":null, "bug_tracking_url":"https://github.com/jroal/a2dpvolume/issues", "code_view_url":null, @@ -677,14 +685,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/a2dp.Vol@137?download_url=https://f-droid.org/repo/a2dp.Vol_137_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/a2dp.Vol", - "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_137_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_137.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/a2dp.Vol@137" @@ -773,11 +783,11 @@ "Multimedia" ], "homepage_url":"http://jimroal.com/slist.htm", - "download_url":"https://f-droid.org/repo/a2dp.Vol_135_src.tar.gz", - "size":558871, + "download_url":"https://f-droid.org/repo/a2dp.Vol_135.apk", + "size":769268, "sha1":null, "md5":null, - "sha256":"4d3054e02935ec461a3ab070fb1f9101a5f7daaddbdeec4ea191a028d00f28af", + "sha256":"970e93aea1888c80056c46513a16ef214b3f8df0f9105720fd3b1479440327d1", "sha512":null, "bug_tracking_url":"https://github.com/jroal/a2dpvolume/issues", "code_view_url":null, @@ -815,14 +825,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/a2dp.Vol@135?download_url=https://f-droid.org/repo/a2dp.Vol_135_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/a2dp.Vol", - "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_135_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_135.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/a2dp.Vol@135" @@ -913,11 +925,11 @@ "Reading" ], "homepage_url":"http://aarddict.org", - "download_url":"https://f-droid.org/repo/aarddict.android_26_src.tar.gz", - "size":3818485, + "download_url":"https://f-droid.org/repo/aarddict.android_26.apk", + "size":1904989, "sha1":null, "md5":null, - "sha256":"8032cf918f0495204e8e00254fcd04ca4fb9c514323f42442100b37283ac0f59", + "sha256":"b72981914c91641d92508ef801fdd99aebe919590b4f981876f306e37c69af91", "sha512":null, "bug_tracking_url":"https://github.com/aarddict/android/issues", "code_view_url":null, @@ -955,14 +967,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/aarddict.android@26?download_url=https://f-droid.org/repo/aarddict.android_26_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/aarddict.android", - "repository_download_url":"https://f-droid.org/repo/aarddict.android_26_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/aarddict.android_26.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/aarddict.android@26" @@ -1053,11 +1067,11 @@ "Reading" ], "homepage_url":"http://aarddict.org", - "download_url":"https://f-droid.org/repo/aarddict.android_25_src.tar.gz", - "size":3818492, + "download_url":"https://f-droid.org/repo/aarddict.android_25.apk", + "size":1904813, "sha1":null, "md5":null, - "sha256":"f7f158ec6cf3506a0012ec1eac18a6fb2907a63fcafd9edbea4142ffcc189b0e", + "sha256":"bd0737ffd7d25bf23f6bd31f3e3b2aa195c5fe523631b44b2e9d975c69898231", "sha512":null, "bug_tracking_url":"https://github.com/aarddict/android/issues", "code_view_url":null, @@ -1095,14 +1109,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/aarddict.android@25?download_url=https://f-droid.org/repo/aarddict.android_25_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/aarddict.android", - "repository_download_url":"https://f-droid.org/repo/aarddict.android_25_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/aarddict.android_25.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/aarddict.android@25" @@ -1193,11 +1209,11 @@ "Reading" ], "homepage_url":"http://aarddict.org", - "download_url":"https://f-droid.org/repo/aarddict.android_24_src.tar.gz", - "size":3818377, + "download_url":"https://f-droid.org/repo/aarddict.android_24.apk", + "size":1904614, "sha1":null, "md5":null, - "sha256":"84b4e90d306c72f78555a7e127623e550493987357db0709f644be29b8bceea3", + "sha256":"a0320f5360b05c6d05b7ba4ffccf6e9b563a2369c68e4da0f5f407cd9ff6479e", "sha512":null, "bug_tracking_url":"https://github.com/aarddict/android/issues", "code_view_url":null, @@ -1235,14 +1251,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/aarddict.android@24?download_url=https://f-droid.org/repo/aarddict.android_24_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/aarddict.android", - "repository_download_url":"https://f-droid.org/repo/aarddict.android_24_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/aarddict.android_24.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/aarddict.android@24" @@ -1347,11 +1365,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51_src.tar.gz", - "size":3140102, + "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51.apk", + "size":3302945, "sha1":null, "md5":null, - "sha256":"f718641268c7863e0094f055123ca14b0e16c6501914c71ce985eadcd1965bb7", + "sha256":"1d6dae4beae98f1004519dc2338e98592585ce0ca0aabf5c38f6f214e5253361", "sha512":null, "bug_tracking_url":"https://github.com/communitymedia/mediaphone/issues", "code_view_url":null, @@ -1389,14 +1407,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ac.robinson.mediaphone@51?download_url=https://f-droid.org/repo/ac.robinson.mediaphone_51_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ac.robinson.mediaphone", - "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ac.robinson.mediaphone@51" @@ -1501,11 +1521,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50_src.tar.gz", - "size":3131516, + "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50.apk", + "size":3274124, "sha1":null, "md5":null, - "sha256":"33f4b8590c6d488f709fcc1a9d89a85ae8cfe94e7ac6437d7e4b2fc67225c908", + "sha256":"fd1b70d9a1e24a8471a1bf947dab5bd5735e8cae6c10018b7868c88aa7198f7d", "sha512":null, "bug_tracking_url":"https://github.com/communitymedia/mediaphone/issues", "code_view_url":null, @@ -1543,14 +1563,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ac.robinson.mediaphone@50?download_url=https://f-droid.org/repo/ac.robinson.mediaphone_50_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ac.robinson.mediaphone", - "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ac.robinson.mediaphone@50" @@ -1655,11 +1677,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48_src.tar.gz", - "size":3128518, + "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48.apk", + "size":3086625, "sha1":null, "md5":null, - "sha256":"185bf70ff585dc81db325fff92c68e126c34b9ae49e85de3e89091f5fb5cf055", + "sha256":"6198e1e1ff295743980112b190e762d1c642ef1344b40a21803a564fdb6661d6", "sha512":null, "bug_tracking_url":"https://github.com/communitymedia/mediaphone/issues", "code_view_url":null, @@ -1697,14 +1719,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ac.robinson.mediaphone@48?download_url=https://f-droid.org/repo/ac.robinson.mediaphone_48_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ac.robinson.mediaphone", - "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ac.robinson.mediaphone@48" @@ -1809,11 +1833,11 @@ "Internet" ], "homepage_url":"http://acrdevelopment.org", - "download_url":"https://f-droid.org/repo/acr.browser.lightning_101_src.tar.gz", - "size":1020192, + "download_url":"https://f-droid.org/repo/acr.browser.lightning_101.apk", + "size":3424126, "sha1":null, "md5":null, - "sha256":"c1ac724cab6f12be29c83ead8ab3df04804383f220e42e374b80f9c4a44b3f60", + "sha256":"820f4f9977a20b060b4091db2b35cff8cd360e060f94aa742255c845747a2d7f", "sha512":null, "bug_tracking_url":"https://github.com/anthonycr/Lightning-Browser/issues", "code_view_url":null, @@ -1851,14 +1875,16 @@ "other_license_detections":[], "extracted_license_statement":"MPL-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/acr.browser.lightning@101?download_url=https://f-droid.org/repo/acr.browser.lightning_101_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/acr.browser.lightning", - "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_101_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_101.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/acr.browser.lightning@101" @@ -1963,11 +1989,11 @@ "Internet" ], "homepage_url":"http://acrdevelopment.org", - "download_url":"https://f-droid.org/repo/acr.browser.lightning_100_src.tar.gz", - "size":1811930, + "download_url":"https://f-droid.org/repo/acr.browser.lightning_100.apk", + "size":3050894, "sha1":null, "md5":null, - "sha256":"8995f5e677b3d2585e43624df5fd357bfa40da9c703b61daf57fa3dbd9d7123e", + "sha256":"db8d5bfc217eda28485f69ece19cb12e4c2f4502a7b925b18db79980b31b72af", "sha512":null, "bug_tracking_url":"https://github.com/anthonycr/Lightning-Browser/issues", "code_view_url":null, @@ -2005,14 +2031,16 @@ "other_license_detections":[], "extracted_license_statement":"MPL-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/acr.browser.lightning@100?download_url=https://f-droid.org/repo/acr.browser.lightning_100_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/acr.browser.lightning", - "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_100_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_100.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/acr.browser.lightning@100" @@ -2117,11 +2145,11 @@ "Internet" ], "homepage_url":"http://acrdevelopment.org", - "download_url":"https://f-droid.org/repo/acr.browser.lightning_96_src.tar.gz", - "size":914859, + "download_url":"https://f-droid.org/repo/acr.browser.lightning_96.apk", + "size":2687399, "sha1":null, "md5":null, - "sha256":"e9cc630379a8478dfa6716bcd7c5f75fadf03519b78d6316f909d1c7055b15e2", + "sha256":"a225314a83ea7518e0f8105d602171985fbc884d606dbd9669a63a2928856147", "sha512":null, "bug_tracking_url":"https://github.com/anthonycr/Lightning-Browser/issues", "code_view_url":null, @@ -2159,14 +2187,16 @@ "other_license_detections":[], "extracted_license_statement":"MPL-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/acr.browser.lightning@96?download_url=https://f-droid.org/repo/acr.browser.lightning_96_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/acr.browser.lightning", - "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_96_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_96.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/acr.browser.lightning@96" @@ -2271,11 +2301,11 @@ "Money" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.bitac_6_src.tar.gz", - "size":4996182, + "download_url":"https://f-droid.org/repo/ademar.bitac_6.apk", + "size":3077855, "sha1":null, "md5":null, - "sha256":"96606dd18416352a408585cfb0946e0b49bbc701366e5365f6d3b5d9084b81d6", + "sha256":"660a7730cfa9f11b488395fd657b1f054881e6b0736b6d7f96a61229c43dc77a", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/BitAC/-/issues", "code_view_url":null, @@ -2313,14 +2343,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.bitac@6?download_url=https://f-droid.org/repo/ademar.bitac_6_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.bitac", - "repository_download_url":"https://f-droid.org/repo/ademar.bitac_6_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.bitac_6.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.bitac@6" @@ -2425,11 +2457,11 @@ "Money" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.bitac_5_src.tar.gz", - "size":4995815, + "download_url":"https://f-droid.org/repo/ademar.bitac_5.apk", + "size":2919369, "sha1":null, "md5":null, - "sha256":"fe0a4dbf0fcc76df2f85e38758abcc1808fc7fa355ffdf8237c41452670f647e", + "sha256":"efd2052b37ea6cb44836f400926b79d974cf3528e6e6e2c8a72569c63cfdbdd6", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/BitAC/-/issues", "code_view_url":null, @@ -2467,14 +2499,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.bitac@5?download_url=https://f-droid.org/repo/ademar.bitac_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.bitac", - "repository_download_url":"https://f-droid.org/repo/ademar.bitac_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.bitac_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.bitac@5" @@ -2579,11 +2613,11 @@ "Theming" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.textlauncher_8_src.tar.gz", - "size":981806, + "download_url":"https://f-droid.org/repo/ademar.textlauncher_8.apk", + "size":25418, "sha1":null, "md5":null, - "sha256":"d81d49a13ee829998b63114457fc0a086273e634b196bb9dc1dd26426ba0c8a8", + "sha256":"b2a7a09d0e0dafa341912eb4563fe0a8fdc02536e438862d8a83b8ba55c282ec", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/textlauncher/-/issues", "code_view_url":null, @@ -2621,14 +2655,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.textlauncher@8?download_url=https://f-droid.org/repo/ademar.textlauncher_8_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.textlauncher", - "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_8_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_8.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.textlauncher@8" @@ -2733,11 +2769,11 @@ "Theming" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.textlauncher_7_src.tar.gz", - "size":981742, + "download_url":"https://f-droid.org/repo/ademar.textlauncher_7.apk", + "size":14591, "sha1":null, "md5":null, - "sha256":"e0f03ad0f1d1e547b5cec5fe665fd796e0e48db865d595f7d802393b875b74d5", + "sha256":"cfb8c97c5f524d88d854e604d60a83b199b27b124743bbe8e415d027ea3a32be", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/textlauncher/-/issues", "code_view_url":null, @@ -2775,14 +2811,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.textlauncher@7?download_url=https://f-droid.org/repo/ademar.textlauncher_7_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.textlauncher", - "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_7_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_7.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.textlauncher@7" @@ -2887,11 +2925,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/agersant.polaris_415293114_src.tar.gz", - "size":25450214, + "download_url":"https://f-droid.org/repo/agersant.polaris_415293114.apk", + "size":4952438, "sha1":null, "md5":null, - "sha256":"4284e29988d213bfcfaedf85b37fe9df15d8af30d6f262e935251df83b1b314a", + "sha256":"0b88ef7adfcc601a45db988b9cfc5296cdc4c7c095b07edc43aada63c4e1211d", "sha512":null, "bug_tracking_url":"https://github.com/agersant/polaris-android/issues", "code_view_url":null, @@ -2929,14 +2967,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agersant.polaris@415293114?download_url=https://f-droid.org/repo/agersant.polaris_415293114_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agersant.polaris", - "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293114_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293114.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agersant.polaris@415293114" @@ -3041,11 +3081,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/agersant.polaris_415293112_src.tar.gz", - "size":25450029, + "download_url":"https://f-droid.org/repo/agersant.polaris_415293112.apk", + "size":4831957, "sha1":null, "md5":null, - "sha256":"8e67cf488d40d88dc4d0f95a8d6e712c1e63940b21a21a6d6ca1f65f2e5dfc8b", + "sha256":"b82a59f1cc21d014c6947d174abb7f533d780ffdfc3794cc193e8268ba93efe3", "sha512":null, "bug_tracking_url":"https://github.com/agersant/polaris-android/issues", "code_view_url":null, @@ -3083,14 +3123,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agersant.polaris@415293112?download_url=https://f-droid.org/repo/agersant.polaris_415293112_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agersant.polaris", - "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293112_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293112.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agersant.polaris@415293112" @@ -3195,11 +3237,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/agersant.polaris_48_src.tar.gz", - "size":25386695, + "download_url":"https://f-droid.org/repo/agersant.polaris_48.apk", + "size":3831332, "sha1":null, "md5":null, - "sha256":"7b85afd1d1a30ba36b2ad0dce1151d246efaaab708cca580280b50776527f107", + "sha256":"134d18e2d9dbd10a974e2806e27149951c9e4ec1c53e1ba2179ec859ae6b928a", "sha512":null, "bug_tracking_url":"https://github.com/agersant/polaris-android/issues", "code_view_url":null, @@ -3237,14 +3279,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agersant.polaris@48?download_url=https://f-droid.org/repo/agersant.polaris_48_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agersant.polaris", - "repository_download_url":"https://f-droid.org/repo/agersant.polaris_48_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agersant.polaris_48.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agersant.polaris@48" @@ -3333,11 +3377,11 @@ "Multimedia" ], "homepage_url":"https://codeberg.org/agrigolo/chubby-click", - "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22_src.tar.gz", - "size":1684337, + "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22.apk", + "size":2274962, "sha1":null, "md5":null, - "sha256":"b90b73ade10a3478842e279a6d23456b8f7b3affae886f1c451d9d90f0d9c412", + "sha256":"cb357c3f12258e40955d1d16ad45a0996b482e07c6bce13587aa6f722c164030", "sha512":null, "bug_tracking_url":"https://codeberg.org/agrigolo/chubby-click/issues", "code_view_url":null, @@ -3375,14 +3419,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agrigolo.chubbyclick@22?download_url=https://f-droid.org/repo/agrigolo.chubbyclick_22_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agrigolo.chubbyclick", - "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agrigolo.chubbyclick@22" @@ -3471,11 +3517,11 @@ "Multimedia" ], "homepage_url":"https://codeberg.org/agrigolo/chubby-click", - "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21_src.tar.gz", - "size":1681634, + "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21.apk", + "size":2270785, "sha1":null, "md5":null, - "sha256":"bf6c2be1fcb8308c6a0f6e870c3466fff65cc970758ead7a1274985d626dc7ae", + "sha256":"ddeb3fadfac02ccf53c2cf6ac4fc8ecc6cf58dd2411a888c7a7d03a2035cf78e", "sha512":null, "bug_tracking_url":"https://codeberg.org/agrigolo/chubby-click/issues", "code_view_url":null, @@ -3513,14 +3559,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agrigolo.chubbyclick@21?download_url=https://f-droid.org/repo/agrigolo.chubbyclick_21_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agrigolo.chubbyclick", - "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agrigolo.chubbyclick@21" @@ -3609,11 +3657,11 @@ "Multimedia" ], "homepage_url":"https://codeberg.org/agrigolo/chubby-click", - "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20_src.tar.gz", - "size":1681501, + "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20.apk", + "size":2270785, "sha1":null, "md5":null, - "sha256":"37dc91804c33ea5ecfb33e37f3fc68fb8d377d550ab522b5c1726644d501b4a0", + "sha256":"e2bd8970b208b36d9b7b37f44f29d664531d9fe36a4a0e9fa697a6d6589c4166", "sha512":null, "bug_tracking_url":"https://codeberg.org/agrigolo/chubby-click/issues", "code_view_url":null, @@ -3651,14 +3699,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agrigolo.chubbyclick@20?download_url=https://f-droid.org/repo/agrigolo.chubbyclick_20_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agrigolo.chubbyclick", - "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agrigolo.chubbyclick@20" @@ -3747,11 +3797,11 @@ "Internet" ], "homepage_url":"https://susi.ai/", - "download_url":"https://f-droid.org/repo/ai.susi_16_src.tar.gz", - "size":3964656, + "download_url":"https://f-droid.org/repo/ai.susi_16.apk", + "size":14344225, "sha1":null, "md5":null, - "sha256":"22cdb59a9d33eb02e0f44aa5206dcc3194a21b020cd2b3b063e6a718d6878cae", + "sha256":"6f851010809953054e7bb8fdd7e1f86a80e00cef6d91450518a408c4b0b59195", "sha512":null, "bug_tracking_url":"https://github.com/fossasia/susi_android/issues", "code_view_url":null, @@ -3789,14 +3839,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ai.susi@16?download_url=https://f-droid.org/repo/ai.susi_16_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ai.susi", - "repository_download_url":"https://f-droid.org/repo/ai.susi_16_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ai.susi_16.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ai.susi@16" @@ -3885,11 +3937,11 @@ "Internet" ], "homepage_url":"https://susi.ai/", - "download_url":"https://f-droid.org/repo/ai.susi_15_src.tar.gz", - "size":3868859, + "download_url":"https://f-droid.org/repo/ai.susi_15.apk", + "size":11555015, "sha1":null, "md5":null, - "sha256":"c74134a1e1986579d92f381cbd3109c3b77fcd3c52c59e7083069e49e604bbd0", + "sha256":"4f25e1679ced9ea42ceb32677bb0d0310fdf818e6cda9a9f7e8b81e73e73c8e9", "sha512":null, "bug_tracking_url":"https://github.com/fossasia/susi_android/issues", "code_view_url":null, @@ -3927,14 +3979,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ai.susi@15?download_url=https://f-droid.org/repo/ai.susi_15_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ai.susi", - "repository_download_url":"https://f-droid.org/repo/ai.susi_15_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ai.susi_15.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ai.susi@15" @@ -4023,11 +4077,11 @@ "Internet" ], "homepage_url":"https://susi.ai/", - "download_url":"https://f-droid.org/repo/ai.susi_14_src.tar.gz", - "size":3863221, + "download_url":"https://f-droid.org/repo/ai.susi_14.apk", + "size":10331217, "sha1":null, "md5":null, - "sha256":"ef878413b874564939fb0c37e1971a780488df402c46505a72886d2e7b18b0c4", + "sha256":"d0c0443f153b75fcb155aa062d8b8af6109d779c0b8d02946a00c074b5f9c305", "sha512":null, "bug_tracking_url":"https://github.com/fossasia/susi_android/issues", "code_view_url":null, @@ -4065,14 +4119,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ai.susi@14?download_url=https://f-droid.org/repo/ai.susi_14_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ai.susi", - "repository_download_url":"https://f-droid.org/repo/ai.susi_14_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ai.susi_14.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ai.susi@14" @@ -4163,11 +4219,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2_src.tar.gz", - "size":183772, + "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2.apk", + "size":185892, "sha1":null, "md5":null, - "sha256":"c26b0ed5cbd07d9e839e7f31f7757479e142e0194264c3afb0622b14c3e8f571", + "sha256":"20eea522f8d41dbbe9f8fa7204f076918dd9420c562468e5fd72059e6e66615e", "sha512":null, "bug_tracking_url":"https://github.com/andviane/moon/issues", "code_view_url":null, @@ -4205,14 +4261,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/akk.astro.droid.moonphase@2?download_url=https://f-droid.org/repo/akk.astro.droid.moonphase_2_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/akk.astro.droid.moonphase", - "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/akk.astro.droid.moonphase@2" @@ -4303,11 +4361,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1_src.tar.gz", - "size":183725, + "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1.apk", + "size":190393, "sha1":null, "md5":null, - "sha256":"084f4b57a30ad16b6479c54b4e5d778e38aa9f26712e998718243c0156a756db", + "sha256":"7e053f07f595f78863ddfc73ea55e5b19f1f504c2b8f06f61fc772521488e03f", "sha512":null, "bug_tracking_url":"https://github.com/andviane/moon/issues", "code_view_url":null, @@ -4345,14 +4403,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/akk.astro.droid.moonphase@1?download_url=https://f-droid.org/repo/akk.astro.droid.moonphase_1_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/akk.astro.droid.moonphase", - "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/akk.astro.droid.moonphase@1" @@ -4441,11 +4501,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/export-contacts", - "download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003_src.tar.gz", - "size":51527, + "download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003.apk", + "size":58051, "sha1":null, "md5":null, - "sha256":"0a719754dd28701e7782bd63f05e2ddfeb195bcba462efa31e8c28eb79f1b39f", + "sha256":"dcbed1c712db0b614d53e9517a287541349a33ac00d1a391806f839643cce5e9", "sha512":null, "bug_tracking_url":"http://dev.ed.am/export-contacts", "code_view_url":null, @@ -4483,14 +4543,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.exportcontacts@10003?download_url=https://f-droid.org/repo/am.ed.exportcontacts_10003_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.exportcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.exportcontacts@10003" @@ -4579,11 +4641,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/import-contacts", - "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304_src.tar.gz", - "size":68741, + "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304.apk", + "size":80326, "sha1":null, "md5":null, - "sha256":"921396b7c6858188406578358db80472c9e0053335662b96999e2fda259eba09", + "sha256":"8680fbf57af95a5ab69d91502b6337549e770d23db318a9ceaed957c88a92845", "sha512":null, "bug_tracking_url":"http://dev.ed.am/import-contacts", "code_view_url":null, @@ -4621,14 +4683,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.importcontacts@10304?download_url=https://f-droid.org/repo/am.ed.importcontacts_10304_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.importcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.importcontacts@10304" @@ -4717,11 +4781,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/import-contacts", - "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303_src.tar.gz", - "size":76762, + "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303.apk", + "size":85429, "sha1":null, "md5":null, - "sha256":"605a93496bb1ecbdb30b6af075b9fbd3ee23e61a971360f28dcddda2e567ca24", + "sha256":"f12e21058329841874e08db08eae230dd03a1437466a8f31c9485658bee53e63", "sha512":null, "bug_tracking_url":"http://dev.ed.am/import-contacts", "code_view_url":null, @@ -4759,14 +4823,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.importcontacts@10303?download_url=https://f-droid.org/repo/am.ed.importcontacts_10303_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.importcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.importcontacts@10303" @@ -4855,11 +4921,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/import-contacts", - "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301_src.tar.gz", - "size":75346, + "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301.apk", + "size":84309, "sha1":null, "md5":null, - "sha256":"22a6b3892c8bc4ce5b087639589f74418b4eaf5984a295a15e0d3b0cc577fd21", + "sha256":"c150a1a6e420e1e1ea535c9d26666d76f2d30bc1038a0cb2e871b359327aebcb", "sha512":null, "bug_tracking_url":"http://dev.ed.am/import-contacts", "code_view_url":null, @@ -4897,14 +4963,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.importcontacts@10301?download_url=https://f-droid.org/repo/am.ed.importcontacts_10301_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.importcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.importcontacts@10301" @@ -4993,11 +5061,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12_src.tar.gz", - "size":615615, + "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12.apk", + "size":255897, "sha1":null, "md5":null, - "sha256":"5a52dc5903a482738986a2e6e17e2a9f3ef3841d3e941f743972a3bde37ba63a", + "sha256":"8e13ebb31680b56c802428b09cb7ac630505dd13989ff5de81067d90754d58d2", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mBrowser/issues", "code_view_url":null, @@ -5035,14 +5103,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mbrowser@12?download_url=https://f-droid.org/repo/am.zoom.mbrowser_12_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mbrowser", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mbrowser@12" @@ -5131,11 +5201,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11_src.tar.gz", - "size":611638, + "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11.apk", + "size":255472, "sha1":null, "md5":null, - "sha256":"5eba16c41e9af0c4a38fe43421be2dd7e855c7b1e302d4df84f136c8c4d33f8a", + "sha256":"30325a66e6d15229c8c29ad5b164c0e130841c67588a03870632cec1de1390ab", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mBrowser/issues", "code_view_url":null, @@ -5173,14 +5243,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mbrowser@11?download_url=https://f-droid.org/repo/am.zoom.mbrowser_11_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mbrowser", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mbrowser@11" @@ -5269,11 +5341,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10_src.tar.gz", - "size":611065, + "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10.apk", + "size":254344, "sha1":null, "md5":null, - "sha256":"69b49b63e495d03c430f5e917e01b87dcd593607382adc279be73beb478c9974", + "sha256":"b17d82c7f67d6cc1f1c6f495355fb1ffd8f96eca23fc9d4a0951d208a3171855", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mBrowser/issues", "code_view_url":null, @@ -5311,14 +5383,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mbrowser@10?download_url=https://f-droid.org/repo/am.zoom.mbrowser_10_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mbrowser", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mbrowser@10" @@ -5407,11 +5481,11 @@ "System" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7_src.tar.gz", - "size":729123, + "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7.apk", + "size":251861, "sha1":null, "md5":null, - "sha256":"2493ef2a2bb1216990d494012abdb62be872b1bf1460a2ecc55e1ba838202638", + "sha256":"d135c514f0d59cc8eed8ddd72ace781610cf01222dc2985caeecfa002a840cae", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mLauncher/issues", "code_view_url":null, @@ -5449,14 +5523,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mlauncher@7?download_url=https://f-droid.org/repo/am.zoom.mlauncher_7_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mlauncher", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mlauncher@7" @@ -5545,11 +5621,11 @@ "System" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6_src.tar.gz", - "size":729127, + "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6.apk", + "size":251855, "sha1":null, "md5":null, - "sha256":"1a4d77d9a38fb16a7a01b18efafddaaf8f5b23d0bc09c0a26db336521d4e5ff1", + "sha256":"f37d48959ab15f75a140f55a2cd0d29dab27be3fb6cced615c03c50f12f62d36", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mLauncher/issues", "code_view_url":null, @@ -5587,14 +5663,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mlauncher@6?download_url=https://f-droid.org/repo/am.zoom.mlauncher_6_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mlauncher", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mlauncher@6" @@ -5683,11 +5761,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/amirz.dngprocessor_5_src.tar.gz", - "size":201270, + "download_url":"https://f-droid.org/repo/amirz.dngprocessor_5.apk", + "size":249134, "sha1":null, "md5":null, - "sha256":"680ea228d34737d4ff1d98d81b7da439f22c8d987c7f0bfdecdfcbd22136e7c7", + "sha256":"8a8e803d239717837e709c84de06bd2d10306836d6443eb1ec98f588786fc631", "sha512":null, "bug_tracking_url":"https://github.com/amirzaidi/DNGProcessor/issues", "code_view_url":null, @@ -5725,14 +5803,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/amirz.dngprocessor@5?download_url=https://f-droid.org/repo/amirz.dngprocessor_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/amirz.dngprocessor", - "repository_download_url":"https://f-droid.org/repo/amirz.dngprocessor_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/amirz.dngprocessor_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/amirz.dngprocessor@5" @@ -5839,11 +5919,11 @@ "Theming" ], "homepage_url":"http://www.reddit.com/u/AmirZ", - "download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911_src.tar.gz", - "size":1225005, + "download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911.apk", + "size":1693305, "sha1":null, "md5":null, - "sha256":"5e6413aa4a619c842bd430a5002abb44cc5be0ade9bab105516c634666eed15d", + "sha256":"7fa44d560dc4577374d45176220de2c0b00a71e09d6d148cdea4e0a52d38404a", "sha512":null, "bug_tracking_url":"https://github.com/amirzaidi/Launcher3/issues", "code_view_url":null, @@ -5881,14 +5961,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/amirz.rootless.nexuslauncher@30911?download_url=https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/amirz.rootless.nexuslauncher", - "repository_download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/amirz.rootless.nexuslauncher@30911" @@ -5993,11 +6075,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16_src.tar.gz", - "size":111974, + "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16.apk", + "size":1971112, "sha1":null, "md5":null, - "sha256":"968c79b32cfc86df5c0b638f8d7bfc5baeb5c6a6f730a0977a0471a5d9c779f0", + "sha256":"db1ee1a0fcddad2b8b99a6f53834718050c54adefa50a08cf64cb00efa50af27", "sha512":null, "bug_tracking_url":"https://github.com/gryphius/androdns/issues", "code_view_url":null, @@ -6035,14 +6117,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/androdns.android.leetdreams.ch.androdns@16?download_url=https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/androdns.android.leetdreams.ch.androdns", - "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/androdns.android.leetdreams.ch.androdns@16" @@ -6147,11 +6231,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15_src.tar.gz", - "size":111953, + "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15.apk", + "size":1971108, "sha1":null, "md5":null, - "sha256":"df576b5e63d85b4d1801eb4814e53d9c2d89a836306dc7e9b639b44ef623786a", + "sha256":"cb30ddf5e59b91b938271983d415656de6e370f23cbdf633786036831ca3d3db", "sha512":null, "bug_tracking_url":"https://github.com/gryphius/androdns/issues", "code_view_url":null, @@ -6189,14 +6273,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/androdns.android.leetdreams.ch.androdns@15?download_url=https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/androdns.android.leetdreams.ch.androdns", - "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/androdns.android.leetdreams.ch.androdns@15" @@ -6301,11 +6387,11 @@ "System" ], "homepage_url":"https://github.com/antlersoft/android-vnc-viewer/wiki/Documentation", - "download_url":"https://f-droid.org/repo/android.androidVNC_13_src.tar.gz", - "size":400409, + "download_url":"https://f-droid.org/repo/android.androidVNC_13.apk", + "size":243294, "sha1":null, "md5":null, - "sha256":"6a27130023302f7aa0974ceac9c9c9b2439b644906269d73042210ba4fbb63ac", + "sha256":"eb2682f9ab9ccc5926d1bd504995af31ce15c14b7810efd3f88ab9c3acefa4f9", "sha512":null, "bug_tracking_url":"https://github.com/antlersoft/android-vnc-viewer/issues", "code_view_url":null, @@ -6343,14 +6429,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.androidVNC@13?download_url=https://f-droid.org/repo/android.androidVNC_13_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.androidVNC", - "repository_download_url":"https://f-droid.org/repo/android.androidVNC_13_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.androidVNC_13.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.androidVNC@13" @@ -6439,11 +6527,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/android.game.prboom_31_src.tar.gz", - "size":6292542, + "download_url":"https://f-droid.org/repo/android.game.prboom_31.apk", + "size":883589, "sha1":null, "md5":null, - "sha256":"eb7058ec653b4884b18c9c9c918764820fb1d11b28d86c3f72cc2bfb18bb4ffd", + "sha256":"0a88b31c5cc465d83fc77703adddf3f4769af32a9c6505b636ae062a7d351bc0", "sha512":null, "bug_tracking_url":null, "code_view_url":null, @@ -6481,14 +6569,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.game.prboom@31?download_url=https://f-droid.org/repo/android.game.prboom_31_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.game.prboom", - "repository_download_url":"https://f-droid.org/repo/android.game.prboom_31_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.game.prboom_31.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.game.prboom@31" @@ -6593,11 +6683,11 @@ "System" ], "homepage_url":"https://fakestandby.jonasbernard.de/", - "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11_src.tar.gz", - "size":10393451, + "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11.apk", + "size":1695060, "sha1":null, "md5":null, - "sha256":"9c3ebdd1a733ac2d77106ce273f725549e87dd09dbf9d7d7b45828ad6ec91225", + "sha256":"1621370f48a2ad0a41a3ef528896c3c8c6fa169be9c7deba49d9ca642fbcb887", "sha512":null, "bug_tracking_url":"https://github.com/JonasBernard/FakeStandby/issues", "code_view_url":null, @@ -6635,14 +6725,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.jonas.fakestandby@11?download_url=https://f-droid.org/repo/android.jonas.fakestandby_11_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.jonas.fakestandby", - "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.jonas.fakestandby@11" @@ -6747,11 +6839,11 @@ "System" ], "homepage_url":"https://fakestandby.jonasbernard.de/", - "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10_src.tar.gz", - "size":10390397, + "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10.apk", + "size":1703252, "sha1":null, "md5":null, - "sha256":"7b5a6d6a6611e2012ea42de0faccac07bf8d177729aa746c5e91c2d19d663fd2", + "sha256":"68782ca0f0fcd686dd3854879bbee3a2072dac798352263956e6079f46129b0a", "sha512":null, "bug_tracking_url":"https://github.com/JonasBernard/FakeStandby/issues", "code_view_url":null, @@ -6789,14 +6881,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.jonas.fakestandby@10?download_url=https://f-droid.org/repo/android.jonas.fakestandby_10_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.jonas.fakestandby", - "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.jonas.fakestandby@10" @@ -6901,11 +6995,11 @@ "System" ], "homepage_url":"https://fakestandby.jonasbernard.de/", - "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9_src.tar.gz", - "size":10388620, + "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9.apk", + "size":3218772, "sha1":null, "md5":null, - "sha256":"bc61d8b67d2dcf1c50c47a69b7a262e6161baec4767fb4293bb40f973e86a63b", + "sha256":"26ca0a5bc1fc7abf92640c8ab2b92a7bd8ab5b6ecc8d3586aaf3222c110d4201", "sha512":null, "bug_tracking_url":"https://github.com/JonasBernard/FakeStandby/issues", "code_view_url":null, @@ -6943,14 +7037,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.jonas.fakestandby@9?download_url=https://f-droid.org/repo/android.jonas.fakestandby_9_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.jonas.fakestandby", - "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.jonas.fakestandby@9" @@ -7041,11 +7137,11 @@ "Reading" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5_src.tar.gz", - "size":79269, + "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5.apk", + "size":8730238, "sha1":null, "md5":null, - "sha256":"d8980a562d74247599e4e0ebfc6d790efc21a701fc23a687c0fc9d6af6a1f11c", + "sha256":"209b6119126a78aa2b529a0b4a340bb61ab841e4de67015388bf5b852f59d2cc", "sha512":null, "bug_tracking_url":"https://github.com/NachiketaVadera/EBookDownloader/issues", "code_view_url":null, @@ -7083,14 +7179,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.nachiketa.ebookdownloader@5?download_url=https://f-droid.org/repo/android.nachiketa.ebookdownloader_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.nachiketa.ebookdownloader", - "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.nachiketa.ebookdownloader@5" @@ -7181,11 +7279,11 @@ "Reading" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4_src.tar.gz", - "size":79036, + "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4.apk", + "size":1669940, "sha1":null, "md5":null, - "sha256":"af48fce72d27dd111025d0e55eb4a1bafbefb709a86ce7afb22f351b1c12e31d", + "sha256":"ae5b380411a2c4333ec7865113d8b19fb2431800741eb0416784281e7113ba1a", "sha512":null, "bug_tracking_url":"https://github.com/NachiketaVadera/EBookDownloader/issues", "code_view_url":null, @@ -7223,14 +7321,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.nachiketa.ebookdownloader@4?download_url=https://f-droid.org/repo/android.nachiketa.ebookdownloader_4_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.nachiketa.ebookdownloader", - "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.nachiketa.ebookdownloader@4" @@ -7319,11 +7419,11 @@ "Graphics" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/anupam.acrylic_19_src.tar.gz", - "size":717863, + "download_url":"https://f-droid.org/repo/anupam.acrylic_19.apk", + "size":542178, "sha1":null, "md5":null, - "sha256":"8d7c23bab765ed1c765feeff5c6f357c06775224fa40a1d7da58b3be2668eca5", + "sha256":"df01309e3641fac77cd9bd356558e122e31f1317f988dfb4144ebad949e0ac84", "sha512":null, "bug_tracking_url":"https://github.com/valerio-bozzolan/AcrylicPaint/issues", "code_view_url":null, @@ -7361,14 +7461,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/anupam.acrylic@19?download_url=https://f-droid.org/repo/anupam.acrylic_19_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/anupam.acrylic", - "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_19_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_19.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/anupam.acrylic@19" @@ -7457,11 +7559,11 @@ "Graphics" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/anupam.acrylic_18_src.tar.gz", - "size":717113, + "download_url":"https://f-droid.org/repo/anupam.acrylic_18.apk", + "size":540073, "sha1":null, "md5":null, - "sha256":"5ea3a0c4a0ec2a1f771debbd3dcc5287274205afca6ba7c5f4dfcd42f39250a1", + "sha256":"b014d0a5febd73ee883b69f1054c015d222003559332a7b489d0fd1e49eca408", "sha512":null, "bug_tracking_url":"https://github.com/valerio-bozzolan/AcrylicPaint/issues", "code_view_url":null, @@ -7499,14 +7601,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/anupam.acrylic@18?download_url=https://f-droid.org/repo/anupam.acrylic_18_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/anupam.acrylic", - "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_18_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_18.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/anupam.acrylic@18" @@ -7595,11 +7699,11 @@ "Graphics" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/anupam.acrylic_17_src.tar.gz", - "size":460693, + "download_url":"https://f-droid.org/repo/anupam.acrylic_17.apk", + "size":461248, "sha1":null, "md5":null, - "sha256":"b561e4348ab05b5f8094181f31d99d664496759680b2afd393ad5a74d511eeab", + "sha256":"b06660fd279f443c74aa35b2fc79bd1689f04fb8a23a892d4cf73340de8a261e", "sha512":null, "bug_tracking_url":"https://github.com/valerio-bozzolan/AcrylicPaint/issues", "code_view_url":null, @@ -7637,14 +7741,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/anupam.acrylic@17?download_url=https://f-droid.org/repo/anupam.acrylic_17_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/anupam.acrylic", - "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_17_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_17.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/anupam.acrylic@17" @@ -7751,11 +7857,11 @@ "Multimedia" ], "homepage_url":"https://github.com/alextran1502/immich", - "download_url":"https://f-droid.org/repo/app.alextran.immich_54_src.tar.gz", - "size":47553968, + "download_url":"https://f-droid.org/repo/app.alextran.immich_54.apk", + "size":57002798, "sha1":null, "md5":null, - "sha256":"ca85cdb973ac46acc20b375226d7227d6010f9c8dfaafd962903181833d4cd96", + "sha256":"b6d1717613cbeb60cbf342b5543f009f9a04008fc9999ad34dc3ba87909d24be", "sha512":null, "bug_tracking_url":"https://github.com/alextran1502/immich/issues", "code_view_url":null, @@ -7793,14 +7899,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.alextran.immich@54?download_url=https://f-droid.org/repo/app.alextran.immich_54_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.alextran.immich", - "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_54_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_54.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.alextran.immich@54" @@ -7907,11 +8015,11 @@ "Multimedia" ], "homepage_url":"https://github.com/alextran1502/immich", - "download_url":"https://f-droid.org/repo/app.alextran.immich_53_src.tar.gz", - "size":47547734, + "download_url":"https://f-droid.org/repo/app.alextran.immich_53.apk", + "size":56920878, "sha1":null, "md5":null, - "sha256":"5e348ab1d8d101ae1b844d8a58b8b6da89ad41933eaf9a5110ab8ff78d765d61", + "sha256":"bd22db2ef4364a84d03b90f0d1770015fd9d500728311e2caa7adb1c8fed58a2", "sha512":null, "bug_tracking_url":"https://github.com/alextran1502/immich/issues", "code_view_url":null, @@ -7949,14 +8057,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.alextran.immich@53?download_url=https://f-droid.org/repo/app.alextran.immich_53_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.alextran.immich", - "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_53_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_53.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.alextran.immich@53" @@ -8063,11 +8173,11 @@ "Multimedia" ], "homepage_url":"https://github.com/alextran1502/immich", - "download_url":"https://f-droid.org/repo/app.alextran.immich_52_src.tar.gz", - "size":45165676, + "download_url":"https://f-droid.org/repo/app.alextran.immich_52.apk", + "size":56757038, "sha1":null, "md5":null, - "sha256":"173a3e7accf415037e20179f0ceb6053f12080eef63e9ab02daca277c7404b9e", + "sha256":"410b2fa7495cb7c8e2b6ba6b296f629e2ea2ca00524e47811339924711ac4ffc", "sha512":null, "bug_tracking_url":"https://github.com/alextran1502/immich/issues", "code_view_url":null, @@ -8105,14 +8215,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.alextran.immich@52?download_url=https://f-droid.org/repo/app.alextran.immich_52_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.alextran.immich", - "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_52_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_52.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.alextran.immich@52" @@ -8201,11 +8313,11 @@ "Money" ], "homepage_url":"https://crescent.cash/", - "download_url":"https://f-droid.org/repo/app.crescentcash.src_120_src.tar.gz", - "size":12692630, + "download_url":"https://f-droid.org/repo/app.crescentcash.src_120.apk", + "size":9839433, "sha1":null, "md5":null, - "sha256":"e9e21882e7f29dab34dfb53697f0d43866819c37b2305fbdd6ac0568e2fd09b4", + "sha256":"a02b64c18d0ec43adfe87686551416a4c68f36db3a25122fc474797eb4880aee", "sha512":null, "bug_tracking_url":"https://gitlab.com/pokkst/crescentcash/issues", "code_view_url":null, @@ -8243,14 +8355,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crescentcash.src@120?download_url=https://f-droid.org/repo/app.crescentcash.src_120_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crescentcash.src", - "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_120_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_120.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crescentcash.src@120" @@ -8339,11 +8453,11 @@ "Money" ], "homepage_url":"https://crescent.cash/", - "download_url":"https://f-droid.org/repo/app.crescentcash.src_118_src.tar.gz", - "size":12652634, + "download_url":"https://f-droid.org/repo/app.crescentcash.src_118.apk", + "size":9837377, "sha1":null, "md5":null, - "sha256":"bd4d9664749a285a16fa6436311640f45745443d6c1aa9acd8d5f256e62cd822", + "sha256":"04b4db2625a90d7f413e963d4509342bbc870479700764b2a53cd87ad875b3f8", "sha512":null, "bug_tracking_url":"https://gitlab.com/pokkst/crescentcash/issues", "code_view_url":null, @@ -8381,14 +8495,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crescentcash.src@118?download_url=https://f-droid.org/repo/app.crescentcash.src_118_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crescentcash.src", - "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_118_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_118.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crescentcash.src@118" @@ -8477,11 +8593,11 @@ "Money" ], "homepage_url":"https://crescent.cash/", - "download_url":"https://f-droid.org/repo/app.crescentcash.src_117_src.tar.gz", - "size":12675494, + "download_url":"https://f-droid.org/repo/app.crescentcash.src_117.apk", + "size":9802865, "sha1":null, "md5":null, - "sha256":"bdb15b54aaef02ac84ab51508936297c2825f9a2735c59db39a945b71e2bb83b", + "sha256":"b376c41a96ac242da44944db44ada51f99b6818191b7f50ae0d44c97d8bae3af", "sha512":null, "bug_tracking_url":"https://gitlab.com/pokkst/crescentcash/issues", "code_view_url":null, @@ -8519,14 +8635,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crescentcash.src@117?download_url=https://f-droid.org/repo/app.crescentcash.src_117_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crescentcash.src", - "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_117_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_117.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crescentcash.src@117" @@ -8615,11 +8733,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000_src.tar.gz", - "size":1912837, + "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000.apk", + "size":5435991, "sha1":null, "md5":null, - "sha256":"4ab9f2bfbd621ef624b11f22e23ad531d57a21c383bd1331f451b0bf5a517952", + "sha256":"2ae2f52f2487751b4519fa1670b282c974fa9c8544a621793c94f712106e849e", "sha512":null, "bug_tracking_url":"https://github.com/yourealwaysbe/forkyz/issues", "code_view_url":null, @@ -8657,14 +8775,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3200000?download_url=https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crossword.yourealwaysbe.forkyz", - "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3200000" @@ -8753,11 +8873,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000_src.tar.gz", - "size":1480726, + "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000.apk", + "size":5427462, "sha1":null, "md5":null, - "sha256":"2f73a7f95bf303c8000fec515d69a67f7fcfe4bf73ab745db551dcfd29f9fe0a", + "sha256":"a815d9aa7c680402d76b9c8fe291fc74f564f3e03bb170dc68c6275d3c3c7500", "sha512":null, "bug_tracking_url":"https://github.com/yourealwaysbe/forkyz/issues", "code_view_url":null, @@ -8795,14 +8915,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3100000?download_url=https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crossword.yourealwaysbe.forkyz", - "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3100000" @@ -8891,11 +9013,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000_src.tar.gz", - "size":1480460, + "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000.apk", + "size":5427462, "sha1":null, "md5":null, - "sha256":"31cd5689b9e2fd0aab497226b946e3ee799a10c995f13f3a5e2977891655baa1", + "sha256":"079de513de4ac49eee95bc21f03a15e732f4822dbb054d3a958bdd5269e93680", "sha512":null, "bug_tracking_url":"https://github.com/yourealwaysbe/forkyz/issues", "code_view_url":null, @@ -8933,14 +9055,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3000000?download_url=https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crossword.yourealwaysbe.forkyz", - "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3000000" @@ -9029,11 +9153,11 @@ "Security" ], "homepage_url":"http://forum.xda-developers.com/android/apps-games/app-easy-token-source-securid-token-t2805507", - "download_url":"https://f-droid.org/repo/app.easytoken_919_src.tar.gz", - "size":10755431, + "download_url":"https://f-droid.org/repo/app.easytoken_919.apk", + "size":1024784, "sha1":null, "md5":null, - "sha256":"c251369d27760a347b5282ff92cb5d6c40a1e2d40e3cda464ef2e30f3e9c2478", + "sha256":"4ea7fe623d6e3920f920335191cd23adaf2533d5bd10cef10a7a5a80473da79e", "sha512":null, "bug_tracking_url":"https://github.com/cernekee/EasyToken/issues", "code_view_url":null, @@ -9071,14 +9195,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.easytoken@919?download_url=https://f-droid.org/repo/app.easytoken_919_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.easytoken", - "repository_download_url":"https://f-droid.org/repo/app.easytoken_919_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.easytoken_919.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.easytoken@919" @@ -9167,11 +9293,11 @@ "Security" ], "homepage_url":"http://forum.xda-developers.com/android/apps-games/app-easy-token-source-securid-token-t2805507", - "download_url":"https://f-droid.org/repo/app.easytoken_909_src.tar.gz", - "size":8025551, + "download_url":"https://f-droid.org/repo/app.easytoken_909.apk", + "size":809099, "sha1":null, "md5":null, - "sha256":"12c9383926b5f9f0d1e4df2efacce6cfdf1515dfdb07e32b902802ddd938c53f", + "sha256":"4d688551eec4e75e6ac3469ab220f4c3d3e40e2254c9916f31fecdb1f04cb27e", "sha512":null, "bug_tracking_url":"https://github.com/cernekee/EasyToken/issues", "code_view_url":null, @@ -9209,14 +9335,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.easytoken@909?download_url=https://f-droid.org/repo/app.easytoken_909_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.easytoken", - "repository_download_url":"https://f-droid.org/repo/app.easytoken_909_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.easytoken_909.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.easytoken@909" @@ -9305,11 +9433,11 @@ "Security" ], "homepage_url":"http://forum.xda-developers.com/android/apps-games/app-easy-token-source-securid-token-t2805507", - "download_url":"https://f-droid.org/repo/app.easytoken_819_src.tar.gz", - "size":8018460, + "download_url":"https://f-droid.org/repo/app.easytoken_819.apk", + "size":804978, "sha1":null, "md5":null, - "sha256":"48910c8fe3e98d6899b11eeafcde96ad30be61af8dda366d0bd1fb41f35b6854", + "sha256":"f0594a80c5dc64f63cea7009683453ddcd65485dbbbbef8d84e4281e44f0e2e2", "sha512":null, "bug_tracking_url":"https://github.com/cernekee/EasyToken/issues", "code_view_url":null, @@ -9347,14 +9475,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.easytoken@819?download_url=https://f-droid.org/repo/app.easytoken_819_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.easytoken", - "repository_download_url":"https://f-droid.org/repo/app.easytoken_819_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.easytoken_819.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.easytoken@819" @@ -9459,11 +9589,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45_src.tar.gz", - "size":6582330, + "download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45.apk", + "size":12287923, "sha1":null, "md5":null, - "sha256":"ca2fe5033881b074537c01a06ccb31c9dd319010c035d62d4f710fd927ae1452", + "sha256":"bd3f683f838f1fab45192f14aec98379fb3d36960b917b9f2cb7a7e81c6481c8", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/fedilab-tube/issues", "code_view_url":null, @@ -9501,14 +9631,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.fedilabtube@45?download_url=https://f-droid.org/repo/app.fedilab.fedilabtube_45_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.fedilabtube", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.fedilabtube@45" @@ -9613,11 +9745,11 @@ "Internet" ], "homepage_url":"https://framagit.org/tom79/mobilizon-android-app", - "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3_src.tar.gz", - "size":1012014, + "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3.apk", + "size":3037446, "sha1":null, "md5":null, - "sha256":"62518770423befc370c46d68a9ada76b10cd8cf023f122e638eda2f0e3d69100", + "sha256":"9b7a3d5efee6a925becc2dfbe3a2b2a0e7c1841609c0cbab2481d9557ee3ed4e", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/mobilizon-android-app/issues", "code_view_url":null, @@ -9655,14 +9787,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.mobilizon@3?download_url=https://f-droid.org/repo/app.fedilab.mobilizon_3_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.mobilizon", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.mobilizon@3" @@ -9767,11 +9901,11 @@ "Internet" ], "homepage_url":"https://framagit.org/tom79/mobilizon-android-app", - "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2_src.tar.gz", - "size":999588, + "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2.apk", + "size":3021062, "sha1":null, "md5":null, - "sha256":"1e52ad2425d1d375d33909e9000088c654ac0e9636e3f52212a2cfaf7907d72f", + "sha256":"defe55d3423cd24a0830090fdfa1ca1f6edfa02bea79a123b7f9635bff92b726", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/mobilizon-android-app/issues", "code_view_url":null, @@ -9809,14 +9943,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.mobilizon@2?download_url=https://f-droid.org/repo/app.fedilab.mobilizon_2_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.mobilizon", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.mobilizon@2" @@ -9921,11 +10057,11 @@ "Internet" ], "homepage_url":"https://framagit.org/tom79/mobilizon-android-app", - "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1_src.tar.gz", - "size":997708, + "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1.apk", + "size":3021062, "sha1":null, "md5":null, - "sha256":"f1413b554480c0fb9088944a57c01fdeb9b8cd23b05271e7c28fba734f40358c", + "sha256":"84e17b1b523153a91808e16011d0a6d6d9c6061bdb398756d4d56c9ce339b8e1", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/mobilizon-android-app/issues", "code_view_url":null, @@ -9963,14 +10099,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.mobilizon@1?download_url=https://f-droid.org/repo/app.fedilab.mobilizon_1_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.mobilizon", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.mobilizon@1" @@ -10059,11 +10197,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33_src.tar.gz", - "size":1096069, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33.apk", + "size":3714928, "sha1":null, "md5":null, - "sha256":"7f151a6aecae7fe1e4de472425b658a2de18d19c1dc44fe784e18cb80d39fe17", + "sha256":"bcf251559ee4777f26a9e26b403b1606365893ef96df5edcfcca47a71c377361", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10101,14 +10239,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizeme@33?download_url=https://f-droid.org/repo/app.fedilab.nitterizeme_33_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizeme", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizeme@33" @@ -10197,11 +10337,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32_src.tar.gz", - "size":1398878, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32.apk", + "size":3732264, "sha1":null, "md5":null, - "sha256":"217889ed1119e7fc862711a2c5b80f9767176866ac507560cd771f750b3cb131", + "sha256":"80608edebc87f3967dbcc14263b0ddbbac31d389970602049f81ea38e247aa82", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10239,14 +10379,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizeme@32?download_url=https://f-droid.org/repo/app.fedilab.nitterizeme_32_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizeme", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizeme@32" @@ -10335,11 +10477,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31_src.tar.gz", - "size":3236208, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31.apk", + "size":3678975, "sha1":null, "md5":null, - "sha256":"cfe56f1fa80ceca3058468ef400be55ef75ca76195c4b42d94949a0d3d958c44", + "sha256":"cd7dc93738eb5f7c7c0fa76c616e016ea2cec5bf9572b1f4fb3f24afed345cb6", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10377,14 +10519,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizeme@31?download_url=https://f-droid.org/repo/app.fedilab.nitterizeme_31_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizeme", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizeme@31" @@ -10473,11 +10617,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33_src.tar.gz", - "size":1096611, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33.apk", + "size":3714928, "sha1":null, "md5":null, - "sha256":"43a03cbdc9c012f2e28717855f0ee18afcd80d618ae3f37e6ac5f12efa906711", + "sha256":"30ae40611f5fbbce0772e3c26d4103d98d62ed22b7dd83736219ace083b1bc46", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10515,14 +10659,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizemelite@33?download_url=https://f-droid.org/repo/app.fedilab.nitterizemelite_33_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizemelite", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizemelite@33" @@ -10611,11 +10757,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32_src.tar.gz", - "size":1399379, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32.apk", + "size":3732264, "sha1":null, "md5":null, - "sha256":"e199a466c8d1f3ed771b1803123397d54863bb3261e7ef89a45dd3d6c19c06d5", + "sha256":"7c8135e64e6d4e3558dc870f459cdf159c31166799f17d85809b528f1d55feda", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10653,14 +10799,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizemelite@32?download_url=https://f-droid.org/repo/app.fedilab.nitterizemelite_32_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizemelite", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizemelite@32" @@ -10749,11 +10897,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31_src.tar.gz", - "size":3236897, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31.apk", + "size":3674879, "sha1":null, "md5":null, - "sha256":"eb79f7871c4bf8ee0c0c6a97b6d86065e0de19bb1d0519f6bff6874ed1e226f6", + "sha256":"3564df4741d8774abffa119d8daea6f6f6f1eef0cd3900cecf85274970105f24", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10791,14 +10939,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizemelite@31?download_url=https://f-droid.org/repo/app.fedilab.nitterizemelite_31_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizemelite", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizemelite@31" @@ -10887,11 +11037,11 @@ "Navigation" ], "homepage_url":"https://framagit.org/tom79/openmaps", - "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13_src.tar.gz", - "size":3417207, + "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13.apk", + "size":3828282, "sha1":null, "md5":null, - "sha256":"89bd8da46c986b9633c8de48bb61f468021cb4036286dd192ae0eeabc933750b", + "sha256":"f8f74eb4efd87d25113e960af3f1469839042fbe4da0eea0075290944cc5fb53", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/openmaps/issues", "code_view_url":null, @@ -10929,14 +11079,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.openmaps@13?download_url=https://f-droid.org/repo/app.fedilab.openmaps_13_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.openmaps", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.openmaps@13" @@ -11025,11 +11177,11 @@ "Navigation" ], "homepage_url":"https://framagit.org/tom79/openmaps", - "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12_src.tar.gz", - "size":3416732, + "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12.apk", + "size":3717905, "sha1":null, "md5":null, - "sha256":"f0eb93ca63d5c4f23e7a403f1d79906ff578451c11d11e084ab680eb604b76a2", + "sha256":"f0e53ba598a761609dcd5dc2a2a2b733b8a8a51611f8a9d1dbc0a777ae339904", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/openmaps/issues", "code_view_url":null, @@ -11067,14 +11219,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.openmaps@12?download_url=https://f-droid.org/repo/app.fedilab.openmaps_12_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.openmaps", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.openmaps@12" @@ -11163,11 +11317,11 @@ "Navigation" ], "homepage_url":"https://framagit.org/tom79/openmaps", - "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11_src.tar.gz", - "size":3407906, + "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11.apk", + "size":3295949, "sha1":null, "md5":null, - "sha256":"0f5d6fb9f8eeae672fdb73b1c3af613542e56f3a936e8942aa7fe7ebebd53441", + "sha256":"ae76fbd1ee331c9d7c2a734a5b2f88fe38c8850ce4351ef7ae70550f3caa05ee", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/openmaps/issues", "code_view_url":null, @@ -11205,14 +11359,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.openmaps@11?download_url=https://f-droid.org/repo/app.fedilab.openmaps_11_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.openmaps", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.openmaps@11" @@ -11317,11 +11473,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45_src.tar.gz", - "size":6581087, + "download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45.apk", + "size":12043946, "sha1":null, "md5":null, - "sha256":"a114d34c5f6066ed3045e4b6096119ab715760cc371e9ace8f532196aec07fe7", + "sha256":"74987196d05f1003732a4cb99d1237d0b1a4831de5f828988d621b0d9a7a6d81", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/fedilab-tube/issues", "code_view_url":null, @@ -11359,14 +11515,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.tubelab@45?download_url=https://f-droid.org/repo/app.fedilab.tubelab_45_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.tubelab", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.tubelab@45" @@ -11471,11 +11629,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.intra_64_src.tar.gz", - "size":1329599, + "download_url":"https://f-droid.org/repo/app.intra_64.apk", + "size":12590857, "sha1":null, "md5":null, - "sha256":"8449121f840d7208363c1bb39daea26afeea7317cd708b85c3b006e7db76fe83", + "sha256":"eb19b25591e54c6e3718e9810d38d42f2883df3a834ca511c05532bdf45dbbf3", "sha512":null, "bug_tracking_url":"https://github.com/Jigsaw-Code/Intra/issues", "code_view_url":null, @@ -11513,14 +11671,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.intra@64?download_url=https://f-droid.org/repo/app.intra_64_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.intra", - "repository_download_url":"https://f-droid.org/repo/app.intra_64_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.intra_64.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.intra@64" @@ -11609,11 +11769,11 @@ "Reading" ], "homepage_url":"https://librenews.io/", - "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5_src.tar.gz", - "size":314767, + "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5.apk", + "size":2240436, "sha1":null, "md5":null, - "sha256":"f01e5322c0161259310186612b98612955b0045b4fc07b9d512fb3bc76451281", + "sha256":"cad1fef2b79f7df832a27dbf991e70c83b1639eaa678c177a250e32e74fc52db", "sha512":null, "bug_tracking_url":"https://github.com/milesmcc/LibreNews-Android/issues", "code_view_url":null, @@ -11651,14 +11811,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.librenews.io.librenews@5?download_url=https://f-droid.org/repo/app.librenews.io.librenews_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.librenews.io.librenews", - "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.librenews.io.librenews@5" @@ -11747,11 +11909,11 @@ "Reading" ], "homepage_url":"https://librenews.io/", - "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4_src.tar.gz", - "size":314245, + "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4.apk", + "size":2239563, "sha1":null, "md5":null, - "sha256":"21e79d2f2aad88aa10d0620dcdb1ee0eb9ffd9830fd7e9d5120e2a1943f5cab2", + "sha256":"856fa473c9f9aed97d9f4122a853288ceca3cc3ccceb33dd29cb0f21c706729b", "sha512":null, "bug_tracking_url":"https://github.com/milesmcc/LibreNews-Android/issues", "code_view_url":null, @@ -11789,14 +11951,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.librenews.io.librenews@4?download_url=https://f-droid.org/repo/app.librenews.io.librenews_4_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.librenews.io.librenews", - "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.librenews.io.librenews@4" @@ -11885,11 +12049,11 @@ "Reading" ], "homepage_url":"https://librenews.io/", - "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3_src.tar.gz", - "size":310961, + "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3.apk", + "size":1993445, "sha1":null, "md5":null, - "sha256":"8aae95fde276065e50626f5a1d89beffffbe93d6c34502c0ae83106839a03b24", + "sha256":"506b3003a4de2d5eb956a0c995b22efee313c983fdffc5ca3bef940d8428caa6", "sha512":null, "bug_tracking_url":"https://github.com/milesmcc/LibreNews-Android/issues", "code_view_url":null, @@ -11927,14 +12091,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.librenews.io.librenews@3?download_url=https://f-droid.org/repo/app.librenews.io.librenews_3_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.librenews.io.librenews", - "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.librenews.io.librenews@3" diff --git a/pyproject.toml b/pyproject.toml index 65f49137..81017f08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,10 +75,10 @@ ignore = [ "D212", # multi-line-summary-first-line "D400", # ends-in-period "D415", # ends-in-punctuation - "E501", # line-too-long # TODO: we want to address these issues in the codebase, then get rid of # the following ignores "C901", # complex-structure + "E501", # line-too-long "S101", # assert "S103", # bad-file-permissions "S113", # request-without-timeout From 0275c704b555902f9be678b143b90923c7074846 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 14:25:02 -0700 Subject: [PATCH 09/12] Have code style check as its own job #512 #515 Signed-off-by: Jono Yang --- azure-pipelines.yml | 8 ++++++++ etc/ci/azure-posix.yml | 3 --- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3a108dbd..9e157f2e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -40,3 +40,11 @@ jobs: python_versions: ['3.10'] test_suites: all: make check_docs + + - template: etc/ci/azure-posix.yml + parameters: + job_name: ci_code_style + image_name: ubuntu-22.04 + python_versions: ['3.10'] + test_suites: + all: make check diff --git a/etc/ci/azure-posix.yml b/etc/ci/azure-posix.yml index 2fe1633c..b139a66c 100644 --- a/etc/ci/azure-posix.yml +++ b/etc/ci/azure-posix.yml @@ -41,8 +41,5 @@ jobs: sudo cp .env /etc/scancodeio displayName: '${{ pyver }} - Configure' - - script: make check - displayName: '${{ pyver }} - Check code style and format' - - script: $(test_suite) displayName: '${{ pyver }} - $(test_suite_label) on ${{ parameters.job_name }}' From ab4b9d709a5e373581d5ef4cf56ea59db98ecf30 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 14:57:15 -0700 Subject: [PATCH 10/12] Sort collect api test results before comparison #512 #515 Signed-off-by: Jono Yang --- minecode/utils_test.py | 25 ------------------------- packagedb/tests/test_api.py | 4 +++- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/minecode/utils_test.py b/minecode/utils_test.py index 31a7c393..923d9943 100644 --- a/minecode/utils_test.py +++ b/minecode/utils_test.py @@ -305,31 +305,6 @@ def _remove_fields_from_results(self, data, fields_to_remove): continue data.pop(field) - def check_expected_results( - self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN - ): - """ - Check `results` are equal to expected data stored in a JSON - file at `expected_loc`. - `results` can be a JSON string or a regular Python structure. - - Regen the expected JSON if `regen` is True. - """ - if isinstance(results, str): - results = json.loads(results) - - results = self._normalize_results(results, fields_to_remove) - - if regen: - with codecs.open(expected_loc, mode="wb", encoding="utf-8") as expect: - json.dump(results, expect, indent=2, separators=(",", ":")) - - with codecs.open(expected_loc, mode="rb", encoding="utf-8") as expect: - expected = json.load(expect) - - results = json.loads(json.dumps(results)) - self.assertEqual(expected, results) - def check_expected_uris( self, uris, expected_loc, data_is_json=False, regen=FIXTURES_REGEN ): diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index 118ae6cb..46d93162 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -979,7 +979,9 @@ def test_package_live(self): expected = self.get_test_loc("api/twill-core-0.12.0.json") self.assertEqual(2, len(response.data)) - result = response.data[0] + # Sort results by name + results = sorted(response.data, key=lambda x: x["name"]) + result = results[0] # remove fields result.pop("url") From 59e9e0e4ec1e1005321cc35157d02ae069eb50bf Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 15:11:24 -0700 Subject: [PATCH 11/12] Update expected test results #512 #515 Signed-off-by: Jono Yang --- packagedb/tests/test_api.py | 14 +- .../testfiles/api/twill-core-0.12.0.json | 554 ++++++++++++------ 2 files changed, 376 insertions(+), 192 deletions(-) diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index 46d93162..35e5ca63 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -977,18 +977,16 @@ def test_package_live(self): 1, Package.objects.filter(download_url=sources_download_url).count() ) expected = self.get_test_loc("api/twill-core-0.12.0.json") - - self.assertEqual(2, len(response.data)) - # Sort results by name - results = sorted(response.data, key=lambda x: x["name"]) - result = results[0] + results = response.data + self.assertEqual(2, len(results)) # remove fields - result.pop("url") - fields_to_remove = ["uuid", "resources", "package_sets", "history"] + for result in results: + result.pop("url") + fields_to_remove = ["uuid", "resources", "package_sets", "history"] self.check_expected_results( - result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN + results, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN ) # Ensure that the created ScannableURI objects have a priority of 100 diff --git a/packagedb/tests/testfiles/api/twill-core-0.12.0.json b/packagedb/tests/testfiles/api/twill-core-0.12.0.json index 0c1c60fa..114217e8 100644 --- a/packagedb/tests/testfiles/api/twill-core-0.12.0.json +++ b/packagedb/tests/testfiles/api/twill-core-0.12.0.json @@ -1,184 +1,370 @@ -{ - "filename":"twill-core-0.12.0.jar", - "package_content":"binary", - "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0", - "type":"maven", - "namespace":"org.apache.twill", - "name":"twill-core", - "version":"0.12.0", - "qualifiers":"", - "subpath":"", - "primary_language":"Java", - "description":"Apache Twill core library", - "release_date":null, - "parties":[ - { - "type":"organization", - "role":"owner", - "name":"The Apache Software Foundation", - "email":null, - "url":"http://www.apache.org/" - } - ], - "keywords":[], - "homepage_url":"http://www.apache.org/", - "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar", - "bug_tracking_url":null, - "code_view_url":null, - "vcs_url":null, - "repository_homepage_url":null, - "repository_download_url":null, - "api_data_url":null, - "size":null, - "md5":null, - "sha1":"252cc5e60690d611a9981d1b3fabeb0d3a7e8a28", - "sha256":null, - "sha512":null, - "copyright":null, - "holder":null, - "declared_license_expression":"apache-2.0", - "declared_license_expression_spdx":"Apache-2.0", - "license_detections":[], - "other_license_expression":null, - "other_license_expression_spdx":null, - "other_license_detections":[], - "extracted_license_statement":null, - "notice_text":null, - "source_packages":[ - "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" - ], - "extra_data":{}, - "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?uuid=fixed-uid-done-for-testing-5642512d1758", - "datasource_id":null, - "file_references":[], - "dependencies":[ - { - "purl":"pkg:maven/org.apache.twill/twill-api@0.12.0", - "extracted_requirement":"0.12.0", - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":true - }, - { - "purl":"pkg:maven/org.apache.twill/twill-zookeeper@0.12.0", - "extracted_requirement":"0.12.0", - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":true - }, - { - "purl":"pkg:maven/org.apache.twill/twill-discovery-core@0.12.0", - "extracted_requirement":"0.12.0", - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":true - }, - { - "purl":"pkg:maven/com.google.guava/guava", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/com.google.code.gson/gson", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/io.netty/netty", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.xerial.snappy/snappy-java", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.ow2.asm/asm-all", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.slf4j/slf4j-api", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/ch.qos.logback/logback-core", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/ch.qos.logback/logback-classic", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.apache.kafka/kafka_2.10", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/net.sf.jopt-simple/jopt-simple", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/junit/junit", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.unitils/unitils-core", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.apache.commons/commons-compress", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - } - ] -} \ No newline at end of file +[ + { + "filename":"twill-core-0.12.0.jar", + "package_content":"binary", + "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0", + "type":"maven", + "namespace":"org.apache.twill", + "name":"twill-core", + "version":"0.12.0", + "qualifiers":"", + "subpath":"", + "primary_language":"Java", + "description":"Apache Twill core library", + "release_date":null, + "parties":[ + { + "type":"organization", + "role":"owner", + "name":"The Apache Software Foundation", + "email":null, + "url":"http://www.apache.org/" + } + ], + "keywords":[], + "homepage_url":"http://www.apache.org/", + "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar", + "bug_tracking_url":null, + "code_view_url":null, + "vcs_url":null, + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":null, + "md5":null, + "sha1":"252cc5e60690d611a9981d1b3fabeb0d3a7e8a28", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":"apache-2.0", + "declared_license_expression_spdx":"Apache-2.0", + "license_detections":[], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":null, + "notice_text":null, + "source_packages":[ + "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" + ], + "extra_data":{}, + "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?uuid=fixed-uid-done-for-testing-5642512d1758", + "datasource_id":null, + "file_references":[], + "dependencies":[ + { + "purl":"pkg:maven/org.apache.twill/twill-api@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-zookeeper@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-discovery-core@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/com.google.guava/guava", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/com.google.code.gson/gson", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/io.netty/netty", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.xerial.snappy/snappy-java", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.ow2.asm/asm-all", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.slf4j/slf4j-api", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-classic", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.kafka/kafka_2.10", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/net.sf.jopt-simple/jopt-simple", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/junit/junit", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.unitils/unitils-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.commons/commons-compress", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + } + ] + }, + { + "filename":"twill-core-0.12.0-sources.jar", + "package_content":"source_archive", + "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources", + "type":"maven", + "namespace":"org.apache.twill", + "name":"twill-core", + "version":"0.12.0", + "qualifiers":"classifier=sources", + "subpath":"", + "primary_language":"Java", + "description":"Apache Twill core library", + "release_date":null, + "parties":[ + { + "type":"organization", + "role":"owner", + "name":"The Apache Software Foundation", + "email":null, + "url":"http://www.apache.org/" + } + ], + "keywords":[], + "homepage_url":"http://www.apache.org/", + "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar", + "bug_tracking_url":null, + "code_view_url":null, + "vcs_url":null, + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":null, + "md5":null, + "sha1":"dfbe61539b44213f389ff7d9a7745173d114b6df", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":"apache-2.0", + "declared_license_expression_spdx":"Apache-2.0", + "license_detections":[], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":null, + "notice_text":null, + "source_packages":[ + "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" + ], + "extra_data":{}, + "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources&uuid=fixed-uid-done-for-testing-5642512d1758", + "datasource_id":null, + "file_references":[], + "dependencies":[ + { + "purl":"pkg:maven/org.apache.twill/twill-api@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-zookeeper@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-discovery-core@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/com.google.guava/guava", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/com.google.code.gson/gson", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/io.netty/netty", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.xerial.snappy/snappy-java", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.ow2.asm/asm-all", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.slf4j/slf4j-api", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-classic", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.kafka/kafka_2.10", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/net.sf.jopt-simple/jopt-simple", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/junit/junit", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.unitils/unitils-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.commons/commons-compress", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + } + ] + } +] \ No newline at end of file From a3eaf8b4e7dd5ebab9bc1a8bef81557a3c3c7724 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Aug 2024 15:23:45 -0700 Subject: [PATCH 12/12] Add code to collector init #512 #515 Signed-off-by: Jono Yang --- minecode/collectors/__init__.py | 10 +++++++ packagedb/tests/test_api.py | 3 ++ .../testfiles/api/twill-core-0.12.0.json | 28 +++++++++---------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/minecode/collectors/__init__.py b/minecode/collectors/__init__.py index e1521118..a916ea5b 100644 --- a/minecode/collectors/__init__.py +++ b/minecode/collectors/__init__.py @@ -6,3 +6,13 @@ # See https://github.com/aboutcode-org/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # + +import pkgutil + +""" +Minimal way to recursively import all submodules dynamically. If this module is +imported, all submodules will be imported: this triggers the actual registration +of miners. This should stay as the last import in this init module. +""" +for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + "."): + __import__(name) diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index 35e5ca63..d7c150b8 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -984,6 +984,9 @@ def test_package_live(self): for result in results: result.pop("url") + # sort by filename + results = sorted(results, key=lambda x: x["filename"]) + fields_to_remove = ["uuid", "resources", "package_sets", "history"] self.check_expected_results( results, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN diff --git a/packagedb/tests/testfiles/api/twill-core-0.12.0.json b/packagedb/tests/testfiles/api/twill-core-0.12.0.json index 114217e8..39b13fc5 100644 --- a/packagedb/tests/testfiles/api/twill-core-0.12.0.json +++ b/packagedb/tests/testfiles/api/twill-core-0.12.0.json @@ -1,13 +1,13 @@ [ { - "filename":"twill-core-0.12.0.jar", - "package_content":"binary", - "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0", + "filename":"twill-core-0.12.0-sources.jar", + "package_content":"source_archive", + "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources", "type":"maven", "namespace":"org.apache.twill", "name":"twill-core", "version":"0.12.0", - "qualifiers":"", + "qualifiers":"classifier=sources", "subpath":"", "primary_language":"Java", "description":"Apache Twill core library", @@ -23,7 +23,7 @@ ], "keywords":[], "homepage_url":"http://www.apache.org/", - "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar", + "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar", "bug_tracking_url":null, "code_view_url":null, "vcs_url":null, @@ -32,7 +32,7 @@ "api_data_url":null, "size":null, "md5":null, - "sha1":"252cc5e60690d611a9981d1b3fabeb0d3a7e8a28", + "sha1":"dfbe61539b44213f389ff7d9a7745173d114b6df", "sha256":null, "sha512":null, "copyright":null, @@ -49,7 +49,7 @@ "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" ], "extra_data":{}, - "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?uuid=fixed-uid-done-for-testing-5642512d1758", + "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources&uuid=fixed-uid-done-for-testing-5642512d1758", "datasource_id":null, "file_references":[], "dependencies":[ @@ -184,14 +184,14 @@ ] }, { - "filename":"twill-core-0.12.0-sources.jar", - "package_content":"source_archive", - "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources", + "filename":"twill-core-0.12.0.jar", + "package_content":"binary", + "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0", "type":"maven", "namespace":"org.apache.twill", "name":"twill-core", "version":"0.12.0", - "qualifiers":"classifier=sources", + "qualifiers":"", "subpath":"", "primary_language":"Java", "description":"Apache Twill core library", @@ -207,7 +207,7 @@ ], "keywords":[], "homepage_url":"http://www.apache.org/", - "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar", + "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar", "bug_tracking_url":null, "code_view_url":null, "vcs_url":null, @@ -216,7 +216,7 @@ "api_data_url":null, "size":null, "md5":null, - "sha1":"dfbe61539b44213f389ff7d9a7745173d114b6df", + "sha1":"252cc5e60690d611a9981d1b3fabeb0d3a7e8a28", "sha256":null, "sha512":null, "copyright":null, @@ -233,7 +233,7 @@ "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" ], "extra_data":{}, - "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources&uuid=fixed-uid-done-for-testing-5642512d1758", + "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?uuid=fixed-uid-done-for-testing-5642512d1758", "datasource_id":null, "file_references":[], "dependencies":[