From ec7ab38cf4ae0078d4687b35ab5c84dffada8f5c Mon Sep 17 00:00:00 2001 From: Noah Dove Date: Fri, 17 Mar 2023 20:32:26 -0700 Subject: [PATCH] [a r] Index supplementary files for AnVIL (#5000) --- src/azul/plugins/metadata/anvil/__init__.py | 2 + .../metadata/anvil/indexer/transform.py | 2 + .../metadata/anvil/service/response.py | 1 + .../plugins/repository/tdr_anvil/__init__.py | 43 +++++++++++ ...274-affe-aabc-eb3db63ad068.result.tdr.json | 2 + ...2-e274-affe-aabc-eb3db63ad068.results.json | 36 +++++++++ test/integration_test.py | 6 +- test/service/test_response_anvil.py | 76 +++++++++++++++++++ 8 files changed, 166 insertions(+), 2 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index 0a98e3ed9d..b3815f2f5a 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -180,6 +180,7 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: 'file_md5sum', 'reference_assembly', 'file_name', + 'is_supplementary', # Not in schema 'crc32', 'sha256', @@ -223,6 +224,7 @@ def facets(self) -> Sequence[str]: 'files.data_modality', 'files.file_format', 'files.reference_assembly', + 'files.is_supplementary', ] @property diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 62b2a40eda..41fb9f28d1 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -42,6 +42,7 @@ EntityReference, EntityType, FieldTypes, + null_bool, null_datetime, null_int, null_str, @@ -275,6 +276,7 @@ def _file_types(cls) -> FieldTypes: 'file_md5sum': null_str, 'reference_assembly': [null_str], 'file_name': null_str, + 'is_supplementary': null_bool, # Not in schema 'version': null_str, 'uuid': null_str, diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py index 5279a3c378..c6f3f4fd33 100644 --- a/src/azul/plugins/metadata/anvil/service/response.py +++ b/src/azul/plugins/metadata/anvil/service/response.py @@ -230,6 +230,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]: 'count', 'data_modality', 'file_format', + 'is_supplementary', 'reference_assembly' } } diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index d402533a90..f1fd4280a6 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -116,6 +116,7 @@ def merge(cls, links: Iterable['Link']) -> 'Link': class BundleEntityType(Enum): primary: EntityType = 'biosample' + supplementary: EntityType = 'file' @attr.s(auto_attribs=True, frozen=True, kw_only=True) @@ -224,10 +225,15 @@ def _list_bundles(self, partition_prefix = spec.prefix.common + prefix validate_uuid_prefix(partition_prefix) primary = BundleEntityType.primary.value + supplementary = BundleEntityType.supplementary.value rows = self._run_sql(f''' SELECT datarepo_row_id, {primary!r} AS entity_type FROM {backtick(self._full_table_name(spec, primary))} WHERE STARTS_WITH(datarepo_row_id, '{partition_prefix}') + UNION ALL + SELECT datarepo_row_id, {supplementary!r} AS entity_type + FROM {backtick(self._full_table_name(spec, supplementary))} AS supp + WHERE supp.is_supplementary AND STARTS_WITH(datarepo_row_id, '{partition_prefix}') ''') return [ AnvilBundleFQID(source=source, @@ -282,6 +288,9 @@ def _emulate_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle: if bundle_fqid.entity_type is BundleEntityType.primary: log.info('Bundle %r is a primary bundle', bundle_fqid.uuid) return self._primary_bundle(bundle_fqid) + elif bundle_fqid.entity_type is BundleEntityType.supplementary: + log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid) + return self._supplementary_bundle(bundle_fqid) else: assert False, bundle_fqid.entity_type @@ -326,6 +335,39 @@ def _primary_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle: return result + def _supplementary_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle: + entity_id = uuids.change_version(bundle_fqid.uuid, + self.bundle_uuid_version, + self.datarepo_row_uuid_version) + source = bundle_fqid.source.spec + bundle_entity_type = bundle_fqid.entity_type.value + result = TDRAnvilBundle(fqid=bundle_fqid, manifest=[], metadata_files={}) + columns = self._columns(bundle_entity_type) + bundle_entity = dict(one(self._run_sql(f''' + SELECT {', '.join(sorted(columns))} + FROM {backtick(self._full_table_name(source, bundle_entity_type))} + WHERE datarepo_row_id = '{entity_id}' + '''))) + linked_entity_type = 'dataset' + columns = self._columns(linked_entity_type) + linked_entity = dict(one(self._run_sql(f''' + SELECT {', '.join(sorted(columns))} + FROM {backtick(self._full_table_name(source, linked_entity_type))} + '''))) + entities_by_key = {} + link_args = {} + for entity_type, row, arg in [ + (bundle_entity_type, bundle_entity, 'outputs'), + (linked_entity_type, linked_entity, 'inputs') + ]: + entity_ref = EntityReference(entity_type=entity_type, entity_id=row['datarepo_row_id']) + key_ref = KeyReference(key=row[entity_type + '_id'], entity_type=entity_type) + entities_by_key[key_ref] = entity_ref + result.add_entity(entity_ref, self._version, row) + link_args[arg] = key_ref + result.add_links(bundle_fqid, {Link.create(**link_args)}, entities_by_key) + return result + def _bundle_entity(self, bundle_fqid: AnvilBundleFQID) -> KeyReference: source = bundle_fqid.source bundle_uuid = bundle_fqid.uuid @@ -714,6 +756,7 @@ def _convert_column(self, value): 'reference_assembly', 'file_name', 'file_ref', + 'is_supplementary', }, 'activity': { 'activity_id', diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.result.tdr.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.result.tdr.json index df35d94fb3..732ebe2be9 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.result.tdr.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.result.tdr.json @@ -181,6 +181,7 @@ "file_ref": "drs://data.terra.bio/v1_5b77f311-6e67-4a1b-b32a-baa9f5742442_1e269f04-4347-4188-b060-1dcc69e71d67", "file_size": 213021639, "reference_assembly": [], + "is_supplementary": false, "source_datarepo_row_ids": [ "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" ] @@ -195,6 +196,7 @@ "file_ref": "drs://data.terra.bio/v1_5b77f311-6e67-4a1b-b32a-baa9f5742442_8b722e88-8103-49c1-b351-e64fa7c6ab37", "file_size": 3306845592, "reference_assembly": [], + "is_supplementary": false, "source_datarepo_row_ids": [ "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" ] diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 1fc62605df..de931a713b 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -196,6 +196,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -405,6 +408,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "6a85e0ab-2386-4f7e-8503-d72d90b4bc47", "size": 213021639, @@ -630,6 +634,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.bam" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -687,6 +694,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -915,6 +925,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "677f207e-2d12-4eca-8f7a-039325af91ad", "size": 3306845592, @@ -941,6 +952,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "6a85e0ab-2386-4f7e-8503-d72d90b4bc47", "size": 213021639, @@ -1164,6 +1176,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.bam" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -1221,6 +1236,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -1449,6 +1467,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "677f207e-2d12-4eca-8f7a-039325af91ad", "size": 3306845592, @@ -1475,6 +1494,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "6a85e0ab-2386-4f7e-8503-d72d90b4bc47", "size": 213021639, @@ -1689,6 +1709,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "677f207e-2d12-4eca-8f7a-039325af91ad", "size": 3306845592, @@ -1881,6 +1902,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "677f207e-2d12-4eca-8f7a-039325af91ad", "size": 3306845592, @@ -2095,6 +2117,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "6a85e0ab-2386-4f7e-8503-d72d90b4bc47", "size": 213021639, @@ -2287,6 +2310,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "6a85e0ab-2386-4f7e-8503-d72d90b4bc47", "size": 213021639, @@ -2505,6 +2529,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.bam" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -2714,6 +2741,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "677f207e-2d12-4eca-8f7a-039325af91ad", "size": 3306845592, @@ -2931,6 +2959,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.bam" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -2988,6 +3019,9 @@ "file_name": [ "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" ], + "is_supplementary": [ + 0 + ], "version": [ "2022-06-01T00:00:00.000000Z" ], @@ -3216,6 +3250,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "677f207e-2d12-4eca-8f7a-039325af91ad", "size": 3306845592, @@ -3242,6 +3277,7 @@ "~null" ], "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "is_supplementary": 0, "version": "2022-06-01T00:00:00.000000Z", "uuid": "6a85e0ab-2386-4f7e-8503-d72d90b4bc47", "size": 213021639, diff --git a/test/integration_test.py b/test/integration_test.py index 61c184e639..b60488327d 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -450,8 +450,10 @@ def _wait_for_indexer(): elif config.is_anvil_enabled(catalog.name): # While the files index does exist for AnVIL, it's possible # for a bundle entity not to contain any files and - # thus be absent from the files response - entity_type = pluralize(BundleEntityType.primary.value) + # thus be absent from the files response. The only entity + # type that is linked to both primary and supplementary + # bundles is datasets. + entity_type = 'datasets' else: assert False, catalog self._assert_catalog_complete(catalog=catalog.name, diff --git a/test/service/test_response_anvil.py b/test/service/test_response_anvil.py index ac5263211c..44fbedac38 100644 --- a/test/service/test_response_anvil.py +++ b/test/service/test_response_anvil.py @@ -155,6 +155,9 @@ def test_entity_indices(self): 'file_format': [ '.vcf.gz' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -274,6 +277,9 @@ def test_entity_indices(self): 'file_format': [ '.bam' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -481,6 +487,16 @@ def test_entity_indices(self): 'total': 2, 'type': 'terms' }, + 'files.is_supplementary': { + 'terms': [ + { + 'count': 2, + 'term': 'false' + } + ], + 'total': 2, + 'type': 'terms' + }, 'files.reference_assembly': { 'terms': [ { @@ -597,6 +613,9 @@ def test_entity_indices(self): 'file_format': [ '.bam' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -609,6 +628,9 @@ def test_entity_indices(self): 'file_format': [ '.vcf.gz' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -816,6 +838,16 @@ def test_entity_indices(self): 'total': 1, 'type': 'terms' }, + 'files.is_supplementary': { + 'terms': [ + { + 'count': 1, + 'term': 'false' + } + ], + 'total': 1, + 'type': 'terms' + }, 'files.reference_assembly': { 'terms': [ { @@ -952,6 +984,9 @@ def test_entity_indices(self): 'file_format': [ '.bam' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -964,6 +999,9 @@ def test_entity_indices(self): 'file_format': [ '.vcf.gz' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -1171,6 +1209,16 @@ def test_entity_indices(self): 'total': 1, 'type': 'terms' }, + 'files.is_supplementary': { + 'terms': [ + { + 'count': 1, + 'term': 'false' + } + ], + 'total': 1, + 'type': 'terms' + }, 'files.reference_assembly': { 'terms': [ { @@ -1302,6 +1350,9 @@ def test_entity_indices(self): 'file_format': [ '.bam' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -1314,6 +1365,9 @@ def test_entity_indices(self): 'file_format': [ '.vcf.gz' ], + 'is_supplementary': [ + False + ], 'reference_assembly': [ None ], @@ -1521,6 +1575,16 @@ def test_entity_indices(self): 'total': 1, 'type': 'terms' }, + 'files.is_supplementary': { + 'terms': [ + { + 'count': 1, + 'term': 'false' + } + ], + 'total': 1, + 'type': 'terms' + }, 'files.reference_assembly': { 'terms': [ { @@ -1650,6 +1714,7 @@ def test_entity_indices(self): 'version': '2022-06-01T00:00:00.000000Z', 'uuid': '6a85e0ab-2386-4f7e-8503-d72d90b4bc47', 'size': 213021639, + 'is_supplementary': False, 'name': 'file_1e269f04-4347-4188-b060-1dcc69e71d67', 'crc32': '', 'sha256': '', @@ -1779,6 +1844,7 @@ def test_entity_indices(self): 'version': '2022-06-01T00:00:00.000000Z', 'uuid': '677f207e-2d12-4eca-8f7a-039325af91ad', 'size': 3306845592, + 'is_supplementary': False, 'name': 'file_8b722e88-8103-49c1-b351-e64fa7c6ab37', 'crc32': '', 'sha256': '', @@ -1993,6 +2059,16 @@ def test_entity_indices(self): 'total': 2, 'type': 'terms' }, + 'files.is_supplementary': { + 'terms': [ + { + 'count': 2, + 'term': 'false' + } + ], + 'total': 2, + 'type': 'terms' + }, 'files.reference_assembly': { 'terms': [ {