Skip to content

Commit

Permalink
[a r] Index supplementary files for AnVIL (#5000)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Mar 18, 2023
1 parent 2deef77 commit 0002e32
Show file tree
Hide file tree
Showing 8 changed files with 166 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
'file_md5sum',
'reference_assembly',
'file_name',
'is_supplementary',
# Not in schema
'crc32',
'sha256',
Expand Down Expand Up @@ -223,6 +224,7 @@ def facets(self) -> Sequence[str]:
'files.data_modality',
'files.file_format',
'files.reference_assembly',
'files.is_supplementary',
]

@property
Expand Down
2 changes: 2 additions & 0 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
EntityReference,
EntityType,
FieldTypes,
null_bool,
null_datetime,
null_int,
null_str,
Expand Down Expand Up @@ -275,6 +276,7 @@ def _file_types(cls) -> FieldTypes:
'file_md5sum': null_str,
'reference_assembly': [null_str],
'file_name': null_str,
'is_supplementary': null_bool,
# Not in schema
'version': null_str,
'uuid': null_str,
Expand Down
1 change: 1 addition & 0 deletions src/azul/plugins/metadata/anvil/service/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]:
'count',
'data_modality',
'file_format',
'is_supplementary',
'reference_assembly'
}
}
43 changes: 43 additions & 0 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def merge(cls, links: Iterable['Link']) -> 'Link':

class BundleEntityType(Enum):
primary: EntityType = 'biosample'
supplementary: EntityType = 'file'


@attr.s(auto_attribs=True, frozen=True, kw_only=True)
Expand Down Expand Up @@ -224,10 +225,15 @@ def _list_bundles(self,
partition_prefix = spec.prefix.common + prefix
validate_uuid_prefix(partition_prefix)
primary = BundleEntityType.primary.value
supplementary = BundleEntityType.supplementary.value
rows = self._run_sql(f'''
SELECT datarepo_row_id, {primary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, primary))}
WHERE STARTS_WITH(datarepo_row_id, '{partition_prefix}')
UNION ALL
SELECT datarepo_row_id, {supplementary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, supplementary))} AS supp
WHERE STARTS_WITH(datarepo_row_id, '{partition_prefix}') AND supp.is_supplementary
''')
return [
AnvilBundleFQID(source=source,
Expand Down Expand Up @@ -263,6 +269,8 @@ def list_partitions(self,
def _emulate_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:
if bundle_fqid.entity_type is BundleEntityType.primary:
return self._primary_bundle(bundle_fqid)
elif bundle_fqid.entity_type is BundleEntityType.supplementary:
return self._supplementary_bundle(bundle_fqid)
else:
assert False, bundle_fqid.entity_type

Expand Down Expand Up @@ -307,6 +315,40 @@ def _primary_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:

return result

def _supplementary_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:
entity_id = uuids.change_version(bundle_fqid.uuid,
self.bundle_uuid_version,
self.datarepo_row_uuid_version)
source = bundle_fqid.source.spec
bundle_entity_type = bundle_fqid.entity_type.value
result = TDRAnvilBundle(fqid=bundle_fqid, manifest=[], metadata_files={})
columns = self._columns(bundle_entity_type)
bundle_entity = one(self._run_sql(f'''
SELECT {', '.join(sorted(columns))}
FROM {backtick(self._full_table_name(source, bundle_entity_type))}
WHERE datarepo_row_id = '{entity_id}'
'''))
linked_entity_type = 'dataset'
columns = self._columns(linked_entity_type)
linked_entity = one(self._run_sql(f'''
SELECT {', '.join(sorted(columns))}
FROM {backtick(self._full_table_name(source, linked_entity_type))}
LIMIT 2
'''))
entities_by_key = {}
link_args = {}
for entity_type, row, arg in [
(bundle_entity_type, bundle_entity, 'outputs'),
(linked_entity_type, linked_entity, 'inputs')
]:
entity_ref = EntityReference(entity_type=entity_type, entity_id=row['datarepo_row_id'])
key_ref = KeyReference(key=row[entity_type + '_id'], entity_type=entity_type)
entities_by_key[key_ref] = entity_ref
result.add_entity(entity_ref, self._version, row)
link_args[arg] = key_ref
result.add_links(bundle_fqid, {Link.create(**link_args)}, entities_by_key)
return result

def _bundle_entity(self, bundle_fqid: AnvilBundleFQID) -> KeyReference:
source = bundle_fqid.source
bundle_uuid = bundle_fqid.uuid
Expand Down Expand Up @@ -695,6 +737,7 @@ def _convert_column(self, value):
'reference_assembly',
'file_name',
'file_ref',
'is_supplementary',
},
'activity': {
'activity_id',
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions test/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,8 +450,10 @@ def _wait_for_indexer():
elif config.is_anvil_enabled(catalog.name):
# While the files index does exist for AnVIL, it's possible
# for a bundle entity not to contain any files and
# thus be absent from the files response
entity_type = pluralize(BundleEntityType.primary.value)
# thus be absent from the files response. The only entity
# type that is linked to both primary and supplementary
# bundles is datasets.
entity_type = 'datasets'
else:
assert False, catalog
self._assert_catalog_complete(catalog=catalog.name,
Expand Down
Loading

0 comments on commit 0002e32

Please sign in to comment.