Skip to content

Commit

Permalink
[a r] Index supplementary files for AnVIL (#5000)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Mar 18, 2023
1 parent de9ffdc commit 47617da
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
'file_md5sum',
'reference_assembly',
'file_name',
'is_supplementary',
# Not in schema
'crc32',
'sha256',
Expand Down Expand Up @@ -223,6 +224,7 @@ def facets(self) -> Sequence[str]:
'files.data_modality',
'files.file_format',
'files.reference_assembly',
'files.is_supplementary',
]

@property
Expand Down
2 changes: 2 additions & 0 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
EntityReference,
EntityType,
FieldTypes,
null_bool,
null_datetime,
null_int,
null_str,
Expand Down Expand Up @@ -275,6 +276,7 @@ def _file_types(cls) -> FieldTypes:
'file_md5sum': null_str,
'reference_assembly': [null_str],
'file_name': null_str,
'is_supplementary': null_bool,
# Not in schema
'version': null_str,
'uuid': null_str,
Expand Down
1 change: 1 addition & 0 deletions src/azul/plugins/metadata/anvil/service/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]:
'count',
'data_modality',
'file_format',
'is_supplementary',
'reference_assembly'
}
}
43 changes: 43 additions & 0 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def merge(cls, links: Iterable['Link']) -> 'Link':

class BundleEntityType(Enum):
primary: EntityType = 'biosample'
supplementary: EntityType = 'file'


@attr.s(auto_attribs=True, frozen=True, kw_only=True)
Expand Down Expand Up @@ -225,10 +226,15 @@ def _list_bundles(self,
partition_prefix = spec.prefix.common + prefix
validate_uuid_prefix(partition_prefix)
primary = BundleEntityType.primary.value
supplementary = BundleEntityType.supplementary.value
rows = self._run_sql(f'''
SELECT datarepo_row_id, {primary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, primary))}
WHERE STARTS_WITH(datarepo_row_id, '{partition_prefix}')
UNION ALL
SELECT datarepo_row_id, {supplementary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, supplementary))} AS supp
WHERE STARTS_WITH(datarepo_row_id, '{partition_prefix}') AND supp.is_supplementary
''')
return [
AnvilBundleFQID(source=source,
Expand Down Expand Up @@ -264,6 +270,8 @@ def list_partitions(self,
def _emulate_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:
if bundle_fqid.entity_type is BundleEntityType.primary:
return self._primary_bundle(bundle_fqid)
elif bundle_fqid.entity_type is BundleEntityType.supplementary:
return self._supplementary_bundle(bundle_fqid)
else:
assert False, bundle_fqid.entity_type

Expand Down Expand Up @@ -308,6 +316,40 @@ def _primary_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:

return result

def _supplementary_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:
entity_id = uuids.change_version(bundle_fqid.uuid,
self.bundle_uuid_version,
self.datarepo_row_uuid_version)
source = bundle_fqid.source.spec
bundle_entity_type = bundle_fqid.entity_type.value
result = TDRAnvilBundle(fqid=bundle_fqid, manifest=[], metadata_files={})
columns = self._columns(bundle_entity_type)
bundle_entity = one(self._run_sql(f'''
SELECT {', '.join(sorted(columns))}
FROM {backtick(self._full_table_name(source, bundle_entity_type))}
WHERE datarepo_row_id = '{entity_id}'
'''))
linked_entity_type = 'dataset'
columns = self._columns(linked_entity_type)
linked_entity = one(self._run_sql(f'''
SELECT {', '.join(sorted(columns))}
FROM {backtick(self._full_table_name(source, linked_entity_type))}
LIMIT 2
'''))
entities_by_key = {}
link_args = {}
for entity_type, row, arg in [
(bundle_entity_type, bundle_entity, 'outputs'),
(linked_entity_type, linked_entity, 'inputs')
]:
entity_ref = EntityReference(entity_type=entity_type, entity_id=row['datarepo_row_id'])
key_ref = KeyReference(key=row[entity_type + '_id'], entity_type=entity_type)
entities_by_key[key_ref] = entity_ref
result.add_entity(entity_ref, self._version, row)
link_args[arg] = key_ref
result.add_links(bundle_fqid, {Link.create(**link_args)}, entities_by_key)
return result

def _bundle_entity(self, bundle_fqid: AnvilBundleFQID) -> KeyReference:
source = bundle_fqid.source
bundle_uuid = bundle_fqid.uuid
Expand Down Expand Up @@ -696,6 +738,7 @@ def _convert_column(self, value):
'reference_assembly',
'file_name',
'file_ref',
'is_supplementary',
},
'activity': {
'activity_id',
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 47617da

Please sign in to comment.