Skip to content

Commit

Permalink
Include entity type in AnVIL bundle FQIDs
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Mar 18, 2023
1 parent 3eb1587 commit 2deef77
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 20 deletions.
64 changes: 47 additions & 17 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
defaultdict,
)
import datetime
from enum import (
Enum,
)
import logging
from operator import (
itemgetter,
Expand Down Expand Up @@ -111,8 +114,28 @@ def merge(cls, links: Iterable['Link']) -> 'Link':
Links = set[Link]


class BundleEntityType(Enum):
primary: EntityType = 'biosample'


@attr.s(auto_attribs=True, frozen=True, kw_only=True)
class AnvilBundleFQID(TDRBundleFQID):
entity_type: BundleEntityType

def fqid_json(self) -> MutableJSON:
result = super().fqid_json()
result['entity_type'] = self.entity_type.value
return result

@classmethod
def from_json(cls, source: TDRSourceRef, json: JSON) -> 'AnvilBundleFQID':
return cls(source=source,
uuid=json['bundle_uuid'],
version=json['bundle_version'],
entity_type=BundleEntityType(json['entity_type']))


class TDRAnvilBundle(TDRBundle):
entity_type: EntityType = 'biosample'

def add_entity(self,
entity: EntityReference,
Expand Down Expand Up @@ -180,7 +203,7 @@ def _parse_drs_uri(self, file_ref: Optional[str]) -> Optional[str]:
return self._parse_drs_path(file_ref)


class Plugin(TDRPlugin[TDRSourceSpec, TDRSourceRef, TDRBundleFQID]):
class Plugin(TDRPlugin[TDRSourceSpec, TDRSourceRef, AnvilBundleFQID]):

@cached_property
def _version(self):
Expand All @@ -196,24 +219,25 @@ def _version(self):
def _list_bundles(self,
source: TDRSourceRef,
prefix: str
) -> list[TDRBundleFQID]:
) -> list[AnvilBundleFQID]:
spec = source.spec
partition_prefix = spec.prefix.common + prefix
validate_uuid_prefix(partition_prefix)
entity_type = TDRAnvilBundle.entity_type
primary = BundleEntityType.primary.value
rows = self._run_sql(f'''
SELECT datarepo_row_id
FROM {backtick(self._full_table_name(spec, entity_type))}
SELECT datarepo_row_id, {primary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, primary))}
WHERE STARTS_WITH(datarepo_row_id, '{partition_prefix}')
''')
return [
TDRBundleFQID(source=source,
# Reversibly tweak the entity UUID to prevent
# collisions between entity IDs and bundle IDs
uuid=uuids.change_version(row['datarepo_row_id'],
self.datarepo_row_uuid_version,
self.bundle_uuid_version),
version=self._version)
AnvilBundleFQID(source=source,
# Reversibly tweak the entity UUID to prevent
# collisions between entity IDs and bundle IDs
uuid=uuids.change_version(row['datarepo_row_id'],
self.datarepo_row_uuid_version,
self.bundle_uuid_version),
version=self._version,
entity_type=BundleEntityType(row['entity_type']))
for row in rows
]

Expand All @@ -226,7 +250,7 @@ def list_partitions(self,
for partition_prefix in prefix.partition_prefixes()
]
assert prefixes, prefix
entity_type = TDRAnvilBundle.entity_type
entity_type = BundleEntityType.primary.value
pk_column = entity_type + '_id'
rows = self._run_sql(f'''
SELECT prefix, COUNT({pk_column}) AS subgraph_count
Expand All @@ -236,7 +260,13 @@ def list_partitions(self,
''')
return {row['prefix']: row['subgraph_count'] for row in rows}

def _emulate_bundle(self, bundle_fqid: TDRBundleFQID) -> TDRAnvilBundle:
def _emulate_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:
if bundle_fqid.entity_type is BundleEntityType.primary:
return self._primary_bundle(bundle_fqid)
else:
assert False, bundle_fqid.entity_type

def _primary_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:
source = bundle_fqid.source
bundle_entity = self._bundle_entity(bundle_fqid)

Expand Down Expand Up @@ -277,13 +307,13 @@ def _emulate_bundle(self, bundle_fqid: TDRBundleFQID) -> TDRAnvilBundle:

return result

def _bundle_entity(self, bundle_fqid: TDRBundleFQID) -> KeyReference:
def _bundle_entity(self, bundle_fqid: AnvilBundleFQID) -> KeyReference:
source = bundle_fqid.source
bundle_uuid = bundle_fqid.uuid
entity_id = uuids.change_version(bundle_uuid,
self.bundle_uuid_version,
self.datarepo_row_uuid_version)
entity_type = TDRAnvilBundle.entity_type
entity_type = bundle_fqid.entity_type.value
pk_column = entity_type + '_id'
bundle_entity = one(self._run_sql(f'''
SELECT {pk_column}
Expand Down
6 changes: 3 additions & 3 deletions test/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
TDRSourceRef,
)
from azul.plugins.repository.tdr_anvil import (
TDRAnvilBundle,
BundleEntityType,
)
from azul.portal_service import (
PortalService,
Expand Down Expand Up @@ -451,7 +451,7 @@ def _wait_for_indexer():
# While the files index does exist for AnVIL, it's possible
# for a bundle entity not to contain any files and
# thus be absent from the files response
entity_type = pluralize(TDRAnvilBundle.entity_type)
entity_type = pluralize(BundleEntityType.primary.value)
else:
assert False, catalog
self._assert_catalog_complete(catalog=catalog.name,
Expand Down Expand Up @@ -494,7 +494,7 @@ def _test_other_endpoints(self):
if config.is_hca_enabled(catalog):
bundle_index, project_index = 'bundles', 'projects'
elif config.is_anvil_enabled(catalog):
bundle_index, project_index = pluralize(TDRAnvilBundle.entity_type), 'datasets'
bundle_index, project_index = pluralize(BundleEntityType.primary.value), 'datasets'
else:
assert False, catalog
service_paths = {
Expand Down

0 comments on commit 2deef77

Please sign in to comment.