dataladify_studyvisit_from_meta -> deposit_visit_dataset #33

Merged
mih merged 3 commits from dataladgen into main 2023-05-31 15:45:05 +00:00
3 changed files with 73 additions and 46 deletions

View file

@ -12,7 +12,6 @@ import sys
import tempfile import tempfile
from uuid import uuid4 from uuid import uuid4
import datalad.api as dl
from datalad_catalog.catalog import Catalog from datalad_catalog.catalog import Catalog
from datalad_catalog.webcatalog import WebCatalog from datalad_catalog.webcatalog import WebCatalog
@ -247,7 +246,6 @@ def format_bytes(bytes, decimals=2):
return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}" return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"
if __name__ == '__main__': if __name__ == '__main__':
import argparse import argparse
p = argparse.ArgumentParser(description=__doc__) p = argparse.ArgumentParser(description=__doc__)

View file

@ -1,20 +1,27 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
This command reads the metadata deposit from `deposit_visit_metadata` for a
visit in a study (given by their respective identifiers) from the data store,
and generates a DataLad dataset from it. This DataLad dataset provides
versioned access to the visit's DICOM data, up to single-image granularity.
Moreover, all DICOM files are annotated with basic DICOM tags that enable
on-demand dataset views for particular applications (e.g., DICOMs sorted
by image series and protocol name). The DataLad dataset is deposited in
two files in the study directory:
- `{visit_id}_XDLRA--refs`
- `{visit_id}_XDLRA--repo-export`
where the former enables `datalad/git clone` operations, and the latter
represents the actual dataset as a compressed archive.
""" """
import json import json
import os import os
from pathlib import Path from pathlib import Path
import sys
import tempfile import tempfile
import datalad.api as dl import datalad.api as dl
# this points to the top of the ICF data store.
# internally it will be amended with the missing components
# for study and visit deposit locations
icfstore_baseurl = 'https://data.inm-icf.de'
# which DICOM tags to extract from DICOM files and store as # which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views # git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets) # of visit datasets)
@ -28,9 +35,12 @@ dicom_metadata_keys = [
] ]
def main(store_dir: str, def main(
study_id: str, store_dir: str,
visit_id: str): store_url: str,
study_id: str,
visit_id: str,
):
store_base_dir = Path(store_dir) store_base_dir = Path(store_dir)
# where to deposit the final datalad dataset # where to deposit the final datalad dataset
repo_base_path = store_base_dir / study_id / f'{visit_id}_' repo_base_path = store_base_dir / study_id / f'{visit_id}_'
@ -48,20 +58,27 @@ def main(store_dir: str,
f'{visit_id}_metadata_dicoms.json' f'{visit_id}_metadata_dicoms.json'
with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir: with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir:
runshit( deposit_dataset(
# workdir # workdir
wdir, wdir,
# path to deposited dataset metadata # path to deposited dataset metadata
dataset_metadata_path.absolute(), dataset_metadata_path.absolute(),
# path to deposited file metadata # path to deposited file metadata
file_metadata_path.absolute(), file_metadata_path.absolute(),
# base URL of the store to complete access URLs
store_url,
# path to deposit the repo at # path to deposit the repo at
repo_base_path.absolute(), repo_base_path.absolute(),
) )
def runshit(wdir, metapath_dataset, metapath_file, repobasepath): def deposit_dataset(
wdir: Path,
metapath_dataset: Path,
metapath_files: Path,
store_url: str,
repobasepath: Path,
):
# read tar metadata dict # read tar metadata dict
tar_metadata = read_json_file(metapath_dataset) tar_metadata = read_json_file(metapath_dataset)
expected_keys = ('size', 'md5', 'dspath', 'storepath') expected_keys = ('size', 'md5', 'dspath', 'storepath')
@ -88,7 +105,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid'] uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid']
assert uncurl_uuid assert uncurl_uuid
# register the URL of the tarball # register the URL of the tarball
tar_metadata['url'] = f"{icfstore_baseurl}/{tar_metadata['storepath']}" tar_metadata['url'] = f"{store_url}/{tar_metadata['storepath']}"
res = ds.addurls( res = ds.addurls(
[tar_metadata], [tar_metadata],
'{url}', '{url}',
@ -98,9 +115,11 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
# fish out annex key of tarball. # fish out annex key of tarball.
# we could also construct that, but let's not duplicate the setup above # we could also construct that, but let's not duplicate the setup above
tarpath = Path(tar_metadata.get('dspath')) tarpath = Path(tar_metadata.get('dspath'))
tarkey = [r.get('annexkey') for r in res tarkey = [
if r.get('action') == 'fromkey' r.get('annexkey') for r in res
and r.get('path', '').endswith(tarpath.name)] if r.get('action') == 'fromkey'
and r.get('path', '').endswith(tarpath.name)
]
assert len(tarkey) == 1 assert len(tarkey) == 1
tarkey = tarkey[0] tarkey = tarkey[0]
assert tarkey assert tarkey
@ -123,7 +142,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
assert archivist_uuid assert archivist_uuid
# load dicom metadata # load dicom metadata
dicoms = read_json_file(metapath_file) dicoms = read_json_file(metapath_files)
# add to dataset # add to dataset
dicom_recs = ds.addurls( dicom_recs = ds.addurls(
dicoms, dicoms,
@ -146,7 +165,10 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1']) repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1'])
repo.call_git([ repo.call_git([
'remote', 'add', 'icfstore', 'remote', 'add',
# the remote name is arbitrary, it will not end up in the resulting
# deposit
'store',
# this is a little twisted: # this is a little twisted:
# the first line is an f-string, because we need to get the base URL # the first line is an f-string, because we need to get the base URL
# pointing to the study directory into the remote URL # pointing to the study directory into the remote URL
@ -163,7 +185,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
# to be able to actually push everything # to be able to actually push everything
repo.call_annex(['whereis', '--key', dicomkeys[0]]) repo.call_annex(['whereis', '--key', dicomkeys[0]])
ds.push( ds.push(
to='icfstore', to='store',
# under no circumstances do we want to push annexed content. # under no circumstances do we want to push annexed content.
# and there also should be none # and there also should be none
data='nothing', data='nothing',
@ -174,31 +196,36 @@ def read_json_file(file_path):
""" """
Load content from catalog metadata file for current node Load content from catalog metadata file for current node
""" """
try: with open(file_path) as f:
with open(file_path) as f: return json.load(f)
return json.load(f)
except OSError as err:
raise("OS error: {0}".format(err))
except:
raise("Unexpected error:", sys.exc_info()[0])
if __name__ == '__main__': if __name__ == '__main__':
import argparse import argparse
p = argparse.ArgumentParser(description=__doc__) p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
help="study and visit identifiers, used to "
"locate the visit archive in the storage organization. "
)
p.add_argument( p.add_argument(
"-o", "--store-dir", metavar='PATH', default=os.getcwd(), "-o", "--store-dir", metavar='PATH', default=os.getcwd(),
help="Root directory of the ICF data store. " help="root directory of the data store. "
"Visit data will be read from it, and the DataLad dataset will be " "Visit data will be read from it, and the DataLad dataset will be "
"deposited into it." "deposited into it."
) )
p.add_argument( p.add_argument(
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True, '--store-url', metavar='URL', default='https://data.inm-icf.de',
help="The study and visit identifiers, used to " help="base URL of the DICOM data store. This URL is used to "
"locate the visit archive in the storage organization. " "register TAR archive download URLs in the generated DataLad "
"dataset."
) )
args = p.parse_args() args = p.parse_args()
main(store_dir=args.store_dir, main(store_dir=args.store_dir,
store_url=args.store_url,
study_id=args.id[0], study_id=args.id[0],
visit_id=args.id[1], visit_id=args.id[1],
) )

View file

@ -28,6 +28,7 @@ def run_script(name: str,
working_directory: str | Path, working_directory: str | Path,
study_id: str, study_id: str,
visit_id: str, visit_id: str,
*args
): ):
script_path = Path(*(Path(__file__).parts[:-3] + ('bin',))) / name script_path = Path(*(Path(__file__).parts[:-3] + ('bin',))) / name
@ -37,14 +38,16 @@ def run_script(name: str,
str(script_path), str(script_path),
'--id', '--id',
study_id, study_id,
visit_id visit_id,
*args
] ]
) )
def process_visits(studies_dir: Path, def process_visits(studies_dir: Path,
studies: list[str], studies: list[str],
visits: list[str] visits: list[str],
baseurl: str,
): ):
for study in studies: for study in studies:
for visit in visits: for visit in visits:
@ -56,9 +59,10 @@ def process_visits(studies_dir: Path,
) )
# run dataladification script # run dataladification script
run_script( run_script(
'dataladify_studyvisit_from_meta', 'deposit_visit_dataset',
studies_dir, studies_dir,
study, visit study, visit,
'--store-url', baseurl,
) )
# run catalogification script # run catalogification script
run_script( run_script(
@ -140,6 +144,7 @@ def test_pipeline(tmp_path: Path,
Path(test_studies_dir), Path(test_studies_dir),
test_study_names, test_study_names,
existing_visits, existing_visits,
data_webserver,
) )
# 1. Test metadata generation # 1. Test metadata generation
@ -166,12 +171,9 @@ def test_pipeline(tmp_path: Path,
dataaccess_credential, dataaccess_credential,
credman, credman,
) )
# TODO reenable once the server setup is actually compatible # pull all individual DICOM files, this will internally
# TODO swap the order of gets, or actually drop the tar get # access/download the archive at the store
# completely. Pulling individual files will do all that internally dataset.get(f'{study}_{visit}')
# Try to get the tar file and the DICOMs
#dataset.get(f'icf/{visit}_dicom.tar')
#dataset.get(f'{study}_{visit}')
# 3. Test catalog generation # 3. Test catalog generation
# - assert that study catalogs have been created using webcatalog method # - assert that study catalogs have been created using webcatalog method