dataladify_studyvisit_from_meta -> deposit_visit_dataset #33
3 changed files with 73 additions and 46 deletions
|
|
@ -12,7 +12,6 @@ import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
import datalad.api as dl
|
|
||||||
from datalad_catalog.catalog import Catalog
|
from datalad_catalog.catalog import Catalog
|
||||||
from datalad_catalog.webcatalog import WebCatalog
|
from datalad_catalog.webcatalog import WebCatalog
|
||||||
|
|
||||||
|
|
@ -95,7 +94,7 @@ def get_catalog(study_id, catalog_path):
|
||||||
# 3. set catalog home page
|
# 3. set catalog home page
|
||||||
ctlg.main_id = study_entry.get('dataset_id')
|
ctlg.main_id = study_entry.get('dataset_id')
|
||||||
ctlg.main_version = study_entry.get('dataset_version')
|
ctlg.main_version = study_entry.get('dataset_version')
|
||||||
ctlg.set_main_dataset()
|
ctlg.set_main_dataset()
|
||||||
return ctlg
|
return ctlg
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -109,7 +108,7 @@ def generate_study_entry(study_id):
|
||||||
ds_version='latest',
|
ds_version='latest',
|
||||||
ds_name=study_id,
|
ds_name=study_id,
|
||||||
ds_description=desc)
|
ds_description=desc)
|
||||||
|
|
||||||
|
|
||||||
def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path):
|
def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path):
|
||||||
meta_item = {
|
meta_item = {
|
||||||
|
|
@ -247,7 +246,6 @@ def format_bytes(bytes, decimals=2):
|
||||||
return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"
|
return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import argparse
|
import argparse
|
||||||
p = argparse.ArgumentParser(description=__doc__)
|
p = argparse.ArgumentParser(description=__doc__)
|
||||||
|
|
|
||||||
|
|
@ -1,20 +1,27 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
|
This command reads the metadata deposit from `deposit_visit_metadata` for a
|
||||||
|
visit in a study (given by their respective identifiers) from the data store,
|
||||||
|
and generates a DataLad dataset from it. This DataLad dataset provides
|
||||||
|
versioned access to the visit's DICOM data, up to single-image granularity.
|
||||||
|
Moreover, all DICOM files are annotated with basic DICOM tags that enable
|
||||||
|
on-demand dataset views for particular applications (e.g., DICOMs sorted
|
||||||
|
by image series and protocol name). The DataLad dataset is deposited in
|
||||||
|
two files in the study directory:
|
||||||
|
|
||||||
|
- `{visit_id}_XDLRA--refs`
|
||||||
|
- `{visit_id}_XDLRA--repo-export`
|
||||||
|
|
||||||
|
where the former enables `datalad/git clone` operations, and the latter
|
||||||
|
represents the actual dataset as a compressed archive.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
import datalad.api as dl
|
import datalad.api as dl
|
||||||
|
|
||||||
# this points to the top of the ICF data store.
|
|
||||||
# internally it will be amended with the missing components
|
|
||||||
# for study and visit deposit locations
|
|
||||||
icfstore_baseurl = 'https://data.inm-icf.de'
|
|
||||||
|
|
||||||
# which DICOM tags to extract from DICOM files and store as
|
# which DICOM tags to extract from DICOM files and store as
|
||||||
# git-annex metadata (e.g., to enable metadata-driven views
|
# git-annex metadata (e.g., to enable metadata-driven views
|
||||||
# of visit datasets)
|
# of visit datasets)
|
||||||
|
|
@ -28,9 +35,12 @@ dicom_metadata_keys = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def main(store_dir: str,
|
def main(
|
||||||
study_id: str,
|
store_dir: str,
|
||||||
visit_id: str):
|
store_url: str,
|
||||||
|
study_id: str,
|
||||||
|
visit_id: str,
|
||||||
|
):
|
||||||
store_base_dir = Path(store_dir)
|
store_base_dir = Path(store_dir)
|
||||||
# where to deposit the final datalad dataset
|
# where to deposit the final datalad dataset
|
||||||
repo_base_path = store_base_dir / study_id / f'{visit_id}_'
|
repo_base_path = store_base_dir / study_id / f'{visit_id}_'
|
||||||
|
|
@ -48,20 +58,27 @@ def main(store_dir: str,
|
||||||
f'{visit_id}_metadata_dicoms.json'
|
f'{visit_id}_metadata_dicoms.json'
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir:
|
with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir:
|
||||||
runshit(
|
deposit_dataset(
|
||||||
# workdir
|
# workdir
|
||||||
wdir,
|
wdir,
|
||||||
# path to deposited dataset metadata
|
# path to deposited dataset metadata
|
||||||
dataset_metadata_path.absolute(),
|
dataset_metadata_path.absolute(),
|
||||||
# path to deposited file metadata
|
# path to deposited file metadata
|
||||||
file_metadata_path.absolute(),
|
file_metadata_path.absolute(),
|
||||||
|
# base URL of the store to complete access URLs
|
||||||
|
store_url,
|
||||||
# path to deposit the repo at
|
# path to deposit the repo at
|
||||||
repo_base_path.absolute(),
|
repo_base_path.absolute(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
|
def deposit_dataset(
|
||||||
|
wdir: Path,
|
||||||
|
metapath_dataset: Path,
|
||||||
|
metapath_files: Path,
|
||||||
|
store_url: str,
|
||||||
|
repobasepath: Path,
|
||||||
|
):
|
||||||
# read tar metadata dict
|
# read tar metadata dict
|
||||||
tar_metadata = read_json_file(metapath_dataset)
|
tar_metadata = read_json_file(metapath_dataset)
|
||||||
expected_keys = ('size', 'md5', 'dspath', 'storepath')
|
expected_keys = ('size', 'md5', 'dspath', 'storepath')
|
||||||
|
|
@ -88,7 +105,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
|
||||||
uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid']
|
uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid']
|
||||||
assert uncurl_uuid
|
assert uncurl_uuid
|
||||||
# register the URL of the tarball
|
# register the URL of the tarball
|
||||||
tar_metadata['url'] = f"{icfstore_baseurl}/{tar_metadata['storepath']}"
|
tar_metadata['url'] = f"{store_url}/{tar_metadata['storepath']}"
|
||||||
res = ds.addurls(
|
res = ds.addurls(
|
||||||
[tar_metadata],
|
[tar_metadata],
|
||||||
'{url}',
|
'{url}',
|
||||||
|
|
@ -98,9 +115,11 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
|
||||||
# fish out annex key of tarball.
|
# fish out annex key of tarball.
|
||||||
# we could also construct that, but let's not duplicate the setup above
|
# we could also construct that, but let's not duplicate the setup above
|
||||||
tarpath = Path(tar_metadata.get('dspath'))
|
tarpath = Path(tar_metadata.get('dspath'))
|
||||||
tarkey = [r.get('annexkey') for r in res
|
tarkey = [
|
||||||
if r.get('action') == 'fromkey'
|
r.get('annexkey') for r in res
|
||||||
and r.get('path', '').endswith(tarpath.name)]
|
if r.get('action') == 'fromkey'
|
||||||
|
and r.get('path', '').endswith(tarpath.name)
|
||||||
|
]
|
||||||
assert len(tarkey) == 1
|
assert len(tarkey) == 1
|
||||||
tarkey = tarkey[0]
|
tarkey = tarkey[0]
|
||||||
assert tarkey
|
assert tarkey
|
||||||
|
|
@ -123,7 +142,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
|
||||||
assert archivist_uuid
|
assert archivist_uuid
|
||||||
|
|
||||||
# load dicom metadata
|
# load dicom metadata
|
||||||
dicoms = read_json_file(metapath_file)
|
dicoms = read_json_file(metapath_files)
|
||||||
# add to dataset
|
# add to dataset
|
||||||
dicom_recs = ds.addurls(
|
dicom_recs = ds.addurls(
|
||||||
dicoms,
|
dicoms,
|
||||||
|
|
@ -146,7 +165,10 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
|
||||||
repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1'])
|
repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1'])
|
||||||
|
|
||||||
repo.call_git([
|
repo.call_git([
|
||||||
'remote', 'add', 'icfstore',
|
'remote', 'add',
|
||||||
|
# the remote name is arbitrary, it will not end up in the resulting
|
||||||
|
# deposit
|
||||||
|
'store',
|
||||||
# this is a little twisted:
|
# this is a little twisted:
|
||||||
# the first line is an f-string, because we need to get the base URL
|
# the first line is an f-string, because we need to get the base URL
|
||||||
# pointing to the study directory into the remote URL
|
# pointing to the study directory into the remote URL
|
||||||
|
|
@ -163,7 +185,7 @@ def runshit(wdir, metapath_dataset, metapath_file, repobasepath):
|
||||||
# to be able to actually push everything
|
# to be able to actually push everything
|
||||||
repo.call_annex(['whereis', '--key', dicomkeys[0]])
|
repo.call_annex(['whereis', '--key', dicomkeys[0]])
|
||||||
ds.push(
|
ds.push(
|
||||||
to='icfstore',
|
to='store',
|
||||||
# under no circumstances do we want to push annexed content.
|
# under no circumstances do we want to push annexed content.
|
||||||
# and there also should be none
|
# and there also should be none
|
||||||
data='nothing',
|
data='nothing',
|
||||||
|
|
@ -174,31 +196,36 @@ def read_json_file(file_path):
|
||||||
"""
|
"""
|
||||||
Load content from catalog metadata file for current node
|
Load content from catalog metadata file for current node
|
||||||
"""
|
"""
|
||||||
try:
|
with open(file_path) as f:
|
||||||
with open(file_path) as f:
|
return json.load(f)
|
||||||
return json.load(f)
|
|
||||||
except OSError as err:
|
|
||||||
raise("OS error: {0}".format(err))
|
|
||||||
except:
|
|
||||||
raise("Unexpected error:", sys.exc_info()[0])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import argparse
|
import argparse
|
||||||
p = argparse.ArgumentParser(description=__doc__)
|
p = argparse.ArgumentParser(
|
||||||
|
description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
|
||||||
|
help="study and visit identifiers, used to "
|
||||||
|
"locate the visit archive in the storage organization. "
|
||||||
|
)
|
||||||
p.add_argument(
|
p.add_argument(
|
||||||
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
|
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
|
||||||
help="Root directory of the ICF data store. "
|
help="root directory of the data store. "
|
||||||
"Visit data will be read from it, and the DataLad dataset will be "
|
"Visit data will be read from it, and the DataLad dataset will be "
|
||||||
"deposited into it."
|
"deposited into it."
|
||||||
)
|
)
|
||||||
p.add_argument(
|
p.add_argument(
|
||||||
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
|
'--store-url', metavar='URL', default='https://data.inm-icf.de',
|
||||||
help="The study and visit identifiers, used to "
|
help="base URL of the DICOM data store. This URL is used to "
|
||||||
"locate the visit archive in the storage organization. "
|
"register TAR archive download URLs in the generated DataLad "
|
||||||
|
"dataset."
|
||||||
)
|
)
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
main(store_dir=args.store_dir,
|
main(store_dir=args.store_dir,
|
||||||
|
store_url=args.store_url,
|
||||||
study_id=args.id[0],
|
study_id=args.id[0],
|
||||||
visit_id=args.id[1],
|
visit_id=args.id[1],
|
||||||
)
|
)
|
||||||
|
|
@ -28,6 +28,7 @@ def run_script(name: str,
|
||||||
working_directory: str | Path,
|
working_directory: str | Path,
|
||||||
study_id: str,
|
study_id: str,
|
||||||
visit_id: str,
|
visit_id: str,
|
||||||
|
*args
|
||||||
):
|
):
|
||||||
|
|
||||||
script_path = Path(*(Path(__file__).parts[:-3] + ('bin',))) / name
|
script_path = Path(*(Path(__file__).parts[:-3] + ('bin',))) / name
|
||||||
|
|
@ -37,14 +38,16 @@ def run_script(name: str,
|
||||||
str(script_path),
|
str(script_path),
|
||||||
'--id',
|
'--id',
|
||||||
study_id,
|
study_id,
|
||||||
visit_id
|
visit_id,
|
||||||
|
*args
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def process_visits(studies_dir: Path,
|
def process_visits(studies_dir: Path,
|
||||||
studies: list[str],
|
studies: list[str],
|
||||||
visits: list[str]
|
visits: list[str],
|
||||||
|
baseurl: str,
|
||||||
):
|
):
|
||||||
for study in studies:
|
for study in studies:
|
||||||
for visit in visits:
|
for visit in visits:
|
||||||
|
|
@ -56,9 +59,10 @@ def process_visits(studies_dir: Path,
|
||||||
)
|
)
|
||||||
# run dataladification script
|
# run dataladification script
|
||||||
run_script(
|
run_script(
|
||||||
'dataladify_studyvisit_from_meta',
|
'deposit_visit_dataset',
|
||||||
studies_dir,
|
studies_dir,
|
||||||
study, visit
|
study, visit,
|
||||||
|
'--store-url', baseurl,
|
||||||
)
|
)
|
||||||
# run catalogification script
|
# run catalogification script
|
||||||
run_script(
|
run_script(
|
||||||
|
|
@ -140,6 +144,7 @@ def test_pipeline(tmp_path: Path,
|
||||||
Path(test_studies_dir),
|
Path(test_studies_dir),
|
||||||
test_study_names,
|
test_study_names,
|
||||||
existing_visits,
|
existing_visits,
|
||||||
|
data_webserver,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 1. Test metadata generation
|
# 1. Test metadata generation
|
||||||
|
|
@ -166,12 +171,9 @@ def test_pipeline(tmp_path: Path,
|
||||||
dataaccess_credential,
|
dataaccess_credential,
|
||||||
credman,
|
credman,
|
||||||
)
|
)
|
||||||
# TODO reenable once the server setup is actually compatible
|
# pull all individual DICOM files, this will internally
|
||||||
# TODO swap the order of gets, or actually drop the tar get
|
# access/download the archive at the store
|
||||||
# completely. Pulling individual files will do all that internally
|
dataset.get(f'{study}_{visit}')
|
||||||
# Try to get the tar file and the DICOMs
|
|
||||||
#dataset.get(f'icf/{visit}_dicom.tar')
|
|
||||||
#dataset.get(f'{study}_{visit}')
|
|
||||||
|
|
||||||
# 3. Test catalog generation
|
# 3. Test catalog generation
|
||||||
# - assert that study catalogs have been created using webcatalog method
|
# - assert that study catalogs have been created using webcatalog method
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue