inm-icf-utilities/bin/deposit_visit_dataset
Michael Hanke e494cb03e6 dataladify_studyvisit_from_meta -> deposit_visit_dataset
document more comprehensively and remove the ICF label from the help.
However, the base URL of the store continues to default to
https://data.inm-icf.de
2023-05-31 14:59:39 +02:00

231 lines
7.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
This command reads the metadata deposit from `deposit_visit_metadata` for a
visit in a study (given by their respective identifiers) from the data store,
and generates a DataLad dataset from it. This DataLad dataset provides
versioned access to the visit's DICOM data, up to single-image granularity.
Moreover, all DICOM files are annotated with basic DICOM tags that enable
on-demand dataset views for particular applications (e.g., DICOMs sorted
by image series and protocol name). The DataLad dataset is deposited in
two files in the study directory:
- `{visit_id}_XDLRA--refs`
- `{visit_id}_XDLRA--repo-export`
where the former enables `datalad/git clone` operations, and the latter
represents the actual dataset as a compressed archive.
"""
import json
import os
from pathlib import Path
import tempfile
import datalad.api as dl
# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
dicom_metadata_keys = [
"SeriesDescription",
"SeriesNumber",
"Modality",
"MRAcquisitionType",
"ProtocolName",
"PulseSequenceName",
]
def main(
store_dir: str,
store_url: str,
study_id: str,
visit_id: str,
):
store_base_dir = Path(store_dir)
# where to deposit the final datalad dataset
repo_base_path = store_base_dir / study_id / f'{visit_id}_'
deposit_conflicts = [str(p) for p in repo_base_path.parent.glob(
f'{repo_base_path.name}XDLRA*')]
if deposit_conflicts:
# be safe
raise ValueError(
f'existing dataset deposit {deposit_conflicts}, '
'refusing to overwrite')
# locate metadata files
dataset_metadata_path = store_base_dir / study_id / \
f'{visit_id}_metadata_tarball.json'
file_metadata_path = store_base_dir / study_id / \
f'{visit_id}_metadata_dicoms.json'
with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir:
deposit_dataset(
# workdir
wdir,
# path to deposited dataset metadata
dataset_metadata_path.absolute(),
# path to deposited file metadata
file_metadata_path.absolute(),
# base URL of the store to complete access URLs
store_url,
# path to deposit the repo at
repo_base_path.absolute(),
)
def deposit_dataset(
wdir: Path,
metapath_dataset: Path,
metapath_files: Path,
store_url: str,
repobasepath: Path,
):
# read tar metadata dict
tar_metadata = read_json_file(metapath_dataset)
expected_keys = ('size', 'md5', 'dspath', 'storepath')
if not all(k in tar_metadata for k in expected_keys):
raise ValueError(f'incomplete tarball metadata at {metapath_dataset}')
# create visit dataset
ds = dl.create(wdir)
# alias for speed, `.repo` is really expensive
repo = ds.repo
# enable uncurl remote to have the tarball URL be claimed by it
# and future-proof access (via its reconfiguration possibilities
# without having to touch the annex record
repo.call_annex([
'initremote',
'uncurl',
'type=external',
'externaltype=uncurl',
'encryption=none',
# auto-enabling is cheap (makes no connection attempts), and convenient
'autoenable=true',
])
# we need its UUID later
uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid']
assert uncurl_uuid
# register the URL of the tarball
tar_metadata['url'] = f"{store_url}/{tar_metadata['storepath']}"
res = ds.addurls(
[tar_metadata],
'{url}',
'{dspath}',
key='et:MD5-s{size}--{md5}',
)
# fish out annex key of tarball.
# we could also construct that, but let's not duplicate the setup above
tarpath = Path(tar_metadata.get('dspath'))
tarkey = [
r.get('annexkey') for r in res
if r.get('action') == 'fromkey'
and r.get('path', '').endswith(tarpath.name)
]
assert len(tarkey) == 1
tarkey = tarkey[0]
assert tarkey
# assure tar key availability
repo.call_annex(['setpresentkey', tarkey, uncurl_uuid, '1'])
# here we register the archivist special remote, to claim
# the dl+archives URLs registered below.
repo.call_annex([
'initremote',
'archivist',
'type=external',
'externaltype=archivist',
'encryption=none',
# auto-enabling is cheap (makes no connection attempts), and convenient
'autoenable=true',
])
archivist_uuid = repo.call_annex_records(
['info', 'archivist'])[0]['uuid']
assert archivist_uuid
# load dicom metadata
dicoms = read_json_file(metapath_files)
# add to dataset
dicom_recs = ds.addurls(
dicoms,
f'dl+archive:{tarkey}#path={{path}}&size={{size}}',
'{path}',
key='et:MD5-s{size}--{md5}',
# field names are limited to alphanumerics (and [_-.]),
# and are case insensitive
meta=[
f'{dmk.lower()}={{{dmk}}}'
for dmk in dicom_metadata_keys
],
)
# assure availability for each DICOM
dicomkeys = [
r['annexkey']
for r in dicom_recs if r.get('action') == 'fromkey'
]
for dicomkey in dicomkeys:
repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1'])
repo.call_git([
'remote', 'add',
# the remote name is arbitrary, it will not end up in the resulting
# deposit
'store',
# this is a little twisted:
# the first line is an f-string, because we need to get the base URL
# pointing to the study directory into the remote URL
f'datalad-annex::?type=external&externaltype=uncurl&url=file://{repobasepath}'
# this second line is NOT an f-string, and braces are quoted!!
# this is because datalad-annex:: will pass this URL to uncurl
# (removing the quoting; it can do placeholders too!), and uncurl
# will then fill in the annex key of the deposit in order to get
# the final upload URL
'{{annex_key}}&encryption=none'
])
# probe the availability metadata. This seems to be necessary at times to
# get git-annex to commit the metadata operations performed above
# to be able to actually push everything
repo.call_annex(['whereis', '--key', dicomkeys[0]])
ds.push(
to='store',
# under no circumstances do we want to push annexed content.
# and there also should be none
data='nothing',
)
def read_json_file(file_path):
"""
Load content from catalog metadata file for current node
"""
with open(file_path) as f:
return json.load(f)
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
help="study and visit identifiers, used to "
"locate the visit archive in the storage organization. "
)
p.add_argument(
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
help="root directory of the data store. "
"Visit data will be read from it, and the DataLad dataset will be "
"deposited into it."
)
p.add_argument(
'--store-url', metavar='URL', default='https://data.inm-icf.de',
help="base URL of the DICOM data store. This URL is used to "
"register TAR archive download URLs in the generated DataLad "
"dataset."
)
args = p.parse_args()
main(store_dir=args.store_dir,
store_url=args.store_url,
study_id=args.id[0],
visit_id=args.id[1],
)