document more comprehensively and remove the ICF label from the help. However, the base URL of the store continues to default to https://data.inm-icf.de
231 lines
7.7 KiB
Python
Executable file
231 lines
7.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
This command reads the metadata deposit from `deposit_visit_metadata` for a
|
|
visit in a study (given by their respective identifiers) from the data store,
|
|
and generates a DataLad dataset from it. This DataLad dataset provides
|
|
versioned access to the visit's DICOM data, up to single-image granularity.
|
|
Moreover, all DICOM files are annotated with basic DICOM tags that enable
|
|
on-demand dataset views for particular applications (e.g., DICOMs sorted
|
|
by image series and protocol name). The DataLad dataset is deposited in
|
|
two files in the study directory:
|
|
|
|
- `{visit_id}_XDLRA--refs`
|
|
- `{visit_id}_XDLRA--repo-export`
|
|
|
|
where the former enables `datalad/git clone` operations, and the latter
|
|
represents the actual dataset as a compressed archive.
|
|
"""
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
import tempfile
|
|
|
|
import datalad.api as dl
|
|
|
|
# which DICOM tags to extract from DICOM files and store as
|
|
# git-annex metadata (e.g., to enable metadata-driven views
|
|
# of visit datasets)
|
|
dicom_metadata_keys = [
|
|
"SeriesDescription",
|
|
"SeriesNumber",
|
|
"Modality",
|
|
"MRAcquisitionType",
|
|
"ProtocolName",
|
|
"PulseSequenceName",
|
|
]
|
|
|
|
|
|
def main(
|
|
store_dir: str,
|
|
store_url: str,
|
|
study_id: str,
|
|
visit_id: str,
|
|
):
|
|
store_base_dir = Path(store_dir)
|
|
# where to deposit the final datalad dataset
|
|
repo_base_path = store_base_dir / study_id / f'{visit_id}_'
|
|
deposit_conflicts = [str(p) for p in repo_base_path.parent.glob(
|
|
f'{repo_base_path.name}XDLRA*')]
|
|
if deposit_conflicts:
|
|
# be safe
|
|
raise ValueError(
|
|
f'existing dataset deposit {deposit_conflicts}, '
|
|
'refusing to overwrite')
|
|
# locate metadata files
|
|
dataset_metadata_path = store_base_dir / study_id / \
|
|
f'{visit_id}_metadata_tarball.json'
|
|
file_metadata_path = store_base_dir / study_id / \
|
|
f'{visit_id}_metadata_dicoms.json'
|
|
|
|
with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir:
|
|
deposit_dataset(
|
|
# workdir
|
|
wdir,
|
|
# path to deposited dataset metadata
|
|
dataset_metadata_path.absolute(),
|
|
# path to deposited file metadata
|
|
file_metadata_path.absolute(),
|
|
# base URL of the store to complete access URLs
|
|
store_url,
|
|
# path to deposit the repo at
|
|
repo_base_path.absolute(),
|
|
)
|
|
|
|
|
|
def deposit_dataset(
|
|
wdir: Path,
|
|
metapath_dataset: Path,
|
|
metapath_files: Path,
|
|
store_url: str,
|
|
repobasepath: Path,
|
|
):
|
|
# read tar metadata dict
|
|
tar_metadata = read_json_file(metapath_dataset)
|
|
expected_keys = ('size', 'md5', 'dspath', 'storepath')
|
|
if not all(k in tar_metadata for k in expected_keys):
|
|
raise ValueError(f'incomplete tarball metadata at {metapath_dataset}')
|
|
|
|
# create visit dataset
|
|
ds = dl.create(wdir)
|
|
# alias for speed, `.repo` is really expensive
|
|
repo = ds.repo
|
|
# enable uncurl remote to have the tarball URL be claimed by it
|
|
# and future-proof access (via its reconfiguration possibilities
|
|
# without having to touch the annex record
|
|
repo.call_annex([
|
|
'initremote',
|
|
'uncurl',
|
|
'type=external',
|
|
'externaltype=uncurl',
|
|
'encryption=none',
|
|
# auto-enabling is cheap (makes no connection attempts), and convenient
|
|
'autoenable=true',
|
|
])
|
|
# we need its UUID later
|
|
uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid']
|
|
assert uncurl_uuid
|
|
# register the URL of the tarball
|
|
tar_metadata['url'] = f"{store_url}/{tar_metadata['storepath']}"
|
|
res = ds.addurls(
|
|
[tar_metadata],
|
|
'{url}',
|
|
'{dspath}',
|
|
key='et:MD5-s{size}--{md5}',
|
|
)
|
|
# fish out annex key of tarball.
|
|
# we could also construct that, but let's not duplicate the setup above
|
|
tarpath = Path(tar_metadata.get('dspath'))
|
|
tarkey = [
|
|
r.get('annexkey') for r in res
|
|
if r.get('action') == 'fromkey'
|
|
and r.get('path', '').endswith(tarpath.name)
|
|
]
|
|
assert len(tarkey) == 1
|
|
tarkey = tarkey[0]
|
|
assert tarkey
|
|
# assure tar key availability
|
|
repo.call_annex(['setpresentkey', tarkey, uncurl_uuid, '1'])
|
|
|
|
# here we register the archivist special remote, to claim
|
|
# the dl+archives URLs registered below.
|
|
repo.call_annex([
|
|
'initremote',
|
|
'archivist',
|
|
'type=external',
|
|
'externaltype=archivist',
|
|
'encryption=none',
|
|
# auto-enabling is cheap (makes no connection attempts), and convenient
|
|
'autoenable=true',
|
|
])
|
|
archivist_uuid = repo.call_annex_records(
|
|
['info', 'archivist'])[0]['uuid']
|
|
assert archivist_uuid
|
|
|
|
# load dicom metadata
|
|
dicoms = read_json_file(metapath_files)
|
|
# add to dataset
|
|
dicom_recs = ds.addurls(
|
|
dicoms,
|
|
f'dl+archive:{tarkey}#path={{path}}&size={{size}}',
|
|
'{path}',
|
|
key='et:MD5-s{size}--{md5}',
|
|
# field names are limited to alphanumerics (and [_-.]),
|
|
# and are case insensitive
|
|
meta=[
|
|
f'{dmk.lower()}={{{dmk}}}'
|
|
for dmk in dicom_metadata_keys
|
|
],
|
|
)
|
|
# assure availability for each DICOM
|
|
dicomkeys = [
|
|
r['annexkey']
|
|
for r in dicom_recs if r.get('action') == 'fromkey'
|
|
]
|
|
for dicomkey in dicomkeys:
|
|
repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1'])
|
|
|
|
repo.call_git([
|
|
'remote', 'add',
|
|
# the remote name is arbitrary, it will not end up in the resulting
|
|
# deposit
|
|
'store',
|
|
# this is a little twisted:
|
|
# the first line is an f-string, because we need to get the base URL
|
|
# pointing to the study directory into the remote URL
|
|
f'datalad-annex::?type=external&externaltype=uncurl&url=file://{repobasepath}'
|
|
# this second line is NOT an f-string, and braces are quoted!!
|
|
# this is because datalad-annex:: will pass this URL to uncurl
|
|
# (removing the quoting; it can do placeholders too!), and uncurl
|
|
# will then fill in the annex key of the deposit in order to get
|
|
# the final upload URL
|
|
'{{annex_key}}&encryption=none'
|
|
])
|
|
# probe the availability metadata. This seems to be necessary at times to
|
|
# get git-annex to commit the metadata operations performed above
|
|
# to be able to actually push everything
|
|
repo.call_annex(['whereis', '--key', dicomkeys[0]])
|
|
ds.push(
|
|
to='store',
|
|
# under no circumstances do we want to push annexed content.
|
|
# and there also should be none
|
|
data='nothing',
|
|
)
|
|
|
|
|
|
def read_json_file(file_path):
|
|
"""
|
|
Load content from catalog metadata file for current node
|
|
"""
|
|
with open(file_path) as f:
|
|
return json.load(f)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
p = argparse.ArgumentParser(
|
|
description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
p.add_argument(
|
|
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
|
|
help="study and visit identifiers, used to "
|
|
"locate the visit archive in the storage organization. "
|
|
)
|
|
p.add_argument(
|
|
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
|
|
help="root directory of the data store. "
|
|
"Visit data will be read from it, and the DataLad dataset will be "
|
|
"deposited into it."
|
|
)
|
|
p.add_argument(
|
|
'--store-url', metavar='URL', default='https://data.inm-icf.de',
|
|
help="base URL of the DICOM data store. This URL is used to "
|
|
"register TAR archive download URLs in the generated DataLad "
|
|
"dataset."
|
|
)
|
|
args = p.parse_args()
|
|
main(store_dir=args.store_dir,
|
|
store_url=args.store_url,
|
|
study_id=args.id[0],
|
|
visit_id=args.id[1],
|
|
)
|