inm-icf-utilities/bin/deposit_visit_dataset

#!/usr/bin/env python3
"""
This command reads the metadata deposit from `deposit_visit_metadata` for a
visit in a study (given by their respective identifiers) from the data store,
and generates a DataLad dataset from it. This DataLad dataset provides
versioned access to the visit's DICOM data, up to single-image granularity.
Moreover, all DICOM files are annotated with basic DICOM tags that enable
on-demand dataset views for particular applications (e.g., DICOMs sorted
by image series and protocol name). The DataLad dataset is deposited in
two files in the study directory:

- `{visit_id}_XDLRA--refs`
- `{visit_id}_XDLRA--repo-export`

where the former enables `datalad/git clone` operations, and the latter
represents the actual dataset as a compressed archive.
"""
import json
import os
from pathlib import Path
import tempfile

import datalad.api as dl

# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
dicom_metadata_keys = [
    "SeriesDescription",
    "SeriesNumber",
    "Modality",
    "MRAcquisitionType",
    "ProtocolName",
    "PulseSequenceName",
]


def main(
    store_dir: str,
    store_url: str,
    study_id: str,
    visit_id: str,
):
    store_base_dir = Path(store_dir)
    # where to deposit the final datalad dataset
    repo_base_path = store_base_dir / study_id / f'{visit_id}_'
    deposit_conflicts = [str(p) for p in repo_base_path.parent.glob(
        f'{repo_base_path.name}XDLRA*')]
    if deposit_conflicts:
        # be safe
        raise ValueError(
            f'existing dataset deposit {deposit_conflicts}, '
            'refusing to overwrite')
    # locate metadata files
    dataset_metadata_path = store_base_dir / study_id / \
        f'{visit_id}_metadata_tarball.json'
    file_metadata_path = store_base_dir / study_id / \
        f'{visit_id}_metadata_dicoms.json'

    with tempfile.TemporaryDirectory(prefix='dataladify_visit_') as wdir:
        deposit_dataset(
            # workdir
            wdir,
            # path to deposited dataset metadata
            dataset_metadata_path.absolute(),
            # path to deposited file metadata
            file_metadata_path.absolute(),
            # base URL of the store to complete access URLs
            store_url,
            # path to deposit the repo at
            repo_base_path.absolute(),
        )


def deposit_dataset(
    wdir: Path,
    metapath_dataset: Path,
    metapath_files: Path,
    store_url: str,
    repobasepath: Path,
):
    # read tar metadata dict
    tar_metadata = read_json_file(metapath_dataset)
    expected_keys = ('size', 'md5', 'dspath', 'storepath')
    if not all(k in tar_metadata for k in expected_keys):
        raise ValueError(f'incomplete tarball metadata at {metapath_dataset}')

    # create visit dataset
    ds = dl.create(wdir)
    # alias for speed, `.repo` is really expensive
    repo = ds.repo
    # enable uncurl remote to have the tarball URL be claimed by it
    # and future-proof access (via its reconfiguration possibilities
    # without having to touch the annex record
    repo.call_annex([
        'initremote',
        'uncurl',
        'type=external',
        'externaltype=uncurl',
        'encryption=none',
        # auto-enabling is cheap (makes no connection attempts), and convenient
        'autoenable=true',
    ])
    # we need its UUID later
    uncurl_uuid = repo.call_annex_records(['info', 'uncurl'])[0]['uuid']
    assert uncurl_uuid
    # register the URL of the tarball
    tar_metadata['url'] = f"{store_url}/{tar_metadata['storepath']}"
    res = ds.addurls(
        [tar_metadata],
        '{url}',
        '{dspath}',
        key='et:MD5-s{size}--{md5}',
    )
    # fish out annex key of tarball.
    # we could also construct that, but let's not duplicate the setup above
    tarpath = Path(tar_metadata.get('dspath'))
    tarkey = [
        r.get('annexkey') for r in res
        if r.get('action') == 'fromkey'
        and r.get('path', '').endswith(tarpath.name)
    ]
    assert len(tarkey) == 1
    tarkey = tarkey[0]
    assert tarkey
    # assure tar key availability
    repo.call_annex(['setpresentkey', tarkey, uncurl_uuid, '1'])

    # here we register the archivist special remote, to claim
    # the dl+archives URLs registered below.
    repo.call_annex([
        'initremote',
        'archivist',
        'type=external',
        'externaltype=archivist',
        'encryption=none',
        # auto-enabling is cheap (makes no connection attempts), and convenient
        'autoenable=true',
    ])
    archivist_uuid = repo.call_annex_records(
        ['info', 'archivist'])[0]['uuid']
    assert archivist_uuid

    # load dicom metadata
    dicoms = read_json_file(metapath_files)
    # add to dataset
    dicom_recs = ds.addurls(
        dicoms,
        f'dl+archive:{tarkey}#path={{path}}&size={{size}}',
        '{path}',
        key='et:MD5-s{size}--{md5}',
        # field names are limited to alphanumerics (and [_-.]),
        # and are case insensitive
        meta=[
            f'{dmk.lower()}={{{dmk}}}'
            for dmk in dicom_metadata_keys
        ],
    )
    # assure availability for each DICOM
    dicomkeys = [
        r['annexkey']
        for r in dicom_recs if r.get('action') == 'fromkey'
    ]
    for dicomkey in dicomkeys:
        repo.call_annex(['setpresentkey', dicomkey, archivist_uuid, '1'])

    repo.call_git([
        'remote', 'add',
        # the remote name is arbitrary, it will not end up in the resulting
        # deposit
        'store',
        # this is a little twisted:
        # the first line is an f-string, because we need to get the base URL
        # pointing to the study directory into the remote URL
        f'datalad-annex::?type=external&externaltype=uncurl&url=file://{repobasepath}'
        # this second line is NOT an f-string, and braces are quoted!!
        # this is because datalad-annex:: will pass this URL to uncurl
        # (removing the quoting; it can do placeholders too!), and uncurl
        # will then fill in the annex key of the deposit in order to get
        # the final upload URL
        '{{annex_key}}&encryption=none'
    ])
    # probe the availability metadata. This seems to be necessary at times to
    # get git-annex to commit the metadata operations performed above
    # to be able to actually push everything
    repo.call_annex(['whereis', '--key', dicomkeys[0]])
    ds.push(
        to='store',
        # under no circumstances do we want to push annexed content.
        # and there also should be none
        data='nothing',
    )


def read_json_file(file_path):
    """
    Load content from catalog metadata file for current node
    """
    with open(file_path) as f:
        return json.load(f)


if __name__ == '__main__':
    import argparse
    p = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    p.add_argument(
        '--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
        help="study and visit identifiers, used to "
        "locate the visit archive in the storage organization. "
    )
    p.add_argument(
        "-o", "--store-dir", metavar='PATH', default=os.getcwd(),
        help="root directory of the data store. "
        "Visit data will be read from it, and the DataLad dataset will be "
        "deposited into it."
    )
    p.add_argument(
        '--store-url', metavar='URL', default='https://data.inm-icf.de',
        help="base URL of the DICOM data store. This URL is used to "
        "register TAR archive download URLs in the generated DataLad "
        "dataset."
    )
    args = p.parse_args()
    main(store_dir=args.store_dir,
         store_url=args.store_url,
         study_id=args.id[0],
         visit_id=args.id[1],
    )