inm-icf-utilities/bin/catalogify_studyvisit_from_meta

#!/usr/bin/env python3
"""

"""
from datetime import datetime
import json
import math
import os
from pathlib import Path
import subprocess
import sys
import tempfile
from uuid import uuid4

from datalad.api import catalog_add
from datalad_catalog.webcatalog import WebCatalog
from datalad_catalog.schema_utils import get_metadata_item

# this points to the top of the ICF data store.
# internally it will be amended with the missing components
# for study and visit deposit locations
icfstore_baseurl = 'https://data.inm-icf.de'

# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
dicom_metadata_keys = [
    "SeriesDescription",
    "SeriesNumber",
    "Modality",
    "MRAcquisitionType",
    "ProtocolName",
    "PulseSequenceName",
]


def main(store_dir: str,
         study_id: str,
         visit_id: str):
    store_base_dir = Path(store_dir)
    # where to add the catalog entry
    study_catalog_path = store_base_dir / study_id / 'catalog'
    # locate metadata files
    dataset_metadata_path = store_base_dir / study_id / \
        f'{visit_id}_metadata_tarball.json'
    file_metadata_path = store_base_dir / study_id / \
        f'{visit_id}_metadata_dicoms.json'
    # Grab or create the catalog (WebCatalog class)
    # (creating includes generating and adding a study entry for the catalog)
    ctlg = get_catalog(study_id, study_catalog_path)
    # Generate and add a visit entry for the catalog
    visit_entry = generate_visit_entry(
        study_id,
        visit_id,
        dataset_metadata_path,
        file_metadata_path)
    catalog_add(
        catalog=study_catalog_path,
        metadata=visit_entry,
    )
    # Add visit entry as subdataset to study entry
    super_dict = ctlg.get_main_dataset()
    subdatasets = [
        {
            'dataset_id': visit_entry['dataset_id'],
            'dataset_version': visit_entry['dataset_version'],
            'dataset_path': visit_id,
        }
    ]
    update_entry(
        super_dict['dataset_id'],
        super_dict['dataset_version'],
        study_id,
        'subdatasets',
        subdatasets,
        study_catalog_path)


def get_catalog(study_id, catalog_path):
    """"""
    package_path = Path(__file__).resolve().parent.parent
    # Instantiate WebCatalog object
    ctlg = WebCatalog(location=str(catalog_path))
    # If catalog does not exist:
    if not ctlg.is_created():
        # 1. create it
        ctlg.create(
            config_file=str(package_path / 'assets' / 'catalog_config.json'),
        )
        # 2. generate and add the study-level catalog entry
        study_entry = generate_study_entry(study_id)
        catalog_add(
            catalog=catalog_path,
            metadata=study_entry,
        )
        # 3. set catalog home page
        ctlg.set_main_dataset(
            dataset_id=study_entry.get('dataset_id'),
            dataset_version=study_entry.get('dataset_version'),
        )
    return ctlg


def generate_study_entry(study_id):
    """"""
    desc=f"""This data catalog presents the DICOM data collected
    for all visits of the study: {study_id}. Browse through details
    of all study visits in the 'Subdatasets' tab below."""
    meta_item = get_metadata_item(
        item_type='dataset',
        dataset_id=str(uuid4()),
        dataset_version='latest',
        source_name='automated_addition',
        source_version='0.1.0',
    )
    meta_item['name'] = study_id
    meta_item['description'] = desc
    return meta_item


def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path):
    meta_item = get_metadata_item(
        item_type='dataset',
        dataset_id=ds_id,
        dataset_version=ds_version,
        source_name='automated_addition',
        source_version='0.1.0',
    )
    meta_item['name'] = ds_name
    meta_item.update({key: value})
    catalog_add(
        catalog=study_catalog_path,
        metadata=meta_item,
    )
    return meta_item


def generate_visit_entry(study_id, visit_id, metapath_dataset, metapath_file):
    """"""
        # Create base visit entry
    desc=f"""This page presents the DICOM data collected for the visit
    {visit_id} during the imaging study {study_id}. Browse through details
    of this particular study visit in the 'DICOM' tab below."""
    meta_item = get_metadata_item(
        item_type='dataset',
        dataset_id=str(uuid4()),
        dataset_version='latest',
        source_name='automated_addition',
        source_version='0.1.0',
    )
    meta_item['name'] = visit_id
    meta_item['description'] = desc
    # Load tarball metadata
    tar_metadata = read_json_file(metapath_dataset)
    expected_keys = ('size', 'md5', 'dspath', 'storepath')
    if not all(k in tar_metadata for k in expected_keys):
        raise ValueError(f'incomplete tarball metadata at {metapath_dataset}')
    # add dataset url
    access_url_pre = 'datalad-annex::?type=external&externaltype=uncurl&url='
    access_url_post = '_{{annex_key}}&encryption=none'
    access_url = f'{access_url_pre}{icfstore_baseurl}/{study_id}/{visit_id}{access_url_post}'
    meta_item.update(dict(url=access_url))
    # Load dicom metadata and derive some summary measures
    dicoms = read_json_file(metapath_file)
    nr_files = len(dicoms)
    unique_tag_vals = {}
    for k in dicom_metadata_keys:
        unique_tag_vals[k] = list(filter(None, list(set(d[k] for d in dicoms))))
    additional_keyvals = {
        "keywords": unique_tag_vals['ProtocolName'] + unique_tag_vals['Modality'],
        "additional_display": [
            {
                "name": "DICOM",
                "icon": "far fa-file-image",
                "content": unique_tag_vals
            }
        ],
        "top_display": [
            {
                "name": "TAR file size",
                "value": format_bytes(tar_metadata['size'])
            },
            {
                "name": "Nr of DICOMs",
                "value": nr_files
            },
        ]
    }
    # add summary measures to visit entry
    meta_item.update(additional_keyvals)
    return meta_item


def read_json_file(file_path):
    """
    Load content from catalog metadata file for current node
    """
    try:
        with open(file_path) as f:
            return json.load(f)
    except OSError as err:
        raise("OS error: {0}".format(err))
    except:
        raise("Unexpected error:", sys.exc_info()[0])


def format_bytes(bytes, decimals=2):
    if bytes == 0:
        return "0 Bytes"
    k = 1024
    dm = 0 if decimals < 0 else decimals
    sizes = ["Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
    i = int(math.floor(math.log(bytes) / math.log(k)))
    return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"


if __name__ == '__main__':
    import argparse
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument(
        "-o", "--store-dir", metavar='PATH', default=os.getcwd(),
        help="Root directory of the ICF data store. "
        "Visit data will be read from it, and the DataLad dataset will be "
        "deposited into it."
    )
    p.add_argument(
        '--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
        help="The study and visit identifiers, used to "
        "locate the visit archive in the storage organization. "
    )
    args = p.parse_args()
    main(store_dir=args.store_dir,
         study_id=args.id[0],
         visit_id=args.id[1],
    )