inm-icf-utilities/bin/catalogify_studyvisit_from_meta

#!/usr/bin/env python3
"""

"""
from datetime import datetime
import json
import math
import os
from pathlib import Path
import subprocess
import sys
import tempfile
from uuid import uuid4

from datalad_catalog.catalog import Catalog
from datalad_catalog.webcatalog import WebCatalog

# this points to the top of the ICF data store.
# internally it will be amended with the missing components
# for study and visit deposit locations
icfstore_baseurl = 'https://data.inm-icf.de'

# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
dicom_metadata_keys = [
    "SeriesDescription",
    "SeriesNumber",
    "Modality",
    "MRAcquisitionType",
    "ProtocolName",
    "PulseSequenceName",
]

# Instantiate interface object for api
catalog_api = Catalog()


def main(store_dir: str,
         study_id: str,
         visit_id: str):
    store_base_dir = Path(store_dir)
    # where to add the catalog entry
    study_catalog_path = store_base_dir / study_id / 'catalog'
    # locate metadata files
    dataset_metadata_path = store_base_dir / study_id / \
        f'{visit_id}_metadata_tarball.json'
    file_metadata_path = store_base_dir / study_id / \
        f'{visit_id}_metadata_dicoms.json'
    # Grab or create the catalog (WebCatalog class)
    # (creating includes generating and adding a study entry for the catalog)
    ctlg = get_catalog(study_id, study_catalog_path)
    # Generate and add a visit entry for the catalog
    visit_entry = generate_visit_entry(
        study_id,
        visit_id,
        dataset_metadata_path,
        file_metadata_path)
    add_to_catalog(visit_entry, str(study_catalog_path))
    # Add visit entry as subdataset to study entry
    super_dict = read_json_file(ctlg.location / 'metadata' / 'super.json')
    subdatasets = [
        {
            'dataset_id': visit_entry['dataset_id'],
            'dataset_version': visit_entry['dataset_version'],
            'dataset_path': visit_id,
        }
    ]
    update_entry(
        super_dict['dataset_id'],
        super_dict['dataset_version'],
        study_id,
        'subdatasets',
        subdatasets,
        study_catalog_path)


def get_catalog(study_id, catalog_path):
    """"""
    package_path = Path(__file__).resolve().parent.parent
    # Instantiate WebCatalog object
    ctlg = WebCatalog(
        location=str(catalog_path),
        config_file=str(package_path / 'assets' / 'catalog_config.json'),
        catalog_action='create',
    )
    # If catalog does not exist:
    if not ctlg.is_created():
        # 1. create it
        ctlg.create()
        # 2. generate and add the study-level catalog entry
        study_entry = generate_study_entry(study_id)
        add_to_catalog(study_entry, str(catalog_path))
        # 3. set catalog home page
        ctlg.main_id = study_entry.get('dataset_id')
        ctlg.main_version = study_entry.get('dataset_version')
        ctlg.set_main_dataset()
    return ctlg


def generate_study_entry(study_id):
    """"""
    desc=f"""This data catalog presents the DICOM data collected
    for all visits of the study: {study_id}. Browse through details
    of all study visits in the 'Subdatasets' tab below."""
    return new_dataset_meta_item(
        ds_id=str(uuid4()),
        ds_version='latest',
        ds_name=study_id,
        ds_description=desc)


def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path):
    meta_item = {
        'type': 'dataset',
        'dataset_id': ds_id,
        'dataset_version': ds_version,
        'name': ds_name,
        'metadata_sources': get_metadata_source(),
    }
    meta_item.update({key: value})
    add_to_catalog(meta_item, str(study_catalog_path))
    return meta_item


def generate_visit_entry(study_id, visit_id, metapath_dataset, metapath_file):
    """"""
        # Create base visit entry
    desc=f"""This page presents the DICOM data collected for the visit
    {visit_id} during the imaging study {study_id}. Browse through details
    of this particular study visit in the 'DICOM' tab below."""
    meta_item = new_dataset_meta_item(
        ds_id=str(uuid4()),
        ds_version='latest',
        ds_name=visit_id,
        ds_description=desc)
    # Load tarball metadata
    tar_metadata = read_json_file(metapath_dataset)
    expected_keys = ('size', 'md5', 'dspath', 'storepath')
    if not all(k in tar_metadata for k in expected_keys):
        raise ValueError(f'incomplete tarball metadata at {metapath_dataset}')
    # add dataset url

    access_url_pre = 'datalad-annex::?type=external&externaltype=uncurl&url='
    access_url_post = '_{{annex_key}}&encryption=none'
    access_url = f'{access_url_pre}{icfstore_baseurl}/{study_id}/{visit_id}{access_url_post}'
    meta_item.update(dict(url=access_url))
    # Load dicom metadata and derive some summary measures
    dicoms = read_json_file(metapath_file)
    nr_files = len(dicoms)
    unique_tag_vals = {}
    for k in dicom_metadata_keys:
        unique_tag_vals[k] = list(filter(None, list(set(d[k] for d in dicoms))))
    additional_keyvals = {
        "keywords": unique_tag_vals['ProtocolName'] + unique_tag_vals['Modality'],
        "additional_display": [
            {
                "name": "DICOM",
                "icon": "far fa-file-image",
                "content": unique_tag_vals
            }
        ],
        "top_display": [
            {
                "name": "TAR file size",
                "value": format_bytes(tar_metadata['size'])
            },
            {
                "name": "Nr of DICOMs",
                "value": nr_files
            },
        ]
    }
    # add summary measures to visit entry
    meta_item.update(additional_keyvals)
    return meta_item


def add_to_catalog(meta_entry: dict, catalog_dir: str ):
    """"""
    with tempfile.NamedTemporaryFile(mode="w+t") as f:
        json.dump(meta_entry, f)
        f.seek(0)
        res = catalog_api("add", catalog_dir=catalog_dir, metadata=f.name)


def read_json_file(file_path):
    """
    Load content from catalog metadata file for current node
    """
    try:
        with open(file_path) as f:
            return json.load(f)
    except OSError as err:
        raise("OS error: {0}".format(err))
    except:
        raise("Unexpected error:", sys.exc_info()[0])


def get_gitconfig(conf_name):
    result = (
        subprocess.run(['git', 'config', conf_name], capture_output=True)
        .stdout.decode()
        .rstrip()
    )
    return result


def get_metadata_source():
    """Create metadata_sources dict required by catalog schema"""
    source = {
        'key_source_map': {},
        'sources': [
            {
                'source_name': 'automated_addition',
                'source_version': '0.1.0',
                'source_time': datetime.now().timestamp(),
                'agent_email': get_gitconfig('user.name'),
                'agent_name': get_gitconfig('user.email'),
            }
        ],
    }
    return source


def new_dataset_meta_item(ds_id, ds_version, ds_name = '', ds_description = ''):
    """Create a minimal valid dataset metadata blob in catalog schema"""
    meta_item = {
        'type': 'dataset',
        'dataset_id': ds_id,
        'dataset_version': ds_version,
        'name': ds_name,
        'description': ds_description,
        'metadata_sources': get_metadata_source(),
    }
    return meta_item


def format_bytes(bytes, decimals=2):
    if bytes == 0:
        return "0 Bytes"
    k = 1024
    dm = 0 if decimals < 0 else decimals
    sizes = ["Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
    i = int(math.floor(math.log(bytes) / math.log(k)))
    return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"


if __name__ == '__main__':
    import argparse
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument(
        "-o", "--store-dir", metavar='PATH', default=os.getcwd(),
        help="Root directory of the ICF data store. "
        "Visit data will be read from it, and the DataLad dataset will be "
        "deposited into it."
    )
    p.add_argument(
        '--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
        help="The study and visit identifiers, used to "
        "locate the visit archive in the storage organization. "
    )
    args = p.parse_args()
    main(store_dir=args.store_dir,
         study_id=args.id[0],
         visit_id=args.id[1],
    )