inm-icf-utilities/bin/catalogify_studyvisit_from_meta

235 lines
7.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""
"""
from datetime import datetime
import json
import math
import os
from pathlib import Path
import subprocess
import sys
import tempfile
from uuid import uuid4
from datalad.api import catalog_add
from datalad_catalog.webcatalog import WebCatalog
from datalad_catalog.schema_utils import get_metadata_item
# this points to the top of the ICF data store.
# internally it will be amended with the missing components
# for study and visit deposit locations
icfstore_baseurl = 'https://data.inm-icf.de'
# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
dicom_metadata_keys = [
"SeriesDescription",
"SeriesNumber",
"Modality",
"MRAcquisitionType",
"ProtocolName",
"PulseSequenceName",
]
def main(store_dir: str,
study_id: str,
visit_id: str):
store_base_dir = Path(store_dir)
# where to add the catalog entry
study_catalog_path = store_base_dir / study_id / 'catalog'
# locate metadata files
dataset_metadata_path = store_base_dir / study_id / \
f'{visit_id}_metadata_tarball.json'
file_metadata_path = store_base_dir / study_id / \
f'{visit_id}_metadata_dicoms.json'
# Grab or create the catalog (WebCatalog class)
# (creating includes generating and adding a study entry for the catalog)
ctlg = get_catalog(study_id, study_catalog_path)
# Generate and add a visit entry for the catalog
visit_entry = generate_visit_entry(
study_id,
visit_id,
dataset_metadata_path,
file_metadata_path)
catalog_add(
catalog=study_catalog_path,
metadata=visit_entry,
)
# Add visit entry as subdataset to study entry
super_dict = ctlg.get_main_dataset()
subdatasets = [
{
'dataset_id': visit_entry['dataset_id'],
'dataset_version': visit_entry['dataset_version'],
'dataset_path': visit_id,
}
]
update_entry(
super_dict['dataset_id'],
super_dict['dataset_version'],
study_id,
'subdatasets',
subdatasets,
study_catalog_path)
def get_catalog(study_id, catalog_path):
""""""
package_path = Path(__file__).resolve().parent.parent
# Instantiate WebCatalog object
ctlg = WebCatalog(location=str(catalog_path))
# If catalog does not exist:
if not ctlg.is_created():
# 1. create it
ctlg.create(
config_file=str(package_path / 'assets' / 'catalog_config.json'),
)
# 2. generate and add the study-level catalog entry
study_entry = generate_study_entry(study_id)
catalog_add(
catalog=catalog_path,
metadata=study_entry,
)
# 3. set catalog home page
ctlg.set_main_dataset(
dataset_id=study_entry.get('dataset_id'),
dataset_version=study_entry.get('dataset_version'),
)
return ctlg
def generate_study_entry(study_id):
""""""
desc=f"""This data catalog presents the DICOM data collected
for all visits of the study: {study_id}. Browse through details
of all study visits in the 'Subdatasets' tab below."""
meta_item = get_metadata_item(
item_type='dataset',
dataset_id=str(uuid4()),
dataset_version='latest',
source_name='automated_addition',
source_version='0.1.0',
)
meta_item['name'] = study_id
meta_item['description'] = desc
return meta_item
def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path):
meta_item = get_metadata_item(
item_type='dataset',
dataset_id=ds_id,
dataset_version=ds_version,
source_name='automated_addition',
source_version='0.1.0',
)
meta_item['name'] = ds_name
meta_item.update({key: value})
catalog_add(
catalog=study_catalog_path,
metadata=meta_item,
)
return meta_item
def generate_visit_entry(study_id, visit_id, metapath_dataset, metapath_file):
""""""
# Create base visit entry
desc=f"""This page presents the DICOM data collected for the visit
{visit_id} during the imaging study {study_id}. Browse through details
of this particular study visit in the 'DICOM' tab below."""
meta_item = get_metadata_item(
item_type='dataset',
dataset_id=str(uuid4()),
dataset_version='latest',
source_name='automated_addition',
source_version='0.1.0',
)
meta_item['name'] = visit_id
meta_item['description'] = desc
# Load tarball metadata
tar_metadata = read_json_file(metapath_dataset)
expected_keys = ('size', 'md5', 'dspath', 'storepath')
if not all(k in tar_metadata for k in expected_keys):
raise ValueError(f'incomplete tarball metadata at {metapath_dataset}')
# add dataset url
access_url_pre = 'datalad-annex::?type=external&externaltype=uncurl&url='
access_url_post = '_{{annex_key}}&encryption=none'
access_url = f'{access_url_pre}{icfstore_baseurl}/{study_id}/{visit_id}{access_url_post}'
meta_item.update(dict(url=access_url))
# Load dicom metadata and derive some summary measures
dicoms = read_json_file(metapath_file)
nr_files = len(dicoms)
unique_tag_vals = {}
for k in dicom_metadata_keys:
unique_tag_vals[k] = list(filter(None, list(set(d[k] for d in dicoms))))
additional_keyvals = {
"keywords": unique_tag_vals['ProtocolName'] + unique_tag_vals['Modality'],
"additional_display": [
{
"name": "DICOM",
"icon": "far fa-file-image",
"content": unique_tag_vals
}
],
"top_display": [
{
"name": "TAR file size",
"value": format_bytes(tar_metadata['size'])
},
{
"name": "Nr of DICOMs",
"value": nr_files
},
]
}
# add summary measures to visit entry
meta_item.update(additional_keyvals)
return meta_item
def read_json_file(file_path):
"""
Load content from catalog metadata file for current node
"""
try:
with open(file_path) as f:
return json.load(f)
except OSError as err:
raise("OS error: {0}".format(err))
except:
raise("Unexpected error:", sys.exc_info()[0])
def format_bytes(bytes, decimals=2):
if bytes == 0:
return "0 Bytes"
k = 1024
dm = 0 if decimals < 0 else decimals
sizes = ["Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
i = int(math.floor(math.log(bytes) / math.log(k)))
return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(description=__doc__)
p.add_argument(
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
help="Root directory of the ICF data store. "
"Visit data will be read from it, and the DataLad dataset will be "
"deposited into it."
)
p.add_argument(
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
help="The study and visit identifiers, used to "
"locate the visit archive in the storage organization. "
)
args = p.parse_args()
main(store_dir=args.store_dir,
study_id=args.id[0],
visit_id=args.id[1],
)