inm-icf-utilities/bin/catalogify_studyvisit_from_meta
2023-05-31 10:46:46 +02:00

269 lines
8.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""
"""
from datetime import datetime
import json
import math
import os
from pathlib import Path
import subprocess
import sys
import tempfile
from uuid import uuid4
import datalad.api as dl
from datalad_catalog.catalog import Catalog
from datalad_catalog.webcatalog import WebCatalog
# this points to the top of the ICF data store.
# internally it will be amended with the missing components
# for study and visit deposit locations
icfstore_baseurl = 'https://data.inm-icf.de'
# which DICOM tags to extract from DICOM files and store as
# git-annex metadata (e.g., to enable metadata-driven views
# of visit datasets)
dicom_metadata_keys = [
"SeriesDescription",
"SeriesNumber",
"Modality",
"MRAcquisitionType",
"ProtocolName",
"PulseSequenceName",
]
# Instantiate interface object for api
catalog_api = Catalog()
def main(store_dir: str,
study_id: str,
visit_id: str):
store_base_dir = Path(store_dir)
# where to add the catalog entry
study_catalog_path = store_base_dir / study_id / 'catalog'
# locate metadata files
dataset_metadata_path = store_base_dir / study_id / \
f'{visit_id}_metadata_tarball.json'
file_metadata_path = store_base_dir / study_id / \
f'{visit_id}_metadata_dicoms.json'
# Grab or create the catalog (WebCatalog class)
# (creating includes generating and adding a study entry for the catalog)
ctlg = get_catalog(study_id, study_catalog_path)
# Generate and add a visit entry for the catalog
visit_entry = generate_visit_entry(
study_id,
visit_id,
dataset_metadata_path,
file_metadata_path)
add_to_catalog(visit_entry, str(study_catalog_path))
# Add visit entry as subdataset to study entry
super_dict = read_json_file(ctlg.location / 'metadata' / 'super.json')
subdatasets = [
{
'dataset_id': visit_entry['dataset_id'],
'dataset_version': visit_entry['dataset_version'],
'dataset_path': visit_id,
}
]
update_entry(
super_dict['dataset_id'],
super_dict['dataset_version'],
study_id,
'subdatasets',
subdatasets,
study_catalog_path)
def get_catalog(study_id, catalog_path):
""""""
package_path = Path(__file__).resolve().parent.parent
# Instantiate WebCatalog object
ctlg = WebCatalog(
location=str(catalog_path),
config_file=str(package_path / 'assets' / 'catalog_config.json'),
catalog_action='create',
)
# If catalog does not exist:
if not ctlg.is_created():
# 1. create it
ctlg.create()
# 2. generate and add the study-level catalog entry
study_entry = generate_study_entry(study_id)
add_to_catalog(study_entry, str(catalog_path))
# 3. set catalog home page
ctlg.main_id = study_entry.get('dataset_id')
ctlg.main_version = study_entry.get('dataset_version')
ctlg.set_main_dataset()
return ctlg
def generate_study_entry(study_id):
""""""
desc=f"""This data catalog presents the DICOM data collected
for all visits of the study: {study_id}. Browse through details
of all study visits in the 'Subdatasets' tab below."""
return new_dataset_meta_item(
ds_id=str(uuid4()),
ds_version='latest',
ds_name=study_id,
ds_description=desc)
def update_entry(ds_id, ds_version, ds_name, key, value, study_catalog_path):
meta_item = {
'type': 'dataset',
'dataset_id': ds_id,
'dataset_version': ds_version,
'name': ds_name,
'metadata_sources': get_metadata_source(),
}
meta_item.update({key: value})
add_to_catalog(meta_item, str(study_catalog_path))
return meta_item
def generate_visit_entry(study_id, visit_id, metapath_dataset, metapath_file):
""""""
# Create base visit entry
desc=f"""This page presents the DICOM data collected for the visit
{visit_id} during the imaging study {study_id}. Browse through details
of this particular study visit in the 'DICOM' tab below."""
meta_item = new_dataset_meta_item(
ds_id=str(uuid4()),
ds_version='latest',
ds_name=visit_id,
ds_description=desc)
# Load tarball metadata
tar_metadata = read_json_file(metapath_dataset)
expected_keys = ('size', 'md5', 'dspath', 'storepath')
if not all(k in tar_metadata for k in expected_keys):
raise ValueError(f'incomplete tarball metadata at {metapath_dataset}')
# add dataset url
access_url_pre = 'datalad-annex::?type=external&externaltype=uncurl&url='
access_url_post = '_{{annex_key}}&encryption=none'
access_url = f'{access_url_pre}{icfstore_baseurl}/{study_id}/{visit_id}{access_url_post}'
meta_item.update(dict(url=access_url))
# Load dicom metadata and derive some summary measures
dicoms = read_json_file(metapath_file)
nr_files = len(dicoms)
unique_tag_vals = {}
for k in dicom_metadata_keys:
unique_tag_vals[k] = list(filter(None, list(set(d[k] for d in dicoms))))
additional_keyvals = {
"keywords": unique_tag_vals['ProtocolName'] + unique_tag_vals['Modality'],
"additional_display": [
{
"name": "DICOM",
"icon": "far fa-file-image",
"content": unique_tag_vals
}
],
"top_display": [
{
"name": "TAR file size",
"value": format_bytes(tar_metadata['size'])
},
{
"name": "Nr of DICOMs",
"value": nr_files
},
]
}
# add summary measures to visit entry
meta_item.update(additional_keyvals)
return meta_item
def add_to_catalog(meta_entry: dict, catalog_dir: str ):
""""""
with tempfile.NamedTemporaryFile(mode="w+t") as f:
json.dump(meta_entry, f)
f.seek(0)
res = catalog_api("add", catalog_dir=catalog_dir, metadata=f.name)
def read_json_file(file_path):
"""
Load content from catalog metadata file for current node
"""
try:
with open(file_path) as f:
return json.load(f)
except OSError as err:
raise("OS error: {0}".format(err))
except:
raise("Unexpected error:", sys.exc_info()[0])
def get_gitconfig(conf_name):
result = (
subprocess.run(['git', 'config', conf_name], capture_output=True)
.stdout.decode()
.rstrip()
)
return result
def get_metadata_source():
"""Create metadata_sources dict required by catalog schema"""
source = {
'key_source_map': {},
'sources': [
{
'source_name': 'automated_addition',
'source_version': '0.1.0',
'source_time': datetime.now().timestamp(),
'agent_email': get_gitconfig('user.name'),
'agent_name': get_gitconfig('user.email'),
}
],
}
return source
def new_dataset_meta_item(ds_id, ds_version, ds_name = '', ds_description = ''):
"""Create a minimal valid dataset metadata blob in catalog schema"""
meta_item = {
'type': 'dataset',
'dataset_id': ds_id,
'dataset_version': ds_version,
'name': ds_name,
'description': ds_description,
'metadata_sources': get_metadata_source(),
}
return meta_item
def format_bytes(bytes, decimals=2):
if bytes == 0:
return "0 Bytes"
k = 1024
dm = 0 if decimals < 0 else decimals
sizes = ["Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
i = int(math.floor(math.log(bytes) / math.log(k)))
return f"{round(bytes / math.pow(k, i), dm)} {sizes[i]}"
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(description=__doc__)
p.add_argument(
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
help="Root directory of the ICF data store. "
"Visit data will be read from it, and the DataLad dataset will be "
"deposited into it."
)
p.add_argument(
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
help="The study and visit identifiers, used to "
"locate the visit archive in the storage organization. "
)
args = p.parse_args()
main(store_dir=args.store_dir,
study_id=args.id[0],
visit_id=args.id[1],
)