151 lines
4.9 KiB
Python
Executable file
151 lines
4.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
This command locates the DICOM tarball for a particular visit in a study (given
|
|
by their respective identifiers) in the data store, and extracts a minimal set
|
|
of metadata tags for each DICOM image, and the TAR archive as a whole. These
|
|
metadata are then deposited in two files, in JSON format, in the study
|
|
directory:
|
|
|
|
- `{visit_id}_metadata_tarball.json`
|
|
|
|
JSON object with basic properties of the archive, such as 'size', and
|
|
'md5'.
|
|
|
|
- `{visit_id}_metadata_dicoms.json`
|
|
|
|
JSON array with essential properties for each DICOM image file, such as
|
|
'path' (relative path inside the TAR archive), 'md5' (MD5 checksum of
|
|
the DICOM file), 'size' (in bytes), and select standard DICOM tags,
|
|
such as "SeriesDescription", "SeriesNumber", "Modality",
|
|
"MRAcquisitionType", "ProtocolName", "PulseSequenceName". The latter
|
|
enable a rough, technical characterization of the images in the TAR
|
|
archive.
|
|
"""
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import json
|
|
# this implementation works with pydicom 2x
|
|
from pydicom import dcmread
|
|
from pydicom.errors import InvalidDicomError
|
|
|
|
import datalad.api as dl
|
|
from datalad.utils import md5sum
|
|
|
|
|
|
lgr = logging.getLogger('inm-icf-utilities')
|
|
|
|
# which DICOM tags to extract from DICOM files and store as
|
|
# git-annex metadata (e.g., to enable metadata-driven views
|
|
# of visit datasets)
|
|
dicom_metadata_keys = [
|
|
"SeriesDescription",
|
|
"SeriesNumber",
|
|
"Modality",
|
|
"MRAcquisitionType",
|
|
"ProtocolName",
|
|
"PulseSequenceName",
|
|
]
|
|
|
|
|
|
def main(store_dir: str,
|
|
study_id: str,
|
|
visit_id: str):
|
|
store_base_dir = Path(store_dir)
|
|
# where to create metadata
|
|
dataset_metadata_path = store_base_dir / study_id / \
|
|
f'{visit_id}_metadata_tarball.json'
|
|
file_metadata_path = store_base_dir / study_id / \
|
|
f'{visit_id}_metadata_dicoms.json'
|
|
# do not overwrite existing metadata
|
|
for fp in [dataset_metadata_path, file_metadata_path]:
|
|
deposit_conflicts = [str(p) for p in fp.parent.glob(
|
|
dataset_metadata_path.name)]
|
|
if deposit_conflicts:
|
|
# be safe
|
|
raise ValueError(
|
|
f'existing metadata deposit {deposit_conflicts}, '
|
|
'refusing to overwrite')
|
|
# locate input tarball
|
|
tar_path = store_base_dir / study_id / f'{visit_id}_dicom.tar'
|
|
if not tar_path.exists():
|
|
raise ValueError(f'no tarball at {tar_path}')
|
|
|
|
describe_tarball(
|
|
# source visit tarball
|
|
tar_path.resolve(),
|
|
# source visit tarball URL, relative to store
|
|
f'{study_id}/{tar_path.name}',
|
|
# path to deposit dataset metadata
|
|
dataset_metadata_path.absolute(),
|
|
# path to deposit file metadata
|
|
file_metadata_path.absolute(),
|
|
)
|
|
|
|
|
|
def describe_tarball(tarpath, tarurl, metapath_dataset, metapath_file):
|
|
# construct and dump dataset metadata
|
|
tar_meta = {
|
|
'size': tarpath.stat().st_size,
|
|
'md5': md5sum(tarpath),
|
|
'dspath': str(Path('icf', tarpath.name)),
|
|
'storepath': tarurl,
|
|
}
|
|
with open(metapath_dataset, "w") as f:
|
|
json.dump(tar_meta, f)
|
|
# construct and dump dicom file metadata
|
|
dicoms = []
|
|
for r in dl.ls_file_collection(
|
|
'tarfile',
|
|
tarpath,
|
|
hash=['md5'],
|
|
result_renderer='disabled',
|
|
return_type='generator'):
|
|
fp = r.get('fp')
|
|
if fp is None:
|
|
# not a file
|
|
continue
|
|
d = dict(path=str(r['item']), md5=r['hash-md5'], size=r['size'])
|
|
# rewind file pointer to ready for dcmread()
|
|
fp.seek(0)
|
|
try:
|
|
with dcmread(
|
|
fp,
|
|
stop_before_pixels=True,
|
|
specific_tags=dicom_metadata_keys,
|
|
) as dcm:
|
|
# extract target DICOM tags for metadata
|
|
d.update({
|
|
dmk: getattr(dcm, dmk, '')
|
|
for dmk in dicom_metadata_keys
|
|
})
|
|
dicoms.append(d)
|
|
except InvalidDicomError:
|
|
lgr.info('skipping non-DICOM file: %s', r['item'])
|
|
with open(metapath_file, "w") as f:
|
|
json.dump(dicoms, f)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
p = argparse.ArgumentParser(
|
|
description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
p.add_argument(
|
|
"-o", "--store-dir", metavar='PATH', default=os.getcwd(),
|
|
help="Root directory of the data store. "
|
|
"Visit data will be read from it, and extracted metadata will be "
|
|
"deposited into it."
|
|
)
|
|
p.add_argument(
|
|
'--id', nargs=2, metavar=('STUDY-ID', 'VISIT-ID'), required=True,
|
|
help="The study and visit identifiers, used to "
|
|
"locate the visit archive in the storage organization. "
|
|
)
|
|
args = p.parse_args()
|
|
main(store_dir=args.store_dir,
|
|
study_id=args.id[0],
|
|
visit_id=args.id[1],
|
|
)
|