From 25428f36ed16ebb886c3a11b1c20a7c15df71a35 Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Sun, 1 Sep 2024 11:01:59 -0400
Subject: [PATCH] Fix things.

---
 src/fmripost_phase/utils/bids.py     | 385 +++++++++++++++++++++++++++
 src/fmripost_phase/workflows/base.py |   2 -
 2 files changed, 385 insertions(+), 2 deletions(-)
 create mode 100644 src/fmripost_phase/utils/bids.py

diff --git a/src/fmripost_phase/utils/bids.py b/src/fmripost_phase/utils/bids.py
new file mode 100644
index 0000000..7036226
--- /dev/null
+++ b/src/fmripost_phase/utils/bids.py
@@ -0,0 +1,385 @@
+"""Utilities to handle BIDS inputs."""
+
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+from bids.layout import BIDSLayout
+from bids.utils import listify
+from niworkflows.utils.spaces import SpatialReferences
+
+from fmripost_phase.data import load as load_data
+
+
+def extract_entities(file_list: str | list[str]) -> dict:
+    """Return a dictionary of common entities given a list of files.
+
+    Parameters
+    ----------
+    file_list : str | list[str]
+        File path or list of file paths.
+
+    Returns
+    -------
+    entities : dict
+        Dictionary of entities.
+
+    Examples
+    --------
+    >>> extract_entities("sub-01/anat/sub-01_T1w.nii.gz")
+    {'subject': '01', 'suffix': 'T1w', 'datatype': 'anat', 'extension': '.nii.gz'}
+    >>> extract_entities(["sub-01/anat/sub-01_T1w.nii.gz"] * 2)
+    {'subject': '01', 'suffix': 'T1w', 'datatype': 'anat', 'extension': '.nii.gz'}
+    >>> extract_entities(["sub-01/anat/sub-01_run-1_T1w.nii.gz",
+    ...                   "sub-01/anat/sub-01_run-2_T1w.nii.gz"])
+    {'subject': '01', 'run': [1, 2], 'suffix': 'T1w', 'datatype': 'anat', 'extension': '.nii.gz'}
+
+    """
+    from collections import defaultdict
+
+    from bids.layout import parse_file_entities
+
+    entities = defaultdict(list)
+    for e, v in [
+        ev_pair for f in listify(file_list) for ev_pair in parse_file_entities(f).items()
+    ]:
+        entities[e].append(v)
+
+    def _unique(inlist):
+        inlist = sorted(set(inlist))
+        if len(inlist) == 1:
+            return inlist[0]
+        return inlist
+
+    return {k: _unique(v) for k, v in entities.items()}
+
+
+def collect_derivatives(
+    raw_dataset: Path | BIDSLayout | None,
+    derivatives_dataset: Path | BIDSLayout | None,
+    entities: dict | None,
+    fieldmap_id: str | None,
+    spec: dict | None = None,
+    patterns: list[str] | None = None,
+    allow_multiple: bool = False,
+    spaces: SpatialReferences | None = None,
+) -> dict:
+    """Gather existing derivatives and compose a cache.
+
+    TODO: Ingress 'spaces' and search for BOLD+mask in the spaces *or* xfms.
+
+    Parameters
+    ----------
+    raw_dataset : Path | BIDSLayout | None
+        Path to the raw dataset or a BIDSLayout instance.
+    derivatives_dataset : Path | BIDSLayout
+        Path to the derivatives dataset or a BIDSLayout instance.
+    entities : dict
+        Dictionary of entities to use for filtering.
+    fieldmap_id : str | None
+        Fieldmap ID to use for filtering.
+    spec : dict | None
+        Specification dictionary.
+    patterns : list[str] | None
+        List of patterns to use for filtering.
+    allow_multiple : bool
+        Allow multiple files to be returned for a given query.
+    spaces : SpatialReferences | None
+        Spatial references to select for.
+
+    Returns
+    -------
+    derivs_cache : dict
+        Dictionary with keys corresponding to the derivatives and values
+        corresponding to the file paths.
+    """
+    if not entities:
+        entities = {}
+
+    if spec is None or patterns is None:
+        _spec = json.loads(load_data.readable('io_spec.json').read_text())
+
+        if spec is None:
+            spec = _spec['queries']
+
+        if patterns is None:
+            patterns = _spec['patterns']
+
+    # Search for derivatives data
+    derivs_cache = defaultdict(list, {})
+    if derivatives_dataset is not None:
+        layout = derivatives_dataset
+        if isinstance(layout, Path):
+            layout = BIDSLayout(
+                layout,
+                config=['bids', 'derivatives'],
+                validate=False,
+            )
+
+        for k, q in spec['derivatives'].items():
+            # Combine entities with query. Query values override file entities.
+            query = {**entities, **q}
+            item = layout.get(return_type='filename', **query)
+            if not item:
+                derivs_cache[k] = None
+            elif not allow_multiple and len(item) > 1:
+                raise ValueError(f'Multiple files found for {k}: {item}')
+            else:
+                derivs_cache[k] = item[0] if len(item) == 1 else item
+
+        for k, q in spec['transforms'].items():
+            # Combine entities with query. Query values override file entities.
+            # TODO: Drop functional entities (task, run, etc.) from anat transforms.
+            query = {**entities, **q}
+            if k == 'boldref2fmap':
+                query['to'] = fieldmap_id
+
+            item = layout.get(return_type='filename', **query)
+            if not item:
+                derivs_cache[k] = None
+            elif not allow_multiple and len(item) > 1:
+                raise ValueError(f'Multiple files found for {k}: {item}')
+            else:
+                derivs_cache[k] = item[0] if len(item) == 1 else item
+
+    # Search for requested output spaces
+    if spaces is not None:
+        # Put the output-space files/transforms in lists so they can be parallelized with
+        # template_iterator_wf.
+        spaces_found, bold_outputspaces, bold_mask_outputspaces = [], [], []
+        for space in spaces.references:
+            # First try to find processed BOLD+mask files in the requested space
+            bold_query = {**entities, **spec['derivatives']['bold_mni152nlin6asym']}
+            bold_query['space'] = space.space
+            bold_query = {**bold_query, **space.spec}
+            bold_item = layout.get(return_type='filename', **bold_query)
+            bold_outputspaces.append(bold_item[0] if bold_item else None)
+
+            mask_query = {**entities, **spec['derivatives']['bold_mask_mni152nlin6asym']}
+            mask_query['space'] = space.space
+            mask_query = {**mask_query, **space.spec}
+            mask_item = layout.get(return_type='filename', **mask_query)
+            bold_mask_outputspaces.append(mask_item[0] if mask_item else None)
+
+            spaces_found.append(bool(bold_item) and bool(mask_item))
+
+        if all(spaces_found):
+            derivs_cache['bold_outputspaces'] = bold_outputspaces
+            derivs_cache['bold_mask_outputspaces'] = bold_mask_outputspaces
+        else:
+            # The requested spaces were not found, try to find transforms
+            print(
+                'Not all requested output spaces were found. '
+                'We will try to find transforms to these spaces and apply them to the BOLD data.',
+                flush=True,
+            )
+
+        spaces_found, anat2outputspaces_xfm = [], []
+        for space in spaces.references:
+            # First try to find processed BOLD+mask files in the requested space
+            anat2space_query = {**entities, **spec['transforms']['anat2mni152nlin6asym']}
+            anat2space_query['to'] = space.space
+            item = layout.get(return_type='filename', **anat2space_query)
+            anat2outputspaces_xfm.append(item[0] if item else None)
+            spaces_found.append(bool(item))
+
+        if all(spaces_found):
+            derivs_cache['anat2outputspaces_xfm'] = anat2outputspaces_xfm
+        else:
+            missing_spaces = ', '.join(
+                [s.space for s, found in zip(spaces.references, spaces_found) if not found]
+            )
+            raise ValueError(
+                f'Transforms to the following requested spaces not found: {missing_spaces}.'
+            )
+
+    # Search for raw BOLD data
+    if not derivs_cache and raw_dataset is not None:
+        if isinstance(raw_dataset, Path):
+            raw_layout = BIDSLayout(raw_dataset, config=['bids'], validate=False)
+        else:
+            raw_layout = raw_dataset
+
+        for k, q in spec['raw'].items():
+            # Combine entities with query. Query values override file entities.
+            query = {**entities, **q}
+            item = raw_layout.get(return_type='filename', **query)
+            if not item:
+                derivs_cache[k] = None
+            elif not allow_multiple and len(item) > 1:
+                raise ValueError(f'Multiple files found for {k}: {item}')
+            else:
+                derivs_cache[k] = item[0] if len(item) == 1 else item
+
+    return derivs_cache
+
+
+def write_bidsignore(deriv_dir):
+    bids_ignore = (
+        '*.html',
+        'logs/',
+        'figures/',  # Reports
+        '*_xfm.*',  # Unspecified transform files
+        '*.surf.gii',  # Unspecified structural outputs
+        # Unspecified functional outputs
+        '*_boldref.nii.gz',
+        '*_bold.func.gii',
+        '*_mixing.tsv',
+        '*_timeseries.tsv',
+    )
+    ignore_file = Path(deriv_dir) / '.bidsignore'
+
+    ignore_file.write_text('\n'.join(bids_ignore) + '\n')
+
+
+def write_derivative_description(bids_dir, deriv_dir, dataset_links=None):
+    import os
+
+    from fmripost_phase import __version__
+
+    DOWNLOAD_URL = f'https://github.com/nipreps/fmripost_phase/archive/{__version__}.tar.gz'
+
+    bids_dir = Path(bids_dir)
+    deriv_dir = Path(deriv_dir)
+    desc = {
+        'Name': 'fMRIPost-AROMA- ICA-AROMA Postprocessing Outputs',
+        'BIDSVersion': '1.9.0dev',
+        'DatasetType': 'derivative',
+        'GeneratedBy': [
+            {
+                'Name': 'fMRIPost-AROMA',
+                'Version': __version__,
+                'CodeURL': DOWNLOAD_URL,
+            }
+        ],
+        'HowToAcknowledge': 'Please cite fMRIPost-AROMA when using these results.',
+    }
+
+    # Keys that can only be set by environment
+    if 'FMRIPOST_AROMA_DOCKER_TAG' in os.environ:
+        desc['GeneratedBy'][0]['Container'] = {
+            'Type': 'docker',
+            'Tag': f"nipreps/fmriprep:{os.environ['FMRIPOST_AROMA__DOCKER_TAG']}",
+        }
+    if 'FMRIPOST_AROMA__SINGULARITY_URL' in os.environ:
+        desc['GeneratedBy'][0]['Container'] = {
+            'Type': 'singularity',
+            'URI': os.getenv('FMRIPOST_AROMA__SINGULARITY_URL'),
+        }
+
+    # Keys deriving from source dataset
+    orig_desc = {}
+    fname = bids_dir / 'dataset_description.json'
+    if fname.exists():
+        orig_desc = json.loads(fname.read_text())
+
+    if 'DatasetDOI' in orig_desc:
+        desc['SourceDatasets'] = [
+            {'URL': f'https://doi.org/{orig_desc["DatasetDOI"]}', 'DOI': orig_desc['DatasetDOI']}
+        ]
+    if 'License' in orig_desc:
+        desc['License'] = orig_desc['License']
+
+    # Add DatasetLinks
+    if dataset_links:
+        desc['DatasetLinks'] = {k: str(v) for k, v in dataset_links.items()}
+        if 'templateflow' in dataset_links:
+            desc['DatasetLinks']['templateflow'] = 'https://github.com/templateflow/templateflow'
+
+    Path.write_text(deriv_dir / 'dataset_description.json', json.dumps(desc, indent=4))
+
+
+def validate_input_dir(exec_env, bids_dir, participant_label, need_T1w=True):
+    # Ignore issues and warnings that should not influence FMRIPREP
+    import subprocess
+    import sys
+    import tempfile
+
+    validator_config_dict = {
+        'ignore': [
+            'EVENTS_COLUMN_ONSET',
+            'EVENTS_COLUMN_DURATION',
+            'TSV_EQUAL_ROWS',
+            'TSV_EMPTY_CELL',
+            'TSV_IMPROPER_NA',
+            'VOLUME_COUNT_MISMATCH',
+            'BVAL_MULTIPLE_ROWS',
+            'BVEC_NUMBER_ROWS',
+            'DWI_MISSING_BVAL',
+            'INCONSISTENT_SUBJECTS',
+            'INCONSISTENT_PARAMETERS',
+            'BVEC_ROW_LENGTH',
+            'B_FILE',
+            'PARTICIPANT_ID_COLUMN',
+            'PARTICIPANT_ID_MISMATCH',
+            'TASK_NAME_MUST_DEFINE',
+            'PHENOTYPE_SUBJECTS_MISSING',
+            'STIMULUS_FILE_MISSING',
+            'DWI_MISSING_BVEC',
+            'EVENTS_TSV_MISSING',
+            'TSV_IMPROPER_NA',
+            'ACQTIME_FMT',
+            'Participants age 89 or higher',
+            'DATASET_DESCRIPTION_JSON_MISSING',
+            'FILENAME_COLUMN',
+            'WRONG_NEW_LINE',
+            'MISSING_TSV_COLUMN_CHANNELS',
+            'MISSING_TSV_COLUMN_IEEG_CHANNELS',
+            'MISSING_TSV_COLUMN_IEEG_ELECTRODES',
+            'UNUSED_STIMULUS',
+            'CHANNELS_COLUMN_SFREQ',
+            'CHANNELS_COLUMN_LOWCUT',
+            'CHANNELS_COLUMN_HIGHCUT',
+            'CHANNELS_COLUMN_NOTCH',
+            'CUSTOM_COLUMN_WITHOUT_DESCRIPTION',
+            'ACQTIME_FMT',
+            'SUSPICIOUSLY_LONG_EVENT_DESIGN',
+            'SUSPICIOUSLY_SHORT_EVENT_DESIGN',
+            'MALFORMED_BVEC',
+            'MALFORMED_BVAL',
+            'MISSING_TSV_COLUMN_EEG_ELECTRODES',
+            'MISSING_SESSION',
+        ],
+        'error': ['NO_T1W'] if need_T1w else [],
+        'ignoredFiles': ['/dataset_description.json', '/participants.tsv'],
+    }
+    # Limit validation only to data from requested participants
+    if participant_label:
+        all_subs = {s.name[4:] for s in bids_dir.glob('sub-*')}
+        selected_subs = {s[4:] if s.startswith('sub-') else s for s in participant_label}
+        bad_labels = selected_subs.difference(all_subs)
+        if bad_labels:
+            error_msg = (
+                'Data for requested participant(s) label(s) not found. Could '
+                'not find data for participant(s): %s. Please verify the requested '
+                'participant labels.'
+            )
+            if exec_env == 'docker':
+                error_msg += (
+                    ' This error can be caused by the input data not being '
+                    'accessible inside the docker container. Please make sure all '
+                    'volumes are mounted properly (see https://docs.docker.com/'
+                    'engine/reference/commandline/run/#mount-volume--v---read-only)'
+                )
+            if exec_env == 'singularity':
+                error_msg += (
+                    ' This error can be caused by the input data not being '
+                    'accessible inside the singularity container. Please make sure '
+                    'all paths are mapped properly (see https://www.sylabs.io/'
+                    'guides/3.0/user-guide/bind_paths_and_mounts.html)'
+                )
+            raise RuntimeError(error_msg % ','.join(bad_labels))
+
+        ignored_subs = all_subs.difference(selected_subs)
+        if ignored_subs:
+            for sub in ignored_subs:
+                validator_config_dict['ignoredFiles'].append(f'/sub-{sub}/**')
+    with tempfile.NamedTemporaryFile(mode='w+', suffix='.json') as temp:
+        temp.write(json.dumps(validator_config_dict))
+        temp.flush()
+        try:
+            subprocess.check_call(['bids-validator', str(bids_dir), '-c', temp.name])  # noqa: S607
+        except FileNotFoundError:
+            print('bids-validator does not appear to be installed', file=sys.stderr)
diff --git a/src/fmripost_phase/workflows/base.py b/src/fmripost_phase/workflows/base.py
index 999eac5..fa71ffd 100644
--- a/src/fmripost_phase/workflows/base.py
+++ b/src/fmripost_phase/workflows/base.py
@@ -285,9 +285,7 @@ def init_single_run_wf(bold_file):
     from fmriprep.workflows.bold.stc import init_bold_stc_wf
     from nipype.interfaces import utility as niu
     from niworkflows.engine.workflows import LiterateWorkflow as Workflow
-    from niworkflows.interfaces.fixes import FixHeaderApplyTransforms as ApplyTransforms
     from niworkflows.interfaces.header import ValidateImage
-    from templateflow.api import get as get_template
 
     from fmripost_phase.interfaces.bids import DerivativesDataSink
     from fmripost_phase.interfaces.laynii import LayNiiPhaseJolt