Source code for bids.variables.io

""" Tools for reading/writing BIDS data files. """

import warnings
import json

import numpy as np
import pandas as pd

from bids.utils import listify
from .entities import NodeIndex
from .variables import SparseRunVariable, DenseRunVariable, SimpleVariable


BASE_ENTITIES = ['subject', 'session', 'task', 'run']
ALL_ENTITIES = BASE_ENTITIES + ['datatype', 'suffix', 'acquisition']


[docs] def load_variables(layout, types=None, levels=None, skip_empty=True, dataset=None, scope='all', regex_search=None, **kwargs): """A convenience wrapper for one or more load_*_variables() calls. Parameters ---------- layout : :obj:`bids.layout.BIDSLayout` BIDSLayout containing variable files. types : str or list Types of variables to retrieve. All valid values reflect the filename stipulated in the BIDS spec for each kind of variable. Valid values include: 'events', 'physio', 'stim', 'scans', 'participants', 'sessions', and 'regressors'. levels : str or list Optional level(s) of variables to load. Valid values are 'run', 'session', 'subject', or 'dataset'. This is simply a shorthand way to specify types--e.g., 'run' will be converted to types=['events', 'physio', 'stim', 'regressors']. skip_empty : bool Whether or not to skip empty Variables (i.e., where there are no rows/records in a file after applying any filtering operations like dropping NaNs). dataset : NodeIndex An existing NodeIndex container to store the loaded data in. Can be used to iteratively construct a dataset that contains otherwise heterogeneous sets of variables. If None, a new NodeIndex is used. scope : str or list The scope of the space to search for variables. See docstring for BIDSLayout for details and valid predefined values. kwargs : dict Optional keyword arguments to pass onto the individual load_*_variables() calls. Returns ------- A NodeIndex instance. Examples -------- >>> load_variables(layout, ['events', 'physio'], subject='01') # doctest: +SKIP # returns all variables stored in _events.tsv and _physio.tsv.gz files # for runs that belong to subject with id '01'. """ TYPES = ['events', 'physio', 'stim', 'scans', 'participants', 'sessions', 'regressors'] types = listify(types) if types is None: if levels is not None: types = [] lev_map = { 'run': ['events', 'physio', 'stim', 'regressors'], 'session': ['scans'], 'subject': ['sessions', 'scans'], 'dataset': ['participants'] } [types.extend(lev_map[l.lower()]) for l in listify(levels)] else: types = TYPES bad_types = set(types) - set(TYPES) if bad_types: raise ValueError("Invalid variable types: %s" % bad_types) dataset = dataset or NodeIndex() run_types = list({'events', 'physio', 'stim', 'regressors'} - set(types)) type_flags = {t: False for t in run_types} if len(type_flags) < 4: _kwargs = kwargs.copy() _kwargs.update(type_flags) dataset = _load_time_variables(layout, dataset, scope=scope, **_kwargs) for t in ({'scans', 'sessions', 'participants'} & set(types)): kwargs.pop('suffix', None) # suffix is always one of values above dataset = _load_tsv_variables(layout, t, dataset, scope=scope, regex_search=regex_search, **kwargs) return dataset
def _get_nvols(img_f): import nibabel as nb img = nb.load(img_f) nvols = 0 if isinstance(img, nb.Nifti1Pair): nvols = img.shape[3] elif isinstance(img, nb.Cifti2Image): for ax in map(img.header.get_axis, range(len(img.header.matrix))): if isinstance(ax, nb.cifti2.SeriesAxis): nvols = ax.size break else: raise ValueError("No series axis found in %s" % img_f) elif isinstance(img, nb.GiftiImage): nvols = len(img.get_arrays_from_intent('time series')) else: raise ValueError("Unknown image type %s: %s" % img.__class__, img_f) return nvols def _load_time_variables(layout, dataset=None, columns=None, scan_length=None, drop_na=True, events=True, physio=True, stim=True, regressors=True, skip_empty=True, scope='all', **selectors): """Loads all variables found in *_events.tsv files and returns them as a BIDSVariableCollection. Parameters ---------- layout : :obj:`bids.layout.BIDSLayout` A BIDSLayout to scan. dataset : NodeIndex A BIDS NodeIndex container. If None, a new one is initialized. columns : list Optional list of names specifying which columns in the event files to read. By default, reads all columns found. scan_length : float Optional duration of runs (in seconds). By default, this will be extracted from the BOLD image. However, in cases where the user doesn't have access to the images (e.g., because only file handles are locally available), a fixed duration can be manually specified as a fallback. drop_na : bool If True, removes all events where amplitude is n/a. If False, leaves n/a values intact. Note that in the latter case, transformations that requires numeric values may fail. events : bool If True, extracts variables from events.tsv files. physio : bool If True, extracts variables from _physio files. stim : bool If True, extracts variables from _stim files. skip_empty : bool Whether or not to skip empty Variables (i.e., where there are no rows/records in a file, or all onsets, durations, and amplitudes are 0). scope : str or list The scope of the space to search for variables. See docstring for BIDSLayout for details and valid predefined values. selectors : dict Optional keyword arguments passed on to the BIDSLayout instance's get() method; can be used to constrain which data are loaded. Returns ------- A NodeIndex instance. """ # Extract any non-keyword arguments selectors = selectors.copy() if dataset is None: dataset = NodeIndex() selectors['datatype'] = 'func' selectors['suffix'] = 'bold' exts = selectors.pop('extension', ['.nii', '.nii.gz', '.func.gii', '.dtseries.nii']) images = layout.get(return_type='object', scope=scope, extension=exts, **selectors) if not images: raise ValueError("No functional images that match criteria found.") # Main loop over images for img_obj in images: entities = img_obj.entities img_f = img_obj.path # Run is not mandatory, but we need a default for proper indexing if 'run' in entities: entities['run'] = int(entities['run']) tr = img_obj.get_metadata()["RepetitionTime"] # Get duration of run: first try to get it directly from the image # header; if that fails, look for a scan_length argument. try: nvols = _get_nvols(img_f) duration = nvols * tr except Exception as e: if scan_length is not None: duration = scan_length nvols = int(np.rint(scan_length / tr)) else: msg = ("Unable to extract scan duration from one or more " "BOLD runs, and no scan_length argument was provided " "as a fallback. Please check that the image files are " "available, or manually specify the scan duration.") raise ValueError(msg) from e # We don't want to pass all the image file's entities onto get_node(), # as there can be unhashable nested slice timing values, and this also # slows down querying unnecessarily. Instead, pick out files only based # on the core BIDS entities and any entities explicitly passed as # selectors. # TODO: one downside of this approach is the stripped entities also # won't be returned in the resulting node due to the way things are # implemented. Consider adding a flag to control this. select_on = {k: v for (k, v) in entities.items() if k in BASE_ENTITIES or k in selectors} # If a matching node already exists, return it result = dataset.get_nodes('run', select_on) if result: if len(result) > 1: raise ValueError("More than one existing Node matches the " "specified entities! You may need to pass " "additional selectors to narrow the search.") run = result[0] else: # Otherwise create a new node and use that. # We first convert any entity values that are currently collections to # JSON strings to prevent nasty hashing problems downstream. Note that # isinstance() isn't as foolproof as actually trying to hash the # value, but the latter is likely to be slower, and since values are # coming from JSON or filenames, there's no real chance of encountering # anything but a list or dict. entities = { k: (json.dumps(v) if isinstance(v, (list, dict)) else v) for (k, v) in entities.items() } run = dataset.create_node('run', entities, image_file=img_f, duration=duration, repetition_time=tr, n_vols=nvols) run_info = run.get_info() # Process event files if events: dfs = layout.get_nearest( img_f, extension='.tsv', suffix='events', all_=True, full_search=True, ignore_strict_entities=['suffix', 'extension']) for _data in dfs: _data = pd.read_csv(_data, sep='\t') if 'amplitude' in _data.columns: if (_data['amplitude'].astype(int) == 1).all() and \ 'trial_type' in _data.columns: msg = ("Column 'amplitude' with constant value 1 " "is unnecessary in event files; ignoring it.") _data = _data.drop('amplitude', axis=1) else: msg = ("Column name 'amplitude' is reserved; " "renaming it to 'amplitude_'.") _data = _data.rename( columns={'amplitude': 'amplitude_'}) warnings.warn(msg) _data = _data.replace('n/a', np.nan) # Replace BIDS' n/a _data = _data.apply(pd.to_numeric, errors='ignore') _cols = columns or list(set(_data.columns.tolist()) - {'onset', 'duration'}) # Construct a DataFrame for each extra column for col in _cols: df = _data[['onset', 'duration']].copy() df['amplitude'] = _data[col].values # Add in all of the run's entities as new columns for # index for entity, value in entities.items(): if entity in ALL_ENTITIES: df[entity] = value if drop_na: df = df.dropna(subset=['amplitude']) if df.empty: continue var = SparseRunVariable( name=col, data=df, run_info=run_info, source='events') run.add_variable(var) # Process confound files if regressors: sub_ents = {k: v for k, v in entities.items() if k in BASE_ENTITIES} confound_files = layout.get(suffix=['regressors', 'timeseries'], scope=scope, extension='.tsv', **sub_ents) for cf in confound_files: _data = pd.read_csv(cf.path, sep='\t', na_values='n/a') if columns is not None: conf_cols = list(set(_data.columns) & set(columns)) _data = _data.loc[:, conf_cols] for col in _data.columns: sr = 1. / run.repetition_time var = DenseRunVariable(name=col, values=_data[[col]], run_info=run_info, source='regressors', sampling_rate=sr) run.add_variable(var) # Process recordinging files rec_types = [] if physio: rec_types.append('physio') if stim: rec_types.append('stim') if rec_types: rec_files = layout.get_nearest( img_f, extension='.tsv.gz', all_=True, suffix=rec_types, ignore_strict_entities=['suffix', 'extension'], full_search=True) for rf in rec_files: metadata = layout.get_metadata(rf) if not metadata: raise ValueError("No .json sidecar found for '%s'." % rf) data = pd.read_csv(rf, sep='\t') freq = metadata['SamplingFrequency'] st = metadata['StartTime'] rf_cols = metadata['Columns'] data.columns = rf_cols # Filter columns if user passed names if columns is not None: rf_cols = list(set(rf_cols) & set(columns)) data = data.loc[:, rf_cols] n_cols = len(rf_cols) if not n_cols: continue # Keep only in-scan samples if st < 0: start_ind = np.floor(-st * freq) values = data.values[start_ind:, :] else: values = data.values if st > 0: n_pad = int(freq * st) pad = np.zeros((n_pad, n_cols)) values = np.r_[pad, values] n_rows = int(run.duration * freq) if len(values) > n_rows: values = values[:n_rows, :] elif len(values) < n_rows: pad = np.zeros((n_rows - len(values), n_cols)) values = np.r_[values, pad] df = pd.DataFrame(values, columns=rf_cols) source = 'physio' if '_physio.tsv' in rf else 'stim' for col in df.columns: var = DenseRunVariable(name=col, values=df[[col]], run_info=run_info, source=source, sampling_rate=freq) run.add_variable(var) return dataset def _load_tsv_variables(layout, suffix, dataset=None, columns=None, prepend_type=False, scope='all', regex_search=None, **selectors): """Reads variables from scans.tsv, sessions.tsv, and participants.tsv. Parameters ---------- layout : :obj:`bids.layout.BIDSLayout` The BIDSLayout to use. suffix : str The suffix of file to read from. Must be one of 'scans', 'sessions', or 'participants'. dataset : NodeIndex A BIDS NodeIndex container. If None, a new one is initialized. columns : list Optional list of names specifying which columns in the files to return. If None, all columns are returned. prepend_type : bool If True, variable names are prepended with the type name (e.g., 'age' becomes 'participants.age'). scope : str or list The scope of the space to search for variables. See docstring for BIDSLayout for details and valid predefined values. selectors : dict Optional keyword arguments passed onto the BIDSLayout instance's get() method; can be used to constrain which data are loaded. Returns ------- A NodeIndex instance. """ if regex_search is None: regex_search = layout.regex_search # Sanitize the selectors: only keep entities at current level or above valid_entities_map = { 'scans': ['subject', 'session'], 'sessions': ['subject'], 'participants': [] } valid_entities = valid_entities_map[suffix] layout_kwargs = {k: v for k, v in selectors.items() if k in valid_entities} if dataset is None: dataset = NodeIndex() files = layout.get(extension='.tsv', suffix=suffix, scope=scope, **layout_kwargs) for f in files: _data = f.get_df(include_timing=False) # Entities can be defined either within the first column of the .tsv # file (for entities that vary by row), or from the full file path # (for entities constant over all rows in the file). We extract both # and store them in the main DataFrame alongside other variables (as # they'll be extracted when the BIDSVariable is initialized anyway). for ent_name, ent_val in f.entities.items(): if ent_name in ALL_ENTITIES: _data[ent_name] = ent_val # Handling is a bit more convoluted for scans.tsv, because the first # column contains the run filename, which we also need to parse. if suffix == 'scans': # Suffix is guaranteed to be present in each filename, so drop the # constant column with value 'scans' to make way for it and prevent # two 'suffix' columns. _data.drop(columns=['suffix'], inplace=True) image = _data['filename'] _data = _data.drop('filename', axis=1) dn = f._dirname paths = [str(dn / p) for p in image.values] ent_recs = [dict(layout.files[p].entities) for p in paths if p in layout.files] ent_cols = pd.DataFrame.from_records(ent_recs) # Remove entity columns found in both DFs dupes = list(set(ent_cols.columns) & set(_data.columns)) to_drop = ['extension'] + dupes ent_cols.drop(columns=to_drop, inplace=True) _data = pd.concat([_data, ent_cols], axis=1, sort=True) # The BIDS spec requires ID columns to be named 'session_id', 'run_id', # etc., and IDs begin with entity prefixes (e.g., 'sub-01'). To ensure # consistent internal handling, we strip these suffixes and prefixes. elif suffix == 'sessions': _data = _data.rename(columns={'session_id': 'session'}) _data['session'] = _data['session'].str.replace('ses-', '') elif suffix == 'participants': _data = _data.rename(columns={'participant_id': 'subject'}) _data['subject'] = _data['subject'].str.replace('sub-', '') def make_patt(x, regex_search=False): patt = '%s' % x if isinstance(x, (int, float)): # allow for leading zeros if a number was specified # regardless of regex_search patt = '0*' + patt if not regex_search: patt = '^%s$' % patt return patt # Filter rows on all selectors comm_cols = list(set(_data.columns) & set(selectors.keys())) for col in comm_cols: vals = listify(selectors.get(col)) if regex_search and any(isinstance(val, str) for val in vals): patt = '|'.join(make_patt(x, regex_search=True) for x in vals) _data = _data[_data[col].str.contains(patt)] else: _data = _data[_data[col].isin(vals)] level = {'scans': 'session', 'sessions': 'subject', 'participants': 'dataset'}[suffix] node = dataset.get_or_create_node(level, f.entities) ent_cols = list(set(ALL_ENTITIES) & set(_data.columns)) amp_cols = list(set(_data.columns) - set(ent_cols)) if columns is not None: amp_cols = list(set(amp_cols) & set(columns)) for col_name in amp_cols: # Rename columns: values must be in 'amplitude' df = _data.loc[:, [col_name] + ent_cols] df.columns = ['amplitude'] + ent_cols if prepend_type: col_name = '%s.%s' % (suffix, col_name) node.add_variable(SimpleVariable(name=col_name, data=df, source=suffix)) return dataset