Source code for ccobra.data

""" CCOBRA data container.

"""

import copy
import logging

from . import convert_to_basic_types
from .item import Item

# Initialize module-level logger
logger = logging.getLogger(__name__)

[docs]class CCobraData(): """ CCobra experimental data container. """ def __init__(self, data, target_columns): """ Initializes the CCOBRA data container by passing a data frame and validating its contents. Parameters ---------- data : pd.DataFrame DataFrame to store in the CCOBRA data container. required_fields : list(str), optional List of required columns in the data. Defaults to ['id', 'sequence', 'task', 'choices', 'response', 'response_type', 'domain'] """ self.target_columns = target_columns self.required_fields = [ 'id', 'sequence', 'task', 'choices', 'response_type', 'domain' ] + target_columns # Verify and store the data self.verify_data(data) self._data = data # Normalize the data container self.prepare_data() # Extract meta information self.n_subjects = len(self._data['_unique_id'].unique()) self.domains = self._data['domain'].unique().tolist() self.response_types = self._data['response_type'].unique().tolist()
[docs] def verify_data(self, data): """ Verifies if all required fields are in the data. Parameters ---------- data : pd.DataFrame DataFrame to verify. Raises ------ ValueError Thrown if data does not contain required columns. """ missing = set(self.required_fields) - set(data.columns) if missing: raise ValueError( "Data does not contain columns: {}".format(missing))
[docs] def prepare_data(self): """ Prepares the dataset by adding internally_used columns """ assert '_unique_id' not in self._data # Add unique numerical subject identifier self._data['_unique_id'] = self._data['id']
[docs] def prefix_identifiers(self, prefix='_train_'): """ Prefixes the subject identifier keys. Parameters ---------- prefix : str Prefix to apply to key numerical identifiers. """ self._data['_unique_id'] = self._data['_unique_id'].apply(lambda x: prefix + str(x))
[docs] def get(self): """ Returns the contained data. Returns ------- pd.DataFrame Dataframe containing the data. """ return self._data
[docs] def head(self): """ Displays the first 10 lines of the dataframe. """ return self._data.head()
[docs] def to_eval_dict(self): """ Converts the dataset to an evaluation dictionary mapping from individuals to data. Returns ------- dict(object, list) Dictionary mapping from subject identifiers to lists of experimental data. """ # Prepare the dictionary of subjects containing lists of tasks they responded to df = self._data dataset = {} for subj, subj_df in df.groupby('_unique_id'): assert subj not in dataset subj_df = subj_df.sort_values('sequence') subj_data = [] for _, task_series in subj_df.iterrows(): task_dict = {} # Extract the task information item = Item( task_series['id'], task_series['domain'], task_series['task'], task_series['response_type'], task_series['choices'], task_series['sequence'] ) task_dict['item'] = item # Parse the main response responses = None if isinstance(task_series['response'], str): responses = [] for response in task_series['response'].split('|'): responses.append([x.split(';') for x in response.split('/')]) if task_series['response_type'] != 'multiple-choice': responses = responses[0] else: responses = task_series['response'] task_dict['response'] = convert_to_basic_types(responses) # Parse the auxiliary targets for target_col in self.target_columns: if target_col == 'response': continue if isinstance(task_series[target_col], str): responses = [] for response in task_series[target_col].split('|'): responses.append([x.split(';') for x in response.split('/')]) else: responses = task_series[target_col] task_dict[target_col] = responses # Add auxiliary elements from the data aux = {} for key, value in task_series.iteritems(): if key not in self.required_fields + ['_unique_id']: aux[key] = value task_dict['aux'] = aux task_dict['full'] = copy.deepcopy(task_dict['aux']) for target_col in self.target_columns: task_dict['full'][target_col] = task_dict[target_col] subj_data.append(task_dict) dataset[subj] = subj_data return dataset