Source code for ccobra.data

""" CCOBRA data container.

"""

import copy
import logging

from . import convert_to_basic_types
from .item import Item

# Initialize module-level logger
logger = logging.getLogger(__name__)

[docs]class CCobraData():
    """ CCobra experimental data container.

    """

    def __init__(self, data, target_columns):
        """ Initializes the CCOBRA data container by passing a data frame
        and validating its contents.

        Parameters
        ----------
        data : pd.DataFrame
            DataFrame to store in the CCOBRA data container.

        required_fields : list(str), optional
            List of required columns in the data. Defaults to ['id', 'sequence',
            'task', 'choices', 'response', 'response_type', 'domain']

        """

        self.target_columns = target_columns
        self.required_fields = [
            'id', 'sequence', 'task', 'choices', 'response_type', 'domain'
        ] + target_columns

        # Verify and store the data
        self.verify_data(data)
        self._data = data

        # Normalize the data container
        self.prepare_data()

        # Extract meta information
        self.n_subjects = len(self._data['_unique_id'].unique())
        self.domains = self._data['domain'].unique().tolist()
        self.response_types = self._data['response_type'].unique().tolist()

[docs]    def verify_data(self, data):
        """ Verifies if all required fields are in the data.

        Parameters
        ----------
        data : pd.DataFrame
            DataFrame to verify.

        Raises
        ------
        ValueError
            Thrown if data does not contain required columns.

        """

        missing = set(self.required_fields) - set(data.columns)
        if missing:
            raise ValueError(
                "Data does not contain columns: {}".format(missing))

[docs]    def prepare_data(self):
        """ Prepares the dataset by adding internally_used columns

        """

        assert '_unique_id' not in self._data

        # Add unique numerical subject identifier
        self._data['_unique_id'] = self._data['id']

[docs]    def prefix_identifiers(self, prefix='_train_'):
        """ Prefixes the subject identifier keys.

        Parameters
        ----------
        prefix : str
            Prefix to apply to key numerical identifiers.

        """

        self._data['_unique_id'] = self._data['_unique_id'].apply(lambda x: prefix + str(x))

[docs]    def get(self):
        """ Returns the contained data.

        Returns
        -------
        pd.DataFrame
            Dataframe containing the data.

        """

        return self._data

[docs]    def head(self):
        """ Displays the first 10 lines of the dataframe.

        """

        return self._data.head()

[docs]    def to_eval_dict(self):
        """ Converts the dataset to an evaluation dictionary mapping from individuals to data.

        Returns
        -------
        dict(object, list)
            Dictionary mapping from subject identifiers to lists of experimental data.

        """

        # Prepare the dictionary of subjects containing lists of tasks they responded to
        df = self._data

        dataset = {}
        for subj, subj_df in df.groupby('_unique_id'):
            assert subj not in dataset

            subj_df = subj_df.sort_values('sequence')

            subj_data = []
            for _, task_series in subj_df.iterrows():
                task_dict = {}

                # Extract the task information
                item = Item(
                    task_series['id'], task_series['domain'],
                    task_series['task'], task_series['response_type'],
                    task_series['choices'], task_series['sequence']
                )
                task_dict['item'] = item

                # Parse the main response
                responses = None
                if isinstance(task_series['response'], str):
                    responses = []
                    for response in task_series['response'].split('|'):
                        responses.append([x.split(';') for x in response.split('/')])
                    if task_series['response_type'] != 'multiple-choice':
                        responses = responses[0]
                else:
                    responses = task_series['response']
                task_dict['response'] = convert_to_basic_types(responses)

                # Parse the auxiliary targets
                for target_col in self.target_columns:
                    if target_col == 'response':
                        continue

                    if isinstance(task_series[target_col], str):
                        responses = []
                        for response in task_series[target_col].split('|'):
                            responses.append([x.split(';') for x in response.split('/')])
                    else:
                        responses = task_series[target_col]
                    task_dict[target_col] = responses

                # Add auxiliary elements from the data
                aux = {}
                for key, value in task_series.iteritems():
                    if key not in self.required_fields + ['_unique_id']:
                        aux[key] = value
                task_dict['aux'] = aux

                task_dict['full'] = copy.deepcopy(task_dict['aux'])
                for target_col in self.target_columns:
                    task_dict['full'][target_col] = task_dict[target_col]

                subj_data.append(task_dict)
            dataset[subj] = subj_data

        return dataset