Source code for ccobra.benchmark.evaluator

""" CCOBRA evaluation module.


import copy
import logging
import time

import numpy as np
import pandas as pd

from ..model import CCobraModel

from . import contextmanager
from . import modelimporter

# Initialize module-level logger
logger = logging.getLogger(__name__)

[docs]class Evaluator(): """ CCOBRA evaluation routine. """ def __init__(self, benchmark, is_silent=False, cache_df=None): """ Initializes the evaluator object by preparing the data representations and precomputing the required training and adaption steps. Parameters ---------- benchmarks : ccobra.Benchmark Benchmark container. is_silent : bool, optional Flag indicating that output is supposed to be suppressed. cache_df : pandas.DataFrame, option Cache result dataframe. """'Setting up evaluator...') # Store the information self.benchmark = benchmark self.is_silent = is_silent self.cache_df = cache_df # Extract the dataset information self.dict_test = benchmark.data_test.to_eval_dict() self.dict_pre_train = None self.dict_pre_train_person = None self.dict_pre_person_background = None if benchmark.data_pre_train is not None: logger.debug('Supplied training data to evaluation.') self.dict_pre_train = benchmark.data_pre_train.to_eval_dict() if benchmark.data_pre_train_person is not None: logger.debug('Supplied person training data to evaluation.') self.dict_pre_train_person = benchmark.data_pre_train_person.to_eval_dict() if benchmark.data_pre_person_background is not None: logger.debug('Supplied person background data to evaluation.') self.dict_pre_person_background = benchmark.data_pre_person_background.to_eval_dict() if benchmark.type == 'coverage': self.dict_pre_train_person = self.dict_test # Extract the functionality to apply self.do_adapt = (benchmark.type == 'adaption') self.do_pre_train_global = (self.dict_pre_train is not None) and not benchmark.corresponding_data self.do_pre_train_leaveoneout = (self.dict_pre_train is not None) and benchmark.corresponding_data self.do_pre_train_person = (self.dict_pre_train_person is not None) self.do_pre_person_background = (self.dict_pre_person_background is not None) logger.debug('Evaluation ready:') logger.debug(' do_adapt: %s', self.do_adapt) logger.debug(' do_pre_train_global: %s', self.do_pre_train_global) logger.debug(' do_pre_train_leaveoneout: %s', self.do_pre_train_leaveoneout) logger.debug(' do_pre_train_person: %s', self.do_pre_train_person) logger.debug(' do_pre_person_background: %s', self.do_pre_person_background)
[docs] def evaluate(self): """ Core evaluation routine. Returns ------- pd.DataFrame Pandas dataframe containing the evaluation results. """'Starting evaluation routine...') model_logging_results = {} model_name_cache = set() if self.cache_df is None else set(self.cache_df['model'].unique()) # Activate model context for model_idx, modelinfo in enumerate(self.benchmark.models): # Print the progress log_str = "Evaluating '{}' ({}/{})...".format( modelinfo.path, model_idx + 1, len(self.benchmark.models)) logger.debug(''.join(['='] * 80)) logger.debug(''.join(['='] * 80)) if not self.is_silent: print(log_str) # Initialize the dictionary for the models logging output model_logging_dict = {} # Setup model context with contextmanager.dir_context(modelinfo.path): # Dynamically import the CCOBRA model importer = modelimporter.ModelImporter( modelinfo.path, CCobraModel, load_specific_class=modelinfo.load_specific_class ) # Instantiate and prepare the model for predictions pre_model = importer.instantiate(modelinfo.args) pre_model.setup_environment(self.benchmark.type) # Check if model is applicable to domains/response types self.check_model_applicability(pre_model) # Only use the model's name if no override is specified model_name = modelinfo.override_name if not model_name: model_name = # Ensure that names are unique and show a warning if duplicates are detected original_model_name = model_name changed = False while model_name in model_name_cache: model_name = model_name + '\'' changed = True model_name_cache.add(model_name) if changed: logger.warning( 'Duplicate model name detected ("%s"). Changed to "%s".', original_model_name, model_name ) # Only perform general pre-training if training data is # supplied and corresponding data is false. Otherwise, the # model has to be re-trained for each subject. if self.do_pre_train_global: logger.debug('General pre-training for %s...', model_name) pre_model.pre_train(list(self.dict_pre_train.values())) # Iterate subject for subj_key_identifier, subj_data in self.dict_test.items(): start_subject = time.time() subj_id = subj_data[0]['item'].identifier model = copy.deepcopy(pre_model) # Set the model to new participant model.start_participant(id=subj_id) # Perform pre-training for individual subjects only if # corresponding data is set to true if self.do_pre_train_leaveoneout: logger.debug('Individual pre-training for %s...', model_name) cur_train_data = [ value for key, value in self.dict_pre_train.items() if key != subj_id] model.pre_train(cur_train_data) # Perform background fitting if self.do_pre_person_background: logger.debug('Person background training for %s...', model_name) cur_train_data = self.dict_pre_person_background.get(subj_key_identifier, []) model.pre_person_background(cur_train_data) # Perform person training if self.do_pre_train_person: logger.debug('Person training for %s...', model_name) subj_person_train_data = self.dict_pre_train_person.get(subj_key_identifier, []) model.pre_train_person(subj_person_train_data) # Iterate over individual tasks start_eval = time.time() for task_idx, task in enumerate(subj_data): start_task = time.time() logger.debug('Querying for task %s/%s...', task_idx + 1, len(subj_data)) # Integrity checks assert task['item'].identifier == subj_id # Query models for predictions for eh in self.benchmark.evaluation_handlers: target = task[eh.data_column] eh.predict(model, model_name, task['item'], target, task['aux']) # Perform model adaption if self.do_adapt: for eh in self.benchmark.evaluation_handlers: target = task[eh.data_column] eh.adapt(model, task['item'], task['full']) logger.debug( 'Task {} took {:4f}s'.format(task_idx + 1, time.time() - start_task)) # Finalize subject evaluation and allow the model to store parameters model_log = {} model.end_participant(subj_id, model_log) if len(model_log) > 0: model_logging_dict[subj_id]= model_log logger.debug('Subject evaluation took {:.4}s'.format(time.time() - start_eval)) logger.debug('Subject {} done. took {:.4}s'.format( subj_id, time.time() - start_subject)) # Save the models logging information if available if len(model_logging_dict) > 0: model_logging_results[model_name] = model_logging_dict # Unload the imported model and its dependencies. Might cause garbage collection # issues importer.unimport() res_df = None on_list = [ 'model', 'id', 'domain', 'response_type', 'sequence', 'task', 'choices' ] for enc in self.benchmark.evaluation_handlers: if res_df is None: logger.debug('Preparing new result dataframe based on evaluation handler') res_df = enc.get_result_df() else: logger.debug('Adding evaluation handler result to result dataframe') res_df = res_df.merge(enc.get_result_df(), on=on_list, suffixes=('', '_' + enc.data_column)) # Rename score column res_df = res_df.rename(columns={'score' : 'score_response'}) # Integrate cache if self.cache_df is None: logger.debug('Empty cache. Returning only result dataframe.') return res_df, model_logging_results if res_df.empty: logger.debug('Empty result dataframe. Returning cache only.') return self.cache_df, {} logger.debug('Merging cache and result dataframe...') assert sorted(list(res_df)) == sorted(list(self.cache_df)), 'Incompatible cache' return pd.concat([res_df, self.cache_df]), model_logging_results
[docs] def check_model_applicability(self, pre_model): """ Verifies the applicability of a model by checking its supported domains and response types and comparing them with the evaluation dataset. Parameters ---------- pre_model : CCobraModel Model to check applicability for. Raises ------ ValueError Exception thrown when model is not applicable to some domains or response types in the test data. """ missing_domains = set( - set(pre_model.supported_domains) if missing_domains: raise ValueError( 'Model {} is not applicable to domains {} found in ' \ 'the test dataset.'.format(, missing_domains)) missing_response_types = set(self.benchmark.data_test.response_types) - set(pre_model.supported_response_types) if missing_response_types: raise ValueError( 'Model {} is not applicable to response_types {} ' \ 'found in the test dataset.'.format(, missing_response_types))