#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Victor Calderon
# Created : 2018-05-17
# Last Modified: 2018-05-31
from __future__ import absolute_import, division, print_function
__author__ = ['Victor Calderon']
__copyright__ = ["Copyright 2018 Victor Calderon"]
__email__ = ['victor.calderon@vanderbilt.edu']
__maintainer__ = ['Victor Calderon']
__all__ = ["data_preprocessing",
"train_test_dataset",
"scoring_methods"]
# Importing modules
import scipy
import numpy as np
import pandas as pd
from cosmo_utils.utils import file_utils as fd
from cosmo_utils.utils import gen_utils as gu
from cosmo_utils.custom_exceptions import LSSUtils_Error
# ML modules
import sklearn.metrics as skmetrics
import sklearn.model_selection as skms
import sklearn.preprocessing as skpre
# Extra modules
## Functions
# Data preprocessing
[docs]def data_preprocessing(feat_arr, pre_opt='min_max', reshape=False):
"""
Preprocess the data used, in order to clean and make the data more
suitable for the machine learning algorithms
Parameters
-----------
feat_arr : `numpy.ndarray`, `list`, `pandas.DataFrame`
Array of feature values. This array is used for training a
ML algorithm.
pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional
Type of preprocessing to do on `feat_arr`.
Options:
- 'min_max' : Turns `feat_arr` to values between (0,1)
- 'standard' : Uses `~sklearn.preprocessing.StandardScaler` method
- 'normalize' : Uses the `~sklearn.preprocessing.Normalizer` method
- 'no' : No preprocessing on `feat_arr`
reshape : `bool`, optional
If True, it reshapes `feat_arr` into a 1d array if its shapes is
equal to (ncols, 1), where `ncols` is the number of columns.
This variable is set to `False` by default.
Returns
-----------
feat_arr_scaled : `numpy.ndarray`
Rescaled version of `feat_arr` based on the choice of `pre_opt`.
Notes
-----------
For more information on how to pre-process your data, see
`http://scikit-learn.org/stable/modules/preprocessing.html`_.
"""
file_msg = fd.Program_Msg(__file__)
## Checking input parameters
# `feat_arr`
feat_arr_type_valid = (list, np.ndarray, pd.DataFrame)
if not (isinstance(feat_arr, feat_arr_type_valid)):
msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
file_msg, type(feat_arr))
raise LSSUtils_Error(msg)
# `pre_opt`
pre_opt_valid = ['min_max', 'standard', 'normalize', 'no']
if not (pre_opt in pre_opt_valid):
msg = '{0} `pre_opt` ({1}) is not a valid input'.format(
file_msg, pre_opt)
raise LSSUtils_Error(msg)
##
## Reshaping `feat_arr`
if reshape:
feat_arr = gu.reshape_arr_1d(feat_arr)
##
## Scaling `feat_arr`
if (pre_opt == 'min_max'):
# Scaler
scaler = skpre.MinMaxScaler(feature_range=(0, 1))
# Rescaling
feat_arr_scaled = scaler.fit_transform(feat_arr)
## Standardize Data
if pre_opt == 'standard':
# Scaler
scaler = skpre.StandardScaler().fit(feat_arr)
# Rescaling
feat_arr_scaled = scaler.transform(feat_arr)
## Normalize Data
if pre_opt == 'normalize':
# Scaler
scaler = skpre.Normalizer().fit(feat_arr)
# Rescaling
feat_arr_scaled = scaler.transform(feat_arr)
## No Preprocessing
if pre_opt == 'no':
feat_arr_scaled = feat_arr
return feat_arr_scaled
# Train-Test Data Split
[docs]def train_test_dataset(pred_arr, feat_arr, pre_opt='min_max',
shuffle_opt=True, random_state=0, test_size=0.25, reshape=False,
return_idx=False):
"""
Function to create the training and testing datasets for a given set
of features array and predicted array.
Parameters
-----------
pred_arr : `pandas.DataFrame` `numpy.ndarray` or array-like, shape (n_samples, n_outcomes)
Array consisting of the `predicted values`. The dimensions of
`pred_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the
number of observations, and `n_outcomes` the number of predicted
outcomes.
feat_arr : `numpy.ndarray`, `pandas.DataFrame` or array-like, shape (n_samples, n_features)
Array consisting of the `predicted values`. The dimensions of
`feat_arr` are `n_samples` by `n_features`, where `n_samples`
is the number of observations, and `n_features` the number of
features used.
pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional
Type of preprocessing to do on `feat_arr`.
Options:
- 'min_max' : Turns `feat_arr` to values between (0,1)
- 'standard' : Uses `sklearn.preprocessing.StandardScaler` method
- 'normalize' : Uses the `sklearn.preprocessing.Normalizer` method
- 'no' : No preprocessing on `feat_arr`
shuffle_opt : `bool`, optional
If True, the data is shuffled before splitting into testing and
training datasets. This variable is set to True by default.
random_state : int, optional
Random state number used for when splitting into training and
testing datasets. If set, it will always have the same seed
`random_state`. This variable is set to `0` by default.
test_size : float, optional
Percentage of the catalogue that represents the `test` size of
the testing dataset. This variable must be between (0,1).
This variable is set to `0.25` by default.
reshape : `bool`, optional
If True, it reshapes `feat_arr` into a 1d array if its shapes is
equal to (ncols, 1), where `ncols` is the number of columns.
This variable is set to `False` by default.
return_idx : `bool`, optional
If `True`, it returns the indices of the `training` and `testing`
datasets. This variable is set to `False` by default.
Returns
-----------
train_dict : `dict`
Dictionary containing the `training` data from the catalogue.
test_dict : `dict`
Dictionary containing the `testing` data from the catalogue.
See also
-----------
data_preprocessing : Function to preprocess a dataset.
"""
file_msg = fd.Program_Msg(__file__)
## Checking input parameters
# `pred_arr`
pred_arr_type_valid = (list, np.ndarray, pd.DataFrame)
if not (isinstance(pred_arr, pred_arr_type_valid)):
msg = '{0} `pred_arr` ({1}) is not a valid input type'.format(
file_msg, type(pred_arr))
raise LSSUtils_Error(msg)
# `feat_arr`
feat_arr_type_valid = (list, np.ndarray, pd.DataFrame)
if not (isinstance(feat_arr, feat_arr_type_valid)):
msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
file_msg, type(feat_arr))
raise LSSUtils_Error(msg)
# `pre_opt`
pre_opt_valid = ['min_max', 'standard', 'normalize', 'no']
if not (pre_opt in pre_opt_valid):
msg = '{0} `pre_opt` ({1}) is not a valid input'.format(
file_msg, pre_opt)
raise LSSUtils_Error(msg)
# `shuffle_opt`
shuffle_opt_type_valid = (bool)
if not (isinstance(shuffle_opt, shuffle_opt_type_valid)):
msg = '{0} `shuffle_opt` ({1}) is not a valid input type'.format(
file_msg, type(shuffle_opt))
raise LSSUtils_Error(msg)
# `random_state`
random_state_type_valid = (int)
if not (isinstance(random_state, random_state_type_valid)):
msg = '{0} `random_state` ({1}) is not a valid input'.format(
file_msg, random_state)
raise LSSUtils_Error(msg)
# `test_size`
if not ((test_size > 0) and (test_size < 1.)):
msg = '{0} `test_size` ({1}) must be in range (0,1)'.format(
file_msg, test_size)
raise LSSUtils_Error(msg)
##
## Checking indices of `pred_arr` and `feat_arr`
if return_idx:
# If object is a DataFrame
if ( isinstance(pred_arr, pd.DataFrame) and
isinstance(feat_arr, pd.DataFrame)):
pred_arr_idx = pred_arr.index.values
feat_arr_idx = feat_arr.index.values
else:
pred_arr_idx = np.arange(len(pred_arr))
feat_arr_idx = np.arange(len(feat_arr))
# Reshaping if necessary
if reshape:
pred_arr_idx = gu.reshape_arr_1d(pred_arr_idx)
feat_arr_idx = gu.reshape_arr_1d(feat_arr_idx)
##
## Checking dimensions of `pred_arr` and `feat_arr`
pred_arr = np.asarray(pred_arr)
feat_arr = np.asarray(feat_arr)
# Dimensions
if reshape:
pred_arr = gu.reshape_arr_1d(pred_arr)
feat_arr = gu.reshape_arr_1d(feat_arr)
# Shape
if (len(pred_arr) != len(feat_arr)):
msg = '{0} The shape of `pred_arr` ({1}) and `feat_arr` ({2}) must '
msg += 'have the same length'
msg = msg.format(file_msg, len(pred_arr), len(feat_arr))
raise LSSUtils_Error(msg)
##
## Rescaling Dataset
feat_arr_scaled = data_preprocessing(feat_arr, pre_opt=pre_opt,
reshape=reshape)
##
## Splitting into `Training` and `Testing` datasets.
# Scaled
( X_train, X_test,
Y_train, Y_test) = skms.train_test_split( feat_arr_scaled,
pred_arr,
test_size=test_size,
shuffle=shuffle_opt,
random_state=random_state)
# Not-scaled
( X_train_ns, X_test_ns,
Y_train_ns, Y_test_ns) = skms.train_test_split( feat_arr,
pred_arr,
test_size=test_size,
shuffle=shuffle_opt,
random_state=random_state)
# Returning indices if necessary
if return_idx:
# Splitting to `training` and `testing`
( X_train_idx, X_test_idx,
Y_train_idx, Y_test_idx) = skms.train_test_split(
feat_arr_idx,
pred_arr_idx,
test_size=test_size,
shuffle=shuffle_opt,
random_state=random_state)
if not (np.array_equal(X_train_idx, Y_train_idx) and
np.array_equal(X_test_idx, Y_test_idx)):
msg = '{0} Index arrays are not equal to each other!'
raise LSSUtils_Error(msg)
##
## Assigning `training` and `testing` datasets to dictionaries
# Saving indices if necessary
if return_idx:
# Adding 'indices' to dictionaries
train_dict = { 'X_train': X_train, 'Y_train': Y_train,
'X_train_ns': X_train_ns, 'Y_train_ns': Y_train_ns,
'train_idx': X_train_idx}
test_dict = {'X_test': X_test, 'Y_test': Y_test,
'X_test_ns': X_test_ns, 'Y_test_ns': Y_test_ns,
'test_idx': X_test_idx}
else:
train_dict = { 'X_train': X_train, 'Y_train': Y_train,
'X_train_ns': X_train_ns, 'Y_train_ns': Y_train_ns}
test_dict = {'X_test': X_test, 'Y_test': Y_test,
'X_test_ns': X_test_ns, 'Y_test_ns': Y_test_ns}
return train_dict, test_dict
# Scoring methods
[docs]def scoring_methods(truth_arr, feat_arr=None, pred_arr=None, model=None,
score_method='perc', threshold=0.1, perc=0.68):
"""
Determines the overall score for given arrays, i.e. the `predicted`
array and the `truth` array
Parameters
-----------
truth_arr : `numpy.ndarray` or array-like, shape (n_samples, n_outcomes)
Array consisting of the `true` values for the `n_samples`
observations. The dimensions of `truth_arr` are
`n_samples` by `n_outcomes`, where `n_samples` is the
number of observations, and `n_outcomes` the number of predicted
outcomes.
feat_arr : `numpy.ndarray`, array-like, or `NoneType`, shape (n_samples, n_features)
Array consisting of the `predicted values`. The dimensions of
`feat_arr` are `n_samples` by `n_features`, where `n_samples`
is the number of observations, and `n_features` the number of
features used. This variable is set to `None` by default.
pred_arr : `numpy.ndarray`, array-like, or `NoneType`, shape (n_samples, n_outcomes)
Array of predicted values from `feat_arr`. If ``model == None``,
this variable must be an array-like object. If ``model != None``,
this variable will not be used, and will be calculated using
the `model` object. This variable is set to `None` by default.
model : scikit-learn model object or `NoneType`
Model used to estimate the score if ``score_method == 'model_score'``
This variable is set to `None` by default.
score_method : {'perc', 'threshold', 'model_score', 'r2'} `str`, optional
Type of scoring to use when determining how well an algorithm
is performing.
Options:
- 'perc' : Use percentage and rank-ordering of the values
- 'threshold' : Score based on diffs of `threshold` or less from true value.
- 'model_score' : Out-of-the-box metod from `sklearn` to determine success.
- 'r2': R-squared statistic for error calcuation.
threshold : float, optional
Value to use when calculating the error within `threshold` value
from the truth. This variable is set to `None` by default.
If `None`, this variable assumes a value of `0.1`.
perc : float, optional
Value used when determining score within some `perc` percentile
value form [0,1]. This variable is set to `None` by default.
If `None`, it assumes a value of `0.68`.
Returns
-----------
method_score : float
Overall score from `pred_arr` to predict `truth_arr`.
Notes
-----------
For more information on how to pre-process your data, see
`http://scikit-learn.org/stable/modules/model_evaluation.html`_.
"""
file_msg = fd.Program_Msg(__file__)
## Checking input parameters
# `feat_arr`
feat_arr_type_valid = (list, np.ndarray, type(None))
if not (isinstance(feat_arr, feat_arr_type_valid)):
msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
file_msg, type(feat_arr))
raise LSSUtils_Error(msg)
# `truth_arr`
truth_arr_type_valid = (list, np.ndarray)
if not (isinstance(truth_arr, truth_arr_type_valid)):
msg = '{0} `truth_arr` ({1}) is not a valid input type'.format(
file_msg, type(truth_arr))
raise LSSUtils_Error(msg)
# `score_method` - Type
score_method_type_valid = (str)
if not (isinstance(score_method, score_method_type_valid)):
msg = '{0} `score_method` ({1}) is not a valid input type'.format(
file_msg, type(score_method))
raise LSSUtils_Error(msg)
# `score_method` - Value
score_method_valid = ['perc', 'threshold', 'model_score', 'r2']
if not (score_method in score_method_valid):
msg = '{0} `score_method` ({1}) is not a valid input!'.format(
file_msg, score_method)
raise LSSUtils_Error(score_method)
# `threshold` - Type
threshold_valid = (float, int)
if not (isinstance(threshold, threshold_valid)):
msg = '{0} `threshold` ({1}) is not a valid input type'.format(
file_msg, type(threshold))
raise LSSUtils_Error(msg)
# `threshold` - Value
if not (threshold >= 0.):
msg = '{0} `threshold` ({1}) must be larger than 0!'.format(
file_msg, threshold)
raise LSSUtils_Error(msg)
##
## Checking for `model`, `pred_arr` and `feat_arr`
# If both are none
if ((model is None) and (pred_arr is None)):
msg = '{0} `model` and `pred_arr` cannot both be `None`. '
msg += 'Only one can be `None`'
msg = msg.format(file_msg)
raise LSSUtils_Error(msg)
# If `feat_arr` and `pred_arr` are `None`
if ((feat_arr is None) and (pred_arr is None)):
msg = '{0} `feat_arr` and `pred_arr` cannot both be `None`'.format(
file_msg)
raise TypeError(msg)
# `pred_arr` - Type
# If both are `None`
pred_arr_valid = ((list, np.ndarray))
if (model is None):
if not (isinstance(pred_arr, pred_arr_valid)):
msg = '{0} `pred_arr` ({1}) is not a valid input type!'.format(
file_msg, type(pred_arr))
raise LSSUtils_Error(msg)
##
## Choosing scoring method
# Percentile method
if (score_method == 'perc'):
# Checking for `pred_arr`
if (pred_arr is None):
pred_arr = model.predict(feat_arr)
# Checking for `model`
if (model is None):
pred_arr = np.asarray(pred_arr)
# Error calcualtion
pred_err = np.abs(pred_arr - truth_arr)
method_score = scipy.stats.scoreatpercentile(pred_err, 100. * perc)
# Threshold method
if (score_method == 'threshold'):
# Checking for `pred_arr`
if (pred_arr is None):
pred_arr = model.predict(feat_arr)
# Checking for `model`
if (model is None):
pred_arr = np.asarray(pred_arr)
# Error calcualtion
pred_err = np.abs(pred_arr - truth_arr)
pred_thresh = len(pred_err[pred_err <= threshold])
method_score = pred_thresh / len(pred_arr)
# R-squared method
if (score_method == 'r2'):
# Checking for `pred_arr`
if (pred_arr is None):
pred_arr = model.predict(feat_arr)
# Checking for `model`
if (model is None):
pred_arr = np.asarray(pred_arr)
# Error calcualtion
method_score = skmetrics.r2_score(truth_arr, pred_arr)
# Model method
if (score_method == 'model_score'):
method_score = model.score(feat_arr, truth_arr)
return method_score