Source code for cosmo_utils.utils.stats_funcs

#! /usr/bin/env python
# -*- coding: utf-8 -*-

# Victor Calderon
# Created      : 2018-04-28
# Last Modified: 2018-04-28
from __future__ import absolute_import, division, print_function
__author__     = ['Victor Calderon']
__copyright__  = ["Copyright 2018 Victor Calderon"]
__email__      = ['victor.calderon@vanderbilt.edu']
__maintainer__ = ['Victor Calderon']
__all__        = [  "myceil",
                    "myfloor",
                    "Bins_array_create",
                    "sigma_calcs",
                    "Stats_one_arr"]
"""
Set of statistical functions
"""

## Import modules
import math
import numpy as np
from   cosmo_utils.utils             import file_utils as fd
from   cosmo_utils.custom_exceptions import LSSUtils_Error

## Functions

# Upper-bound values
[docs]def myceil(x, base=10): """ Determines the upper-bound interger for a given number with a given base. Parameters ---------- x : float Number to be approximated to closest number to `base` base : float Base used to calculate the closest largest number Returns ---------- y : float Closest float number to `x`, i.e. upper-bound float Examples ---------- >>> myceil(12, 10) 20.0 >>> myceil(12.05, 1.) 13.0 >>> myceil(12.05, 0.5) 12.5 """ y = float(base * math.ceil(float(x)/base)) return y
## Lower-bound values
[docs]def myfloor(x, base=10): """ Determines the lower-bound interger for a given number with a given base. Parameters ---------- x : float Number to be approximated to closest number to `base` base : float Base used to calculate the closest largest number Returns ---------- y : float Closest float number to `x`, i.e. upper-bound float Examples ---------- >>> myfloor(12, 10) 10.0 >>> myfloor(12.05, 1.) 12.0 >>> myfloor(12.05, 0.2) 12.0 """ y = float(base * math.floor(float(x)/base)) return y
## Generation of bins evenly spaced out
[docs]def Bins_array_create(arr, base=10): """ Generates an evenly-spaced array between the minimum and maximum value of a given array, Parameters ---------- arr : array_like Array of of numbers or floats base : int or float, optional Interval used to create the evenly-spaced array of elements Returns ---------- bins_arr : `numpy.ndarray` Array of elements separated in intervals of `base` """ file_msg = fd.Program_Msg(__file__) # Transforming input data base = float(base) arr = np.asarray(arr) # Checking array dimensions if arr.ndim != 1: msg = '{0} The input array is not of dimension 1, but of `{1}`'.format( file_msg, arr.ndim) raise LSSUtils_Error(msg) # Creating evenly-spaced array arr_min = myfloor(arr.min(), base=base) arr_max = myceil(arr.max(), base=base) bins_arr = np.arange(arr_min, arr_max + 0.5*base, base) return bins_arr
## Calculations of percentiles and sigmas
[docs]def sigma_calcs(data_arr, type_sigma='std', perc_arr=[68., 95., 99.7], return_mean_std=False): """ Calcualates the 1-, 2-, and 3-sigma ranges for `data_arr` Parameters ----------- data_arr : `numpy.ndarray`, shape( param_dict['nrpbins'], param_dict['itern_tot']) array of values, from which to calculate percentiles or St. Dev. type_sigma : {'perc', 'std'} string, optional (default = 'std') Option for calculating either `percentiles` or `standard deviations` - 'perc': calculates percentiles - 'std' : uses standard deviations as 1-, 2-, and 3-sigmas perc_arr : array_like, optional (default = [68., 95., 99.7]) Array of percentiles to calculate return_mean_std : `bool`, optional (default = False) Option for returning mean and St. Dev. along with `sigma_dict` Return ---------- sigma_dict: python dicitionary dictionary containg the 1-, 2-, and 3-sigma upper and lower ranges for `data-arr` mark_mean: array_like array of the mean value of `data_arr`. Only returned if `return_mean_std == True` mark_std: array_like array of the St. Dev. value of `data_arr`. Only returned if `return_mean_std == True` """ file_msg = fd.Program_Msg(__file__) ## Checking input variables # `data_arr` data_arr_valid_types = (np.ndarray, list) if not (isinstance(data_arr, data_arr_valid_types)): msg = '{0} `data_arr` ({1}) is not a valid type!'.format( file_msg, type(data_arr)) raise LSSUtils_Error(msg) else: data_arr = np.asarray(data_arr) # `type_sigma` type_sigma_valid = ['perc', 'std'] if not (isinstance(type_sigma, str)): msg = '{0} `type_sigma` ({1}) is not a valid type!'.format( file_msg, type(type_sigma)) raise LSSUtils_Error(msg) if not (type_sigma in type_sigma_valid): msg = '{0} `type_sigma` ({1}) is not a valid input choice!'.format( file_msg, type_sigma) ## Determining shape of `data_arr` if data_arr.ndim == 1: axis = 0 else: axis = 1 ## Creating dictionary for saving `sigma`s sigma_dict = {} for ii in range(len(perc_arr)): sigma_dict[ii] = [] ## Using Percentiles to estimate errors if type_sigma == 'perc': for ii, perc_ii in enumerate(perc_arr): mark_lower = np.nanpercentile(data_arr, 50.-(perc_ii/2.),axis=axis) mark_upper = np.nanpercentile(data_arr, 50.+(perc_ii/2.),axis=axis) # Saving to dictionary sigma_dict[ii] = np.column_stack((mark_lower, mark_upper)).T ## Using standard deviations to estimate errors if type_sigma == 'std': mean_val = np.nanmean(data_arr, axis=axis) std_val = np.nanstd( data_arr, axis=axis) for ii in range(len(perc_arr)): mark_lower = mean_val - ((ii+1) * std_val) mark_upper = mean_val + ((ii+1) * std_val) # Saving to dictionary sigma_dict[ii] = np.column_stack((mark_lower, mark_upper)).T ## ## Estimating mean and St. Dev. of `data_arr` mark_mean = np.nanmean(data_arr, axis=axis) mark_std = np.nanstd( data_arr, axis=axis) ## Fixing values for when `axis == 0` if data_arr.ndim == 1: for ii in range(len(sigma_dict.keys())): sigma_dict[ii] = sigma_dict[ii].flatten() if return_mean_std: return sigma_dict, mark_mean, mark_std else: return sigma_dict
## Main framework for `Stats_one_arr` and `Stats_two_arr`
[docs]def Stats_one_arr(x, y, base=1., arr_len=0, arr_digit='n', weights=None, statfunc=np.nanmean, bin_statval='average', return_perc=False, failval=np.nan): """ Calculates statists for 2 arrays Parameters ---------- x, y : array_like, shape(N,) Sets of elements for the 1st and 2nd observable base : float, optional Bin width in units of `x`. This variable is set to 1. by default. arr_len : int, optional Minimum number of elements in each bin of `x` arr_digit : {'n', 'y', 'o'} str, optional Option for which elements to return. Options: - 'n' : Returns `x_stat`, `y_stat`, `y_std`, `y_std_err` - 'y' : Returns `x_stat`, `y_stat`, `y_std`, `y_std_err`, `x_bins_data`, `y_bins_data` - 'o' : Returns `x_bins_data`, `y_bins_data` weights : array_like or NoneType, optional Array of weights for values in `y`. This is set to None by default. statfunc : {`numpy.nanmean`, `numpy.nanmedian`} statistical func, optional Numerical function used to calculate on bins of data. By default, this variable is set to `numpy.nanmean` bin_statval : {'average', 'left', 'right'} str, optional Option for where to put the bin values of `x` and `y`. By default, this variable is set to `average`, which means that the values are those of the averages of the bins in `x` and `y`. return_perc : `bool`, optional If true, it also returns the `percentiles` of the data. Last item in the return list. This variable is set to False by default. failval : int, float, NoneType, or NaN, optional This is the value used when no data is available for the bin. This is set to `numpy.nan` by default Returns ---------- x_stat, y_stat : array_like Binned array of elements from `x` y_std : array_like Standard deviation of the binned array in `x` y_std_err : array_like Error in the `statfunc` of `y` x_bins_data : array_like, optional Elements of `x` in each bin with spacing of `base`. Only returned if `arr_digit` == 'y' or 'o' y_bins_data : array_like, optional Elements of `y` in each bin with spacing of `base`. Only returned if `arr_digit` == 'y' or 'o' perc_lims : array_like, shape(N,3) Percentiles in each bin of `x_stat`. Only returned if `arr_digit` == 'y' or 'o' """ file_msg = fd.Program_Msg(__file__) ## Verifying input values # `arr_digit` if not ((arr_digit == 'y') or (arr_digit == 'n') or (arr_digit == 'o')): msg = '{0} `arr_digit` ({1}) is not a valid input. Exiting'.format( file_msg, arr_digit) raise LSSUtils_Error(msg) # Array dimensions if not ((len(x) > 0) and (len(y) > 0)): msg = '{0} The arrays `x` and `y` must have at least one value' msg = msg.format(file_msg) raise LSSUtils_Error(msg) if not ((np.asarray(x).ndim == 1) and (np.asarray(y).ndim == 1)): msg = '{0} The arrays `x` and `y` must have dimension of `1`' msg = msg.format(file_msg) raise LSSUtils_Error(msg) # `arr_len` if not (arr_len >= 0): msg = '{0} `arr_len` ({1}) must be greater or equal than zero!'.format( file_msg, arr_len) raise LSSUtils_Error(msg) # `bin_statval` if not (bin_statval in ['average', 'left', 'right']): msg = '{0} `bin_statval` ({1}) is not a valid input! Exiting'.format( file_msg, bin_statval) raise LSSUtils_Error(msg) ## ## Converting arrays to numpy arrays x = np.asarray(x) y = np.asarray(y) arr_len = int(arr_len - 1.) if arr_len != 0 else int(arr_len) ## ## Statistics calculations x_bins = Bins_array_create(x, base=base) x_digits = np.digitize(x, x_bins) ## ## Determining which bins to use ## These are the bins that meet the criteria of `arr_len` x_digits_bins = np.array([int(ii) for ii in range(1, len(x_bins)) if len(x_digits[x_digits == ii]) > arr_len]) ## Elements in each bin # X-values x_bins_data = np.array([x[x_digits == ii] for ii in x_digits_bins]) # Y-values y_bins_data = np.array([y[x_digits == ii] for ii in x_digits_bins]) ## ## Selecting data in bins # Centered around the average if (bin_statval == 'average'): x_stat = np.array([statfunc(ii) if len(ii) > arr_len else failval for ii in x_bins_data]) # Left-hand side of the bin if (bin_statval == 'left'): x_stat = np.array([x_bins[:-1][ii] if len(x_bins_data[ii]) > arr_len else failval for ii in range(len(x_bins_data))]) # Right-hand side of the bin if (bin_statval == 'right'): x_stat = np.array([x_bins[1:][ii] if len(x_bins_data[ii]) > arr_len else failval for ii in range(len(x_bins_data))]) ## ## Determining the values in `y` # `stat_function` y_stat = np.array([statfunc(ii) if len(ii) > arr_len else failval for ii in y_bins_data]) # Standard Deviation y_std = np.array([np.nanstd(ii) if len(ii) > arr_len else failval for ii in y_bins_data]) # Error in the mean/median y_std_err = np.array([ np.nanstd(ii)/math.sqrt(len(ii)) if len(ii) > arr_len else failval for ii in y_bins_data]) ## ## Correcting error inf `statfunc` == `numpy.nanmedian` if statfunc == np.nanmedian: y_std_err *= 1.253 ## ## Returning percentiles if return_perc: perc_arr_lims = sigma_calcs(y_stat) ## ## Returning values if return_perc: if arr_digit == 'n': return_val = [ x_stat, y_stat, y_std, y_std_err, perc_arr_lims] if arr_digit == 'y': return_val = [ x_stat, y_stat, y_std, y_std_err, x_bins_data, y_bins_data, perc_arr_lims] if arr_digit == 'o': return_val = [ x_bins_data, y_bins_data, perc_arr_lims] else: if arr_digit == 'n': return_val = [ x_stat, y_stat, y_std, y_std_err] if arr_digit == 'y': return_val = [ x_stat, y_stat, y_std, y_std_err, x_bins_data, y_bins_data] if arr_digit == 'o': return_val = [ x_bins_data, y_bins_data] return return_val