Source code for cosmo_utils.utils.file_readers

#! /usr/bin/env python
# -*- coding: utf-8 -*-

# Victor Calderon
# Created      : 2018-05-02
# Last Modified: 2018-05-02
from __future__ import absolute_import, division, print_function
__author__     = ['Victor Calderon']
__copyright__  = ["Copyright 2018 Victor Calderon"]
__email__      = ['victor.calderon@vanderbilt.edu']
__maintainer__ = ['Victor Calderon']
__all__        = [  "IDL_read_file",
                    "fast_food_reader",
                    "read_pandas_hdf5",
                    "read_hdf5_file_to_pandas_DF",
                    "pandas_file_to_hdf5_file",
                    "pandas_df_to_hdf5_file",
                    "concatenate_pd_df"]
"""
Set of functions to read various types of files
"""

# Import modules
import struct
import os
import sys
import numpy as np
import pandas as pd
import h5py
from   scipy.io.idl import readsav
from   cosmo_utils.utils import file_utils as fd
from   cosmo_utils.custom_exceptions import LSSUtils_Error

## Functions

# Reads in IDL catalogue as Python dictionary
[docs]def IDL_read_file(idl_file): """ Reads an IDL file and converts it to a Python dictionary Parameters ---------- idl_file : string Path to the filename being used Returns ---------- idl_dict : python dictionary Dictionary with the data from `idl_file` """ # Checking that file exists fd.File_Exists(idl_file) # Converting to dictionary try: idl_dict = readsav(idl_file, python_dict=True) except: msg = '{0} `idl_file` {0} is not an IDL file'.format( fd.Program_Msg(__file__), idl_file) raise LSSUtils_Error(msg) return idl_dict
# Reads in `fastfood`-type files and converts it to an array
[docs]def fast_food_reader(key, nitems, filename): """ Reads in `fastfood`-type file and converts it to an array Parameters ---------- key : {'int', 'float', 'double', 'long'}, str Type of the element to extract. nitmes : int Number of items to expect and exctract. filename : str Absolute path to the file, from which to extract the information. Returns ---------- items_arr : np.ndarray, shape (`nitems`,) Array of elements from `filename` with length `nitems`. """ # Dictionaries with values for `key` size_types = { 'int' :4 , 'float' :4 , 'char' :1 ,\ 'short_int' :2 , 'long_int' :4 , 'bool' :1 ,\ 'double' :8 , 'long_double' :8 , 'wchart_t':2 } ## type_string = { 'char' :'c', 'signed_char' :'b',\ 'unsigned_char':'B', '_Bool' :'?',\ 'short' :'h', 'unsigned_short' :'H',\ 'int' :'i', 'unsigned_int' :'I',\ 'long' :'l', 'unsigned_long' :'L',\ 'long_long' :'q', 'unsigned_long_long':'Q',\ 'float' :'f', 'double' :'d' } ## Defining types and sizes if key == 'int': size_type = size_types ['int'] type_str = type_string['int'] if key == 'float': size_type = size_types ['float'] type_str = type_string['float'] if key == 'long': size_type = size_types ['long_double'] type_str = type_string['long'] if key == 'double': size_type = size_types ['double'] type_str = type_string['double'] ## # Top padding (it should contain 4 bytes for `.ff` files) # 1st padding = nbyte1 nbyte1_read = filename.read(1 * size_types['int']) nbyte1 = struct.unpack(1 * type_string['int'], nbyte1_read) nbyte1_val = int(nbyte1[0]) if len(nbyte1) != 1: errno = -10 raise ValueError('Read error: file empty?. \nError: ' + str(errno)) sys.exit() # Extracting data nitem1_read = filename.read(size_type * nitems) items_arr = struct.unpack(type_str * nitems, nitem1_read) val_len = len(items_arr) if val_len != nitems: errno = -20 raise ValueError('Read Error: {0} items expected. Read {1}'.format( nitems, val_len)) sys.exit() # Bottom padding nbyte2_read = filename.read(size_types['int']) nbyte2 = struct.unpack(1 * type_string['int'], nbyte2_read) nbyte2_val = int(nbyte2[0]) if len(nbyte2) != 1: errno = -30 raise ValueError('Read Error: File too short?\n Errno: ' + str(errno)) sys.exit() # Checking top and bottom if nbyte1_val != nbyte2_val: errno = -1 Err_msg = 'Read warning. Byte numbers do not match \n ' Err_msg += 'nbyte1 = {0}, nbyte2 = {1}\n'.format(nbyte1, nbyte2) raise ValueError(Err_msg + 'Errno: {0}'.format(errno)) sys.exit() # Checking that nbye1_val = nitems*Size_type[key] if nbyte1_val != nitems * size_type: errno = -2 Err_msg = 'Read warning. Byte numbers do not match \n ' Err_msg += 'nbyte1 = {0}, nitems = {1}\n'.format(nbyte1, nitems) raise ValueError(Err_msg + 'Errno: {0}'.format(errno)) sys.exit() # # Converting values to numpy array items_arr = np.asarray(items_arr) return items_arr
# Reads in a pandas DataFrame from a HDF5 file
[docs]def read_pandas_hdf5(hdf5_file, key=None, ret=False): """ Reads a HDF5 file that contains one or many datasets. It converts it into a pandas DataFrame. Parameters ---------- hdf5_file : str Path to the HDF5 file containing one or more pandas DataFrame(s). key : str or NoneType If provided, it will extract the `key` value as a pandas DataFrame. This value is set to `None` by default. ret : `bool`, optional If True, it returns the value of the `key`. By default, it is set to False. Returns ---------- df : `pandas.DataFrame` DataFrame from the `hdf5_file` with the data from the `key` directory """ file_msg = fd.Program_Msg(__file__) # Checking that file exists fd.File_Exists(hdf5_file) # Checking number of keys hdf5_obj = pd.HDFStore(hdf5_file) hdf5_keys = [ii for ii in hdf5_obj.keys()] hdf5_obj.close() # Reading in HDF5 file if key is None: try: df = pd.read_hdf(hdf5_file) if ret: return df, hdf5_keys[0] else: return df except: msg = '{0} Must specify which key to use:\n\t'.format(file_msg) msg += 'Possible keys: \n' print(msg) for key_i, name in enumerate(hdf5_keys): print('\t Key {0}: {1}'.format(key_i, name)) else: if key not in hdf5_keys: print('{0} Key not in the file: '.format(file_msg)) print('Possible Keys:\n') for key_i, name in enumerate(hdf5_keys): print('\t Key {0}: {1}'.format(key_i, name)) else: df = pd.read_hdf(hdf5_file, key=key) if ret: return df, key else: return df
# Reads HDF5 files and converts them to pandas dataframe
[docs]def read_hdf5_file_to_pandas_DF(hdf5_file, key=None): """ Reads content of HDF5 file and converts it to a Pandas DataFrame Parameters ---------- hdf5_file : str Path to the HDF5 file. This is the file that will be converted to a pandas DataFrame. key : str or NoneType, optional Key or path in `hdf5_file` for the pandas DataFrame and the normal HDF5 file. Returns ---------- df : `pandas.DataFrame` DataFrame from `hdf5_file` under the `key` directory. """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(hdf5_file) # Reading in Pandas DataFrame try: df = pd.read_hdf(hdf5_file, key=key) except: msg = '{0} Could not read `hdf5_file` ({1})! Please check if it exists' msg = msg.format(file_msg, hdf5_file) raise LSSUtils_Error(file_msg) return df
# Converts pandas DataFrame to HDF5 file format
[docs]def pandas_file_to_hdf5_file(df_file, hdf5_file, key=None, mode='w'): """ Converts a HDF5 with pandas format and converts it to normal HDF5 file Paramters --------- df_file : str Path to the `df_file` containing the pandas DataFrame to be converted hdf5_file : str Path to the output HDF5 file containg arrays as keys key : str or NoneType, optional Key or path in HDF5 file for the `df_file` and `hdf5_file` """ file_msg = fd.Program_Msg(__file__) fd.File_Exists(df_file) # Reading in DataFrame if not key: data, key = read_pandas_hdf5(df_file, key=None, ret=True) else: data = read_pandas_hdf5(df_file, key=key) # Rearranging data arr_names = data.dtypes.index.values dtypes_arr = data.dtypes.values dtypes_arr = np.array([x.str for x in dtypes_arr]) data_dtypes = np.dtype(zip(arr_names, dtypes_arr)) dataset = np.recarray((len(data),), dtype=data_dtypes) for name in dataset.dtype.names: dataset[name] = data[name] # Saving file to HDF5 format hdf5_obj = h5py.File(hdf5_file, mode=mode) hdf5_obj.create_dataset(key, data=dataset) hdf5_obj.close() msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file) print(msg)
# Saves a pandas DataFrame into a normal or a `pandas` HDF5 file
[docs]def pandas_df_to_hdf5_file(df, hdf5_file, key=None, mode='w', complevel=8): """ Saves a `pandas.DataFrame` into a `pandas` HDF5 FILE. Parameters ---------- df : `pandas.DataFrame` DataFrame to be converted and saved into a HDF5 file. hdf5_file : str Path to the output HDF5 file key : str or NoneType, optional Key or path, under which `df` will be saved in the `hdf5_file`. mode : {'w','a'}, optional Mode to handle `hdf5_file`. This value is set to `w` by default, which stand for `write`. complevel : int, optional Level of compression for `hdf5_file`. The range of `complevel` is rane(0-9). This is set to a default of 8. """ file_msg = fd.Program_Msg(__file__) # Saving DataFrame to `hdf5_file` try: df.to_hdf(hdf5_file, key, mode=mode, complevel=complevel) msg = '{0} HDF5 file created: {1}'.format(file_msg, hdf5_file) print(msg) except: msg = '{0} Could not create HDF5 file'.format(file_msg) raise LSSUtils_Error(msg)
# Concatenating pandas DataFrame into a single DataFrame
[docs]def concatenate_pd_df(directory, filetype='hdf5', foutput=None, outonly=True): """ Concatenates pandas DataFrames into a single DataFrame Parameters ---------- directory : str Path to the folder containing multiple pandas-HDF5 files filetype : str, optional File format of the file in `directory` to be read This is set to `hdf5` by default. foutput : str or NoneType If not `None`, it is the basename of the output file in HDF5 format outonly : `bool`, optional If True, it returns the pandas DataFrame. If False, it only saved the concatenated `pandas.DataFrame`. Returns ---------- df_conc : `pandas.DataFrame` DataFrame containing the combined datasets from the files in `directory`. Raises ---------- LSSUtils_Error : Exception If no files are found in `directory`, it raises an error warning about this. """ file_msg = fd.Program_Msg(__file__) # Checking that `directory` exists if not os.path.exists(directory): msg = '{0} `directory` {1} is not a valid path! Exiting!'.format( file_msg, directory) raise LSSUtils_Error(msg) # Concatenating files files_arr = fd.index(directory, '.' + filetype, sort=True) print('{0} Found `{1}` files'.format(file_msg, files_arr.size)) if len(files_arr) > 0: # Initializing array that contains info df_arr = [[] for x in range(len(files_arr))] # Looping over HDF5 (pandas) files for ii, file_ii in enumerate(files_arr): df_arr[ii] = read_pandas_hdf5(file_ii) # Concatenating arrays df_conc = pd.concat(df_arr, ignore_index=True) # Deciding name of resulting output file if (foutput is not None) and (type(foutput) == str): foutput_file = os.path.join(directory, '{0}.{1}'.format(foutput, filetype)) # Saving resulting DataFrame pandas_df_to_hdf5_file(df_conc, foutput_file, key='/Main') # Checking file exists fd.File_Exists(foutput_file) print('{0} Output file saved in: {2}'.format( file_msg, foutput_file)) # If only outputting concatenated DataFrame if outonly: return df_conc else: msg = '{0} No files in `{1}` with extension `{2}`'.format(file_msg, directory, filetype) raise LSSUtils_Error(msg)