Source code for cosmo_utils.utils.web_utils

#! /usr/bin/env python
# -*- coding: utf-8 -*-

# Victor Calderon
# Created      : 2018-05-09
# Last Modified: 2018-05-09
from __future__ import absolute_import, division, print_function
__author__     = ['Victor Calderon']
__copyright__  = ["Copyright 2018 Victor Calderon"]
__email__      = ['victor.calderon@vanderbilt.edu']
__maintainer__ = ['Victor Calderon']
__all__        = [  "url_checker",
                    "url_file_list",
                    "url_files_download"]

"""
Tools used to interact with web-related objects
"""

## Import modules
import os
import wget
from   bs4 import BeautifulSoup
import numpy as np
import requests
from   tqdm import tqdm
from   cosmo_utils.utils             import file_utils as fd
from   cosmo_utils.custom_exceptions import LSSUtils_Error
from   cosmo_utils.utils             import file_utils as cfutils

## Functions

## Checking if URL is valid
[docs]def url_checker(url_str): """ Checks if the URL is valid or not. Parameters ----------- url_str : `str` URL of the website to evaluate. Raises ---------- LSSUtils_Error : `Exception` Program exception if input parameters are accepted """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters if not (isinstance(url_str, str)): msg = '{0} `url_str` ({1}) is not a STRING!'.format(file_msg, type(url_str)) raise LSSUtils_Error(msg) ## ## Checking Website request_url = requests.get(url_str) if (request_url.status_code != 200): msg = '{0} `url_str` ({1}) does not exist!'.format(file_msg, url_str) raise LSSUtils_Error(msg)
## Listing the files in a directory
[docs]def url_file_list(url, ext): """ Lists the files from a URL taht have a specific file extension. Parameters ----------- url : `str` String of the URL ext : `str` File extension of the files in the URL. Returns ----------- files_arr : `numpy.ndarray`, shape (N,) Array of the file in `url` that match the file extension `ext`. """ file_msg = fd.Program_Msg(__file__) ## Checking for file type # 'URL' if not isinstance(url, str): msg = '{0} `url` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(url)) raise TypeError(msg) # File extension if not isinstance(ext, str): msg = '{0} `ext` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(ext)) raise TypeError(msg) ## Reformatting URL # Removing whitespaces url = url.strip() # Removing trailing slach if url.endswith('/'): url = url[:-1] # Checking if URL exists url_checker(url) # Reading in HTML from page page = requests.get(url).text # Converting to BeautifulSoup format soup = BeautifulSoup(page, 'html.parser') ## Obtaining list of files # Removing files that are NOT strings files_arr_pre = np.array([ xx.get('href') for xx in soup.find_all('a') if isinstance(xx.get('href'), str)]) # Only those finishing with certain extension files_pre_ext = np.array([xx for xx in files_arr_pre if xx.endswith(ext)]) # Checking if file contains string 'http://' files_pre_web = np.array([(url + '/' + xx) if not ('//' in xx) else xx for xx in files_pre_ext]) # Sorting out file array files_arr = np.sort(files_pre_web) return files_arr
## Downloads the files from a URL to a local directory
[docs]def url_files_download(url, ext, outdir, check_exist=False, create_dir=False, remove_files=False, bar_opt='tqdm'): """ Downloads the files from a URL to a local directory. The files that match a specific file extension, `ext`. Parameters ----------- url : `str` String of the URL ext : `str` File extension of the files in the URL. outdir : `str` Path to the output directory. This is the directory, to which the files with extensions `ext` will be saved. check_exist : `bool`, optional If `True`, it checks for whether or not the file exists. This variable is set to `False` by default. create_dir : `bool`, optional If `True`, it creates the directory if it does not exist. This variable is set to `False` by default. remove_files : `bool`, optional If `True`, local files that are present that match the files at the URL will be replaced by the new versions. This variable is set to ``False`` by default. bar_opt : {'tqdm', 'native'} Option for which type of progress bar to use when downloading files. This variable is set to `tqdm` by default. Options: - 'tqdm' : Uses a tqdm-based progress bar - 'native': Used the wget-based native progress bar. """ file_msg = fd.Program_Msg(__file__) ## Checking for file type # 'URL' if not isinstance(url, str): msg = '{0} `url` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(url)) raise TypeError(msg) # File extension if not isinstance(ext, str): msg = '{0} `ext` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(ext)) raise TypeError(msg) # Output directory if not isinstance(outdir, str): msg = '{0} `outdir` ({1}) is not a valid type. It must be a STRING!' msg = msg.format(file_msg, type(outdir)) raise TypeError(msg) # `check_exist` if not (isinstance(check_exist, bool)): msg = '`check_exist` ({0}) must be of `boolean` type!'.format( type(check_exist)) raise TypeError(msg) # `create_dir` if not (isinstance(create_dir, bool)): msg = '`create_dir` ({0}) must be of `boolean` type!'.format( type(create_dir)) raise TypeError(msg) # `bar` - Type if not (isinstance(bar_opt, str)): msg = '`bar_opt` ({0}) must be of `boolean` type!'.format( type(bar_opt)) raise TypeError(msg) # Progress bar - Value if not (bar_opt in ['tqdm', 'native']): msg = '{0} `bar_opt` ({1}) is not a valid option! Exiting' msg = msg.format(file_msg, bar_opt) raise LSSUtils_Error(msg) ## ## List of files in the URL files_arr = url_file_list(url, ext) # Creating directory if create_dir: cfutils.Path_Folder(outdir) # Check for its existence if check_exist: if not (os.path.exists(outdir)): msg = '`outdir` ({0}) was not found!'.format( outdir) raise FileNotFoundError(msg) ## ## Downloading files to output directory if len(files_arr) > 0: if (bar_opt == 'tqdm'): tqdm_desc = 'Downloading files: ' for file_ii in tqdm(files_arr, desc=tqdm_desc): # Local file file_ii_local = os.path.join( outdir, os.path.basename(file_ii)) # Checking if local file exists if os.path.exists(file_ii_local): if remove_files: os.remove(file_ii_local) wget_opt = True else: wget_opt = False else: wget_opt = True ## ## Only downloading if necessary if wget_opt: wget.download(file_ii, out=outdir, bar=None) elif (bar_opt == 'native'): for file_ii in files_arr: # Local file file_ii_local = os.path.join( outdir, os.path.basename(file_ii)) # Checking if local file exists if os.path.exists(file_ii_local): if remove_files: os.remove(file_ii_local) wget_opt = True else: wget_opt = False else: wget_opt = True ## ## Only downloading if necessary if wget_opt: wget.download(file_ii, out=outdir) else: msg = '{0} Number of files is ZERO!'.format(file_msg) print(msg)