Source code for yellowbrick.datasets.path

# yellowbrick.datasets.path
# Helper functions for looking up dataset paths.
#
# Author:  Benjamin Bengfort
# Created: Thu Jul 26 14:10:51 2018 -0400
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: path.py [7082742] benjamin@bengfort.com $

"""
Helper functions for looking up dataset paths.
"""

##########################################################################
## Imports
##########################################################################

import os
import shutil

from .signature import sha256sum
from yellowbrick.exceptions import DatasetsError


##########################################################################
## Fixtures
##########################################################################

FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures")


##########################################################################
## Dataset path utilities
##########################################################################


[docs]def get_data_home(path=None): """ Return the path of the Yellowbrick data directory. This folder is used by dataset loaders to avoid downloading data several times. By default, this folder is colocated with the code in the install directory so that data shipped with the package can be easily located. Alternatively it can be set by the ``$YELLOWBRICK_DATA`` environment variable, or programmatically by giving a folder path. Note that the ``'~'`` symbol is expanded to the user home directory, and environment variables are also expanded when resolving the path. """ if path is None: path = os.environ.get("YELLOWBRICK_DATA", FIXTURES) path = os.path.expanduser(path) path = os.path.expandvars(path) if not os.path.exists(path): os.makedirs(path) return path
def find_dataset_path(dataset, data_home=None, fname=None, ext=".csv.gz", raises=True): """ Looks up the path to the dataset specified in the data home directory, which is found using the ``get_data_home`` function. By default data home is colocated with the code, but can be modified with the YELLOWBRICK_DATA environment variable, or passing in a different directory. The file returned will be by default, the name of the dataset in compressed CSV format. Other files and extensions can be passed in to locate other data types or auxilliary files. If the dataset is not found a ``DatasetsError`` is raised by default. Parameters ---------- dataset : str The name of the dataset; should either be a folder in data home or specified in the yellowbrick.datasets.DATASETS variable. data_home : str, optional The path on disk where data is stored. If not passed in, it is looked up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. fname : str, optional The filename to look up in the dataset path, by default it will be the name of the dataset. The fname must include an extension. ext : str, default: ".csv.gz" The extension of the data to look up in the dataset path, if the fname is specified then the ext parameter is ignored. If ext is None then the directory of the dataset will be returned. raises : bool, default: True If the path does not exist, raises a DatasetsError unless this flag is set to False, at which point None is returned (e.g. for checking if the path exists or not). Returns ------- path : str or None A path to the requested file, guaranteed to exist if an exception is not raised during processing of the request (unless None is returned). raises : DatasetsError If raise is True and the path does not exist, raises a DatasetsError. """ # Figure out the root directory of the datasets data_home = get_data_home(data_home) # Figure out the relative path to the dataset if fname is None: if ext is None: path = os.path.join(data_home, dataset) else: path = os.path.join(data_home, dataset, "{}{}".format(dataset, ext)) else: path = os.path.join(data_home, dataset, fname) # Determine if the path exists if not os.path.exists(path): # Suppress exceptions if required if not raises: return None raise DatasetsError( ("could not find dataset at {} - does it need to be downloaded?").format( path ) ) return path def dataset_exists(dataset, data_home=None): """ Checks to see if a directory with the name of the specified dataset exists in the data home directory, found with ``get_data_home``. Parameters ---------- dataset : str The name of the dataset; should either be a folder in data home or specified in the yellowbrick.datasets.DATASETS variable. data_home : str, optional The path on disk where data is stored. If not passed in, it is looked up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. Returns ------- exists : bool If a folder with the dataset name is in the data home directory. """ data_home = get_data_home(data_home) path = os.path.join(data_home, dataset) return os.path.exists(path) and os.path.isdir(path) def dataset_archive(dataset, signature, data_home=None, ext=".zip"): """ Checks to see if the dataset archive file exists in the data home directory, found with ``get_data_home``. By specifying the signature, this function also checks to see if the archive is the latest version by comparing the sha256sum of the local archive with the specified signature. Parameters ---------- dataset : str The name of the dataset; should either be a folder in data home or specified in the yellowbrick.datasets.DATASETS variable. signature : str The SHA 256 signature of the dataset, used to determine if the archive is the latest version of the dataset or not. data_home : str, optional The path on disk where data is stored. If not passed in, it is looked up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. ext : str, default: ".zip" The extension of the archive file. Returns ------- exists : bool True if the dataset archive exists and is the latest version. """ data_home = get_data_home(data_home) path = os.path.join(data_home, dataset + ext) if os.path.exists(path) and os.path.isfile(path): return sha256sum(path) == signature return False def cleanup_dataset(dataset, data_home=None, ext=".zip"): """ Removes the dataset directory and archive file from the data home directory. Parameters ---------- dataset : str The name of the dataset; should either be a folder in data home or specified in the yellowbrick.datasets.DATASETS variable. data_home : str, optional The path on disk where data is stored. If not passed in, it is looked up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. ext : str, default: ".zip" The extension of the archive file. Returns ------- removed : int The number of objects removed from data_home. """ removed = 0 data_home = get_data_home(data_home) # Paths to remove datadir = os.path.join(data_home, dataset) archive = os.path.join(data_home, dataset + ext) # Remove directory and contents if os.path.exists(datadir): shutil.rmtree(datadir) removed += 1 # Remove the archive file if os.path.exists(archive): os.remove(archive) removed += 1 return removed