Source code for yellowbrick.datasets.path

# yellowbrick.datasets.path
# Helper functions for looking up dataset paths.
#
# Author:  Benjamin Bengfort
# Created: Thu Jul 26 14:10:51 2018 -0400
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: path.py [7082742] benjamin@bengfort.com $

"""
Helper functions for looking up dataset paths.
"""

##########################################################################
## Imports
##########################################################################

import os
import shutil

from .signature import sha256sum
from yellowbrick.exceptions import DatasetsError


##########################################################################
## Fixtures
##########################################################################

FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures")


##########################################################################
## Dataset path utilities
##########################################################################


[docs]def get_data_home(path=None):
    """
    Return the path of the Yellowbrick data directory. This folder is used by
    dataset loaders to avoid downloading data several times.

    By default, this folder is colocated with the code in the install directory
    so that data shipped with the package can be easily located. Alternatively
    it can be set by the ``$YELLOWBRICK_DATA`` environment variable, or
    programmatically by giving a folder path. Note that the ``'~'`` symbol is
    expanded to the user home directory, and environment variables are also
    expanded when resolving the path.
    """
    if path is None:
        path = os.environ.get("YELLOWBRICK_DATA", FIXTURES)

    path = os.path.expanduser(path)
    path = os.path.expandvars(path)

    if not os.path.exists(path):
        os.makedirs(path)

    return path


def find_dataset_path(dataset, data_home=None, fname=None, ext=".csv.gz", raises=True):
    """
    Looks up the path to the dataset specified in the data home directory,
    which is found using the ``get_data_home`` function. By default data home
    is colocated with the code, but can be modified with the YELLOWBRICK_DATA
    environment variable, or passing in a different directory.

    The file returned will be by default, the name of the dataset in compressed
    CSV format. Other files and extensions can be passed in to locate other data
    types or auxilliary files.

    If the dataset is not found a ``DatasetsError`` is raised by default.

    Parameters
    ----------
    dataset : str
        The name of the dataset; should either be a folder in data home or
        specified in the yellowbrick.datasets.DATASETS variable.

    data_home : str, optional
        The path on disk where data is stored. If not passed in, it is looked
        up from YELLOWBRICK_DATA or the default returned by ``get_data_home``.

    fname : str, optional
        The filename to look up in the dataset path, by default it will be the
        name of the dataset. The fname must include an extension.

    ext : str, default: ".csv.gz"
        The extension of the data to look up in the dataset path, if the fname
        is specified then the ext parameter is ignored. If ext is None then
        the directory of the dataset will be returned.

    raises : bool, default: True
        If the path does not exist, raises a DatasetsError unless this flag is set
        to False, at which point None is returned (e.g. for checking if the
        path exists or not).

    Returns
    -------
    path : str or None
        A path to the requested file, guaranteed to exist if an exception is
        not raised during processing of the request (unless None is returned).

    raises : DatasetsError
        If raise is True and the path does not exist, raises a DatasetsError.
    """
    # Figure out the root directory of the datasets
    data_home = get_data_home(data_home)

    # Figure out the relative path to the dataset
    if fname is None:
        if ext is None:
            path = os.path.join(data_home, dataset)
        else:
            path = os.path.join(data_home, dataset, "{}{}".format(dataset, ext))
    else:
        path = os.path.join(data_home, dataset, fname)

    # Determine if the path exists
    if not os.path.exists(path):

        # Suppress exceptions if required
        if not raises:
            return None

        raise DatasetsError(
            ("could not find dataset at {} - does it need to be downloaded?").format(
                path
            )
        )

    return path


def dataset_exists(dataset, data_home=None):
    """
    Checks to see if a directory with the name of the specified dataset exists
    in the data home directory, found with ``get_data_home``.

    Parameters
    ----------
    dataset : str
        The name of the dataset; should either be a folder in data home or
        specified in the yellowbrick.datasets.DATASETS variable.

    data_home : str, optional
        The path on disk where data is stored. If not passed in, it is looked
        up from YELLOWBRICK_DATA or the default returned by ``get_data_home``.

    Returns
    -------
    exists : bool
        If a folder with the dataset name is in the data home directory.
    """
    data_home = get_data_home(data_home)
    path = os.path.join(data_home, dataset)

    return os.path.exists(path) and os.path.isdir(path)


def dataset_archive(dataset, signature, data_home=None, ext=".zip"):
    """
    Checks to see if the dataset archive file exists in the data home directory,
    found with ``get_data_home``. By specifying the signature, this function
    also checks to see if the archive is the latest version by comparing the
    sha256sum of the local archive with the specified signature.

    Parameters
    ----------
    dataset : str
        The name of the dataset; should either be a folder in data home or
        specified in the yellowbrick.datasets.DATASETS variable.

    signature : str
        The SHA 256 signature of the dataset, used to determine if the archive
        is the latest version of the dataset or not.

    data_home : str, optional
        The path on disk where data is stored. If not passed in, it is looked
        up from YELLOWBRICK_DATA or the default returned by ``get_data_home``.

    ext : str, default: ".zip"
        The extension of the archive file.

    Returns
    -------
    exists : bool
        True if the dataset archive exists and is the latest version.
    """
    data_home = get_data_home(data_home)
    path = os.path.join(data_home, dataset + ext)

    if os.path.exists(path) and os.path.isfile(path):
        return sha256sum(path) == signature

    return False


def cleanup_dataset(dataset, data_home=None, ext=".zip"):
    """
    Removes the dataset directory and archive file from the data home directory.

    Parameters
    ----------
    dataset : str
        The name of the dataset; should either be a folder in data home or
        specified in the yellowbrick.datasets.DATASETS variable.

    data_home : str, optional
        The path on disk where data is stored. If not passed in, it is looked
        up from YELLOWBRICK_DATA or the default returned by ``get_data_home``.

    ext : str, default: ".zip"
        The extension of the archive file.

    Returns
    -------
    removed : int
        The number of objects removed from data_home.
    """
    removed = 0
    data_home = get_data_home(data_home)

    # Paths to remove
    datadir = os.path.join(data_home, dataset)
    archive = os.path.join(data_home, dataset + ext)

    # Remove directory and contents
    if os.path.exists(datadir):
        shutil.rmtree(datadir)
        removed += 1

    # Remove the archive file
    if os.path.exists(archive):
        os.remove(archive)
        removed += 1

    return removed