Skip to content

bookshelf_producer.notebook#

bookshelf_producer.notebook #

Functions to run/manage notebooks

get_available_versions(name, include_private=False) #

Get a list of available versions of a book

Parameters:

Name Type Description Default
name str

Package name

required
include_private bool

If True, also include private versions

False

Returns:

Type Description
tuple[str, ...]

List of versions

Source code in packages/bookshelf-producer/src/bookshelf_producer/notebook.py
def get_available_versions(name: str, include_private: bool = False) -> tuple[str, ...]:
    """
    Get a list of available versions of a book

    Parameters
    ----------
    name
        Package name
    include_private
        If True, also include private versions

    Returns
    -------
    :
        List of versions
    """
    config, _ = _load_nb_config(name)

    versions = config.versions
    if not include_private:
        versions = [v for v in versions if not v.private]

    return tuple(v.version for v in versions)

load_nb_metadata(name, version=None, nb_directory=None) #

Load notebook metadata

Attempts to search nb_directory for a metadata YAML file. This YAML file contains information about the dataset that is being processed. See NotebookMetadata for a description of the available options.

The assumed filename format for versioned data is {name}_{version}.yaml where name matches the notebook name and the name as specified in the NotebookMetadata

Parameters:

Name Type Description Default
name str

Filename to load. Should match the notebook name (not checked)

required
version str

Version of the metadata to load. If none is provided, the last version will be used

None
nb_directory str | None

If a non-absolute path is provided, it is assumed to be relative to nb_directory

None

Raises:

Type Description
UnknownVersion

A matching version is not in the configuration

Returns:

Type Description
NotebookMetadata

Metadata about the notebook including the target package and version

Source code in packages/bookshelf-producer/src/bookshelf_producer/notebook.py
def load_nb_metadata(
    name: str,
    version: Version | None = None,
    nb_directory: str | None = None,
) -> NotebookMetadata:
    """
    Load notebook metadata

    Attempts to search `nb_directory` for a metadata YAML file. This YAML file
    contains information about the dataset that is being processed. See NotebookMetadata
    for a description of the available options.

    The assumed filename format for versioned data is `{name}_{version}.yaml` where
    name matches the notebook name and the name as specified in the NotebookMetadata

    Parameters
    ----------
    name : str
        Filename to load. Should match the notebook name (not checked)
    version : str
        Version of the metadata to load. If none is provided, the last version will be used
    nb_directory: str
        If a non-absolute path is provided, it is assumed to be relative to nb_directory

    Raises
    ------
    UnknownVersion
        A matching version is not in the configuration

    Returns
    -------
    NotebookMetadata
        Metadata about the notebook including the target package and version
    """
    config, raw_data = _load_nb_config(name, nb_directory)

    if "version" in raw_data and version != raw_data["version"]:
        # Check if a version has already been selected
        raise ValueError("Requested version does not match the metadata")

    if version:
        selected_version = None
        for v in config.versions:
            if v.version == version:
                selected_version = v
    else:
        selected_version = config.versions[-1]
    if selected_version is None:
        raise UnknownVersion(config.name, version)

    return NotebookMetadata(**raw_data, **selected_version.dict())

run_notebook(name, nb_directory=None, output_directory=None, force=False, version=None) #

Run a notebook to generate a new Book

The jupytext .py version of the notebook is used.

The template file and configuration is copied to the output directory. The template .py file is then used to create a notebook which is run using papermill. The local_bookshelf parameter is also set to the output directory.

Parameters:

Name Type Description Default
name str

Name of the notebook

required
nb_directory str

Directory containing the notebooks.

This defaults to the notebooks/ directory in this project

None
output_directory str

Where the output directory will be created.

This defaults to data/processing/{name}/{version}

None
force bool

If True, override the existing data in the output directory

False
version str

Version to extract

None

Returns:

Type Description
LocalBook

The generated book

Source code in packages/bookshelf-producer/src/bookshelf_producer/notebook.py
def run_notebook(
    name: str,
    nb_directory: str | None = None,
    output_directory: str | None = None,
    force: bool = False,
    version: Version | None = None,
) -> LocalBook:
    """
    Run a notebook to generate a new Book

    The jupytext `.py` version of the notebook is used.

    The template file and configuration is copied to the output directory. The
    template `.py` file is then used to create a notebook which is run using
    `papermill`. The `local_bookshelf` parameter is also set to the output
    directory.

    Parameters
    ----------
    name : str
        Name of the notebook
    nb_directory : str
        Directory containing the notebooks.

        This defaults to the `notebooks/` directory in this project
    output_directory : str
        Where the output directory will be created.

        This defaults to `data/processing/{name}/{version}`
    force : bool
        If True, override the existing data in the output directory
    version : str
        Version to extract

    Returns
    -------
    :
        The generated book
    """
    if not has_papermill:
        raise ImportError("papermill is not installed. Run 'pip install bookshelf[notebooks]'")
    if not has_jupytext:
        raise ImportError("jupytext is not installed. Run 'pip install bookshelf[notebooks]'")

    short_name = name.split("/")[-1]

    # Verify metadata
    metadata = load_nb_metadata(name, version=version, nb_directory=nb_directory)
    nb_fname = metadata.source_file.replace(".yaml", ".py")

    if not os.path.exists(nb_fname):
        raise FileNotFoundError(f"Could not find notebook: {nb_fname}")

    logger.info(f"Loaded metadata from {metadata.source_file}")
    if metadata.name != short_name:  # pragma: no cover
        raise ValueError(
            "name in metadata does not match the name of the notebook " f"({metadata.name} != {name}"
        )
    logger.info(f"Processing {metadata.long_name()}")

    if output_directory is None:
        output_directory = os.path.join(PROCESSED_DATA_DIR, short_name)

    output_directory = os.path.join(output_directory, metadata.version)
    if os.path.exists(output_directory) and os.listdir(output_directory):
        logger.warning(f"{output_directory} is not empty")
        if not force:
            raise ValueError(f"{output_directory} is not empty")
    os.makedirs(output_directory, exist_ok=True)

    # Copy required files
    logger.info(f"Copying {nb_fname} to {output_directory}")
    shutil.copyfile(nb_fname, os.path.join(output_directory, f"{short_name}.py"))
    logger.info(f"Copying metadata to {output_directory}")
    with open(os.path.join(output_directory, f"{short_name}.yaml"), "w") as fh:
        yaml.safe_dump(metadata.dict(), fh)

    # Template and run notebook
    output_nb_fname = os.path.join(output_directory, f"{short_name}.ipynb")
    logger.info(f"Creating notebook {output_nb_fname} from {nb_fname}")
    notebook_jupytext = jupytext.read(nb_fname)
    jupytext.write(
        notebook_jupytext,
        output_nb_fname,
        fmt="ipynb",
    )
    papermill.execute_notebook(
        output_nb_fname,
        output_nb_fname,
        parameters={"local_bookshelf": output_directory, "version": version},
    )
    # Attempt to load the book from the output directory
    shelf = BookShelf(path=output_directory)
    book = shelf.load(short_name, metadata.version, edition=metadata.edition)
    logger.info(f"Notebook run successfully with hash: {book.hash()}")
    return book