Source code for item.structure.template

import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Mapping

import numpy as np
import pandas as pd
import sdmx
import sdmx.model as m

from item.common import paths
from item.structure.sdmx import _get_anno, generate, merge_dsd

log = logging.getLogger(__name__)


[docs]def add_unit(key: Dict, concept: m.Concept) -> None:
    """Add units to a key."""
    # Retrieve the unit information, stored by read_items()
    unit = _get_anno(concept, "preferred_unit")

    if isinstance(unit, str):
        key["unit"] = unit
    else:
        # Conditional units
        for condition, unit in unit.items():
            dim, value = condition.split(" == ")
            if dim in key and key[dim] == value:
                key["unit"] = unit
                return


[docs]def collapse(row: pd.Series) -> pd.Series:
    """Collapse multiple concepts into fewer columns.

    - VARIABLE label is formatted using the labels for LCA_SCOPE, POLLUTANT, and/or
      FLEET.
    - MODE label is formatted using the labels for SERVICE, VEHICLE, AUTOMATION and/or
      OPERATOR.
    """
    data = row.to_dict()

    # Combine 3 concepts with the measure name ("VARIABLE")
    fleet = data.pop("FLEET")
    lca_scope = data.pop("LCA_SCOPE")
    pollutant = data.pop("POLLUTANT")

    data["VARIABLE"] = (
        f"{pollutant} {data['VARIABLE']}"
        + (f" ({lca_scope})" if len(lca_scope) else "")
        + (f" ({fleet.lower()} vehicles)" if fleet not in ("Total", "") else "")
    ).strip()

    # Combine 4 concepts with "MODE"
    service = data.pop("SERVICE")
    vehicle = data.pop("VEHICLE")
    operator = data.pop("OPERATOR")
    automation = data.pop("AUTOMATION")

    if len(operator) and len(automation) and data["MODE"] == "Light-duty vehicle":
        automation = "" if automation == "Human" else " AV"
        oa = " ({}{})".format(operator.lower(), automation)
    else:
        oa = ""

    data["MODE"] = (
        (f"{service} " if service != "Total" else "")
        + data["MODE"]
        + (f" {vehicle}" if vehicle != "Total" else "")
        + oa
    ).strip()

    return pd.Series(data)


[docs]def name_for_id(
    dsd: m.DataStructureDefinition, ids: List[str]
) -> Mapping[str, Dict[str, str]]:
    """Return a nested dict for use with :meth:`pandas.DataFrame.replace`.

    For the concept schemes `ids` (e.g. 'mode'), the
    :attr:`~.IdentifiableArtefact.id` attribute of a particulate
    :class:`.Concept` (e.g. 'air') is replaced with its
    :attr:`~.NameableArtefact.name` (e.g. 'Aviation').
    """
    result: Mapping[str, Dict[str, str]] = defaultdict(dict)
    for id in ids:
        codelist = dsd.dimensions.get(
            id
        ).local_representation.enumerated  # type: ignore [union-attr]
        assert codelist is not None

        for code in codelist:
            if code.id == "_Z":
                name = ""
            else:
                name = code.name.localized_default()
                if not len(name):
                    name = code.id.title()

            result[id][code.id] = name

    return result


[docs]def make_template(output_path: Path = None, verbose: bool = True):
    """Generate a data template.

    Outputs files containing all keys specified for the iTEM ``HISTORICAL`` data
    structure definition. The file is produced in two formats:

    - :file:`*.csv`: comma-separated values
    - :file:`*.xlsx`: Microsoft Excel.

    …and in three variants:

    - :file:`full.*`: with full dimensionality for every concept.
    - :file:`condensed.*`: with a reduced number of dimensions, with labels for some
      dimensions combining labels for others in shorter, conventional, human-readable
      form.
    - :file:`index.*`: an index or map between the two above versions.

    See also
    --------
    .collapse
    """
    # TODO Use SDMX constraints to filter on concepts that are parents of other concepts

    sm = generate()

    ds = merge_dsd(
        sm,
        "HISTORICAL",
        [
            "GDP",
            "POPULATION",
            "PRICE_FUEL",
            "PRICE_POLLUTANT",
            "ACTIVITY_VEHICLE",
            "ACTIVITY",
            "ENERGY",
            "EMISSIONS",
            "ENERGY_INTENSITY",
            "SALES",
            "STOCK",
            "LOAD_FACTOR",
        ],
    )

    # Convert to pd.DataFrame
    df0 = sdmx.to_pandas(ds).reset_index()

    # Save in multiple formats
    output_path = output_path or paths["output"]
    log.info(f"Output to {output_path}/{{index,template}}.{{csv,xlsx}}")

    # "Index" format: only simple replacements, full dimensionality
    df1 = df0.replace({"_Z": "", np.NaN: "", "(REF_AREA)": "…", "(TIME_PERIOD)": "…"})

    df1.to_csv(output_path / "full.csv")
    df1.to_excel(output_path / "full.xlsx")

    # "Template" format: more human-readable

    # Use names instead of IDs for labels in these dimensions
    replacements = name_for_id(
        sm.structure["HISTORICAL"],
        (
            "AUTOMATION FLEET FUEL MODE OPERATOR POLLUTANT SERVICE TECHNOLOGY VARIABLE "
            "VEHICLE"
        ).split(),
    )
    # Rename all columns except "Value" using data structure info
    columns = dict()
    for dim_id in df1.columns:
        try:
            name = (
                sm.structure["HISTORICAL"]
                .dimensions.get(dim_id)
                .concept_identity.name.localized_default()  # type: ignore [union-attr]
            )
        except (KeyError, AttributeError):
            # Use the dimension ID in title case for VARIABLE and VALUE, which do not
            # have a .concept_identity
            name = dim_id.title()
        finally:
            columns[dim_id] = name

    # Apply replacements; use collapse() above to reduce number of columns
    df2 = df1.replace(replacements).apply(collapse, axis=1).rename(columns=columns)

    df2.to_csv(output_path / "condensed.csv", index=False)
    df2.to_excel(output_path / "condensed.xlsx", index=False)

    # Output the index
    df3 = pd.concat({"FULL": df0, "CONDENSED": df1}, axis=1)
    df3.to_csv(output_path / "index.csv")
    df3.to_excel(output_path / "index.xlsx")