Source code for item.structure.template

import logging
from collections import defaultdict
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Mapping, Optional

import numpy as np
import pandas as pd
import sdmx
import sdmx.model.common as m

from item.common import paths
from item.structure.sdmx import _get_anno, generate, merge_dsd

if TYPE_CHECKING:
    import sdmx.model.common

log = logging.getLogger(__name__)


[docs] def add_unit(key: Dict, concept: m.Concept) -> None: """Add units to a key.""" # Retrieve the unit information, stored by read_items() unit = _get_anno(concept, "preferred_unit") if isinstance(unit, str): key["unit"] = unit else: # Conditional units for condition, unit in unit.items(): dim, value = condition.split(" == ") if dim in key and key[dim] == value: key["unit"] = unit return
[docs] def collapse(row: pd.Series) -> pd.Series: """Collapse multiple concepts into fewer columns. - VARIABLE label is formatted using the labels for LCA_SCOPE, POLLUTANT, and/or FLEET. - MODE label is formatted using the labels for SERVICE, VEHICLE, AUTOMATION and/or OPERATOR. """ data = row.to_dict() # Combine 3 concepts with the measure name ("VARIABLE") fleet = data.pop("FLEET") lca_scope = data.pop("LCA_SCOPE") pollutant = data.pop("POLLUTANT") data["VARIABLE"] = ( f"{pollutant} {data['VARIABLE']}" + (f" ({lca_scope})" if len(lca_scope) else "") + (f" ({fleet.lower()} vehicles)" if fleet not in ("Total", "") else "") ).strip() # Combine 4 concepts with "MODE" service = data.pop("SERVICE") vehicle = data.pop("VEHICLE") operator = data.pop("OPERATOR") automation = data.pop("AUTOMATION") if len(operator) and len(automation) and data["MODE"] == "Light-duty vehicle": automation = "" if automation == "Human" else " AV" oa = " ({}{})".format(operator.lower(), automation) else: oa = "" data["MODE"] = ( (f"{service} " if service != "Total" else "") + data["MODE"] + (f" {vehicle}" if vehicle != "Total" else "") + oa ).strip() return pd.Series(data)
[docs] def name_for_id( dsd: "sdmx.model.common.BaseDataStructureDefinition", ids: List[str] ) -> Mapping[str, Dict[str, str]]: """Return a nested dict for use with :meth:`pandas.DataFrame.replace`. For the concept schemes `ids` (e.g. 'mode'), the :attr:`~.IdentifiableArtefact.id` attribute of a particulate :class:`.Concept` (e.g. 'air') is replaced with its :attr:`~.NameableArtefact.name` (e.g. 'Aviation'). """ result: Mapping[str, Dict[str, str]] = defaultdict(dict) for id in ids: codelist = dsd.dimensions.get(id).local_representation.enumerated # type: ignore [union-attr] assert codelist is not None for code in codelist: if code.id == "_Z": name = "" else: name = code.name.localized_default() if not len(name): name = code.id.title() result[id][code.id] = name return result
[docs] def make_template(output_path: Optional[Path] = None, verbose: bool = True): """Generate a data template. Outputs files containing all keys specified for the iTEM ``HISTORICAL`` data structure definition. The file is produced in two formats: - :file:`*.csv`: comma-separated values - :file:`*.xlsx`: Microsoft Excel. …and in three variants: - :file:`full.*`: with full dimensionality for every concept. - :file:`condensed.*`: with a reduced number of dimensions, with labels for some dimensions combining labels for others in shorter, conventional, human-readable form. - :file:`index.*`: an index or map between the two above versions. See also -------- .collapse """ # TODO Use SDMX constraints to filter on concepts that are parents of other concepts sm = generate() ds = merge_dsd( sm, "HISTORICAL", [ "GDP", "POPULATION", "PRICE_FUEL", "PRICE_POLLUTANT", "ACTIVITY_VEHICLE", "ACTIVITY", "ENERGY", "EMISSIONS", "ENERGY_INTENSITY", "SALES", "STOCK", "LOAD_FACTOR", ], ) # Convert to pd.DataFrame df0 = sdmx.to_pandas(ds).reset_index() # Save in multiple formats output_path = output_path or paths["output"] log.info(f"Output to {output_path}/{{index,template}}.{{csv,xlsx}}") # "Index" format: only simple replacements, full dimensionality df1 = df0.replace({"_Z": "", np.nan: "", "(REF_AREA)": "…", "(TIME_PERIOD)": "…"}) df1.to_csv(output_path / "full.csv") df1.to_excel(output_path / "full.xlsx") # "Template" format: more human-readable # Use names instead of IDs for labels in these dimensions replacements = name_for_id( sm.structure["HISTORICAL"], ( "AUTOMATION FLEET FUEL MODE OPERATOR POLLUTANT SERVICE TECHNOLOGY VARIABLE " "VEHICLE" ).split(), ) # Rename all columns except "Value" using data structure info columns = dict() for dim_id in df1.columns: try: name = ( sm.structure["HISTORICAL"] .dimensions.get(dim_id) .concept_identity.name.localized_default() # type: ignore [union-attr] ) except (KeyError, AttributeError): # Use the dimension ID in title case for VARIABLE and VALUE, which do not # have a .concept_identity name = dim_id.title() finally: columns[dim_id] = name # Apply replacements; use collapse() above to reduce number of columns df2 = df1.replace(replacements).apply(collapse, axis=1).rename(columns=columns) df2.to_csv(output_path / "condensed.csv", index=False) df2.to_excel(output_path / "condensed.xlsx", index=False) # Output the index df3 = pd.concat({"FULL": df0, "CONDENSED": df1}, axis=1) df3.to_csv(output_path / "index.csv") df3.to_excel(output_path / "index.xlsx")