import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Mapping
import numpy as np
import pandas as pd
import sdmx
import sdmx.model as m
from item.common import paths
from item.structure.sdmx import _get_anno, generate, merge_dsd
log = logging.getLogger(__name__)
[docs]def add_unit(key: Dict, concept: m.Concept) -> None:
"""Add units to a key."""
# Retrieve the unit information, stored by read_items()
unit = _get_anno(concept, "preferred_unit")
if isinstance(unit, str):
key["unit"] = unit
else:
# Conditional units
for condition, unit in unit.items():
dim, value = condition.split(" == ")
if dim in key and key[dim] == value:
key["unit"] = unit
return
[docs]def collapse(row: pd.Series) -> pd.Series:
"""Collapse multiple concepts into fewer columns.
- VARIABLE label is formatted using the labels for LCA_SCOPE, POLLUTANT, and/or
FLEET.
- MODE label is formatted using the labels for SERVICE, VEHICLE, AUTOMATION and/or
OPERATOR.
"""
data = row.to_dict()
# Combine 3 concepts with the measure name ("VARIABLE")
fleet = data.pop("FLEET")
lca_scope = data.pop("LCA_SCOPE")
pollutant = data.pop("POLLUTANT")
data["VARIABLE"] = (
f"{pollutant} {data['VARIABLE']}"
+ (f" ({lca_scope})" if len(lca_scope) else "")
+ (f" ({fleet.lower()} vehicles)" if fleet not in ("Total", "") else "")
).strip()
# Combine 4 concepts with "MODE"
service = data.pop("SERVICE")
vehicle = data.pop("VEHICLE")
operator = data.pop("OPERATOR")
automation = data.pop("AUTOMATION")
if len(operator) and len(automation) and data["MODE"] == "Light-duty vehicle":
automation = "" if automation == "Human" else " AV"
oa = " ({}{})".format(operator.lower(), automation)
else:
oa = ""
data["MODE"] = (
(f"{service} " if service != "Total" else "")
+ data["MODE"]
+ (f" {vehicle}" if vehicle != "Total" else "")
+ oa
).strip()
return pd.Series(data)
[docs]def name_for_id(
dsd: m.DataStructureDefinition, ids: List[str]
) -> Mapping[str, Dict[str, str]]:
"""Return a nested dict for use with :meth:`pandas.DataFrame.replace`.
For the concept schemes `ids` (e.g. 'mode'), the
:attr:`~.IdentifiableArtefact.id` attribute of a particulate
:class:`.Concept` (e.g. 'air') is replaced with its
:attr:`~.NameableArtefact.name` (e.g. 'Aviation').
"""
result: Mapping[str, Dict[str, str]] = defaultdict(dict)
for id in ids:
codelist = dsd.dimensions.get(
id
).local_representation.enumerated # type: ignore [union-attr]
assert codelist is not None
for code in codelist:
if code.id == "_Z":
name = ""
else:
name = code.name.localized_default()
if not len(name):
name = code.id.title()
result[id][code.id] = name
return result
[docs]def make_template(output_path: Path = None, verbose: bool = True):
"""Generate a data template.
Outputs files containing all keys specified for the iTEM ``HISTORICAL`` data
structure definition. The file is produced in two formats:
- :file:`*.csv`: comma-separated values
- :file:`*.xlsx`: Microsoft Excel.
…and in three variants:
- :file:`full.*`: with full dimensionality for every concept.
- :file:`condensed.*`: with a reduced number of dimensions, with labels for some
dimensions combining labels for others in shorter, conventional, human-readable
form.
- :file:`index.*`: an index or map between the two above versions.
See also
--------
.collapse
"""
# TODO Use SDMX constraints to filter on concepts that are parents of other concepts
sm = generate()
ds = merge_dsd(
sm,
"HISTORICAL",
[
"GDP",
"POPULATION",
"PRICE_FUEL",
"PRICE_POLLUTANT",
"ACTIVITY_VEHICLE",
"ACTIVITY",
"ENERGY",
"EMISSIONS",
"ENERGY_INTENSITY",
"SALES",
"STOCK",
"LOAD_FACTOR",
],
)
# Convert to pd.DataFrame
df0 = sdmx.to_pandas(ds).reset_index()
# Save in multiple formats
output_path = output_path or paths["output"]
log.info(f"Output to {output_path}/{{index,template}}.{{csv,xlsx}}")
# "Index" format: only simple replacements, full dimensionality
df1 = df0.replace({"_Z": "", np.NaN: "", "(REF_AREA)": "…", "(TIME_PERIOD)": "…"})
df1.to_csv(output_path / "full.csv")
df1.to_excel(output_path / "full.xlsx")
# "Template" format: more human-readable
# Use names instead of IDs for labels in these dimensions
replacements = name_for_id(
sm.structure["HISTORICAL"],
(
"AUTOMATION FLEET FUEL MODE OPERATOR POLLUTANT SERVICE TECHNOLOGY VARIABLE "
"VEHICLE"
).split(),
)
# Rename all columns except "Value" using data structure info
columns = dict()
for dim_id in df1.columns:
try:
name = (
sm.structure["HISTORICAL"]
.dimensions.get(dim_id)
.concept_identity.name.localized_default() # type: ignore [union-attr]
)
except (KeyError, AttributeError):
# Use the dimension ID in title case for VARIABLE and VALUE, which do not
# have a .concept_identity
name = dim_id.title()
finally:
columns[dim_id] = name
# Apply replacements; use collapse() above to reduce number of columns
df2 = df1.replace(replacements).apply(collapse, axis=1).rename(columns=columns)
df2.to_csv(output_path / "condensed.csv", index=False)
df2.to_excel(output_path / "condensed.xlsx", index=False)
# Output the index
df3 = pd.concat({"FULL": df0, "CONDENSED": df1}, axis=1)
df3.to_csv(output_path / "index.csv")
df3.to_excel(output_path / "index.xlsx")