Source code for item.historical.T004

"""Data cleaning code and configuration for T004.

Notes:

- The input data is does not express the units, which are single vehicles.

.. todo::
   - The input data have labels like "- LPG" in the "Fuel type" column, with the hyphen
     possibly indicating a hierarchical code list. Find a reference to this code list.
   - The code currently uses some inconsistent labels, such as:

     - "Liquid-Bio" (no spaces) vs. "Liquid - Fossil" (spaces).
     - "Natural Gas Vehicle" vs. "Conventional" (word "Vehicle" is omitted).

     Fix these after :pull:`62` is merged by using code lists for these dimensions.
   - Add code to fetch this source automatically. It does not have a clearly-defined
     API.
   - Capture and preserve the metadata provided by the UNECE data interface.

"""

from functools import lru_cache

import pandas as pd

#: Separator character for :func:`pandas.read_csv`.
CSV_SEP = ";"

#: iTEM data flow matching the data from this source.
DATAFLOW = "SALES"

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    source="UNECE",  # Agency id, not full name
    variable="Sales",
    unit="vehicle",
    mode="Road",
    fleet="NEW",
)

#: Columns to drop from the raw data.
COLUMNS = dict(drop=["Frequency"])

#: Mapping between existing values and values to be assigned.
MAP = {
    "Type of vehicle": {
        # Dimensions to which the values should be assigned
        "_dims": ("SERVICE", "VEHICLE"),
        # Key is the value appearing in the variable column; values are a tuple for the
        # two columns
        "New lorries (vehicle wt over 3500 kg)": ("F", "Heavy Truck"),
        "New road tractors": ("F", "Medium Truck"),
        "New passenger cars": ("P", "LDV"),
        "New motor coaches, buses and trolley buses": ("F", "Bus"),
        "New light goods vehicles": ("F", "Light Truck"),
    },
    "Fuel type": {
        "_dims": ("TECHNOLOGY", "FUEL"),
        "Diesel": ("IC", "DIESEL"),
        "- Diesel (excluding hybrids)": ("NONHYB", "DIESEL"),
        "- Biodiesel": ("IC", "BIODIESEL"),
        "- Hybrid electric-diesel": ("HYBRID", "DIESEL"),
        "- Plug-in hybrid diesel-electric": ("PHEV-G", "ELEC"),
        "Petrol": ("IC", "GASOLINE"),
        "- Petrol (excluding hybrids)": ("NONHYB", "GASOLINE"),
        "- Bioethanol": ("IC", "BIOETH"),
        "- Hybrid electric-petrol": ("HYBRID", "PETROL"),
        "- Plug-in hybrid petrol-electric": ("PHEV-D", "ELEC"),
        "Alternative (total)": ("Alternative", "Alternative"),
        "- Bi-fuel vehicles": ("IC", "BIOFUEL"),
        "- Compressed natural gas (CNG)": ("IC", "CNG"),
        "- Electricity": ("BEV", "ELEC"),
        "- Hydrogen and fuel cells": ("FC", "H2"),
        "- Liquefied natural gas (LNG)": ("IC", "LNG"),
        "- LPG": ("IC", "LPG"),
        "Total": ("_T", "_T"),
    },
}


def process(df):
    df = df.rename(columns={"Date": "TIME_PERIOD"})

    return pd.concat(
        [
            df,
            df["Type of vehicle"].apply(map_column, args=("Type of vehicle",)),
            df["Fuel type"].apply(map_column, args=("Fuel type",)),
        ],
        axis=1,
    )



[docs]
@lru_cache()
def map_column(value, column):
    """Apply mapping to `value` in `column`."""
    return pd.Series(MAP[column][value], index=MAP[column]["_dims"])