Source code for item.historical.T002

"""Data cleaning code and configuration for T002."""
from functools import lru_cache

import pandas as pd

from item.util import dropna_logged

#: iTEM data flow matching the data from this source.
DATAFLOW = "ACTIVITY"

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    # Add the same source to all rows since all data comes from the same source
    source="International Transport Forum",
    # Since all the data is associated to "Freight," the Service is "Freight"
    service="Freight",
    vehicle="Container",
    # The dataset does not provide any data on the following columns, so we
    # add the default value of "All" in both cases
    automation="_T",
    fuel="_T",
    operator="_T",
    technology="_T",
)

#: Columns to drop from the raw data.
COLUMNS = dict(
    drop=[
        "COUNTRY",
        "VARIABLE",
        "YEAR",
        "Unit Code",
        "PowerCode Code",
        "PowerCode",
        "Reference Period Code",
        "Reference Period",
        "Flag Codes",
        "Flags",
    ],
    # Column containing country name for determining ISO 3166 alpha-3 codes and
    # iTEM regions. Commented, because this is the default value.
    # country_name='Country',
)


[docs]def process(df): """Process data set T002.""" df = df.pipe(dropna_logged, "Value", ["Country"]) # Assign 'Mode', 'Variable', and 'Unit' values return pd.concat( [ df.drop(columns=["Variable", "Unit"]), df["Variable"].apply(map_variable), df["Unit"].apply(map_unit), ], axis=1, )
@lru_cache() def map_variable(value): return pd.Series( { "MODE": "Rail" if "Rail" in value else "Shipping", "VARIABLE": "Freight ({})".format("TEU" if "TEU" in value else "Weight"), } ) @lru_cache() def map_unit(value): return pd.Series({"UNIT": "10^3 tonne / year" if value == "Tonnes" else value})