Source code for item.historical.T002

"""Data cleaning code and configuration for T002."""
from functools import lru_cache

import pandas as pd

from item.util import dropna_logged

#: iTEM data flow matching the data from this source.
DATAFLOW = "ACTIVITY"

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    # Add the same source to all rows since all data comes from the same source
    source="International Transport Forum",
    # Since all the data is associated to "Freight," the Service is "Freight"
    service="Freight",
    vehicle="Container",
    # The dataset does not provide any data on the following columns, so we
    # add the default value of "All" in both cases
    automation="_T",
    fuel="_T",
    operator="_T",
    technology="_T",
)

#: Columns to drop from the raw data.
COLUMNS = dict(
    drop=[
        "COUNTRY",
        "VARIABLE",
        "YEAR",
        "Unit Code",
        "PowerCode Code",
        "PowerCode",
        "Reference Period Code",
        "Reference Period",
        "Flag Codes",
        "Flags",
    ],
    # Column containing country name for determining ISO 3166 alpha-3 codes and
    # iTEM regions. Commented, because this is the default value.
    # country_name='Country',
)


[docs]def process(df):
    """Process data set T002."""
    df = df.pipe(dropna_logged, "Value", ["Country"])

    # Assign 'Mode', 'Variable', and 'Unit' values
    return pd.concat(
        [
            df.drop(columns=["Variable", "Unit"]),
            df["Variable"].apply(map_variable),
            df["Unit"].apply(map_unit),
        ],
        axis=1,
    )


@lru_cache()
def map_variable(value):
    return pd.Series(
        {
            "MODE": "Rail" if "Rail" in value else "Shipping",
            "VARIABLE": "Freight ({})".format("TEU" if "TEU" in value else "Weight"),
        }
    )


@lru_cache()
def map_unit(value):
    return pd.Series({"UNIT": "10^3 tonne / year" if value == "Tonnes" else value})