Source code for item.historical.T000

"""Data cleaning code and configuration for T000."""
from functools import lru_cache

import pandas as pd

from item.util import convert_units, dropna_logged

#: iTEM data flow matching the data from this source.
DATAFLOW = "ACTIVITY"

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    variable="Activity",
    # Add the same source to all rows since all data comes from the same source
    source="International Transport Forum",
    service="P",  # Passenger
    unit="10^9 passenger-km / yr",
    # The dataset does not provide any data on the following columns, so we add the
    # default value of "All" in both cases
    technology="_T",
    automation="_T",
    operator="_T",
)

#: Columns to drop from the raw data.
COLUMNS = dict(
    drop=[
        "COUNTRY",
        "VARIABLE",
        "YEAR",
        "Unit",
        "Unit Code",
        "PowerCode Code",
        "PowerCode",
        "Reference Period Code",
        "Reference Period",
        "Flag Codes",
        "Flags",
    ],
)


def check(df):
    # Input data have the expected units
    assert df["PowerCode"].unique() == ["Millions"]
    assert df["Unit"].unique() == ["Passenger-kilometres"]


[docs]def process(df): """Process data set T000.""" # Drop rows with nulls in "Value"; log corresponding values in "Country" df = dropna_logged(df, "Value", ["Country"]) # Assigning mode and vehicle type based on the variable name df = pd.concat([df, df["Variable"].apply(mode_and_vehicle_type)], axis=1) # 1. Drop null values. # 2. Convert to the preferred iTEM units. df = df.dropna().pipe(convert_units, "Mpassenger km/year", "Gpassenger km/year") return df
[docs]@lru_cache() def mode_and_vehicle_type(variable_name): """Determine 'mode' and 'vehicle type' from 'variable'. The rules implemented are: ============================================= ===== ============ Variable Mode Vehicle type ============================================= ===== ============ Rail passenger transport Rail All Road passenger transport by buses and coaches Road Bus Road passenger transport by passenger cars Road LDV Total inland passenger transport All All ============================================= ===== ============ """ if "Rail" in variable_name: mode = "Rail" vehicle = "_T" elif "Road" in variable_name: mode = "Road" if "by buses" in variable_name: vehicle = "Bus" elif "by passenger" in variable_name: vehicle = "LDV" else: vehicle = "_T" else: mode = "_T" vehicle = "_T" return pd.Series({"VEHICLE": vehicle, "MODE": mode})