Source code for item.historical.scripts.T000

"""Data cleaning code and configuration for T000."""
from functools import lru_cache

import pandas as pd

from item.historical.util import dropna_logged
from item.structure import column_name
from item.utils import convert_units

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    variable="Passenger Activity",
    # Add the same source to all rows since all data comes from the same source
    source="International Transport Forum",
    # Since all the data is associated to "Freight," the Service is "Freight"
    service="Passenger",
    # The dataset does not provide any data on the following columns, so we
    # add the default value of "All" in both cases
    technology="All",
    fuel="All",
    unit="10^9 passenger-km / yr",
)

#: Columns to drop from the raw data.
COLUMNS = dict(
    drop=[
        "COUNTRY",
        "VARIABLE",
        "YEAR",
        "Unit",
        "Unit Code",
        "PowerCode Code",
        "PowerCode",
        "Reference Period Code",
        "Reference Period",
        "Flag Codes",
        "Flags",
    ],
    # Column containing country name for determining ISO 3166 alpha-3 codes and
    # iTEM regions. Commented, because this is the default value.
    # country_name='Country',
)


def check(df):
    # Input data have the expected units
    assert df["PowerCode"].unique() == ["Millions"]
    assert df["Unit"].unique() == ["Passenger-kilometres"]


[docs]def process(df):
    """Process data set T000."""
    # Drop rows with nulls in "Value"; log corresponding values in "Country"
    df = dropna_logged(df, "Value", ["Country"])

    # Assigning mode and vehicle type based on the variable name
    df = pd.concat([df, df["Variable"].apply(mode_and_vehicle_type)], axis=1)

    # 1. Drop null values.
    # 2. Convert to the preferred iTEM units.
    df = df.dropna().pipe(convert_units, "Mpassenger km/year", "Gpassenger km/year")

    return df


[docs]@lru_cache()
def mode_and_vehicle_type(variable_name):
    """Determine 'mode' and 'vehicle type' from 'variable'.

    The rules implemented are:

    ============================================= ===== ============
    Variable                                      Mode  Vehicle type
    ============================================= ===== ============
    Rail passenger transport                      Rail  All
    Road passenger transport by buses and coaches Road  Bus
    Road passenger transport by passenger cars    Road  LDV
    Total inland passenger transport              All   All
    ============================================= ===== ============
    """
    if "Rail" in variable_name:
        mode = "Rail"
        vehicle_type = "All"
    elif "Road" in variable_name:
        mode = "Road"

        if "by buses" in variable_name:
            vehicle_type = "Bus"
        elif "by passenger" in variable_name:
            vehicle_type = "LDV"
        else:
            vehicle_type = "All"
    else:
        mode = "All"
        vehicle_type = "All"

    return pd.Series(
        [vehicle_type, mode],
        index=[column_name("VEHICLE"), column_name("MODE")],
    )