Source code for item.historical.scripts.T000

"""Data cleaning code and configuration for T000."""
from functools import lru_cache

import pandas as pd

from item.historical.util import dropna_logged
from item.structure import column_name
from item.utils import convert_units

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    variable="Passenger Activity",
    # Add the same source to all rows since all data comes from the same source
    source="International Transport Forum",
    # Since all the data is associated to "Freight," the Service is "Freight"
    service="Passenger",
    # The dataset does not provide any data on the following columns, so we
    # add the default value of "All" in both cases
    technology="All",
    fuel="All",
    unit="10^9 passenger-km / yr",
)

#: Columns to drop from the raw data.
COLUMNS = dict(
    drop=[
        "COUNTRY",
        "VARIABLE",
        "YEAR",
        "Unit",
        "Unit Code",
        "PowerCode Code",
        "PowerCode",
        "Reference Period Code",
        "Reference Period",
        "Flag Codes",
        "Flags",
    ],
    # Column containing country name for determining ISO 3166 alpha-3 codes and
    # iTEM regions. Commented, because this is the default value.
    # country_name='Country',
)


def check(df):
    # Input data have the expected units
    assert df["PowerCode"].unique() == ["Millions"]
    assert df["Unit"].unique() == ["Passenger-kilometres"]


[docs]def process(df): """Process data set T000.""" # Drop rows with nulls in "Value"; log corresponding values in "Country" df = dropna_logged(df, "Value", ["Country"]) # Assigning mode and vehicle type based on the variable name df = pd.concat([df, df["Variable"].apply(mode_and_vehicle_type)], axis=1) # 1. Drop null values. # 2. Convert to the preferred iTEM units. df = df.dropna().pipe(convert_units, "Mpassenger km/year", "Gpassenger km/year") return df
[docs]@lru_cache() def mode_and_vehicle_type(variable_name): """Determine 'mode' and 'vehicle type' from 'variable'. The rules implemented are: ============================================= ===== ============ Variable Mode Vehicle type ============================================= ===== ============ Rail passenger transport Rail All Road passenger transport by buses and coaches Road Bus Road passenger transport by passenger cars Road LDV Total inland passenger transport All All ============================================= ===== ============ """ if "Rail" in variable_name: mode = "Rail" vehicle_type = "All" elif "Road" in variable_name: mode = "Road" if "by buses" in variable_name: vehicle_type = "Bus" elif "by passenger" in variable_name: vehicle_type = "LDV" else: vehicle_type = "All" else: mode = "All" vehicle_type = "All" return pd.Series( [vehicle_type, mode], index=[column_name("VEHICLE"), column_name("MODE")], )