"""Data cleaning code and configuration for T000."""fromfunctoolsimportlru_cacheimportpandasaspdfromitem.utilimportconvert_units,dropna_logged#: iTEM data flow matching the data from this source.DATAFLOW="ACTIVITY"#: Dimensions and attributes which do not vary across this data set.COMMON_DIMS=dict(variable="Activity",# Add the same source to all rows since all data comes from the same sourcesource="International Transport Forum",service="P",# Passengerunit="10^9 passenger-km / yr",# The dataset does not provide any data on the following columns, so we add the# default value of "All" in both casestechnology="_T",automation="_T",operator="_T",)#: Columns to drop from the raw data.COLUMNS=dict(drop=["COUNTRY","VARIABLE","YEAR","Unit","Unit Code","PowerCode Code","PowerCode","Reference Period Code","Reference Period","Flag Codes","Flags",],)defcheck(df):# Input data have the expected unitsassertdf["PowerCode"].unique()==["Millions"]assertdf["Unit"].unique()==["Passenger-kilometres"]
[docs]defprocess(df):"""Process data set T000."""# Drop rows with nulls in "Value"; log corresponding values in "Country"df=dropna_logged(df,"Value",["Country"])# Assigning mode and vehicle type based on the variable namedf=pd.concat([df,df["Variable"].apply(mode_and_vehicle_type)],axis=1)# 1. Drop null values.# 2. Convert to the preferred iTEM units.df=df.dropna().pipe(convert_units,"Mpassenger km/year","Gpassenger km/year")returndf
[docs]@lru_cache()defmode_and_vehicle_type(variable_name):"""Determine 'mode' and 'vehicle type' from 'variable'. The rules implemented are: ============================================= ===== ============ Variable Mode Vehicle type ============================================= ===== ============ Rail passenger transport Rail All Road passenger transport by buses and coaches Road Bus Road passenger transport by passenger cars Road LDV Total inland passenger transport All All ============================================= ===== ============ """if"Rail"invariable_name:mode="Rail"vehicle="_T"elif"Road"invariable_name:mode="Road"if"by buses"invariable_name:vehicle="Bus"elif"by passenger"invariable_name:vehicle="LDV"else:vehicle="_T"else:mode="_T"vehicle="_T"returnpd.Series({"VEHICLE":vehicle,"MODE":mode})