Source code for item.historical.scripts.T001

"""Data cleaning code and configuration for T001.

This module:

- Detects and corrects :issue:`32`, a data error in the upstream source where China
  observation values for years 1990 to 2001 inclusive are too low by 2 orders of
  magnitude.

"""
import logging

from item.historical.util import dropna_logged
from item.utils import convert_units

log = logging.getLogger(__name__)


#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    # TODO move the comments below into the #: comment above, so they also
    #      appear in the built documentation.
    # There is only one activity being perform in this dataset and that is the
    # "Freight Activity". We are setting, for each row, the variable "Freight
    # Activity"
    variable="Freight Activity",
    # Add the same source to all rows since all data comes from the same source
    source="International Transport Forum",
    # Since all the data is associated to "Freight," the Service is "Freight"
    service="Freight",
    # The dataset does not provide any data about those two columns, so we
    # add the default value of "All" in both cases
    technology="All",
    fuel="All",
    # Since all the data is about shipping, all rows have "Shipping" as mode
    mode="Shipping",
    # Since all the data in this dataset is associted to coastal shipping, the
    # vehicle type is "Coastal"
    vehicle_type="Coastal",
)

#: Columns to drop from the raw data.
COLUMNS = dict(
    drop=[
        "COUNTRY",
        "VARIABLE",
        "YEAR",
        "Flag Codes",
        "Flags",
        "PowerCode Code",
        "PowerCode",
        "Reference Period Code",
        "Reference Period",
        "Unit Code",
        "Unit",
    ],
    # Column containing country name for determining ISO 3166 alpha-3 codes and
    # iTEM regions. Commented, because this is the default value.
    # country_name='Country',
)


[docs]def check(df): """Check data set T001.""" # Input data contain only the expected variable name assert df["Variable"].unique() == ["Coastal shipping (national transport)"] # Input data have the expected units assert df["PowerCode"].unique() == ["Millions"] assert df["Unit"].unique() == ["Tonnes-kilometres"] # Detect #32 # Data for CHN, including one year before and after the error obs = df.query("COUNTRY == 'CHN' and Year >= 1985 and Year <= 2002").set_index( "Year" )["Value"] # Delete the erroneous data empty = obs.copy() empty.iloc[1:-1] = None # Expected values: interpolated between the two correct values expected = empty.interpolate("index") # Ratio of interpolated and observed values is about 100 for the years containing # the error. # TODO if the data is corrected in the original, this assertion will fail; # then remove this code and the corresponding correction in process(), below. assert ((expected / obs).iloc[1:-1] >= 95).all() log.info("Confirmed 10² magnitude error in China 1990–2001")
[docs]def process(df): """Process data set T001. - Drop null values. - Convert from Mt km / year to Gt km / year. """ # Drop rows with nulls in "Value"; log corresponding values in "Country" df = dropna_logged(df, "Value", ["Country"]) # 1. Drop null values. # 2. Convert to the preferred iTEM units. # TODO read the preferred units (here 'Gt km / year') from a common # location df = df.dropna().pipe(convert_units, "Mt km / year", "Gt km / year") # Correct #32 corrected = df.query("Country == 'China' and Year > 1985 and Year < 2002").copy() corrected["Value"] *= 100.0 df.update(corrected) return df