Source code for item.historical.T001

"""Data cleaning code and configuration for T001.

This module:

- Detects and corrects :issue:`32`, a data error in the upstream source where China
  observation values for years 1990 to 2001 inclusive are too low by 2 orders of
  magnitude (see also :issue:`57`).

"""
import logging

from item.util import convert_units, dropna_logged

log = logging.getLogger(__name__)

#: iTEM data flow matching the data from this source.
DATAFLOW = "ACTIVITY"

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    # TODO move the comments below into the #: comment above, so they also
    #      appear in the built documentation.
    # There is only one activity being perform in this dataset and that is the
    # "Freight Activity". We are setting, for each row, the variable "Freight
    # Activity"
    variable="Activity",
    # Add the same source to all rows since all data comes from the same source
    source="International Transport Forum",
    # Since all the data is associated to "Freight," the Service is "Freight"
    service="F",
    # The dataset does not provide any data about those two columns, so we
    # add the default value of "All" in both cases
    technology="_T",
    # Since all the data is about shipping, all rows have "Shipping" as mode
    mode="Shipping",
    # Since all the data in this dataset is associted to coastal shipping, the
    # vehicle type is "Coastal"
    vehicle="Coastal",
    automation="_T",
    operator="_T",
)

#: Columns to drop from the raw data.
COLUMNS = dict(
    drop=[
        "COUNTRY",
        "VARIABLE",
        "YEAR",
        "Flag Codes",
        "Flags",
        "PowerCode Code",
        "PowerCode",
        "Reference Period Code",
        "Reference Period",
        "Unit Code",
        "Unit",
    ],
)

#: Flag for whether :issue:`32` is detected by :func:`check` and should be fixed by
#: :func:`process`.
FIX_32 = False


[docs]def check(df):
    """Check data set T001."""
    # Input data contain only the expected variable name
    assert df["Variable"].unique() == [
        "Coastal shipping (national transport)"
    ], "Values in 'Variable' column"

    # Input data have the expected units
    assert df["PowerCode"].unique() == ["Millions"], "Values in 'PowerCode' column"
    assert df["Unit"].unique() == ["Tonnes-kilometres"], "Values in 'Unit' column"

    # Detect #32
    global FIX_32

    # Data for CHN, including one year before and after the error
    obs = df.query("COUNTRY == 'CHN' and Year >= 1985 and Year <= 2002").set_index(
        "Year"
    )["Value"]
    # Delete the erroneous data
    empty = obs.copy()
    empty.iloc[1:-1] = None

    # Expected values: interpolated between the two correct values
    expected = empty.interpolate("index")

    # Ratio of interpolated and observed values is about 100 for the years containing
    # the error
    check = (expected / obs).iloc[1:-1] >= 95

    if check.all():
        log.info("Confirmed 10² magnitude error in China 1990–2001")
        FIX_32 = True
    elif not check.any():
        log.info("10² magnitude error in China 1990–2001 absent")
    else:
        raise AssertionError(f"Ambiguous:\n{repr(check)}")


[docs]def process(df):
    """Process data set T001.

    - Drop null values.
    - Convert from Mt km / year to Gt km / year.
    """
    # Drop rows with nulls in "Value"; log corresponding values in "Country"
    # TODO read the preferred units (here 'Gt km / year') from a common location
    df = df.pipe(dropna_logged, "Value", ["Country"]).pipe(
        convert_units, "Mt km / year", "Gt km / year"
    )

    # Correct #32
    if FIX_32:
        corrected = df.query(
            "Country == 'China' and Year > 1985 and Year < 2002"
        ).copy()
        corrected["Value"] *= 100.0
        df.update(corrected)

    return df