Source code for item.historical.T012

"""Data cleaning code and configuration for T012."""

from item.util import convert_units, dropna_logged

#: iTEM data flow matching the data from this source.
DATAFLOW = "POPULATION"

#: Dimensions and attributes which do not vary across this data set.
COMMON_DIMS = dict(
    source="United Nations",
    variable="Population",
    unit="10^6 people",
)

#: Column names:
#:
#: - ``drop``: to drop from the raw data.
#: - ``country_name``: to map to ISO 3166 codes.
COLUMNS = dict(
    drop=[
        "Index",
        "Variant",
        "Notes",
        "Country code",
        "Parent code",
    ],
    #
    country_name="Region, subregion, country or area *",
)



[docs]
def process(df):
    """Process data set T012.

    - Select only rows with ``Type == "Country/Area"``; then drop this column.
    - Rename "Channel Islands" (ISO 3166 numeric code 830) with 831 (Jersey), the larger
      (compared to 832/Guernsey) of the two Channel Islands. Code 830 does not exist.
    - Melt from wide to long format.
    - Remove spaces from strings in the "Value" column; convert to numeric.
    - Drop null values.
    - Convert units from 10³ persons to 10⁶ persons.
    """
    return (
        df.query("Type == 'Country/Area'")
        .drop("Type", axis=1)
        .replace("Channel Islands", "Jersey")
        .melt(
            id_vars=[COLUMNS["country_name"]],
            var_name="TIME_PERIOD",
            value_name="Value",
        )
        .assign(
            Value=lambda df_: df_["Value"]
            .str.replace(" ", "")
            .replace("...", "NaN")
            .astype(float)
        )
        .pipe(dropna_logged, "Value", [COLUMNS["country_name"]])
        .pipe(convert_units, "kpassenger", "Mpassenger")
    )