Source code for item.historical.scripts.util.managers.dataframe

import os
from enum import Enum

import pandas as pd


[docs]class ColumnName(Enum):
    """Column names for processed historical data.

    The order of definition below is the standard order.
    """

    SOURCE = "Source"
    COUNTRY = "Country"
    ISO_CODE = "ISO Code"
    ITEM_REGION = "Region"
    VARIABLE = "Variable"
    UNIT = "Unit"
    SERVICE = "Service"
    MODE = "Mode"
    VEHICLE_TYPE = "Vehicle Type"
    TECHNOLOGY = "Technology"
    FUEL = "Fuel"
    VALUE = "Value"
    YEAR = "Year"
    ID = "ID"


class DataframeManager:
    def __init__(self, dataset_id):
        self.dataset_id = dataset_id
        self.column_order = [ev.value for ev in ColumnName]
        self.column_order.pop()  # Remove 'ID' from the list

    def get_dataframe_from_csv_file(self, path_to_file, delimeter=","):
        if os.path.exists(path_to_file):
            return pd.read_csv(path_to_file, delimeter)
        else:
            return pd.DataFrame({"Empty": []})

    def simple_column_insert(self, dataframe, column_name, cell_value, position=0):
        column_content = [cell_value] * len(dataframe)
        dataframe.insert(position, column_name, column_content, True)

    def reorder_columns(self, dataframe):
        return dataframe.reindex(columns=self.column_order)

    def rename_column(self, df, current_name, new_name):
        df.rename(columns={current_name: new_name}, inplace=True)

    def create_programming_friendly_file(self, dataframe):
        dataframe[ColumnName.ID.value] = [self.dataset_id] * len(dataframe)
        filename = "{}_cleaned_PF.csv".format(self.dataset_id)
        cwd = os.getcwd()
        path = "{}/{}".format(cwd, filename)
        dataframe.to_csv(path, index=False)
        print("> PF File saved at: {}".format(cwd))

    def create_user_friendly_file(self, df):
        # Get the columns to preserve
        columns_to_preserve = self.column_order.copy()
        columns_to_preserve.remove(ColumnName.YEAR.value)
        columns_to_preserve.remove(ColumnName.VALUE.value)

        # Grouping by country
        group_by_country = df.groupby(df.Country)

        # Getting the list of countries
        list_of_countries = list(group_by_country.groups.keys())

        # Saving the dict of all the final dataframes of each country
        dict_of_final_dataframes_per_country = {}

        # For each country, perform the following algorithm
        for country in list_of_countries:

            # Get the df corresponding to the given country
            df_country_X = group_by_country.get_group(country)

            # Get the list of years available for the given year
            list_of_years_for_country_X = list(set(df_country_X[ColumnName.YEAR.value]))

            # Group the data of country X by year
            group_by_year_country_X = df_country_X.groupby(
                df_country_X[ColumnName.YEAR.value]
            )

            # Create a structure that will hold the dataframes of each year
            df_per_year_for_country_X = {}

            # Obtain the dataframe for each year
            for name, group in group_by_year_country_X:
                df_per_year_for_country_X[name] = group

            # Do the necessary processing required in the DF of each year
            for year in list_of_years_for_country_X:

                # Obtain the dataframe for country X in year Y
                df_country_X_in_year_Y = df_per_year_for_country_X[year]

                # Renaming and droping columns
                df_country_X_in_year_Y.rename(
                    columns={ColumnName.VALUE.value: year}, inplace=True
                )
                df_country_X_in_year_Y.drop(
                    columns=[ColumnName.YEAR.value], inplace=True
                )

            # Concatenating all the dataframes of a given country into one df
            list_of_all_df_for_country_X = list(df_per_year_for_country_X.values())
            df_concat_all_dfs_for_country_x = pd.concat(
                list_of_all_df_for_country_X,
                sort=False,
                verify_integrity=True,
                join="outer",
            )

            # Creating the final df for country X by eliminating all NAN
            final_df_for_country_x = (
                df_concat_all_dfs_for_country_x.groupby(columns_to_preserve)[
                    list_of_years_for_country_X
                ]
                .first()
                .reset_index()
            )

            # Saving the final df of country X in the list of all countries df
            dict_of_final_dataframes_per_country[country] = final_df_for_country_x

        # Concatenate all the dataframes of the countries
        list_df_for_all_countries_final = list(
            dict_of_final_dataframes_per_country.values()
        )
        df_with_all_countries_data = pd.concat(
            list_df_for_all_countries_final,
            sort=False,
            verify_integrity=True,
            join="outer",
            ignore_index=True,
        )

        # Reordering the dataframe columns
        all_column_names = set(df_with_all_countries_data.keys())
        none_year_columns = set(columns_to_preserve)
        numberic_columns = list(all_column_names - none_year_columns)
        numberic_columns.sort()
        order_of_columns = columns_to_preserve + numberic_columns
        df_with_all_countries_data = df_with_all_countries_data.reindex(
            columns=order_of_columns
        )

        # Setting the column id for the dataframe
        df_with_all_countries_data[ColumnName.ID.value] = [self.dataset_id] * len(
            df_with_all_countries_data
        )

        # Exporting the final dataframe
        filename = "{}_cleaned_UF.csv".format(self.dataset_id)
        cwd = os.getcwd()
        path = "{}/{}".format(cwd, filename)
        df_with_all_countries_data.to_csv(path, index=False)
        print("> UF File saved at: {}".format(cwd))