Source code for item.model

import errno
import os
import pickle
from importlib import import_module
from os import makedirs
from os.path import join
from typing import Dict

import pandas as pd
import pycountry
import xarray as xr
import yaml

from item.common import log, paths
from item.model.common import as_xarray, concat_versions, select, tidy, to_wide
from item.model.dimensions import INDEX, load_template

__all__ = [

# Versions of the database
VERSIONS = [1, 2]

# Information about the models
MODELS: Dict[str, dict] = {}

[docs]def coverage(models): """Display some basic data coverage information.""" log("Checking data coverage.\n") # Accumulate a list of xr.DataArrays to later concatenate@ result = [] # Load the list of requested quantities qty = load_template(paths["model data"]) # Find True/not-null values and sum to get the number of requested # quantities for each variable req = qty.notnull().sum(["Mode", "Technology", "Fuel"]).to_array(name="Requested") log("Quantities requested in reporting template: %d\n", req.sum()) result.append((req, "Requested")) # Iterate through models for name in sorted(models.keys()): if name == "itf" or name == "exxonmobil" or name == "roadmap": # Skip due to a data issue continue log("Loading data for %s" % name) # Load model data df = pd.read_csv(os.path.join(paths["model data"], "model", name, "data.csv")) log(df.head()) # Convert to an xr.Dataset, then count non-null values. We consider a # series populated if it has a data value for *any* scenario, region # and year. counts = ( as_xarray(df) .notnull() .any(["Scenario", "Region", "Year"]) .sum(["Mode", "Technology", "Fuel"]) .to_array() ) result.append((counts, name)) # Make two separate lists of the DataArrays and labels data, labels = zip(*result) # Combine to a single Dataset df = ( xr.concat(data, pd.Index(labels, name="model")) .fillna(0) .to_dataframe() .unstack("model") ) # Compute some totals df.columns = df.columns.droplevel(0) df["# of models"] = (df.loc[:, "bp":] > 0).sum(axis="columns") df.loc["Total", :] = df.sum(axis="rows") df = df.astype(int) log(df) df.to_csv(os.path.join(paths["model data"], "output", "coverage.csv"))
def get_model_info(name, version): load_models_info() try: model_info = MODELS[name] if version in model_info["versions"]: return model_info else: raise ValueError( "model '{}' not present in database version {}".format(name, version) ) except KeyError: raise ValueError(f"Model {repr(name)} not among {MODELS.keys()}") def get_model_names(version=VERSIONS[-1]): """Return the names of all models in *version*.""" load_models_info() result = [] for name, info in MODELS.items(): if version in info["versions"]: result.append(name) return result def process_raw(version, models): """Process raw data submissions. Data for MODELS are imported from the raw data directory. """ # Process arguments models = models if len(models) else get_model_names(version) log("Processing raw data for: {}".format(" ".join(models))) class _csv_model: def import_data(self, data_path, metadata_path): return pd.read_csv(data_path), None for name in models: try: info = get_model_info(name, version) except KeyError: log(" unknown model '%s', skipping" % name) continue if info["format"] == "csv": model = _csv_model() elif info["format"] is None: log(" model '{}' needs no import".format(name)) continue else: model = import_module("item.model.%s" % name) _process_raw(name, model, version, info) def _process_raw(name, model, version, info): log("Processing raw data for {}".format(name)) # Path to raw data: this hold the contents of the Dropbox folder # 'ITEM2/Scenario_data_for_comparison/Data_submission_1/Raw_data' raw_data = join( paths["model raw"], str(version), "{}.{}".format(name, info["format"]) ) metadata = join(paths["data"], "model", name) log(" raw data: {}\n metadata: {}".format(raw_data, metadata)) # Load the data data, notes = model.import_data(raw_data, metadata) # Put columns in a canonical order data = tidy(data) # Log some diagnostic information iy = list(set(data.columns) - set(INDEX)) log(" %d non-zero values beginning %s", data.loc[:, iy].notnull().sum().sum(), iy) # Create a subdirectory under item2-data/model, if it does not already # exist model_dir = join(paths["model processed"], str(version), name) makedirs(model_dir, exist_ok=True) # TODO log the last-changed date of the file used for import, or a # checksum # Write data data.to_csv( join(paths["model processed"], str(version), "%s.csv" % name), index=False ) # Write the region list for this model pd.Series(data["region"].unique(), name="region").to_csv( join(model_dir, "region.csv"), index=False ) # Write the model comments try: notes.to_csv(join(model_dir, "note.csv"), index=False) except AttributeError: # notes == None; no comments provided for this data set pass
[docs]def load_model_data( version, skip_cache=False, cache=True, fmt=pd.DataFrame, options=[] ): """Load model database""" # Check arguments version = int(version) try: path = paths["models-%d" % version] except KeyError: raise ValueError("invalid model database version: %s" % version) if fmt not in [pd.DataFrame, xr.DataArray, xr.Dataset]: raise ValueError("unknown return format: %s" % fmt) # Path for cached data cache_path = os.path.join(paths["cache"], "model-%d.pkl" % version) data = None # Read data from cache if not skip_cache: try: with open(cache_path, "rb") as f: data = pickle.load(f) except OSError as e: if e.errno == errno.ENOENT: # No such file or directory pass # Read data from file if data is None: data = tidy(pd.read_csv(path)) # Convert to long format, drop empty rows data = pd.melt(data, id_vars=INDEX, var_name="year").dropna(subset=["value"]) # Cache the result if cache: with open(cache_path, "wb") as f: pickle.dump(data, f) # Optional additional processing if "squash scenarios" in options: data = squash_scenarios(data, version) options.remove("squash scenarios") if len(options): raise ValueError if fmt in [xr.Dataset, xr.DataArray]: # Convert to an xarray format return as_xarray(data, version, fmt) else: # return as-is return data
def load_models_info(): """Load the models metadata into the MODELS global.""" global MODELS if len(MODELS) > 0: # Already loaded return with open(join(paths["data"], "model", "models.yaml")) as f: MODELS = yaml.safe_load(f) def load_model_regions(name, version): """Load regions.yaml for model *name* in database *version*. Returns a dictionary where: - Keys are codes or names of model regions. - Values are dictionaries with the keys: - description (optional): a longer name or description of the region - countries: a list of ISO 3166 alpha-3 codes for countries in the region. """ # IDEA load from either regions-1.yaml or regions-2.yaml try: get_model_info(name, version) except Exception: if name.lower() == "item": # Use an empty path in the join() call below; this causes the # overall regions.yaml to be loaded name = "" else: raise with open(join(paths["data"], "model", name, "regions.yaml")) as f: return yaml.safe_load(f)
[docs]def load_model_scenarios(name, version): """Load scenarios.yaml for model *name* in database *version*. Returns a dictionay where: - Keys are codes or names of scenarios. - Values are dictionaries with the key: - ``category``: either 'reference' or 'policy'. """ # Don't do anything with the return value; just check arguments get_model_info(name, version) with open(join(paths["data"], "model", name, "scenarios.yaml")) as f: return yaml.safe_load(f)[version]
[docs]def make_regions_csv(out_file, models=None, compare=None): """Produce a CSV *out_file* with a country→region map for *models*. The table is created by parsing the regions.yaml files in the iTEM model database metadata. It is indexed by ISO 3166 (alpha-3) codes, and has one column for each model in *models* (if no models are specified, all models are included). If *compare* is given, the table has entries only where the generated value and """ version = VERSIONS[-1] # Version 2 only models = models or get_model_names(version) def _load(name): def _invert(data): result = {} for k, v in data.items(): result.update({c: k for c in v["countries"]}) return result return pd.Series( _invert(load_model_regions(name, version)), name=name if len(name) else "item", ) result = pd.concat([_load(model) for model in ["item"] + models], axis=1) def _get_name(row): error = None try: name = pycountry.countries.get( except AttributeError: try: name = pycountry.historic_countries.get( error = "historical" except AttributeError: name = "" error = "nonexistent" finally: print( "{} ISO 3166 code '{}' in models: {}".format( error,, ", ".join(row.dropna().index) ) ) return name result["name"] = result.apply(_get_name, axis=1) if compare is not None: other = pd.read_csv(compare) other.columns = map(str.lower, other.columns) other.set_index("iso", inplace=True) other.index = map(str.upper, other.index) result = result.where( with open(out_file, "w") as f: result.to_csv(f)
[docs]def make_regions_yaml(in_file, country, region, out_file): """Convert a country→region map from CSV *in_file* to YAML *out_file*. *country* and *region* are columns in *in_file* with country codes and region names, respectively. """ data = pd.read_csv(in_file)[[region, country]].sort_values([region, country]) data[country] = data[country].apply(str.upper) result = {} for region, group in data.groupby(region): result[region] = dict(description="", countries=list(group[country])) with open(out_file, "w") as f: yaml.dump(result, f, default_flow_style=False)
[docs]def squash_scenarios(data, version): """Replace the per-model scenario names with scenario categories. *data* is a pd.DataFrame. *version* is the version of the iTEM model database. """ # Construct the map from model metadata scenarios_map = {} for model in get_model_names(version): for s, info in load_model_scenarios(model, version).items(): scenarios_map[s] = info["category"] return data.replace({"scenario": scenarios_map})