Source code for item.historical.diagnostic

"""Diagnostics for historical data sets."""
from pathlib import Path

import pandas as pd

from item.historical import fetch_source, source_str

from . import A003

# Quality checks
QUALITY = [A003.compute]

# Jinja2 template for diagnostics index page
INDEX_TEMPLATE = """<html><body>
{% for group_name, paths in groups.items() %}
<h1>{{ group_name|title }}</h1>

<ul>
{% for path in paths %}
  <li><a href="./{{ path }}">{{ path }}</a></li>
{% endfor %}
</ul>
{% endfor %}
</body></html>
"""

# Template for coverage()
COV_TEXT = """{N_area} areas: {areas}
{N_measures} measures: {measures}
{N_periods} periods: {periods[0]}–{last_period}

Measure-by-area coverage:
"""


[docs]def coverage(df, area="COUNTRY", measure="VARIABLE", period="TIME_PERIOD"): """Return information about the coverage of a data set.""" # String report areas = sorted(df[area].unique()) measures = sorted(df[measure].unique()) periods = sorted(df[period].unique()) result = COV_TEXT.format( N_area=len(areas), areas=" ".join(areas), N_measures=len(measures), measures=measures, N_periods=len(periods), periods=periods, last_period=periods[-1], ) counts = df.groupby([measure, area]).count() for m, df0 in df.groupby(measure): result += f"\n{m}\n" for a, df1 in df0.groupby(area): # Number of observations. Some observations have a status, but no # value obs = max(counts.xs((m, a))[["value", "OBS_STATUS"]]) # Observation value values = counts.xs((m, a))["value"] # Periods appearing in this series gp = sorted(df1[period].unique()) missing = (periods.index(gp[-1]) + 1 - periods.index(gp[0])) - obs # Assemble line result += ( f" {a}: {obs} obs {gp[0]}–{gp[-1]}" + (f" ({missing} gaps)" if missing else "") + f"; {values} values\n" ) return result
[docs]def run_all(output_path): """Run all diagnostics.""" from zipfile import ZIP_DEFLATED, ZipFile from jinja2 import Template output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) data_files = [] # Coverage groups = {"Coverage": [], "Quality": []} for source_id in [0, 1, 2, 3]: # Output filename filename = f"{source_str(source_id)}.txt" groups["Coverage"].append(filename) # Read source data data_files.append(fetch_source(source_id, use_cache=True)) data = pd.read_csv(data_files[-1]) # Generate coverage and write to file # TODO this doesn't allow for column names other than the defaults to # coverage(), above; generalize (output_path / filename).write_text(coverage(data)) # Quality from item.historical import process for check in QUALITY: # Output filename filename = f"{check.__name__.split('.')[-1]}.csv" groups["Quality"].append(filename) data_files.append(output_path / filename) # TODO this is specific to A003; generalize check(process(3), process(9)).to_csv(data_files[-1]) # Archive data files zf = ZipFile( output_path / "data.zip", mode="w", compression=ZIP_DEFLATED, compresslevel=9 ) for path in data_files: zf.write(filename=path, arcname=path.name) groups["Cached raw source data"] = ["data.zip"] # Generate index file t = Template(INDEX_TEMPLATE) (output_path / "index.html").write_text(t.render(groups=groups))