"""Diagnostics for historical data sets."""fromimportlibimportimport_modulefrompathlibimportPathimportpandasaspdfromitem.historicalimportfetch_source,source_str# Quality checksQUALITY=["A001","A002","A003"]# Jinja2 template for diagnostics index pageINDEX_TEMPLATE="""<html><body>{% for group_name, paths in groups.items() %}<h1>{{ group_name|title }}</h1><ul>{% for path in paths %} <li><a href="./{{ path }}">{{ path }}</a></li>{% endfor %}</ul>{% endfor %}</body></html>"""# Template for coverage()COV_TEXT="""{N_area} areas: {areas}{N_measures} measures: {measures}{N_periods} periods: {periods[0]}–{last_period}Measure-by-area coverage:"""
[docs]defcoverage(df,area="COUNTRY",measure="VARIABLE",period="TIME_PERIOD"):"""Return information about the coverage of a data set."""# String reportareas=sorted(df[area].unique())measures=sorted(df[measure].unique())periods=sorted(df[period].unique())result=COV_TEXT.format(N_area=len(areas),areas=" ".join(areas),N_measures=len(measures),measures=measures,N_periods=len(periods),periods=periods,last_period=periods[-1],)counts=df.groupby([measure,area]).count()form,df0indf.groupby(measure):result+=f"\n{m}\n"fora,df1indf0.groupby(area):# Number of observations. Some observations have a status, but no# valueobs=max(counts.xs((m,a))[["value","OBS_STATUS"]])# Observation valuevalues=counts.xs((m,a))["value"]# Periods appearing in this seriesgp=sorted(df1[period].unique())missing=(periods.index(gp[-1])+1-periods.index(gp[0]))-obs# Assemble lineresult+=(f" {a}: {obs} obs {gp[0]}–{gp[-1]}"+(f" ({missing} gaps)"ifmissingelse"")+f"; {values} values\n")returnresult
[docs]defrun_all(output_path):"""Run all diagnostics."""fromzipfileimportZIP_DEFLATED,ZipFilefromjinja2importTemplateoutput_path=Path(output_path)output_path.mkdir(parents=True,exist_ok=True)data_files=[]# Coveragegroups={"Coverage":[],"Quality":[]}forsource_idin[0,1,2,3]:# Output filenamefilename=f"{source_str(source_id)}.txt"groups["Coverage"].append(filename)# Read source datadata_files.append(fetch_source(source_id,use_cache=True))data=pd.read_csv(data_files[-1])# Generate coverage and write to file# TODO this doesn't allow for column names other than the defaults to# coverage(), above; generalize(output_path/filename).write_text(coverage(data))# Quality checksfromitem.historicalimportprocessforcheckinQUALITY:# Importcheck_module=import_module(f"item.historical.diagnostic.{check}")# Output filenamefilename=f"{check}.csv"groups["Quality"].append(filename)data_files.append(output_path/filename)# Generate inputsinputs=[process(arg)forargincheck_module.ARGS]# Compute and savecheck_module.compute(*inputs).to_csv(data_files[-1])# Archive data fileszf=ZipFile(output_path/"data.zip",mode="w",compression=ZIP_DEFLATED,compresslevel=9)forpathindata_files:zf.write(filename=path,arcname=path.name)groups["Cached raw source data"]=["data.zip"]# Generate index filet=Template(INDEX_TEMPLATE)(output_path/"index.html").write_text(t.render(groups=groups))