Source code for datenguidepy.query_helper

from datenguidepy.query_builder import Query
from datenguidepy.query_execution import (
    QueryExecutioner,
    ExecutionResults,
    DEFAULT_STATISTICS_META_DATA_PROVIDER,
)
from datenguidepy.translation import DEFAULT_TRANSLATION_PROVIDER, TranslationProvider

from typing import Dict, Any, cast, Optional, List
import pandas as pd
from functools import partial

import os

PACKAGE_DATA_DIR = "package_data"
PACKAGE_DATA_PATH = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), PACKAGE_DATA_DIR
)
ALL_REGIONS: pd.DataFrame = pd.read_csv(
    os.path.join(PACKAGE_DATA_PATH, "regions.csv"), index_col="region_id"
)


[docs]class ConfigMapping: """[summary] :param mapping: [description] :type mapping: Dict[str, Any] """ def __init__(self, mapping: Dict[str, Any]): self._mapping = mapping def __getattr__(self, k: str) -> Any: return self._mapping[k] def __dir__(self): return list(self._mapping.keys()) def __repr__(self): return "\n".join( name.ljust(30, ".") + " " + id for name, id in self._mapping.items() ) def __iter__(self): return self._mapping.values().__iter__()
[docs]def hirachy_up( lowestids: str, hirachy_frame: pd.DataFrame = ALL_REGIONS ) -> pd.DataFrame: """[summary] :param lowestids: [description] :type lowestids: str :param hirachy_frame: [description], defaults to ALL_REGIONS :type hirachy_frame: pd.DataFrame, optional :raises RuntimeError: [description] :raises RuntimeError: [description] :return: [description] :rtype: pd.DataFrame """ anscestors = [] current_ids = lowestids while len(current_ids) > 0: current_regions = hirachy_frame.query("index.isin(@current_ids)") anscestors.append(current_regions) current_ids = current_regions.dropna().parent.unique() return pd.concat(anscestors).sort_index()
[docs]def hirachy_down( highest_ids: str, lowest_level: str = "lau", hirachy_frame: pd.DataFrame = ALL_REGIONS, ) -> pd.DataFrame: """[summary] :param highest_ids: [description] :type highest_ids: str :param lowest_level: [description], defaults to "lau" :type lowest_level: str, optional :param hirachy_frame: [description], defaults to ALL_REGIONS :type hirachy_frame: pd.DataFrame, optional :raises RuntimeError: [description] :raises RuntimeError: [description] :return: [description] :rtype: pd.DataFrame """ descendents = [hirachy_frame.query("index.isin(@highest_ids)")] current_ids = highest_ids while len(current_ids) > 0: current_regions = hirachy_frame.query("parent.isin(@current_ids)") descendents.append(current_regions) current_ids = current_regions.dropna().index.unique() if lowest_level in current_regions.level.unique(): break return pd.concat(descendents).sort_index()
[docs]def siblings( region_id: pd.DataFrame, hirachy_frame: pd.DataFrame = ALL_REGIONS ) -> pd.DataFrame: """[summary] :param region_id: [description] :type region_id: pd.DataFrame :param hirachy_frame: [description], defaults to ALL_REGIONS :type hirachy_frame: pd.DataFrame, optional :raises RuntimeError: [description] :raises RuntimeError: [description] :return: [description] :rtype: pd.DataFrame """ parent = ( # noqa: F841 hirachy_frame.query("index == @region_id").loc[:, "parent"].iloc[0] ) return hirachy_frame.query("parent == @parent")
[docs]def get_regions() -> pd.DataFrame: """List of all the regions and their hierachy structure. This function returns a DataFrame of all the regions. It contains the name of the region and the its id. The latter is required to build queries. Additionally information is provided regarding the hierachy structure by listing the parent region for each region. Furthermore the regions statistical classification (nuts/lau) is provided. To allow for more filter options. For performance reasons this is simply read from disk. The regions are not expected to change significantly over time. Nonetheless an up to date DataFrame can be obtained with download_all_regions :return: DataFrame with all regions. """ return ALL_REGIONS.copy()
state_regions: pd.DataFrame = get_regions().query('level == "nuts1"') federal_state_dictionary = { region.name.replace("-", "_"): region.Index for region in state_regions.itertuples() } federal_states = ConfigMapping(federal_state_dictionary)
[docs]def get_statistics( search: Optional[str] = None, stat_meta_data_provider=None, target_language: str = "de", translation_provider: TranslationProvider = None, ) -> pd.DataFrame: """List of all the currently available statistics. This frunction returns a DataFrame of all available statistics. It contains the statistic code, which is required by the queries. It also contains a short and a long description of each statistic. By default it returns all available statistics, but it also has to option to provide a search keyword in advance. The original statistic description are in Germna, but the function also allows to get a machine translated version for english of these descritpions. :param search: Search term used for non-case-sensitive search in the long description :param translation_provider: Object used for translating the statistics. Defaults to default translation provider if None :param target_language: language to translate statistic descriptions to, Possible values are currently 'de', 'en' for the default translation provider. :param stat_meta_data_provider: Source object used to obtain the statistic descriptions. Uses global default if missing. :return: Table with available statistics. """ if stat_meta_data_provider is None: stat_meta_data_provider = DEFAULT_STATISTICS_META_DATA_PROVIDER if translation_provider is None: translation_provider = DEFAULT_TRANSLATION_PROVIDER if target_language != "de" and not translation_provider.is_valid_language_code( target_language ): valid_language_codes = str(translation_provider.get_valid_language_codes()) raise ValueError( "Target language {0} is invalid or not available for translation provider, " "please use one of {1}".format(target_language, valid_language_codes) ) stat_descr = stat_meta_data_provider.get_stat_descriptions() stat_frame = pd.DataFrame( [(stat, *stat_descr[stat]) for stat in stat_descr], columns=["statistic", "short_description", "long_description"], ).set_index("statistic") if target_language != "de": translation_provider.translate_data_frame_from_german( stat_frame, target_language ) if search is not None: search_string = cast(str, search) # noqa: F841 return stat_frame.query( "short_description.str.contains(@search_string,case=False)" ) else: return stat_frame
[docs]def get_availability_summary() -> pd.DataFrame: """Summary of available data for region/statistic combinations. There are many regions and statistics available within the datenguide API/at the original sources. Nonetheless data is not available for all combinations of statistics and regions. Furthermore some statistics might have been discontinued after a certain point in time. To help with the search for available statistics the function proved results from and availablility analysis for all statistics and all regions for nuts1, nuts2 and nuts3. This function returns the results of this analysis and contains for each analyzed region/statistic pair the corresponding id/code, the number of entries in the database and if applicable the first and last year when this statistic appeared. The function does not contain an overview of the lau regions and it does not contain an overview of possible drilldowns in statstics. For instance is the statstic available for men and women individually on top of its availability for the combined population. :return: Table with available statistics. """ path = os.path.join(PACKAGE_DATA_PATH, "overview.csv") return pd.read_csv(path, converters={"region_id": lambda x: str(x)}).set_index( ["region_id", "statistic"] )
[docs]def download_all_regions() -> pd.DataFrame: """Downloads all current regions and their hierarchy structure. :raises RuntimeError: [description] :raises RuntimeError: [description] :return: [description] :rtype: pd.DataFrame """ def nuts_query(nuts_level): q = Query.all_regions(nuts=nuts_level) return q def lau_query(lau_level): q = Query.all_regions(lau=lau_level) return q qb_all = Query.all_regions() qe = QueryExecutioner() print("start") all_regions = qe.run_query(qb_all) print("all") r_nuts1 = qe.run_query(nuts_query(1)) print("nuts1") r_nuts2 = qe.run_query(nuts_query(2)) print("nuts2") r_nuts3 = qe.run_query(nuts_query(3)) print("nuts3") r_lau1 = qe.run_query(lau_query(1)) print("lau") # currently no distinction between different laus # on datenguide side # r_lau2 = qe.run_query(lau_query(2)) levels = { "nuts1": r_nuts1, "nuts2": r_nuts2, "nuts3": r_nuts3, "lau": r_lau1, # 'lau2':r_lau2 } def isAnscestor(region_id, candidate): """[summary] :param region_id: [description] :type region_id: [type] :param candidate: [description] :type candidate: [type] :return: [description] :rtype: [type] """ return region_id.startswith(candidate) and candidate != region_id def parent(region_id, region_details): """[summary] :param region_id: [description] :type region_id: [type] :param region_details: [description] :type region_details: [type] :return: [description] :rtype: [type] """ desc = region_details.assign( ansc=lambda df: df.index.map(lambda i: isAnscestor(region_id, i)) ).query("ansc") max_lev = desc.level.max() # noqa: F841 parent_frame = desc.query("level == @max_lev") if not parent_frame.empty: return parent_frame.iloc[0, :].name else: None if all_regions is None: raise RuntimeError("Was not able to download all regions") for k in levels: if levels[k] is None: raise RuntimeError(f"Was not able to download {k} regions") all_regions_df = pd.concat( [ pd.DataFrame(page["data"]["allRegions"]["regions"]) for page in cast(List[ExecutionResults], all_regions)[0].query_results ] ).set_index("id") level_df = pd.concat( pd.concat( [ pd.DataFrame(page["data"]["allRegions"]["regions"]) for page in cast(List[ExecutionResults], levels[k])[0].query_results ] ).assign(level=k) for k in levels ) all_rg_parents = all_regions_df.join( level_df.set_index("id").loc[:, "level"] ).assign( parent=lambda df: df.index.map( partial( parent, region_details=all_regions_df.assign( level=lambda df: df.index.map(len) ), ) ) ) all_rg_parents.loc[all_rg_parents.level == "nuts1", "parent"] = "DG" return all_rg_parents