Source code for datenguidepy.query_helper

from datenguidepy.query_builder import Query
from datenguidepy.query_execution import (
    QueryExecutioner,
    ExecutionResults,
    DEFAULT_STATISTICS_META_DATA_PROVIDER,
)
from datenguidepy.translation import DEFAULT_TRANSLATION_PROVIDER, TranslationProvider

from typing import Dict, Any, cast, Optional, List
import pandas as pd
from functools import partial

import os

PACKAGE_DATA_DIR = "package_data"
PACKAGE_DATA_PATH = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), PACKAGE_DATA_DIR
)
ALL_REGIONS: pd.DataFrame = pd.read_csv(
    os.path.join(PACKAGE_DATA_PATH, "regions.csv"), index_col="region_id"
)


[docs]class ConfigMapping:
    """[summary]

    :param mapping: [description]
    :type mapping: Dict[str, Any]
    """

    def __init__(self, mapping: Dict[str, Any]):

        self._mapping = mapping

    def __getattr__(self, k: str) -> Any:
        return self._mapping[k]

    def __dir__(self):
        return list(self._mapping.keys())

    def __repr__(self):
        return "\n".join(
            name.ljust(30, ".") + " " + id for name, id in self._mapping.items()
        )

    def __iter__(self):
        return self._mapping.values().__iter__()


[docs]def hirachy_up(
    lowestids: str, hirachy_frame: pd.DataFrame = ALL_REGIONS
) -> pd.DataFrame:
    """[summary]

    :param lowestids: [description]
    :type lowestids: str
    :param hirachy_frame: [description], defaults to ALL_REGIONS
    :type hirachy_frame: pd.DataFrame, optional
    :raises RuntimeError: [description]
    :raises RuntimeError: [description]
    :return: [description]
    :rtype: pd.DataFrame
    """
    anscestors = []
    current_ids = lowestids
    while len(current_ids) > 0:
        current_regions = hirachy_frame.query("index.isin(@current_ids)")
        anscestors.append(current_regions)
        current_ids = current_regions.dropna().parent.unique()
    return pd.concat(anscestors).sort_index()


[docs]def hirachy_down(
    highest_ids: str,
    lowest_level: str = "lau",
    hirachy_frame: pd.DataFrame = ALL_REGIONS,
) -> pd.DataFrame:
    """[summary]

    :param highest_ids: [description]
    :type highest_ids: str
    :param lowest_level: [description], defaults to "lau"
    :type lowest_level: str, optional
    :param hirachy_frame: [description], defaults to ALL_REGIONS
    :type hirachy_frame: pd.DataFrame, optional
    :raises RuntimeError: [description]
    :raises RuntimeError: [description]
    :return: [description]
    :rtype: pd.DataFrame
    """
    descendents = [hirachy_frame.query("index.isin(@highest_ids)")]
    current_ids = highest_ids
    while len(current_ids) > 0:
        current_regions = hirachy_frame.query("parent.isin(@current_ids)")
        descendents.append(current_regions)
        current_ids = current_regions.dropna().index.unique()
        if lowest_level in current_regions.level.unique():
            break
    return pd.concat(descendents).sort_index()


[docs]def siblings(
    region_id: pd.DataFrame, hirachy_frame: pd.DataFrame = ALL_REGIONS
) -> pd.DataFrame:
    """[summary]

    :param region_id: [description]
    :type region_id: pd.DataFrame
    :param hirachy_frame: [description], defaults to ALL_REGIONS
    :type hirachy_frame: pd.DataFrame, optional
    :raises RuntimeError: [description]
    :raises RuntimeError: [description]
    :return: [description]
    :rtype: pd.DataFrame
    """
    parent = (  # noqa: F841
        hirachy_frame.query("index == @region_id").loc[:, "parent"].iloc[0]
    )
    return hirachy_frame.query("parent == @parent")


[docs]def get_regions() -> pd.DataFrame:
    """List of all the regions and their hierachy structure.

    This function returns a DataFrame of all the regions.
    It contains the name of the region and the its id.
    The latter is required to build queries. Additionally
    information is provided regarding the hierachy structure by
    listing the parent region for each region. Furthermore
    the regions statistical classification (nuts/lau) is provided.
    To allow for more filter options.

    For performance reasons this is simply read from disk.
    The regions are not expected to change significantly over time.
    Nonetheless an up to date DataFrame can be obtained with
    download_all_regions

    :return: DataFrame with all regions.
    """
    return ALL_REGIONS.copy()


state_regions: pd.DataFrame = get_regions().query('level == "nuts1"')
federal_state_dictionary = {
    region.name.replace("-", "_"): region.Index for region in state_regions.itertuples()
}

federal_states = ConfigMapping(federal_state_dictionary)


[docs]def get_statistics(
    search: Optional[str] = None,
    stat_meta_data_provider=None,
    target_language: str = "de",
    translation_provider: TranslationProvider = None,
) -> pd.DataFrame:
    """List of all the currently available statistics.

    This frunction returns a DataFrame of all available statistics.
    It contains the statistic code, which is required by the queries.
    It also contains a short and a long description of each statistic.
    By default it returns all available statistics, but it also
    has to option to provide a search keyword in advance.

    The original statistic description are in Germna, but the function
    also allows to get a machine translated version for english of these
    descritpions.

    :param search: Search term used for non-case-sensitive
        search in the long description
    :param translation_provider: Object used for translating the statistics.
        Defaults to  default translation provider if None
    :param target_language: language to translate statistic descriptions to,
        Possible values are currently 'de', 'en' for the default translation
        provider.
    :param stat_meta_data_provider: Source object used to obtain the
        statistic descriptions. Uses global default if missing.
    :return: Table with available statistics.
    """
    if stat_meta_data_provider is None:
        stat_meta_data_provider = DEFAULT_STATISTICS_META_DATA_PROVIDER

    if translation_provider is None:
        translation_provider = DEFAULT_TRANSLATION_PROVIDER

    if target_language != "de" and not translation_provider.is_valid_language_code(
        target_language
    ):
        valid_language_codes = str(translation_provider.get_valid_language_codes())
        raise ValueError(
            "Target language {0} is invalid or not available for translation provider, "
            "please use one of {1}".format(target_language, valid_language_codes)
        )

    stat_descr = stat_meta_data_provider.get_stat_descriptions()

    stat_frame = pd.DataFrame(
        [(stat, *stat_descr[stat]) for stat in stat_descr],
        columns=["statistic", "short_description", "long_description"],
    ).set_index("statistic")

    if target_language != "de":
        translation_provider.translate_data_frame_from_german(
            stat_frame, target_language
        )

    if search is not None:
        search_string = cast(str, search)  # noqa: F841
        return stat_frame.query(
            "short_description.str.contains(@search_string,case=False)"
        )
    else:
        return stat_frame


[docs]def get_availability_summary() -> pd.DataFrame:
    """Summary of available data for region/statistic combinations.

    There are many regions and statistics available within the
    datenguide API/at the original sources. Nonetheless data is not
    available for all combinations of statistics and regions.
    Furthermore some statistics might have been discontinued after
    a certain point in time.

    To help with the search for available statistics the function
    proved results from and availablility analysis for all
    statistics and all regions for nuts1, nuts2 and nuts3.
    This function returns the results of this analysis and contains
    for each analyzed region/statistic pair the corresponding
    id/code, the number of entries in the database and if applicable
    the first and last year when this statistic appeared.

    The function does not contain an overview of the lau regions
    and it does not contain an overview of possible drilldowns
    in statstics. For instance is the statstic available for men
    and women individually on top of its availability for the
    combined population.


    :return: Table with available statistics.
    """

    path = os.path.join(PACKAGE_DATA_PATH, "overview.csv")
    return pd.read_csv(path, converters={"region_id": lambda x: str(x)}).set_index(
        ["region_id", "statistic"]
    )


[docs]def download_all_regions() -> pd.DataFrame:
    """Downloads all current regions and their hierarchy structure.

    :raises RuntimeError: [description]
    :raises RuntimeError: [description]
    :return: [description]
    :rtype: pd.DataFrame
    """

    def nuts_query(nuts_level):
        q = Query.all_regions(nuts=nuts_level)
        return q

    def lau_query(lau_level):
        q = Query.all_regions(lau=lau_level)
        return q

    qb_all = Query.all_regions()

    qe = QueryExecutioner()
    print("start")
    all_regions = qe.run_query(qb_all)
    print("all")
    r_nuts1 = qe.run_query(nuts_query(1))
    print("nuts1")
    r_nuts2 = qe.run_query(nuts_query(2))
    print("nuts2")
    r_nuts3 = qe.run_query(nuts_query(3))
    print("nuts3")
    r_lau1 = qe.run_query(lau_query(1))
    print("lau")
    # currently no distinction between different laus
    # on datenguide side
    # r_lau2 = qe.run_query(lau_query(2))

    levels = {
        "nuts1": r_nuts1,
        "nuts2": r_nuts2,
        "nuts3": r_nuts3,
        "lau": r_lau1,
        # 'lau2':r_lau2
    }

    def isAnscestor(region_id, candidate):
        """[summary]

        :param region_id: [description]
        :type region_id: [type]
        :param candidate: [description]
        :type candidate: [type]
        :return: [description]
        :rtype: [type]
        """
        return region_id.startswith(candidate) and candidate != region_id

    def parent(region_id, region_details):
        """[summary]

        :param region_id: [description]
        :type region_id: [type]
        :param region_details: [description]
        :type region_details: [type]
        :return: [description]
        :rtype: [type]
        """
        desc = region_details.assign(
            ansc=lambda df: df.index.map(lambda i: isAnscestor(region_id, i))
        ).query("ansc")
        max_lev = desc.level.max()  # noqa: F841
        parent_frame = desc.query("level == @max_lev")
        if not parent_frame.empty:
            return parent_frame.iloc[0, :].name
        else:
            None

    if all_regions is None:
        raise RuntimeError("Was not able to download all regions")

    for k in levels:
        if levels[k] is None:
            raise RuntimeError(f"Was not able to download {k} regions")

    all_regions_df = pd.concat(
        [
            pd.DataFrame(page["data"]["allRegions"]["regions"])
            for page in cast(List[ExecutionResults], all_regions)[0].query_results
        ]
    ).set_index("id")

    level_df = pd.concat(
        pd.concat(
            [
                pd.DataFrame(page["data"]["allRegions"]["regions"])
                for page in cast(List[ExecutionResults], levels[k])[0].query_results
            ]
        ).assign(level=k)
        for k in levels
    )

    all_rg_parents = all_regions_df.join(
        level_df.set_index("id").loc[:, "level"]
    ).assign(
        parent=lambda df: df.index.map(
            partial(
                parent,
                region_details=all_regions_df.assign(
                    level=lambda df: df.index.map(len)
                ),
            )
        )
    )
    all_rg_parents.loc[all_rg_parents.level == "nuts1", "parent"] = "DG"

    return all_rg_parents
Source code for datenguidepy.query_helper

Datenguide Python

Navigation

Related Topics