Source code for filip.utils.data

import os
import importlib
import logging
from pathlib import Path
import pickle
from typing import Dict
import pandas as pd
from pandas import DataFrame
from pandas_datapackage_reader import read_datapackage
from filip.utils.validators import validate_http_url

logger = logging.getLogger(__name__)


[docs]def load_datapackage(url: str, package_name: str) -> Dict[str, pd.DataFrame]: """ Downloads data package from online source and stores it as hdf-file in filip.data named by the <filename>.hdf. Args: url (str): Valid url to where the data package is hosted package_name (str): name of the cached file. Returns: Dict of dataframes """ # validate arguments validate_http_url(url=url) # create directory for data if not exists validate_http_url(url=url) path = Path(__file__).parent.parent.absolute().joinpath('data') path.mkdir(parents=True, exist_ok=True) package_path = path.joinpath(package_name) if os.path.isdir(package_path): # read data from filip.data if exists logger.info("Found existing data package in 'filip.data'") data = {} for file in os.listdir(package_path): file_name = file[:-4] # read in each file as one dataframe, prevents the deletion of NaN # values with na_filter=False frame = pd.read_csv(package_path.joinpath(file), index_col=0, header=0, na_filter=False) data[file_name] = frame else: # download external data and store data logger.info("Could not find data package in 'filip.data'. Will " "try to download from %s", url) try: data = read_datapackage(url) # rename keys data = {k.replace('-', '_'): v for k, v in data.items()} os.mkdir(package_path) # store data in filip.data for k, v in data.items(): v: DataFrame = v v.loc[:, :] = v[:].applymap(str) table_filepath = \ str(package_path) + f"\\{k.replace('-', '_')}.csv" v.to_csv(table_filepath) except: logger.error("Failed to load data package!") raise return data