Coverage for filip/utils/data.py: 71%
35 statements
« prev ^ index » next coverage.py v7.4.4, created at 2025-02-19 11:48 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2025-02-19 11:48 +0000
1import os
2import importlib
3import logging
4from pathlib import Path
5import pickle
6from typing import Dict
7import pandas as pd
8from pandas import DataFrame
9from pandas_datapackage_reader import read_datapackage
10from filip.utils.validators import validate_http_url
12logger = logging.getLogger(__name__)
15def load_datapackage(url: str, package_name: str) -> Dict[str, pd.DataFrame]:
16 """
17 Downloads data package from online source and stores it as hdf-file in
18 filip.data named by the <filename>.hdf.
20 Args:
21 url (str): Valid url to where the data package is hosted
22 package_name (str): name of the cached file.
24 Returns:
25 Dict of dataframes
26 """
27 # validate arguments
28 validate_http_url(url=url)
30 # create directory for data if not exists
31 validate_http_url(url=url)
32 path = Path(__file__).parent.parent.absolute().joinpath("data")
33 path.mkdir(parents=True, exist_ok=True)
34 package_path = path.joinpath(package_name)
36 if os.path.isdir(package_path):
37 # read data from filip.data if exists
38 logger.info("Found existing data package in 'filip.data'")
40 data = {}
41 for file in os.listdir(package_path):
42 file_name = file[:-4]
43 # read in each file as one dataframe, prevents the deletion of NaN
44 # values with na_filter=False
45 frame = pd.read_csv(
46 package_path.joinpath(file), index_col=0, header=0, na_filter=False
47 )
48 data[file_name] = frame
50 else:
51 # download external data and store data
52 logger.info(
53 "Could not find data package in 'filip.data'. Will "
54 "try to download from %s",
55 url,
56 )
57 try:
58 data = read_datapackage(url)
59 # rename keys
60 data = {k.replace("-", "_"): v for k, v in data.items()}
61 os.mkdir(package_path)
63 # store data in filip.data
64 for k, v in data.items():
65 v: DataFrame = v
66 v.loc[:, :] = v[:].applymap(str)
67 table_filepath = str(package_path) + f"\\{k.replace('-', '_')}.csv"
68 v.to_csv(table_filepath)
70 except:
71 logger.error("Failed to load data package!")
72 raise
73 return data