Coverage for filip/utils/data.py: 71%
35 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-11-20 16:54 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-11-20 16:54 +0000
1import os
2import importlib
3import logging
4from pathlib import Path
5import pickle
6from typing import Dict
7import pandas as pd
8from pandas import DataFrame
9from pandas_datapackage_reader import read_datapackage
10from filip.utils.validators import validate_http_url
12logger = logging.getLogger(__name__)
15def load_datapackage(url: str, package_name: str) -> Dict[str, pd.DataFrame]:
16 """
17 Downloads data package from online source and stores it as hdf-file in
18 filip.data named by the <filename>.hdf.
20 Args:
21 url (str): Valid url to where the data package is hosted
22 package_name (str): name of the cached file.
24 Returns:
25 Dict of dataframes
26 """
27 # validate arguments
28 validate_http_url(url=url)
30 # create directory for data if not exists
31 validate_http_url(url=url)
32 path = Path(__file__).parent.parent.absolute().joinpath('data')
33 path.mkdir(parents=True, exist_ok=True)
34 package_path = path.joinpath(package_name)
36 if os.path.isdir(package_path):
37 # read data from filip.data if exists
38 logger.info("Found existing data package in 'filip.data'")
40 data = {}
41 for file in os.listdir(package_path):
42 file_name = file[:-4]
43 # read in each file as one dataframe, prevents the deletion of NaN
44 # values with na_filter=False
45 frame = pd.read_csv(package_path.joinpath(file),
46 index_col=0,
47 header=0,
48 na_filter=False)
49 data[file_name] = frame
51 else:
52 # download external data and store data
53 logger.info("Could not find data package in 'filip.data'. Will "
54 "try to download from %s", url)
55 try:
56 data = read_datapackage(url)
57 # rename keys
58 data = {k.replace('-', '_'): v for k, v in data.items()}
59 os.mkdir(package_path)
61 # store data in filip.data
62 for k, v in data.items():
63 v: DataFrame = v
64 v.loc[:, :] = v[:].applymap(str)
65 table_filepath = \
66 str(package_path) + f"\\{k.replace('-', '_')}.csv"
67 v.to_csv(table_filepath)
69 except:
70 logger.error("Failed to load data package!")
71 raise
72 return data