Coverage for filip/utils/data.py: 71%

35 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-02-19 11:48 +0000

1import os 

2import importlib 

3import logging 

4from pathlib import Path 

5import pickle 

6from typing import Dict 

7import pandas as pd 

8from pandas import DataFrame 

9from pandas_datapackage_reader import read_datapackage 

10from filip.utils.validators import validate_http_url 

11 

12logger = logging.getLogger(__name__) 

13 

14 

15def load_datapackage(url: str, package_name: str) -> Dict[str, pd.DataFrame]: 

16 """ 

17 Downloads data package from online source and stores it as hdf-file in 

18 filip.data named by the <filename>.hdf. 

19 

20 Args: 

21 url (str): Valid url to where the data package is hosted 

22 package_name (str): name of the cached file. 

23 

24 Returns: 

25 Dict of dataframes 

26 """ 

27 # validate arguments 

28 validate_http_url(url=url) 

29 

30 # create directory for data if not exists 

31 validate_http_url(url=url) 

32 path = Path(__file__).parent.parent.absolute().joinpath("data") 

33 path.mkdir(parents=True, exist_ok=True) 

34 package_path = path.joinpath(package_name) 

35 

36 if os.path.isdir(package_path): 

37 # read data from filip.data if exists 

38 logger.info("Found existing data package in 'filip.data'") 

39 

40 data = {} 

41 for file in os.listdir(package_path): 

42 file_name = file[:-4] 

43 # read in each file as one dataframe, prevents the deletion of NaN 

44 # values with na_filter=False 

45 frame = pd.read_csv( 

46 package_path.joinpath(file), index_col=0, header=0, na_filter=False 

47 ) 

48 data[file_name] = frame 

49 

50 else: 

51 # download external data and store data 

52 logger.info( 

53 "Could not find data package in 'filip.data'. Will " 

54 "try to download from %s", 

55 url, 

56 ) 

57 try: 

58 data = read_datapackage(url) 

59 # rename keys 

60 data = {k.replace("-", "_"): v for k, v in data.items()} 

61 os.mkdir(package_path) 

62 

63 # store data in filip.data 

64 for k, v in data.items(): 

65 v: DataFrame = v 

66 v.loc[:, :] = v[:].applymap(str) 

67 table_filepath = str(package_path) + f"\\{k.replace('-', '_')}.csv" 

68 v.to_csv(table_filepath) 

69 

70 except: 

71 logger.error("Failed to load data package!") 

72 raise 

73 return data