Coverage for filip/utils/data.py: 71%

35 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-11-20 16:54 +0000

1import os 

2import importlib 

3import logging 

4from pathlib import Path 

5import pickle 

6from typing import Dict 

7import pandas as pd 

8from pandas import DataFrame 

9from pandas_datapackage_reader import read_datapackage 

10from filip.utils.validators import validate_http_url 

11 

12logger = logging.getLogger(__name__) 

13 

14 

15def load_datapackage(url: str, package_name: str) -> Dict[str, pd.DataFrame]: 

16 """ 

17 Downloads data package from online source and stores it as hdf-file in 

18 filip.data named by the <filename>.hdf. 

19 

20 Args: 

21 url (str): Valid url to where the data package is hosted 

22 package_name (str): name of the cached file. 

23 

24 Returns: 

25 Dict of dataframes 

26 """ 

27 # validate arguments 

28 validate_http_url(url=url) 

29 

30 # create directory for data if not exists 

31 validate_http_url(url=url) 

32 path = Path(__file__).parent.parent.absolute().joinpath('data') 

33 path.mkdir(parents=True, exist_ok=True) 

34 package_path = path.joinpath(package_name) 

35 

36 if os.path.isdir(package_path): 

37 # read data from filip.data if exists 

38 logger.info("Found existing data package in 'filip.data'") 

39 

40 data = {} 

41 for file in os.listdir(package_path): 

42 file_name = file[:-4] 

43 # read in each file as one dataframe, prevents the deletion of NaN 

44 # values with na_filter=False 

45 frame = pd.read_csv(package_path.joinpath(file), 

46 index_col=0, 

47 header=0, 

48 na_filter=False) 

49 data[file_name] = frame 

50 

51 else: 

52 # download external data and store data 

53 logger.info("Could not find data package in 'filip.data'. Will " 

54 "try to download from %s", url) 

55 try: 

56 data = read_datapackage(url) 

57 # rename keys 

58 data = {k.replace('-', '_'): v for k, v in data.items()} 

59 os.mkdir(package_path) 

60 

61 # store data in filip.data 

62 for k, v in data.items(): 

63 v: DataFrame = v 

64 v.loc[:, :] = v[:].applymap(str) 

65 table_filepath = \ 

66 str(package_path) + f"\\{k.replace('-', '_')}.csv" 

67 v.to_csv(table_filepath) 

68 

69 except: 

70 logger.error("Failed to load data package!") 

71 raise 

72 return data