Coverage for ebcpy/utils/reproduction.py: 76%

145 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-01-04 08:02 +0000

1""" 

2This module contains scripts to extract information 

3out of simulation / programming based research and 

4enable a reproduction of the results at a later stage. 

5""" 

6import json 

7import pathlib 

8import sys 

9import platform 

10import os 

11import logging 

12from typing import List, Tuple, Union 

13import zipfile 

14from datetime import datetime 

15from dataclasses import dataclass 

16from importlib.metadata import distributions 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21@dataclass 

22class ReproductionFile: 

23 """ 

24 Data-class for a text-file which will be written to te zip. 

25 

26 Arguments: 

27 filename: str 

28 Name of the file in the zip. Can be a relative path. 

29 content: str 

30 Content of the text file 

31 """ 

32 filename: str 

33 content: str 

34 

35 

36@dataclass 

37class CopyFile: 

38 """ 

39 Data-class for information on a file 

40 which will be copied to the zip 

41 

42 :param str filename: 

43 Name of the file in the zip. Can be a relative path. 

44 :param pathlib.Path sourcepath: 

45 Path on the current machine where the file to copy 

46 is located 

47 :param bool remove: 

48 If True, the file will be moved instead of just copied. 

49 """ 

50 filename: str 

51 sourcepath: pathlib.Path 

52 remove: bool 

53 

54 

55def save_reproduction_archive( 

56 title: str = None, 

57 path: Union[pathlib.Path, str] = None, 

58 log_message: str = None, 

59 files: List[Union[ReproductionFile, CopyFile]] = None, 

60 file: Union[pathlib.Path, str] = None, 

61 search_on_pypi: bool = False 

62): 

63 """ 

64 Function to save a reproduction archive which contains 

65 files to reproduce any simulation/software based study. 

66 

67 :param str title: 

68 Title of the study 

69 :param pathlib.Path path: 

70 Where to store the .zip file. If not given, os.getcwd() is used. 

71 :param str log_message: 

72 Specific message for this run of the study. If given, 

73 you are not asked at the end of your script to give the 

74 log_message. 

75 :param list files: 

76 List of files to save along the standard ones. 

77 Examples would be plots, tables etc. 

78 :param pathlib.Path file: 

79 The file which is used to run. 

80 Default is __file__ of __main__ module 

81 :param bool search_on_pypi: 

82 If True, all python packages which are 

83 not a git-repo are checked for availability on pypi 

84 Default is False. Does not work if no internet connection 

85 is available. 

86 """ 

87 _py_requirements_name = "python/requirements.txt" 

88 if path is None: 

89 path = os.getcwd() 

90 if file is None: 

91 file = pathlib.Path(sys.modules['__main__'].__file__).absolute() 

92 if title is None: 

93 title = file.name.replace(".py", "") 

94 path = pathlib.Path(path) 

95 os.makedirs(path, exist_ok=True) 

96 if files is None: 

97 files = [] 

98 current_time = datetime.now().strftime('%Y%m%d_%H%M%S') 

99 

100 # Start with the file currently running: 

101 file_running = pathlib.Path(file).absolute() 

102 files.append(ReproductionFile( 

103 filename=file_running.name, 

104 content=file_running.read_text() 

105 )) 

106 # Check if it's a git-repo: 

107 for _dir_path in [file_running] + list(file_running.parents): 

108 repo_info = get_git_information( 

109 path=_dir_path, 

110 zip_folder_path="study_repository" 

111 ) 

112 if repo_info is not None: # That means it's a repo 

113 files.extend(repo_info.pop("difference_files", [])) 

114 files.append(ReproductionFile( 

115 filename="study_repository/repo_info.txt", 

116 content=json.dumps(repo_info, indent=2) 

117 )) 

118 break 

119 # Get log 

120 if log_message is None: 

121 log_message = input("Please enter the specifications / log for this study: ") 

122 if not log_message: 

123 log_message = "The user was to lazy to pass any useful information on " \ 

124 "what made this research study different to others." 

125 

126 with open(path.joinpath(f"Study_Log_{title}.txt"), "a+") as f: 

127 f.write(f"{current_time}: {log_message}\n") 

128 

129 # General info 

130 files.append(ReproductionFile( 

131 filename="Information_to_reproduce.txt", 

132 content=_get_general_information( 

133 title=title, 

134 log_message=log_message, 

135 current_time=current_time 

136 ) 

137 )) 

138 

139 # Python-Reproduction: 

140 py_requirements_content, diff_files, pip_version = _get_python_package_information( 

141 search_on_pypi=search_on_pypi 

142 ) 

143 files.append(ReproductionFile( 

144 filename=_py_requirements_name, 

145 content=py_requirements_content, 

146 )) 

147 files.extend(diff_files) 

148 

149 py_repro = _get_python_reproduction( 

150 title=title, 

151 pip_version=pip_version 

152 ) 

153 files.append(ReproductionFile( 

154 filename="python/Reproduce_python_environment.txt", 

155 content=py_repro, 

156 )) 

157 

158 zip_file_name = path.joinpath( 

159 f"{current_time}_{title}.zip" 

160 ) 

161 with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zip_file: 

162 # Save all result files: 

163 for file in files: 

164 if isinstance(file, str): 

165 if os.path.exists(file): 

166 zip_file.write(file, f"Results/{pathlib.Path(file).name}") 

167 else: 

168 logger.error("Given file '%s' is a string but " 

169 "not an existing file. Skipping...", file) 

170 elif isinstance(file, ReproductionFile): 

171 zip_file.writestr(file.filename, file.content) 

172 elif isinstance(file, CopyFile): 

173 zip_file.write(file.sourcepath, file.filename) 

174 if file.remove: 

175 try: 

176 os.remove(file.sourcepath) 

177 except PermissionError: 

178 logger.error(f"Could not remove {file.sourcepath}") 

179 else: 

180 raise TypeError( 

181 f"Given file '{file}' has no " 

182 f"valid type. Type is '{type(file)}'") 

183 return zip_file_name 

184 

185 

186def get_git_information( 

187 path: pathlib.Path, 

188 name: str = None, 

189 zip_folder_path: str = None 

190): 

191 """ 

192 Function to get the git information for a given path. 

193 

194 :param pathlib.Path path: 

195 Path to possible git repo 

196 :param str name: 

197 Name of the repo. 

198 If not given, the name in the URL will be used. 

199 :param str zip_folder_path: 

200 If given, the PATH of the difference_files for the .zip 

201 will be zip_folder_path plus WARNING_GIT_DIFFERENCE... 

202 

203 Returns: 

204 If the path is not a git repository, this function returns None. 

205 Else, a dictionary with the keys 'url', 'commit' and 'difference_files'. 

206 """ 

207 try: 

208 from git import Repo, InvalidGitRepositoryError, RemoteReference 

209 except ImportError as err: 

210 raise ImportError( 

211 "Could not save data for reproduction, install GitPython using " 

212 "`pip install GitPython`: " + str(err) 

213 ) 

214 try: 

215 repo = Repo(path) 

216 except InvalidGitRepositoryError: 

217 return 

218 commit = repo.head.commit 

219 commit_hex = commit.hexsha 

220 diff_last_cmt = repo.git.diff(commit) 

221 diff_remote_main = "" 

222 remote_main_cmt = "" 

223 for ref in repo.references: 

224 if isinstance(ref, RemoteReference) and ref.name in ['origin/master', 'origin/main']: 

225 diff_remote_main = repo.git.diff(ref.commit) 

226 remote_main_cmt = ref.commit.hexsha 

227 break 

228 data = { 

229 "url": next(repo.remotes[0].urls), 

230 "commit": commit_hex, 

231 "difference_files": [] 

232 } 

233 

234 if name is None: 

235 # Get last part of url 

236 name = data["url"].split("/")[-1].replace(".git", "") 

237 if zip_folder_path is None: 

238 zip_folder_path = "" 

239 else: 

240 zip_folder_path += "/" 

241 # Check new files 

242 if diff_last_cmt: 

243 data["difference_files"].append(ReproductionFile( 

244 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_local_head.txt", 

245 content=diff_last_cmt, 

246 )) 

247 # Check if pushed to remote 

248 if not repo.git.branch("-r", contains=commit_hex): 

249 data["difference_files"].append(ReproductionFile( 

250 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_remote_main.txt", 

251 content=diff_remote_main, 

252 )) 

253 data["commit"] = remote_main_cmt 

254 return data 

255 

256 

257def creat_copy_files_from_dir(foldername: str, 

258 sourcepath: pathlib.Path, 

259 remove: bool = False): 

260 """ 

261 Creates a list with CopyFiles for each file in a directory 

262 where which will be saved in the zip under the foldername 

263 with all subdirectories. 

264 

265 :param str foldername: 

266 Name of the folder in the zip. Can be a relative path. 

267 :param pathlib.Path sourcepath: 

268 Path on the current machine where the directory to copy 

269 is located 

270 :param bool remove: 

271 Default is False. If True, the files in the directory 

272 will be moved instead of just copied. 

273 

274 :return list: 

275 Returns a list with CopyFiles for each file in the directory source path. 

276 """ 

277 files = [] 

278 for dirpath, dirnames, filenames in os.walk(sourcepath): 

279 for file in filenames: 

280 filename = foldername + dirpath.__str__().split(sourcepath.name)[-1] + '/' + file 

281 files.append(CopyFile( 

282 sourcepath=os.path.join(dirpath, file), 

283 filename=filename, 

284 remove=remove 

285 )) 

286 return files 

287 

288 

289def _get_general_information(title: str, log_message: str, current_time: str): 

290 """ 

291 Function to save the general information of the study. 

292 Time, machine information, and an intro on how to reproduce 

293 the study is given. 

294 """ 

295 

296 info_header = f"""This folder contains information necessary to reproduce the python based research study named '{title}'. 

297Reason the user performed this study: 

298"%s" 

299 

300To reproduce, make sure you have installed the following programs: 

301- Anaconda 

302- Dymola (If a folder named Dymola exists in this zip) 

303 

304Run the lines in the file 'python/reproduce_python_environment.txt' in a shell with the PATH variable pointing to anaconda (or in Anaconda Prompt). 

305After execution, make sure to check for any differences in git-based python code. 

306These files are included in this folder and are named e.g. "WARNING_GIT_DIFFERENCE_some_package". 

307If this happens, make sure to change the files in the git-based python packages after installation. 

308For future use, be sure to commit and push your changes before running any research study. 

309""" % log_message 

310 _data = { 

311 "Time": current_time, 

312 "Author": os.getlogin(), 

313 "Machine": platform.machine(), 

314 "Version": platform.version(), 

315 "Platform": platform.platform(), 

316 "System": platform.system(), 

317 "Processor": platform.processor(), 

318 } 

319 _content_lines = [ 

320 info_header + "\n", 

321 "General system information of performed study:", 

322 ] + [f"{k}: {v}" for k, v in _data.items()] 

323 return "\n".join(_content_lines) 

324 

325 

326def get_installed_packages() -> List[dict]: 

327 """ 

328 Returns a list of tuples containing (package_name, version, location) 

329 for all installed Python packages. 

330 """ 

331 packages = [] 

332 for dist in distributions(): 

333 packages.append(dict( 

334 name=dist.metadata['Name'], 

335 version=dist.version, 

336 location=os.path.normpath(str(dist.locate_file(''))) 

337 )) 

338 return packages 

339 

340 

341def _get_python_package_information(search_on_pypi: bool): 

342 """ 

343 Function to get the content of python packages installed 

344 as a requirement.txt format content. 

345 """ 

346 installed_packages = get_installed_packages() 

347 diff_paths = [] 

348 requirement_txt_content = [] 

349 pip_version = "" 

350 for package in installed_packages: 

351 repo_info = get_git_information( 

352 path=package["location"], 

353 name=package["name"], 

354 zip_folder_path="python" 

355 ) 

356 if repo_info is None: 

357 # Check if in python path: 

358 if package["name"] == "pip": # exclude pip in requirements and give info to _get_python_reproduction 

359 pip_version = f'=={package["version"]}' 

360 else: 

361 requirement_txt_content.append( 

362 f'{package["name"]}=={package["version"]}' 

363 ) 

364 if search_on_pypi: 

365 from pypisearch.search import Search 

366 res = Search(package["name"]).result 

367 if not res: 

368 raise ModuleNotFoundError( 

369 "Package '%s' is neither a git " 

370 "repo nor a package on pypi. " 

371 "Won't be able to reproduce it!", 

372 package["name"] 

373 ) 

374 else: 

375 cmt_sha = repo_info["commit"] 

376 requirement_txt_content.append( 

377 f"git+{repo_info['url']}.git@{cmt_sha}#egg={package['name']}" 

378 ) 

379 diff_paths.extend(repo_info["difference_files"]) 

380 return "\n".join(requirement_txt_content), diff_paths, pip_version 

381 

382 

383def _get_python_reproduction(title: str, pip_version: str): 

384 """ 

385 Get the content of a script to reproduce the python 

386 environment used for the study. 

387 """ 

388 _v = sys.version_info 

389 py_version = ".".join([str(_v.major), str(_v.minor), str(_v.micro)]) 

390 env_name = f"py_{title}" 

391 py_reproduce_content = [ 

392 f"conda create -n {env_name} python={py_version} -y", 

393 f"conda activate {env_name}", 

394 f"python -m pip install pip{pip_version}", 

395 f"pip install -r requirements.txt", 

396 ] 

397 return "\n".join(py_reproduce_content) 

398 

399 

400if __name__ == '__main__': 

401 save_reproduction_archive( 

402 title="my_study", 

403 path=r"D:\00_temp\reproduction", 

404 )