Coverage for ebcpy/utils/reproduction.py: 89%

140 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-09-19 12:21 +0000

1""" 

2This module contains scripts to extract information 

3out of simulation / programming based research and 

4enable a reproduction of the results at a later stage. 

5""" 

6import json 

7import pathlib 

8import sys 

9import platform 

10import os 

11import logging 

12from typing import List, Union 

13import zipfile 

14from datetime import datetime 

15from dataclasses import dataclass 

16 

17logger = logging.getLogger(__name__) 

18 

19 

20@dataclass 

21class ReproductionFile: 

22 """ 

23 Data-class for a text-file which will be written to te zip. 

24 

25 Arguments: 

26 filename: str 

27 Name of the file in the zip. Can be a relative path. 

28 content: str 

29 Content of the text file 

30 """ 

31 filename: str 

32 content: str 

33 

34 

35@dataclass 

36class CopyFile: 

37 """ 

38 Data-class for information on a file 

39 which will be copied to the zip 

40 

41 :param str filename: 

42 Name of the file in the zip. Can be a relative path. 

43 :param pathlib.Path sourcepath: 

44 Path on the current machine where the file to copy 

45 is located 

46 :param bool remove: 

47 If True, the file will be moved instead of just copied. 

48 """ 

49 filename: str 

50 sourcepath: pathlib.Path 

51 remove: bool 

52 

53 

54def save_reproduction_archive( 

55 title: str = None, 

56 path: Union[pathlib.Path, str] = None, 

57 log_message: str = None, 

58 files: List[Union[ReproductionFile, CopyFile]] = None, 

59 file: Union[pathlib.Path, str] = None, 

60 search_on_pypi: bool = False 

61): 

62 """ 

63 Function to save a reproduction archive which contains 

64 files to reproduce any simulation/software based study. 

65 

66 :param str title: 

67 Title of the study 

68 :param pathlib.Path path: 

69 Where to store the .zip file. If not given, os.getcwd() is used. 

70 :param str log_message: 

71 Specific message for this run of the study. If given, 

72 you are not asked at the end of your script to give the 

73 log_message. 

74 :param list files: 

75 List of files to save along the standard ones. 

76 Examples would be plots, tables etc. 

77 :param pathlib.Path file: 

78 The file which is used to run. 

79 Default is __file__ of __main__ module 

80 :param bool search_on_pypi: 

81 If True, all python packages which are 

82 not a git-repo are checked for availability on pypi 

83 Default is False. Does not work if no internet connection 

84 is available. 

85 """ 

86 _py_requirements_name = "python/requirements.txt" 

87 if path is None: 

88 path = os.getcwd() 

89 if file is None: 

90 file = pathlib.Path(sys.modules['__main__'].__file__).absolute() 

91 if title is None: 

92 title = file.name.replace(".py", "") 

93 path = pathlib.Path(path) 

94 os.makedirs(path, exist_ok=True) 

95 if files is None: 

96 files = [] 

97 current_time = datetime.now().strftime('%Y%m%d_%H%M%S') 

98 

99 # Start with the file currently running: 

100 file_running = pathlib.Path(file).absolute() 

101 files.append(ReproductionFile( 

102 filename=file_running.name, 

103 content=file_running.read_text() 

104 )) 

105 # Check if it's a git-repo: 

106 for _dir_path in [file_running] + list(file_running.parents): 

107 repo_info = get_git_information( 

108 path=_dir_path, 

109 zip_folder_path="study_repository" 

110 ) 

111 if repo_info is not None: # That means it's a repo 

112 files.extend(repo_info.pop("difference_files", [])) 

113 files.append(ReproductionFile( 

114 filename="study_repository/repo_info.txt", 

115 content=json.dumps(repo_info, indent=2) 

116 )) 

117 break 

118 # Get log 

119 if log_message is None: 

120 log_message = input("Please enter the specifications / log for this study: ") 

121 if not log_message: 

122 log_message = "The user was to lazy to pass any useful information on " \ 

123 "what made this research study different to others." 

124 

125 with open(path.joinpath(f"Study_Log_{title}.txt"), "a+") as f: 

126 f.write(f"{current_time}: {log_message}\n") 

127 

128 # General info 

129 files.append(ReproductionFile( 

130 filename="Information_to_reproduce.txt", 

131 content=_get_general_information( 

132 title=title, 

133 log_message=log_message, 

134 current_time=current_time 

135 ) 

136 )) 

137 

138 # Python-Reproduction: 

139 py_requirements_content, diff_files, pip_version = _get_python_package_information( 

140 search_on_pypi=search_on_pypi 

141 ) 

142 files.append(ReproductionFile( 

143 filename=_py_requirements_name, 

144 content=py_requirements_content, 

145 )) 

146 files.extend(diff_files) 

147 

148 py_repro = _get_python_reproduction( 

149 title=title, 

150 pip_version=pip_version 

151 ) 

152 files.append(ReproductionFile( 

153 filename="python/Reproduce_python_environment.txt", 

154 content=py_repro, 

155 )) 

156 

157 zip_file_name = path.joinpath( 

158 f"{current_time}_{title}.zip" 

159 ) 

160 with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zip_file: 

161 # Save all result files: 

162 for file in files: 

163 if isinstance(file, str): 

164 if os.path.exists(file): 

165 zip_file.write(file, f"Results/{pathlib.Path(file).name}") 

166 else: 

167 logger.error("Given file '%s' is a string but " 

168 "not an existing file. Skipping...", file) 

169 elif isinstance(file, ReproductionFile): 

170 zip_file.writestr(file.filename, file.content) 

171 elif isinstance(file, CopyFile): 

172 zip_file.write(file.sourcepath, file.filename) 

173 if file.remove: 

174 try: 

175 os.remove(file.sourcepath) 

176 except PermissionError: 

177 logger.error(f"Could not remove {file.sourcepath}") 

178 else: 

179 raise TypeError( 

180 f"Given file '{file}' has no " 

181 f"valid type. Type is '{type(file)}'") 

182 return zip_file_name 

183 

184 

185def get_git_information( 

186 path: pathlib.Path, 

187 name: str = None, 

188 zip_folder_path: str = None 

189): 

190 """ 

191 Function to get the git information for a given path. 

192 

193 :param pathlib.Path path: 

194 Path to possible git repo 

195 :param str name: 

196 Name of the repo. 

197 If not given, the name in the URL will be used. 

198 :param str zip_folder_path: 

199 If given, the PATH of the difference_files for the .zip 

200 will be zip_folder_path plus WARNING_GIT_DIFFERENCE... 

201 

202 Returns: 

203 If the path is not a git repository, this function returns None. 

204 Else, a dictionary with the keys 'url', 'commit' and 'difference_files'. 

205 """ 

206 try: 

207 from git import Repo, InvalidGitRepositoryError, RemoteReference 

208 except ImportError as err: 

209 raise ImportError( 

210 "Could not save data for reproduction, install GitPython using " 

211 "`pip install GitPython`: " + str(err) 

212 ) 

213 try: 

214 repo = Repo(path) 

215 except InvalidGitRepositoryError: 

216 return 

217 commit = repo.head.commit 

218 commit_hex = commit.hexsha 

219 diff_last_cmt = repo.git.diff(commit) 

220 diff_remote_main = "" 

221 remote_main_cmt = "" 

222 for ref in repo.references: 

223 if isinstance(ref, RemoteReference) and ref.name in ['origin/master', 'origin/main']: 

224 diff_remote_main = repo.git.diff(ref.commit) 

225 remote_main_cmt = ref.commit.hexsha 

226 break 

227 data = { 

228 "url": next(repo.remotes[0].urls), 

229 "commit": commit_hex, 

230 "difference_files": [] 

231 } 

232 

233 if name is None: 

234 # Get last part of url 

235 name = data["url"].split("/")[-1].replace(".git", "") 

236 if zip_folder_path is None: 

237 zip_folder_path = "" 

238 else: 

239 zip_folder_path += "/" 

240 # Check new files 

241 if diff_last_cmt: 

242 data["difference_files"].append(ReproductionFile( 

243 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_local_head.txt", 

244 content=diff_last_cmt, 

245 )) 

246 # Check if pushed to remote 

247 if not repo.git.branch("-r", contains=commit_hex): 

248 data["difference_files"].append(ReproductionFile( 

249 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_remote_main.txt", 

250 content=diff_remote_main, 

251 )) 

252 data["commit"] = remote_main_cmt 

253 return data 

254 

255 

256def creat_copy_files_from_dir(foldername: str, 

257 sourcepath: pathlib.Path, 

258 remove: bool = False): 

259 """ 

260 Creates a list with CopyFiles for each file in a directory 

261 where which will be saved in the zip under the foldername 

262 with all subdirectories. 

263 

264 :param str foldername: 

265 Name of the folder in the zip. Can be a relative path. 

266 :param pathlib.Path sourcepath: 

267 Path on the current machine where the directory to copy 

268 is located 

269 :param bool remove: 

270 Default is False. If True, the files in the directory 

271 will be moved instead of just copied. 

272 

273 :return list: 

274 Returns a list with CopyFiles for each file in the directory source path. 

275 """ 

276 files = [] 

277 for dirpath, dirnames, filenames in os.walk(sourcepath): 

278 for file in filenames: 

279 filename = foldername + dirpath.__str__().split(sourcepath.name)[-1] + '/' + file 

280 files.append(CopyFile( 

281 sourcepath=os.path.join(dirpath, file), 

282 filename=filename, 

283 remove=remove 

284 )) 

285 return files 

286 

287 

288def _get_general_information(title: str, log_message: str, current_time: str): 

289 """ 

290 Function to save the general information of the study. 

291 Time, machine information, and an intro on how to reproduce 

292 the study is given. 

293 """ 

294 

295 info_header = f"""This folder contains information necessary to reproduce the python based research study named '{title}'. 

296Reason the user performed this study: 

297"%s" 

298 

299To reproduce, make sure you have installed the following programs: 

300- Anaconda 

301- Dymola (If a folder named Dymola exists in this zip) 

302 

303Run the lines in the file 'python/reproduce_python_environment.txt' in a shell with the PATH variable pointing to anaconda (or in Anaconda Prompt). 

304After execution, make sure to check for any differences in git-based python code. 

305These files are included in this folder and are named e.g. "WARNING_GIT_DIFFERENCE_some_package". 

306If this happens, make sure to change the files in the git-based python packages after installation. 

307For future use, be sure to commit and push your changes before running any research study. 

308""" % log_message 

309 _data = { 

310 "Time": current_time, 

311 "Author": os.getlogin(), 

312 "Machine": platform.machine(), 

313 "Version": platform.version(), 

314 "Platform": platform.platform(), 

315 "System": platform.system(), 

316 "Processor": platform.processor(), 

317 } 

318 _content_lines = [ 

319 info_header + "\n", 

320 "General system information of performed study:", 

321 ] + [f"{k}: {v}" for k, v in _data.items()] 

322 return "\n".join(_content_lines) 

323 

324 

325def _get_python_package_information(search_on_pypi: bool): 

326 """ 

327 Function to get the content of python packages installed 

328 as a requirement.txt format content. 

329 """ 

330 import pkg_resources 

331 installed_packages = [pack for pack in pkg_resources.working_set] 

332 diff_paths = [] 

333 requirement_txt_content = [] 

334 pip_version = "" 

335 for package in installed_packages: 

336 repo_info = get_git_information( 

337 path=package.location, 

338 name=package.key, 

339 zip_folder_path="python" 

340 ) 

341 if repo_info is None: 

342 # Check if in python path: 

343 if package.key == "pip": # exclude pip in requirements and give info to _get_python_reproduction 

344 pip_version = f"=={package.version}" 

345 else: 

346 requirement_txt_content.append( 

347 f"{package.key}=={package.version}" 

348 ) 

349 if search_on_pypi: 

350 from pypisearch.search import Search 

351 res = Search(package.key).result 

352 if not res: 

353 raise ModuleNotFoundError( 

354 "Package '%s' is neither a git " 

355 "repo nor a package on pypi. " 

356 "Won't be able to reproduce it!", 

357 package.key 

358 ) 

359 else: 

360 cmt_sha = repo_info["commit"] 

361 requirement_txt_content.append( 

362 f"git+{repo_info['url']}.git@{cmt_sha}#egg={package.key}" 

363 ) 

364 diff_paths.extend(repo_info["difference_files"]) 

365 return "\n".join(requirement_txt_content), diff_paths, pip_version 

366 

367 

368def _get_python_reproduction(title: str, pip_version: str): 

369 """ 

370 Get the content of a script to reproduce the python 

371 environment used for the study. 

372 """ 

373 _v = sys.version_info 

374 py_version = ".".join([str(_v.major), str(_v.minor), str(_v.micro)]) 

375 env_name = f"py_{title}" 

376 py_reproduce_content = [ 

377 f"conda create -n {env_name} python={py_version} -y", 

378 f"conda activate {env_name}", 

379 f"python -m pip install pip{pip_version}", 

380 f"pip install -r requirements.txt", 

381 ] 

382 return "\n".join(py_reproduce_content) 

383 

384 

385if __name__ == '__main__': 

386 save_reproduction_archive( 

387 title="my_study", 

388 path=r"D:\00_temp\reproduction", 

389 )