Coverage for ebcpy/utils/reproduction.py: 76%

147 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2025-08-26 09:12 +0000

1""" 

2This module contains scripts to extract information 

3out of simulation / programming based research and 

4enable a reproduction of the results at a later stage. 

5""" 

6import json 

7import pathlib 

8import sys 

9import platform 

10import os 

11import logging 

12from typing import List, Tuple, Union 

13import zipfile 

14from datetime import datetime 

15from dataclasses import dataclass 

16from importlib.metadata import distributions 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21@dataclass 

22class ReproductionFile: 

23 """ 

24 Data-class for a text-file which will be written to te zip. 

25 

26 Arguments: 

27 filename: str 

28 Name of the file in the zip. Can be a relative path. 

29 content: str 

30 Content of the text file 

31 """ 

32 filename: str 

33 content: Union[str, bytes] 

34 

35 

36@dataclass 

37class CopyFile: 

38 """ 

39 Data-class for information on a file 

40 which will be copied to the zip 

41 

42 :param str filename: 

43 Name of the file in the zip. Can be a relative path. 

44 :param pathlib.Path sourcepath: 

45 Path on the current machine where the file to copy 

46 is located 

47 :param bool remove: 

48 If True, the file will be moved instead of just copied. 

49 """ 

50 filename: str 

51 sourcepath: pathlib.Path 

52 remove: bool 

53 

54 

55def save_reproduction_archive( 

56 title: str = None, 

57 path: Union[pathlib.Path, str] = None, 

58 log_message: str = None, 

59 files: List[Union[ReproductionFile, CopyFile]] = None, 

60 file: Union[pathlib.Path, str] = None, 

61 search_on_pypi: bool = False 

62): 

63 """ 

64 Function to save a reproduction archive which contains 

65 files to reproduce any simulation/software based study. 

66 

67 :param str title: 

68 Title of the study 

69 :param pathlib.Path path: 

70 Where to store the .zip file. If not given, os.getcwd() is used. 

71 :param str log_message: 

72 Specific message for this run of the study. If given, 

73 you are not asked at the end of your script to give the 

74 log_message. 

75 :param list files: 

76 List of files to save along the standard ones. 

77 Examples would be plots, tables etc. 

78 :param pathlib.Path file: 

79 The file which is used to run. 

80 Default is __file__ of __main__ module 

81 :param bool search_on_pypi: 

82 If True, all python packages which are 

83 not a git-repo are checked for availability on pypi 

84 Default is False. Does not work if no internet connection 

85 is available. 

86 """ 

87 _py_requirements_name = "python/requirements.txt" 

88 if path is None: 

89 path = os.getcwd() 

90 if file is None: 

91 file = pathlib.Path(sys.modules['__main__'].__file__).absolute() 

92 if title is None: 

93 title = file.name.replace(".py", "") 

94 path = pathlib.Path(path) 

95 os.makedirs(path, exist_ok=True) 

96 if files is None: 

97 files = [] 

98 current_time = datetime.now().strftime('%Y%m%d_%H%M%S') 

99 

100 # Start with the file currently running: 

101 file_running = pathlib.Path(file).absolute() 

102 files.append(ReproductionFile( 

103 filename=file_running.name, 

104 content=file_running.read_text() 

105 )) 

106 # Check if it's a git-repo: 

107 for _dir_path in [file_running] + list(file_running.parents): 

108 repo_info = get_git_information( 

109 path=_dir_path, 

110 zip_folder_path="study_repository" 

111 ) 

112 if repo_info is not None: # That means it's a repo 

113 files.extend(repo_info.pop("difference_files", [])) 

114 files.append(ReproductionFile( 

115 filename="study_repository/repo_info.txt", 

116 content=json.dumps(repo_info, indent=2) 

117 )) 

118 break 

119 # Get log 

120 if log_message is None: 

121 log_message = input("Please enter the specifications / log for this study: ") 

122 if not log_message: 

123 log_message = "The user was to lazy to pass any useful information on " \ 

124 "what made this research study different to others." 

125 

126 with open(path.joinpath(f"Study_Log_{title}.txt"), "a+") as f: 

127 f.write(f"{current_time}: {log_message}\n") 

128 

129 # General info 

130 files.append(ReproductionFile( 

131 filename="Information_to_reproduce.txt", 

132 content=_get_general_information( 

133 title=title, 

134 log_message=log_message, 

135 current_time=current_time 

136 ) 

137 )) 

138 

139 # Python-Reproduction: 

140 py_requirements_content, diff_files, pip_version = _get_python_package_information( 

141 search_on_pypi=search_on_pypi 

142 ) 

143 files.append(ReproductionFile( 

144 filename=_py_requirements_name, 

145 content=py_requirements_content, 

146 )) 

147 files.extend(diff_files) 

148 

149 py_repro = _get_python_reproduction( 

150 title=title, 

151 pip_version=pip_version 

152 ) 

153 files.append(ReproductionFile( 

154 filename="python/Reproduce_python_environment.txt", 

155 content=py_repro, 

156 )) 

157 

158 zip_file_name = path.joinpath( 

159 f"{current_time}_{title}.zip" 

160 ) 

161 with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zip_file: 

162 # Save all result files: 

163 for file in files: 

164 if isinstance(file, str): 

165 if os.path.exists(file): 

166 zip_file.write(file, f"Results/{pathlib.Path(file).name}") 

167 else: 

168 logger.error("Given file '%s' is a string but " 

169 "not an existing file. Skipping...", file) 

170 elif isinstance(file, ReproductionFile): 

171 if isinstance(file.content, str): 

172 # Use surrogateescape to make sure all characters survive 

173 zip_file.writestr(file.filename, file.content.encode("utf-8", errors="surrogateescape")) 

174 else: 

175 zip_file.writestr(file.filename, file.content) 

176 elif isinstance(file, CopyFile): 

177 zip_file.write(file.sourcepath, file.filename) 

178 if file.remove: 

179 try: 

180 os.remove(file.sourcepath) 

181 except PermissionError: 

182 logger.error(f"Could not remove {file.sourcepath}") 

183 else: 

184 raise TypeError( 

185 f"Given file '{file}' has no " 

186 f"valid type. Type is '{type(file)}'") 

187 return zip_file_name 

188 

189 

190def get_git_information( 

191 path: pathlib.Path, 

192 name: str = None, 

193 zip_folder_path: str = None 

194): 

195 """ 

196 Function to get the git information for a given path. 

197 

198 :param pathlib.Path path: 

199 Path to possible git repo 

200 :param str name: 

201 Name of the repo. 

202 If not given, the name in the URL will be used. 

203 :param str zip_folder_path: 

204 If given, the PATH of the difference_files for the .zip 

205 will be zip_folder_path plus WARNING_GIT_DIFFERENCE... 

206 

207 Returns: 

208 If the path is not a git repository, this function returns None. 

209 Else, a dictionary with the keys 'url', 'commit' and 'difference_files'. 

210 """ 

211 try: 

212 from git import Repo, InvalidGitRepositoryError, RemoteReference 

213 except ImportError as err: 

214 raise ImportError( 

215 "Could not save data for reproduction, install GitPython using " 

216 "`pip install GitPython`: " + str(err) 

217 ) 

218 try: 

219 repo = Repo(path) 

220 except InvalidGitRepositoryError: 

221 return 

222 commit = repo.head.commit 

223 commit_hex = commit.hexsha 

224 diff_last_cmt = repo.git.diff(commit) 

225 diff_remote_main = "" 

226 remote_main_cmt = "" 

227 for ref in repo.references: 

228 if isinstance(ref, RemoteReference) and ref.name in ['origin/master', 'origin/main']: 

229 diff_remote_main = repo.git.diff(ref.commit) 

230 remote_main_cmt = ref.commit.hexsha 

231 break 

232 data = { 

233 "url": next(repo.remotes[0].urls), 

234 "commit": commit_hex, 

235 "difference_files": [] 

236 } 

237 

238 if name is None: 

239 # Get last part of url 

240 name = data["url"].split("/")[-1].replace(".git", "") 

241 if zip_folder_path is None: 

242 zip_folder_path = "" 

243 else: 

244 zip_folder_path += "/" 

245 # Check new files 

246 if diff_last_cmt: 

247 data["difference_files"].append(ReproductionFile( 

248 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_local_head.txt", 

249 content=diff_last_cmt, 

250 )) 

251 # Check if pushed to remote 

252 if not repo.git.branch("-r", contains=commit_hex): 

253 data["difference_files"].append(ReproductionFile( 

254 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_remote_main.txt", 

255 content=diff_remote_main, 

256 )) 

257 data["commit"] = remote_main_cmt 

258 return data 

259 

260 

261def creat_copy_files_from_dir(foldername: str, 

262 sourcepath: pathlib.Path, 

263 remove: bool = False): 

264 """ 

265 Creates a list with CopyFiles for each file in a directory 

266 where which will be saved in the zip under the foldername 

267 with all subdirectories. 

268 

269 :param str foldername: 

270 Name of the folder in the zip. Can be a relative path. 

271 :param pathlib.Path sourcepath: 

272 Path on the current machine where the directory to copy 

273 is located 

274 :param bool remove: 

275 Default is False. If True, the files in the directory 

276 will be moved instead of just copied. 

277 

278 :return list: 

279 Returns a list with CopyFiles for each file in the directory source path. 

280 """ 

281 files = [] 

282 for dirpath, dirnames, filenames in os.walk(sourcepath): 

283 for file in filenames: 

284 filename = foldername + dirpath.__str__().split(sourcepath.name)[-1] + '/' + file 

285 files.append(CopyFile( 

286 sourcepath=os.path.join(dirpath, file), 

287 filename=filename, 

288 remove=remove 

289 )) 

290 return files 

291 

292 

293def _get_general_information(title: str, log_message: str, current_time: str): 

294 """ 

295 Function to save the general information of the study. 

296 Time, machine information, and an intro on how to reproduce 

297 the study is given. 

298 """ 

299 

300 info_header = f"""This folder contains information necessary to reproduce the python based research study named '{title}'. 

301Reason the user performed this study: 

302"%s" 

303 

304To reproduce, make sure you have installed the following programs: 

305- Anaconda 

306- Dymola (If a folder named Dymola exists in this zip) 

307 

308Run the lines in the file 'python/reproduce_python_environment.txt' in a shell with the PATH variable pointing to anaconda (or in Anaconda Prompt). 

309After execution, make sure to check for any differences in git-based python code. 

310These files are included in this folder and are named e.g. "WARNING_GIT_DIFFERENCE_some_package". 

311If this happens, make sure to change the files in the git-based python packages after installation. 

312For future use, be sure to commit and push your changes before running any research study. 

313""" % log_message 

314 _data = { 

315 "Time": current_time, 

316 "Author": os.getlogin(), 

317 "Machine": platform.machine(), 

318 "Version": platform.version(), 

319 "Platform": platform.platform(), 

320 "System": platform.system(), 

321 "Processor": platform.processor(), 

322 } 

323 _content_lines = [ 

324 info_header + "\n", 

325 "General system information of performed study:", 

326 ] + [f"{k}: {v}" for k, v in _data.items()] 

327 return "\n".join(_content_lines) 

328 

329 

330def get_installed_packages() -> List[dict]: 

331 """ 

332 Returns a list of tuples containing (package_name, version, location) 

333 for all installed Python packages. 

334 """ 

335 packages = [] 

336 for dist in distributions(): 

337 packages.append(dict( 

338 name=dist.metadata['Name'], 

339 version=dist.version, 

340 location=os.path.normpath(str(dist.locate_file(''))) 

341 )) 

342 return packages 

343 

344 

345def _get_python_package_information(search_on_pypi: bool): 

346 """ 

347 Function to get the content of python packages installed 

348 as a requirement.txt format content. 

349 """ 

350 installed_packages = get_installed_packages() 

351 diff_paths = [] 

352 requirement_txt_content = [] 

353 pip_version = "" 

354 for package in installed_packages: 

355 repo_info = get_git_information( 

356 path=package["location"], 

357 name=package["name"], 

358 zip_folder_path="python" 

359 ) 

360 if repo_info is None: 

361 # Check if in python path: 

362 if package["name"] == "pip": # exclude pip in requirements and give info to _get_python_reproduction 

363 pip_version = f'=={package["version"]}' 

364 else: 

365 requirement_txt_content.append( 

366 f'{package["name"]}=={package["version"]}' 

367 ) 

368 if search_on_pypi: 

369 from pypisearch.search import Search 

370 res = Search(package["name"]).result 

371 if not res: 

372 raise ModuleNotFoundError( 

373 "Package '%s' is neither a git " 

374 "repo nor a package on pypi. " 

375 "Won't be able to reproduce it!", 

376 package["name"] 

377 ) 

378 else: 

379 cmt_sha = repo_info["commit"] 

380 requirement_txt_content.append( 

381 f"git+{repo_info['url']}.git@{cmt_sha}#egg={package['name']}" 

382 ) 

383 diff_paths.extend(repo_info["difference_files"]) 

384 return "\n".join(requirement_txt_content), diff_paths, pip_version 

385 

386 

387def _get_python_reproduction(title: str, pip_version: str): 

388 """ 

389 Get the content of a script to reproduce the python 

390 environment used for the study. 

391 """ 

392 _v = sys.version_info 

393 py_version = ".".join([str(_v.major), str(_v.minor), str(_v.micro)]) 

394 env_name = f"py_{title}" 

395 py_reproduce_content = [ 

396 f"conda create -n {env_name} python={py_version} -y", 

397 f"conda activate {env_name}", 

398 f"python -m pip install pip{pip_version}", 

399 f"pip install -r requirements.txt", 

400 ] 

401 return "\n".join(py_reproduce_content) 

402 

403 

404if __name__ == '__main__': 

405 save_reproduction_archive( 

406 title="my_study", 

407 path=r"D:\00_temp\reproduction", 

408 )