Coverage for ebcpy/utils/reproduction.py: 89%
140 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-09-19 12:21 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-09-19 12:21 +0000
1"""
2This module contains scripts to extract information
3out of simulation / programming based research and
4enable a reproduction of the results at a later stage.
5"""
6import json
7import pathlib
8import sys
9import platform
10import os
11import logging
12from typing import List, Union
13import zipfile
14from datetime import datetime
15from dataclasses import dataclass
17logger = logging.getLogger(__name__)
20@dataclass
21class ReproductionFile:
22 """
23 Data-class for a text-file which will be written to te zip.
25 Arguments:
26 filename: str
27 Name of the file in the zip. Can be a relative path.
28 content: str
29 Content of the text file
30 """
31 filename: str
32 content: str
35@dataclass
36class CopyFile:
37 """
38 Data-class for information on a file
39 which will be copied to the zip
41 :param str filename:
42 Name of the file in the zip. Can be a relative path.
43 :param pathlib.Path sourcepath:
44 Path on the current machine where the file to copy
45 is located
46 :param bool remove:
47 If True, the file will be moved instead of just copied.
48 """
49 filename: str
50 sourcepath: pathlib.Path
51 remove: bool
54def save_reproduction_archive(
55 title: str = None,
56 path: Union[pathlib.Path, str] = None,
57 log_message: str = None,
58 files: List[Union[ReproductionFile, CopyFile]] = None,
59 file: Union[pathlib.Path, str] = None,
60 search_on_pypi: bool = False
61):
62 """
63 Function to save a reproduction archive which contains
64 files to reproduce any simulation/software based study.
66 :param str title:
67 Title of the study
68 :param pathlib.Path path:
69 Where to store the .zip file. If not given, os.getcwd() is used.
70 :param str log_message:
71 Specific message for this run of the study. If given,
72 you are not asked at the end of your script to give the
73 log_message.
74 :param list files:
75 List of files to save along the standard ones.
76 Examples would be plots, tables etc.
77 :param pathlib.Path file:
78 The file which is used to run.
79 Default is __file__ of __main__ module
80 :param bool search_on_pypi:
81 If True, all python packages which are
82 not a git-repo are checked for availability on pypi
83 Default is False. Does not work if no internet connection
84 is available.
85 """
86 _py_requirements_name = "python/requirements.txt"
87 if path is None:
88 path = os.getcwd()
89 if file is None:
90 file = pathlib.Path(sys.modules['__main__'].__file__).absolute()
91 if title is None:
92 title = file.name.replace(".py", "")
93 path = pathlib.Path(path)
94 os.makedirs(path, exist_ok=True)
95 if files is None:
96 files = []
97 current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
99 # Start with the file currently running:
100 file_running = pathlib.Path(file).absolute()
101 files.append(ReproductionFile(
102 filename=file_running.name,
103 content=file_running.read_text()
104 ))
105 # Check if it's a git-repo:
106 for _dir_path in [file_running] + list(file_running.parents):
107 repo_info = get_git_information(
108 path=_dir_path,
109 zip_folder_path="study_repository"
110 )
111 if repo_info is not None: # That means it's a repo
112 files.extend(repo_info.pop("difference_files", []))
113 files.append(ReproductionFile(
114 filename="study_repository/repo_info.txt",
115 content=json.dumps(repo_info, indent=2)
116 ))
117 break
118 # Get log
119 if log_message is None:
120 log_message = input("Please enter the specifications / log for this study: ")
121 if not log_message:
122 log_message = "The user was to lazy to pass any useful information on " \
123 "what made this research study different to others."
125 with open(path.joinpath(f"Study_Log_{title}.txt"), "a+") as f:
126 f.write(f"{current_time}: {log_message}\n")
128 # General info
129 files.append(ReproductionFile(
130 filename="Information_to_reproduce.txt",
131 content=_get_general_information(
132 title=title,
133 log_message=log_message,
134 current_time=current_time
135 )
136 ))
138 # Python-Reproduction:
139 py_requirements_content, diff_files, pip_version = _get_python_package_information(
140 search_on_pypi=search_on_pypi
141 )
142 files.append(ReproductionFile(
143 filename=_py_requirements_name,
144 content=py_requirements_content,
145 ))
146 files.extend(diff_files)
148 py_repro = _get_python_reproduction(
149 title=title,
150 pip_version=pip_version
151 )
152 files.append(ReproductionFile(
153 filename="python/Reproduce_python_environment.txt",
154 content=py_repro,
155 ))
157 zip_file_name = path.joinpath(
158 f"{current_time}_{title}.zip"
159 )
160 with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zip_file:
161 # Save all result files:
162 for file in files:
163 if isinstance(file, str):
164 if os.path.exists(file):
165 zip_file.write(file, f"Results/{pathlib.Path(file).name}")
166 else:
167 logger.error("Given file '%s' is a string but "
168 "not an existing file. Skipping...", file)
169 elif isinstance(file, ReproductionFile):
170 zip_file.writestr(file.filename, file.content)
171 elif isinstance(file, CopyFile):
172 zip_file.write(file.sourcepath, file.filename)
173 if file.remove:
174 try:
175 os.remove(file.sourcepath)
176 except PermissionError:
177 logger.error(f"Could not remove {file.sourcepath}")
178 else:
179 raise TypeError(
180 f"Given file '{file}' has no "
181 f"valid type. Type is '{type(file)}'")
182 return zip_file_name
185def get_git_information(
186 path: pathlib.Path,
187 name: str = None,
188 zip_folder_path: str = None
189):
190 """
191 Function to get the git information for a given path.
193 :param pathlib.Path path:
194 Path to possible git repo
195 :param str name:
196 Name of the repo.
197 If not given, the name in the URL will be used.
198 :param str zip_folder_path:
199 If given, the PATH of the difference_files for the .zip
200 will be zip_folder_path plus WARNING_GIT_DIFFERENCE...
202 Returns:
203 If the path is not a git repository, this function returns None.
204 Else, a dictionary with the keys 'url', 'commit' and 'difference_files'.
205 """
206 try:
207 from git import Repo, InvalidGitRepositoryError, RemoteReference
208 except ImportError as err:
209 raise ImportError(
210 "Could not save data for reproduction, install GitPython using "
211 "`pip install GitPython`: " + str(err)
212 )
213 try:
214 repo = Repo(path)
215 except InvalidGitRepositoryError:
216 return
217 commit = repo.head.commit
218 commit_hex = commit.hexsha
219 diff_last_cmt = repo.git.diff(commit)
220 diff_remote_main = ""
221 remote_main_cmt = ""
222 for ref in repo.references:
223 if isinstance(ref, RemoteReference) and ref.name in ['origin/master', 'origin/main']:
224 diff_remote_main = repo.git.diff(ref.commit)
225 remote_main_cmt = ref.commit.hexsha
226 break
227 data = {
228 "url": next(repo.remotes[0].urls),
229 "commit": commit_hex,
230 "difference_files": []
231 }
233 if name is None:
234 # Get last part of url
235 name = data["url"].split("/")[-1].replace(".git", "")
236 if zip_folder_path is None:
237 zip_folder_path = ""
238 else:
239 zip_folder_path += "/"
240 # Check new files
241 if diff_last_cmt:
242 data["difference_files"].append(ReproductionFile(
243 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_local_head.txt",
244 content=diff_last_cmt,
245 ))
246 # Check if pushed to remote
247 if not repo.git.branch("-r", contains=commit_hex):
248 data["difference_files"].append(ReproductionFile(
249 filename=f"{zip_folder_path}WARNING_GIT_DIFFERENCE_{name}_to_remote_main.txt",
250 content=diff_remote_main,
251 ))
252 data["commit"] = remote_main_cmt
253 return data
256def creat_copy_files_from_dir(foldername: str,
257 sourcepath: pathlib.Path,
258 remove: bool = False):
259 """
260 Creates a list with CopyFiles for each file in a directory
261 where which will be saved in the zip under the foldername
262 with all subdirectories.
264 :param str foldername:
265 Name of the folder in the zip. Can be a relative path.
266 :param pathlib.Path sourcepath:
267 Path on the current machine where the directory to copy
268 is located
269 :param bool remove:
270 Default is False. If True, the files in the directory
271 will be moved instead of just copied.
273 :return list:
274 Returns a list with CopyFiles for each file in the directory source path.
275 """
276 files = []
277 for dirpath, dirnames, filenames in os.walk(sourcepath):
278 for file in filenames:
279 filename = foldername + dirpath.__str__().split(sourcepath.name)[-1] + '/' + file
280 files.append(CopyFile(
281 sourcepath=os.path.join(dirpath, file),
282 filename=filename,
283 remove=remove
284 ))
285 return files
288def _get_general_information(title: str, log_message: str, current_time: str):
289 """
290 Function to save the general information of the study.
291 Time, machine information, and an intro on how to reproduce
292 the study is given.
293 """
295 info_header = f"""This folder contains information necessary to reproduce the python based research study named '{title}'.
296Reason the user performed this study:
297"%s"
299To reproduce, make sure you have installed the following programs:
300- Anaconda
301- Dymola (If a folder named Dymola exists in this zip)
303Run the lines in the file 'python/reproduce_python_environment.txt' in a shell with the PATH variable pointing to anaconda (or in Anaconda Prompt).
304After execution, make sure to check for any differences in git-based python code.
305These files are included in this folder and are named e.g. "WARNING_GIT_DIFFERENCE_some_package".
306If this happens, make sure to change the files in the git-based python packages after installation.
307For future use, be sure to commit and push your changes before running any research study.
308""" % log_message
309 _data = {
310 "Time": current_time,
311 "Author": os.getlogin(),
312 "Machine": platform.machine(),
313 "Version": platform.version(),
314 "Platform": platform.platform(),
315 "System": platform.system(),
316 "Processor": platform.processor(),
317 }
318 _content_lines = [
319 info_header + "\n",
320 "General system information of performed study:",
321 ] + [f"{k}: {v}" for k, v in _data.items()]
322 return "\n".join(_content_lines)
325def _get_python_package_information(search_on_pypi: bool):
326 """
327 Function to get the content of python packages installed
328 as a requirement.txt format content.
329 """
330 import pkg_resources
331 installed_packages = [pack for pack in pkg_resources.working_set]
332 diff_paths = []
333 requirement_txt_content = []
334 pip_version = ""
335 for package in installed_packages:
336 repo_info = get_git_information(
337 path=package.location,
338 name=package.key,
339 zip_folder_path="python"
340 )
341 if repo_info is None:
342 # Check if in python path:
343 if package.key == "pip": # exclude pip in requirements and give info to _get_python_reproduction
344 pip_version = f"=={package.version}"
345 else:
346 requirement_txt_content.append(
347 f"{package.key}=={package.version}"
348 )
349 if search_on_pypi:
350 from pypisearch.search import Search
351 res = Search(package.key).result
352 if not res:
353 raise ModuleNotFoundError(
354 "Package '%s' is neither a git "
355 "repo nor a package on pypi. "
356 "Won't be able to reproduce it!",
357 package.key
358 )
359 else:
360 cmt_sha = repo_info["commit"]
361 requirement_txt_content.append(
362 f"git+{repo_info['url']}.git@{cmt_sha}#egg={package.key}"
363 )
364 diff_paths.extend(repo_info["difference_files"])
365 return "\n".join(requirement_txt_content), diff_paths, pip_version
368def _get_python_reproduction(title: str, pip_version: str):
369 """
370 Get the content of a script to reproduce the python
371 environment used for the study.
372 """
373 _v = sys.version_info
374 py_version = ".".join([str(_v.major), str(_v.minor), str(_v.micro)])
375 env_name = f"py_{title}"
376 py_reproduce_content = [
377 f"conda create -n {env_name} python={py_version} -y",
378 f"conda activate {env_name}",
379 f"python -m pip install pip{pip_version}",
380 f"pip install -r requirements.txt",
381 ]
382 return "\n".join(py_reproduce_content)
385if __name__ == '__main__':
386 save_reproduction_archive(
387 title="my_study",
388 path=r"D:\00_temp\reproduction",
389 )