-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtraindata_analysis_more_para.py
133 lines (117 loc) · 4.55 KB
/
traindata_analysis_more_para.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# analysis traindata
# JamesMisaka update in 20230208
# more infomation get from train-data
# parallel version, well used, same result as single version
import numpy as np
import pandas as pd
import sys
import os
from multiprocessing import Pool
from allstr_new import AllStr
from structure_new import Str
import time
def get_info_from_struc(stru: Str) -> dict:
'''get infomation from struc in train-data, unit for parallel comp
:param stru: instance of Str class
:returns: dict for all we need
'''
ele_conb = tuple(stru.sp.keys())
atom_conb = ""
for atom, num in stru.sp.items():
atom_conb += atom
atom_conb += str(num)
cell_type = stru.get_basic_shape() # key time comsuming step
cell_and_atom = (atom_conb, cell_type)
return ele_conb, atom_conb ,stru.natom, cell_type, cell_and_atom
# setting
strfile = "TrainStr.txt"
forfile = "TrainFor.txt"
# reading TrainFile
allstr_raw = AllStr()
if len(sys.argv) == 2:
allstr_raw.train_data_init(sys.argv[1])
elif len(sys.argv) > 2:
allstr_raw.train_data_init(sys.argv[1], sys.argv[2])
else:
allstr_raw.train_data_init(strfile, forfile)
# out of range: TrainStr not related to TrainFor
# get Traindata info
size = len(allstr_raw)
max_for = 0
max_stress = 0
all_elements = allstr_raw.get_all_element()
# do statistic by sort
# max force
allstr_sort_by_force = allstr_raw.sort_by_force()
max_for = allstr_sort_by_force[size-1].maxF
# max stress
allstr_sort_by_stress = allstr_raw.sort_by_stress()
max_stress = max(allstr_sort_by_stress[size-1].stress)
# forset all-in-one
# Specific infomation about Traindata element conbination and cell type
atom_conb_dict = {}
# get cell type: bulk, layer, cluster
elements_conb_dict = {}
# atom_conb_dict = {}
natom_dict = {}
cell_type_dict = {}
cell_and_atom_type_dict = {}
# can be parallel computation
start_time = time.perf_counter()
# method from chbpku
with Pool(processes=None) as p:
for info_list in p.imap(get_info_from_struc,
allstr_raw, os.cpu_count()):
# can be independent manage
ele_conb, atom_conb , natom, cell_type, cell_and_atom = info_list
natom_dict[atom_conb] = natom
elements_conb_dict[ele_conb] = elements_conb_dict.get(ele_conb, 0) + 1
atom_conb_dict[atom_conb] = atom_conb_dict.get(atom_conb, 0) + 1
cell_type_dict[cell_type] = cell_type_dict.get(cell_type, 0) + 1
cell_and_atom_type_dict[cell_and_atom] = cell_and_atom_type_dict.get(cell_and_atom, 0) + 1
end_time = time.perf_counter()
time_consumed = end_time - start_time
# print Traindata info
print(f" ---- Processing Done in {time_consumed} s")
print(" ---- Getting Traindata infomation ...")
info_string = "---- Traindata Analysis Result ----\n"
info_string += f" Data Size: {size} (structures)\n"
info_string += f" Max Force: {max_for} (eV/Ang)\n"
info_string += f" Max Stress {max_stress} (GPa)\n"
info_string += "Elements Conbinations in Train-data: \n"
for ele_conb, count in elements_conb_dict.items():
info_string += "%s: %d \n"%(ele_conb, count)
info_string += "Atoms Conbinations in Train-data: \n"
for atom_conb, count in atom_conb_dict.items():
info_string += "%s: %d \n"%(atom_conb, count)
info_string += "Structure Types in Train-data: \n"
for cell_type, count in cell_type_dict.items():
info_string += f"{cell_type}: {count}\n"
info_string += "Atoms Conbinations and their Types in Train-data: \n"
for cell_and_atoms, count in cell_and_atom_type_dict.items():
info_string += f"{cell_and_atoms}: {count}\n"
# for atom_conb, count in atom_conb_dict.items():
# info_string += f"{atom_conb}: {count}\n"
info_string += "---- DONE! ----"
print(" ---- Printing Traindata infomation table ...")
# give a csv file for Traindata
atoms_and_type_info = {}
for atom_conb, count in atom_conb_dict.items():
atoms_and_type_info[atom_conb] = {
"natom": natom_dict.get(atom_conb, 0),
"bulk": cell_and_atom_type_dict.get((atom_conb, "bulk"),0),
"layer": cell_and_atom_type_dict.get((atom_conb, "layer"),0),
"cluster": cell_and_atom_type_dict.get((atom_conb, "cluster"),0),
"sum": count
}
atoms_and_type_info["Total"] = {
"natom": None,
"bulk":cell_type_dict.get("bulk", 0),
"layer":cell_type_dict.get("layer", 0),
"cluster":cell_type_dict.get("cluster", 0),
"sum":size
}
atoms_df = pd.DataFrame(atoms_and_type_info).T # need transpose
atoms_df.sort_index()
atoms_df.to_csv("Traindata_analysis_table.csv")
print(info_string)