PyLamarr
Pythonizations for the ultra-fast simulation option for the LHCb experiment
 
Loading...
Searching...
No Matches
EDM4hepValidator.py
1import PyLamarr
2import numpy as np
3from PyLamarr import RemoteResource
4import io
5import re
6import sys
7from dataclasses import dataclass, field
8from pydantic import Field, validate_arguments
9from typing import Dict, Any, Union
10
11import pandas as pd
12
13DEFAULT_EDM4HEP_YAML = (
14 "https://raw.githubusercontent.com/key4hep/EDM4hep/main/edm4hep.yaml"
15)
16
17
18SQLITE_MASTER_QUERY = """
19 SELECT name, sql
20 FROM sqlite_master
21 WHERE type=='table'
22"""
23
25 def __init__ (self,
26 output_stream: Union[str, io.TextIOWrapper] = sys.stdout,
27 edm4hep_yaml: Union[str, RemoteResource] = DEFAULT_EDM4HEP_YAML,
28 sql_edm_yaml: Union[str, io.TextIOWrapper] = open('sql_edm.yaml', 'w'),
29 ):
30 if isinstance(edm4hep_yaml, str):
31 edm4hep_yaml = RemoteResource(edm4hep_yaml)
32
33 if isinstance(output_stream, str):
34 output_stream = open(output_stream, "w")
35
36 if isinstance(sql_edm_yaml, str):
37 sql_edm_yaml = open(sql_edm_yaml, "w")
38
39 with open(edm4hep_yaml.file) as input_file:
40 loaded_yaml = input_file.read()
41
42 self.config = self.initialize_config(loaded_yaml)
43 self.batch_dfs = []
44 self.sql_edm_yaml = sql_edm_yaml
45
46 @staticmethod
47 def initialize_config(loaded_yaml):
48 import yaml
49 raw_cfg = yaml.safe_load(loaded_yaml)
50 datatypes = raw_cfg['datatypes']
51
52 tables = {}
53 for table, desc in datatypes.items():
54 this_table, = re.findall(r"edm4hep::([\w0-9_]+).*", table)
55 buff = []
56 for member_cpp in desc['Members']:
57 member_cpp = member_cpp[:member_cpp.find("//")]
58 matches = re.findall(
59 r'([\w:0-9]+(?:<[\w,0-9: _]+>)?) *([\w0-9]+)', member_cpp
60 )
61 if len(matches) == 0:
62 raise ValueError(f"Could not parse {member_cpp}")
63 (dtype, member), *_ = matches
64 if re.match("edm4hep::Vector2[df].*", dtype):
65 buff += [('float', f"{member}_x"), ('float', f"{member}_y")]
66 elif re.match("edm4hep::Vector3[df].*", dtype):
67 buff += [
68 ('float', f"{member}_x"),
69 ('float', f"{member}_y"),
70 ('float', f"{member}_z"),
71 ]
72 elif re.match("edm4hep::Vector2[i].*", dtype):
73 buff += [('int', f"{member}_x"), ('int', f"{member}_y")]
74 elif re.match("edm4hep::Vector3[i].*", dtype):
75 buff += [
76 ('int', f"{member}_x"),
77 ('int', f"{member}_y"),
78 ('int', f"{member}_z"),
79 ]
80 elif re.match("edm4hep::CovMatrix3[df].*", dtype):
81 buff += [
82 ('float', f"{member}_00"),
83 ('float', f"{member}_01"),
84 ('float', f"{member}_02"),
85 ('float', f"{member}_11"),
86 ('float', f"{member}_12"),
87 ('float', f"{member}_22"),
88 ]
89 elif re.match("edm4hep::CovMatrix4[df].*", dtype):
90 buff += [
91 ('float', f"{member}_00"),
92 ('float', f"{member}_01"),
93 ('float', f"{member}_02"),
94 ('float', f"{member}_03"),
95 ('float', f"{member}_11"),
96 ('float', f"{member}_12"),
97 ('float', f"{member}_13"),
98 ('float', f"{member}_22"),
99 ('float', f"{member}_23"),
100 ('float', f"{member}_33"),
101 ]
102 elif re.match("edm4hep::CovMatrix5[df].*", dtype):
103 buff += [
104 ('float', f"{member}_00"),
105 ('float', f"{member}_01"),
106 ('float', f"{member}_02"),
107 ('float', f"{member}_03"),
108 ('float', f"{member}_04"),
109 ('float', f"{member}_11"),
110 ('float', f"{member}_12"),
111 ('float', f"{member}_13"),
112 ('float', f"{member}_14"),
113 ('float', f"{member}_22"),
114 ('float', f"{member}_23"),
115 ('float', f"{member}_24"),
116 ('float', f"{member}_33"),
117 ('float', f"{member}_34"),
118 ('float', f"{member}_44"),
119 ]
120 elif re.match("edm4hep::Quantity", dtype):
121 buff += [
122 ('float', f"{member}_best"),
123 ('float', f"{member}_err"),
124 ]
125 elif re.match("std::array<\w+,[0-9]+>", dtype):
126 (atype, asize), = re.findall("std::array<(\w+), *([0-9]+)>", dtype)
127 asize = int(asize)
128 buff += [ (atype, f"{member}_{i:02d}") for i in range(asize) ]
129 elif re.match("std::array<edm4hep::\w+, *[0-9]+>", dtype):
130 (atype, asize), = re.findall("std::array<(edm4hep::\w+), *([0-9]+)>", dtype)
131 asize = int(asize)
132 buff += [ (atype, f"{member}_{i:02d}") for i in range(asize) ]
133
134 elif 'edm4hep::' in dtype:
135 raise NotImplementedError(f"Unexpected type {dtype}")
136 else:
137 buff.append ((dtype, member))
138
139 if 'OneToOneRelations' in desc:
140 for pointer in desc['OneToOneRelations']:
141 pointer = pointer[:pointer.find("//")]
142 (target_table, name), *_ = re.findall(
143 r'edm4hep::([\w0-9_]+) *([\w0-9_]+)', pointer
144 )
145
146 buff += [('int', name, target_table)]
147
148 if 'OneToManyRelations' in desc:
149 for pointer in desc['OneToManyRelations']:
150 pointer = pointer[:pointer.find("//")]
151 (target_table, name), *_ = re.findall(
152 r'edm4hep::([\w0-9_]+) *([\w0-9_]+)', pointer
153 )
154
155 zip_table = "%s__%s" % tuple(sorted([this_table, target_table]))
156
157 buff += [('int', name, zip_table)]
158
159 tables[this_table] = [b if len(b)==3 else (*b, None) for b in buff]
160
161 INTEGER_TYPES = ('uint64_t', 'int64_t', 'uint32_t', 'int32_t', 'int')
162 REAL_TYPES = ('float', 'double')
163 return (
164 pd.DataFrame([(k, *v) for k, row in tables.items() for v in row],
165 columns = ["table", "member_type", "member_name", "requires_table"])
166 .replace({'member_type': {i: 'INTEGER' for i in INTEGER_TYPES}})
167 .replace({'member_type': {i: 'REAL' for i in REAL_TYPES}})
168 )
169
170
171 @PyLamarr.method
172 def __call__(self, db):
173 rows = []
174 for name, create in db.execute(SQLITE_MASTER_QUERY).fetchall():
175 if "sqlite" in name: continue
176
177 df = pd.read_sql(f"SELECT * FROM {name} LIMIT 1", db)
178 columns = list(df.columns)
179 rows += [(name, str(df[c].dtype), c) for c in columns]
180
181 INTEGER_TYPES = ('int64', 'int32')
182 REAL_TYPES = ('float64', 'float32')
183
184 self.batch_dfs.append (
185 pd.DataFrame(rows, columns=['table', 'member_type', 'member_name'])
186 .replace({'member_type': {i: 'INTEGER' for i in INTEGER_TYPES}})
187 .replace({'member_type': {i: 'REAL' for i in REAL_TYPES}})
188 .assign(batch_id=lambda _: len(self.batch_dfs))
189 )
190
191 def report_row(self, table, message, column=None):
192 if column is None:
193 print (f"{table:<33s} | {message:80s}")
194 else:
195 print (f"{table:<20s} | {column:<10s} | {message:80s}")
196
197 def summary(self):
198 df = pd.concat(self.batch_dfs)
199 print (df)
200 self.batch_dfs = []
201 all_batches = np.unique(df.batch_id)
202 for expected_table, xpdf in self.config.groupby('table'):
203 found = df[df.table == expected_table]
204 found_in_batches = np.unique(found.batch_id)
205 if len(found) == 0:
206 self.report_row(expected_table, "Not found")
207 continue
208 else:
209 if len(np.setdiff1d(all_batches, found_in_batches)) > 0:
210 self.report_row(expected_table,
211 f"Found in {len(found_in_batches)}/{len(all_batches)}")
212 else:
213 self.report_row(expected_table, "Found")
214
215 try:
216 import yaml as output_fmt
217 except ImportError:
218 import json as output_fmt
219
220 report = {}
221 for actual_table, acdf in df.groupby('table'):
222 report[actual_table] = np.unique(acdf.member_name).tolist()
223
224 output_fmt.dump(report, self.sql_edm_yaml)
225
226
227
228
229
230
231# print (self.config['datatypes'].keys())
232# print (self.config['datatypes']['edm4hep::MCRecoCaloAssociation']['Members'])
233# print (self.config['datatypes']['edm4hep::MCRecoCaloAssociation']['OneToOneRelations'])
234#
235# for dtype, desc in self.config['datatypes'].items():
236# if 'OneToManyRelations' in desc:
237# print (dtype, desc['OneToManyRelations'])
238#
239
240if __name__ == '__main__':
241 validator = EDM4hepValidator()
242 from pprint import pprint
243 print (validator.config)
244
245
246
Resource on the Internet, locally cached.