26 output_stream: Union[str, io.TextIOWrapper] = sys.stdout,
27 edm4hep_yaml: Union[str, RemoteResource] = DEFAULT_EDM4HEP_YAML,
28 sql_edm_yaml: Union[str, io.TextIOWrapper] = open(
'sql_edm.yaml',
'w'),
30 if isinstance(edm4hep_yaml, str):
33 if isinstance(output_stream, str):
34 output_stream = open(output_stream,
"w")
36 if isinstance(sql_edm_yaml, str):
37 sql_edm_yaml = open(sql_edm_yaml,
"w")
39 with open(edm4hep_yaml.file)
as input_file:
40 loaded_yaml = input_file.read()
47 def initialize_config(loaded_yaml):
49 raw_cfg = yaml.safe_load(loaded_yaml)
50 datatypes = raw_cfg[
'datatypes']
53 for table, desc
in datatypes.items():
54 this_table, = re.findall(
r"edm4hep::([\w0-9_]+).*", table)
56 for member_cpp
in desc[
'Members']:
57 member_cpp = member_cpp[:member_cpp.find(
"//")]
59 r'([\w:0-9]+(?:<[\w,0-9: _]+>)?) *([\w0-9]+)', member_cpp
62 raise ValueError(f
"Could not parse {member_cpp}")
63 (dtype, member), *_ = matches
64 if re.match(
"edm4hep::Vector2[df].*", dtype):
65 buff += [(
'float', f
"{member}_x"), (
'float', f
"{member}_y")]
66 elif re.match(
"edm4hep::Vector3[df].*", dtype):
68 (
'float', f
"{member}_x"),
69 (
'float', f
"{member}_y"),
70 (
'float', f
"{member}_z"),
72 elif re.match(
"edm4hep::Vector2[i].*", dtype):
73 buff += [(
'int', f
"{member}_x"), (
'int', f
"{member}_y")]
74 elif re.match(
"edm4hep::Vector3[i].*", dtype):
76 (
'int', f
"{member}_x"),
77 (
'int', f
"{member}_y"),
78 (
'int', f
"{member}_z"),
80 elif re.match(
"edm4hep::CovMatrix3[df].*", dtype):
82 (
'float', f
"{member}_00"),
83 (
'float', f
"{member}_01"),
84 (
'float', f
"{member}_02"),
85 (
'float', f
"{member}_11"),
86 (
'float', f
"{member}_12"),
87 (
'float', f
"{member}_22"),
89 elif re.match(
"edm4hep::CovMatrix4[df].*", dtype):
91 (
'float', f
"{member}_00"),
92 (
'float', f
"{member}_01"),
93 (
'float', f
"{member}_02"),
94 (
'float', f
"{member}_03"),
95 (
'float', f
"{member}_11"),
96 (
'float', f
"{member}_12"),
97 (
'float', f
"{member}_13"),
98 (
'float', f
"{member}_22"),
99 (
'float', f
"{member}_23"),
100 (
'float', f
"{member}_33"),
102 elif re.match(
"edm4hep::CovMatrix5[df].*", dtype):
104 (
'float', f
"{member}_00"),
105 (
'float', f
"{member}_01"),
106 (
'float', f
"{member}_02"),
107 (
'float', f
"{member}_03"),
108 (
'float', f
"{member}_04"),
109 (
'float', f
"{member}_11"),
110 (
'float', f
"{member}_12"),
111 (
'float', f
"{member}_13"),
112 (
'float', f
"{member}_14"),
113 (
'float', f
"{member}_22"),
114 (
'float', f
"{member}_23"),
115 (
'float', f
"{member}_24"),
116 (
'float', f
"{member}_33"),
117 (
'float', f
"{member}_34"),
118 (
'float', f
"{member}_44"),
120 elif re.match(
"edm4hep::Quantity", dtype):
122 (
'float', f
"{member}_best"),
123 (
'float', f
"{member}_err"),
125 elif re.match(
"std::array<\w+,[0-9]+>", dtype):
126 (atype, asize), = re.findall(
"std::array<(\w+), *([0-9]+)>", dtype)
128 buff += [ (atype, f
"{member}_{i:02d}")
for i
in range(asize) ]
129 elif re.match(
"std::array<edm4hep::\w+, *[0-9]+>", dtype):
130 (atype, asize), = re.findall(
"std::array<(edm4hep::\w+), *([0-9]+)>", dtype)
132 buff += [ (atype, f
"{member}_{i:02d}")
for i
in range(asize) ]
134 elif 'edm4hep::' in dtype:
135 raise NotImplementedError(f
"Unexpected type {dtype}")
137 buff.append ((dtype, member))
139 if 'OneToOneRelations' in desc:
140 for pointer
in desc[
'OneToOneRelations']:
141 pointer = pointer[:pointer.find(
"//")]
142 (target_table, name), *_ = re.findall(
143 r'edm4hep::([\w0-9_]+) *([\w0-9_]+)', pointer
146 buff += [(
'int', name, target_table)]
148 if 'OneToManyRelations' in desc:
149 for pointer
in desc[
'OneToManyRelations']:
150 pointer = pointer[:pointer.find(
"//")]
151 (target_table, name), *_ = re.findall(
152 r'edm4hep::([\w0-9_]+) *([\w0-9_]+)', pointer
155 zip_table =
"%s__%s" % tuple(sorted([this_table, target_table]))
157 buff += [(
'int', name, zip_table)]
159 tables[this_table] = [b
if len(b)==3
else (*b,
None)
for b
in buff]
161 INTEGER_TYPES = (
'uint64_t',
'int64_t',
'uint32_t',
'int32_t',
'int')
162 REAL_TYPES = (
'float',
'double')
164 pd.DataFrame([(k, *v)
for k, row
in tables.items()
for v
in row],
165 columns = [
"table",
"member_type",
"member_name",
"requires_table"])
166 .replace({
'member_type': {i:
'INTEGER' for i
in INTEGER_TYPES}})
167 .replace({
'member_type': {i:
'REAL' for i
in REAL_TYPES}})
172 def __call__(self, db):
174 for name, create
in db.execute(SQLITE_MASTER_QUERY).fetchall():
175 if "sqlite" in name:
continue
177 df = pd.read_sql(f
"SELECT * FROM {name} LIMIT 1", db)
178 columns = list(df.columns)
179 rows += [(name, str(df[c].dtype), c)
for c
in columns]
181 INTEGER_TYPES = (
'int64',
'int32')
182 REAL_TYPES = (
'float64',
'float32')
185 pd.DataFrame(rows, columns=[
'table',
'member_type',
'member_name'])
186 .replace({
'member_type': {i:
'INTEGER' for i
in INTEGER_TYPES}})
187 .replace({
'member_type': {i:
'REAL' for i
in REAL_TYPES}})
188 .assign(batch_id=
lambda _: len(self.
batch_dfs))
191 def report_row(self, table, message, column=None):
193 print (f
"{table:<33s} | {message:80s}")
195 print (f
"{table:<20s} | {column:<10s} | {message:80s}")
201 all_batches = np.unique(df.batch_id)
202 for expected_table, xpdf
in self.
config.groupby(
'table'):
203 found = df[df.table == expected_table]
204 found_in_batches = np.unique(found.batch_id)
209 if len(np.setdiff1d(all_batches, found_in_batches)) > 0:
211 f
"Found in {len(found_in_batches)}/{len(all_batches)}")
216 import yaml
as output_fmt
218 import json
as output_fmt
221 for actual_table, acdf
in df.groupby(
'table'):
222 report[actual_table] = np.unique(acdf.member_name).tolist()