bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
check_dmrpp.cc
1#include <iostream>
2#include<fstream>
3#include <string>
4#include <vector>
5using namespace std;
6
7bool find_var_helper(const string &str, const vector<string> &var_type_list,
8 vector<string> &var_type, vector<string> &var_name);
9
10bool find_var(const string &str, const vector<string> &var_type_list,
11 vector<string> &var_type, vector<string> &var_name,
12 vector<unsigned int> &var_lines, unsigned int line_num);
13bool find_endvar(const string &str,const string &vtype);
14
15bool find_raw_data_location_info(const string &str);
16bool find_fillValue_in_chunks(const string &str);
17bool find_data_offset(const string &str);
18bool find_embedded_data_info(const string &str);
19
20string obtain_var_grp_paths(const vector<unsigned int> &gs_line_nums,
21 const vector<unsigned int> &ge_line_nums,
22 const vector<string> &grp_names,
23 unsigned int var_line);
24
25int obtain_gse_line_index(const vector<unsigned int> &gse_line_nums, unsigned int var_line);
26
27bool find_grp(const string &str, unsigned int line_num, vector<string> &grp_names,
28 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines);
29
30bool find_end_grp(const string &dmrpp_line, unsigned int line_num, vector<unsigned int> &end_grp_lines);
31
32bool obtain_grp_info(const string &fname, vector<string> &grp_names, vector<unsigned int> &grp_lines,
33 vector<unsigned int> &end_grp_lines);
34
35int main (int argc, char** argv)
36{
37 // Provide the dmrpp file name and the file name to store the variables that miss values
38 if (argc <3 || argc >4) {
39
40 cout<<"Provide the dmrpp file name to be checked and the output file name that includes the missing data variables."<<endl;
41 cout<<"If you need to use the DAP2 constraint delimiter for the missing data variables, add -dap2 as the last argument."<<endl;
42 return -1;
43 }
44
45 bool dap2_output = false;
46 if (argc == 4) {
47 string dap2_str(argv[3]);
48 if (dap2_str =="-dap2")
49 dap2_output = true;
50 else {
51 cout<<"To use the DAP2 constraint delimiter for the missing data variables, the last argument must be -dap2."<<endl;
52 cout<<"The program is terminated. "<<endl;
53 return -1;
54 }
55 }
56
57 string fname(argv[1]);
58 ifstream dmrpp_fstream;
59 dmrpp_fstream.open(fname.c_str(),ifstream::in);
60 string dmrpp_line;
61
62 // DAP4 supported atomic datatype
63 vector<string> var_type_list;
64 var_type_list.push_back("Float32");
65 var_type_list.push_back("Int32");
66 var_type_list.push_back("Float64");
67 var_type_list.push_back("Byte");
68 var_type_list.push_back("Int16");
69 var_type_list.push_back("UInt16");
70 var_type_list.push_back("String");
71 var_type_list.push_back("UInt32");
72 var_type_list.push_back("Int8");
73 var_type_list.push_back("Int64");
74 var_type_list.push_back("UInt64");
75 var_type_list.push_back("UInt8");
76 var_type_list.push_back("Char");
77 var_type_list.push_back("Structure");
78
79 // var_type and var_name should be var data type and var name in the dmrpp file
80 vector<string> var_type;
81 vector<string> var_name;
82 vector<unsigned int> var_lines;
83
84 //The vector to check if there is raw data info inside this var block(<var ..> </var>)
85 vector<bool> data_exist;
86
87 // The following flags are used to check the variables that miss the values.
88 // In a dmrpp file, an example of variable block may start from
89 // <Float32 name="temperature"> and end with </Float32>
90 // fin_vb_start: flag to find the start of the var block
91 // fin_vb_end: flag to find the end of the var block
92 // data_found: flag to find if raw data information is inside the var block
93 bool fin_vb_start = false;
94 bool fin_vb_end = false;
95 bool data_found = false;
96
97 unsigned int line_num = 0;
98
99 // Check every line of the dmrpp file.
100 // We also remember the line number for every variable.
101 while (getline(dmrpp_fstream,dmrpp_line)) {
102
103 // If we find the start of the var block(<var..>)
104 if (true == fin_vb_start) {
105
106 // var data type must exist.
107 if (var_type.empty()) {
108 cout<<"Doesn't have the variable datatype, abort for dmrpp file "<<fname << endl;
109 return -1;
110 }
111 // Not find the end of var block. try to find it.
112 if (false == fin_vb_end)
113 fin_vb_end = find_endvar(dmrpp_line, var_type[var_type.size()-1]);
114
115 // If find the end of var block, check if the raw data info is already found in the var block.
116 if (true == fin_vb_end) {
117
118 if (false == data_found)
119 data_exist.push_back(false);
120
121 // If we find the end of this var block,
122 // reset all bools for the next variable.
123 fin_vb_start = false;
124 fin_vb_end = false;
125 data_found = false;
126 }
127 else {// Check if having raw data info. within this var block.
128
129 if (false == data_found) {
130
131 data_found = find_raw_data_location_info(dmrpp_line);
132
133 // When finding the raw data location info in this dmrpp file, update the data_exist vector.
134 if (true == data_found)
135 data_exist.push_back(true);
136
137 }
138 }
139 }
140 else // Continue finding the var block
141 fin_vb_start = find_var(dmrpp_line,var_type_list,var_type,var_name, var_lines,line_num);
142
143 line_num++;
144 }
145
146 //Sanity check to make sure the data_exist vector is the same as var_type vector.
147 //If not, something is wrong with this dmrpp file.
148 if (data_exist.size() != var_type.size()) {
149 cout<<"Number of chunk check is not consistent with the number of var check."<<endl;
150 cout<< "The dmrpp file is "<<fname<<endl;
151 return -1;
152 }
153
154 bool has_missing_info = false;
155 size_t last_missing_chunk_index = 0;
156
157 // Check if there is any missing variable information.
158 // Here we need to remember the last missing chunking index for the final output.
159 if (!var_type.empty()) {
160 auto ritr = var_type.rbegin();
161 size_t i = var_type.size() - 1;
162 while (ritr != var_type.rend()) {
163 if (!data_exist[i]) {
164 has_missing_info = true;
165 last_missing_chunk_index = i;
166 break;
167 }
168 ritr++;
169 i--;
170 }
171 }
172
173 // Report the final output.
174 if (true == has_missing_info) {
175
176 // Check group and var_grp names for group hierarchy enhancement.
177 vector<string> grp_names;
178 vector<unsigned int> grp_lines;
179 vector<unsigned int> end_grp_lines;
180
181 bool has_grps = obtain_grp_info(fname,grp_names,grp_lines,end_grp_lines);
182 if (grp_lines.size() != end_grp_lines.size()) {
183 cout<<"The number of group bracket is NOT the same as the number of end group bracket."<<endl;
184 return -1;
185 }
186
187 // fname2 is the output file that contains the missing variable information.
188 ofstream dmrpp_ofstream;
189 string fname2(argv[2]);
190 dmrpp_ofstream.open(fname2.c_str(),ofstream::out | ofstream::trunc);
191
192 // We need to loop through every variable. Note: we just another index to obtain the corresponding
193 // variable lines and variable names.
194 size_t i = 0;
195 for (auto vt:var_type) {
196 if(!data_exist[i]) {
197 string var_str;
198 if (has_grps) {
199 // We need to obtain the missing variable FQN.
200 string var_path = obtain_var_grp_paths(grp_lines,end_grp_lines,grp_names,var_lines[i]);
201 var_str = var_path + "/" + var_name[i];
202 }
203 else
204 var_str = var_name[i];
205
206 // Note: DAP4 constraint syntax needs semicolon(';") whereas DAP2 constraint needs comma(',');
207 // Essentially the DAP4 constraint is general enough to cover everything, however, the current
208 // get_dmrpp still uses the DAP2 constraint. To keep it compatible with get_dmrpp for the non-group case,
209 // I still keep comma.
210 if (i != last_missing_chunk_index) {
211 if (dap2_output)
212 dmrpp_ofstream<<var_str <<",";
213 else
214 dmrpp_ofstream<<var_str <<";";
215 }
216 else
217 dmrpp_ofstream<<var_str;
218 }
219 i++;
220 }
221 }
222 return 0;
223
224}
225
226// Find the var type and var name like <Int16 name="foo">
227bool find_var_helper(const string &str, const vector<string> &var_type_list,
228 vector<string> &var_type,vector<string> &var_name) {
229
230 bool ret = false;
231
232 // Every var block will have spaces before <
233 if (str[0] != ' ')
234 return ret;
235
236 // Ignore the line with all spaces
237 size_t non_space_char_pos = str.find_first_not_of(' ');
238 if (non_space_char_pos == string::npos)
239 return ret;
240
241 // The first non-space character should be '<'
242 if (str[non_space_char_pos] != '<')
243 return ret;
244
245 // After space, must at least contain '<','>'
246 if (str.size() <= (non_space_char_pos+1))
247 return ret;
248
249 // The last character must be '>', maybe this is too strict.
250 // We will see.
251 if (str[str.size()-1] != '>' )
252 return ret;
253
254 // char_2 is a character right after<
255 char char_2 = str[non_space_char_pos+1];
256
257 // The first var character must be one of the list.
258 // The following list includes the first character
259 // of all possible variable types.
260 string v_1char_list = "FIUBSC";
261
262 // If the first character is not one of DAP type,ignore.
263 if (v_1char_list.find_first_of(char_2) == string::npos)
264 return ret;
265
266 // Find ' name="' and the position after non_space_char_pos+1, like <Int16 name="d16_1">
267 string sep=" name=\"";
268 size_t sep_pos = str.find(sep,non_space_char_pos+2);
269
270 // Cannot find "name=..", ignore this line.
271 if (sep_pos == string::npos)
272 return ret;
273
274 // Try to figure out the variable type.
275 size_t var_index = -1;
276 bool found = false;
277 for (size_t i = 0; i < var_type_list.size() && !found ; i++) {
278 if(str.compare(non_space_char_pos+1,sep_pos-non_space_char_pos-1,var_type_list[i]) == 0) {
279 var_index = i;
280 found = true;
281 }
282 }
283
284 // If cannot find the supported type, ignore this line.
285 if (!found)
286 return ret;
287
288 // Find the end quote position of the variable name.
289 char end_quote='"';
290 size_t end_name_pos = str.find(end_quote,sep_pos + sep.size() + 1);
291 if (end_name_pos != string::npos) {
292
293 // Find both var type and var name. Store them in the vector
294 string var_name_line = str.substr(sep_pos + sep.size(),end_name_pos - sep_pos - sep.size());
295 var_type.push_back(var_type_list[var_index]);
296 var_name.push_back(var_name_line);
297 ret = true;
298 }
299 return ret;
300}
301
302// Find if this var block contains the raw data info.
303bool find_raw_data_location_info(const string &dmrpp_line) {
304
305 bool ret = false;
306
307 // Check if this var contains data storage key word fillValue.
308 ret = find_fillValue_in_chunks(dmrpp_line);
309
310 // Check if this var contains the key word chunk or block and offset.
311 if (ret == false)
312 ret = find_data_offset(dmrpp_line);
313
314 // Also need to find if having a key word such as dmrpp:missingdata.
315 // These key words indicate the data is stored inside the dmrpp file.
316 if (false == ret)
317 ret = find_embedded_data_info(dmrpp_line);
318
319 return ret;
320
321}
322
323// Find if this var block contains dmrpp:chunks and fillValue
324bool find_fillValue_in_chunks(const string &str) {
325
326 bool ret = false;
327 string fvalue_mark = "fillValue";
328 string dmrpp_chunks_mark = "<dmrpp:chunks ";
329
330 size_t dmrpp_chunks_mark_pos = str.find(dmrpp_chunks_mark);
331 if (dmrpp_chunks_mark_pos != string::npos) {
332 if (string::npos != str.find(fvalue_mark, dmrpp_chunks_mark_pos+dmrpp_chunks_mark.size()))
333 ret = true;
334 }
335 return ret;
336
337}
338
339// Find whether there are chunks or blocks inside the var block.
340// Any chunk info(chunk or contiguous) should include
341// "<dmrpp:chunk " / "<dmrpp:block " and "offset".
342bool find_data_offset(const string &str) {
343
344 bool ret = false;
345 string offset_mark = "offset";
346 vector<string> data_storage_mark_list = {"<dmrpp:chunk ","<dmrpp:block "};
347
348 for (const auto & data_storage_mark:data_storage_mark_list) {
349
350 size_t data_storage_mark_pos = str.find(data_storage_mark);
351 if (data_storage_mark_pos != string::npos) {
352 if (string::npos != str.find(offset_mark, data_storage_mark_pos+data_storage_mark.size())) {
353 ret = true;
354 break;
355 }
356 }
357 }
358 return ret;
359}
360
361// Find whether there are embedded_data_info in this var block.
362// Currently the embedded_data_info includes <dmrpp:compact>, <dmrpp:missingdata>, <dmrpp:vlsa>
363// and <dmrpp:specialstructuredata>.
364bool find_embedded_data_info(const string &str) {
365
366 bool ret = false;
367 vector<string> embedded_data_block_list = {"<dmrpp:compact>",
368 "<dmrpp:missingdata>",
369 "<dmrpp:vlsa>",
370 "<dmrpp:specialstructuredata>"};
371
372 for (const auto & embedded_data_block:embedded_data_block_list) {
373 size_t embedded_data_block_pos = str.find(embedded_data_block);
374 if (embedded_data_block_pos != string::npos) {
375 ret = true;
376 break;
377 }
378 }
379 return ret;
380}
381
382
383// Find the end of var block such as </Int32>
384// There may be space before </Int32>
385bool find_endvar(const string &str, const string &vtype) {
386
387 bool ret = false;
388 string end_var = "</" + vtype + '>';
389 size_t vb_end_pos = str.find(end_var);
390 if (vb_end_pos != string::npos) {
391 if ((vb_end_pos + end_var.size()) == str.size())
392 ret = true;
393 }
394 return ret;
395}
396
397bool find_var(const string &str, const vector<string> &var_type_list,
398 vector<string> &var_type,vector<string> &var_name,
399 vector<unsigned int> &var_lines, unsigned int line_num) {
400
401 bool ret_value = find_var_helper(str,var_type_list,var_type,var_name);
402 if (ret_value == true)
403 var_lines.push_back(line_num);
404 return ret_value;
405}
406
407// obtain the group names, group line numbers and end group line numbers.
408// The return value is true if there are any groups.
409bool obtain_grp_info(const string &fname, vector<string> &grp_names,
410 vector<unsigned int> &grp_lines,vector<unsigned int> &end_grp_lines)
411{
412
413 string dmrpp_line;
414
415 // find <Group>
416 bool find_grp_start = false;
417 unsigned int line_num = 0;
418
419 ifstream dmrpp_fstream;
420 dmrpp_fstream.open(fname.c_str(),ifstream::in);
421
422 while(getline(dmrpp_fstream,dmrpp_line)) {
423
424 find_grp_start = find_grp(dmrpp_line,line_num,grp_names,grp_lines,end_grp_lines);
425 if (find_grp_start == false)
426 find_end_grp(dmrpp_line,line_num,end_grp_lines);
427 line_num++;
428 }
429
430 return !(grp_names.empty());
431
432}
433
434// Find group
435bool find_grp(const string &str, unsigned int line_num, vector<string> &grp_names,
436 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines) {
437
438 bool ret = false;
439
440 // Every group block will have spaces before <
441 if (str[0]!=' ')
442 return ret;
443
444 // Ignore the line with all spaces
445 size_t non_space_char_pos = str.find_first_not_of(' ');
446 if (non_space_char_pos == string::npos)
447 return ret;
448
449 // The first non-space character should be '<'
450 if (str[non_space_char_pos] != '<')
451 return ret;
452
453 // After space, must at least contain '<','>'
454 if (str.size() <= (non_space_char_pos+1))
455 return ret;
456
457 // The last character must be '>', maybe this is too strict.
458 // We will see.
459 if (str[str.size()-1] != '>' )
460 return ret;
461
462 // char_2 is a character right after<
463 char char_2 = str[non_space_char_pos+1];
464 if (char_2 != 'G')
465 return ret;
466
467 // Find ' name="' and the position after non_space_char_pos+1, like <Int16 name="d16_1">
468 string sep="Group name=\"";
469 size_t sep_pos = str.find(sep,non_space_char_pos+1);
470
471 // Cannot find "Group name=", ignore this line.
472 if (sep_pos == string::npos)
473 return ret;
474
475 // Find the end quote position of the group name.
476 char end_quote='"';
477 size_t end_name_pos = str.find(end_quote,sep_pos+sep.size()+1);
478 if (end_name_pos != string::npos) {
479
480 // Store the group name in the vector
481 string grp_name = str.substr(sep_pos+sep.size(),end_name_pos-sep_pos-sep.size());
482 grp_names.push_back(grp_name);
483 grp_lines.push_back(line_num);
484
485 // We also need to check the empty group case. That is when Group name="foo"/>
486 // For this case, we need to remember this line also as the end group line.
487 // Like <Group name="FILE_ATTRIBUTES"/>
488 if ((str.size() > (end_name_pos+1)) && str[end_name_pos+1]=='/')
489 end_grp_lines.push_back(line_num);
490
491 ret = true;
492 }
493
494 return ret;
495}
496
497// Find the end of var block such as </Int32>
498// There may be space before </Int32>
499bool find_end_grp(const string &dmrpp_line,unsigned int line_num, vector<unsigned int> &end_grp_lines) {
500
501 bool ret = false;
502 string end_grp = "</Group>" ;
503 size_t end_grp_pos = dmrpp_line.find(end_grp);
504 if (end_grp_pos != string::npos) {
505 if ((end_grp_pos + end_grp.size()) == dmrpp_line.size()) {
506 end_grp_lines.push_back(line_num);
507 ret = true;
508 }
509 }
510 return ret;
511}
512
513
514// Obtain the variable path.
515string obtain_var_grp_paths(const vector<unsigned int> &gs_line_nums,
516 const vector<unsigned int> &ge_line_nums,
517 const vector<string> & grp_names,
518 unsigned int var_line) {
519 string ret_value;
520
521 vector<unsigned int> gse_line_nums;
522 vector<bool> is_group_start;
523
524 unsigned int end_grp_index = 0;
525 unsigned int start_grp_index = 0;
526
527 // The maximum index of the group is the number of groups minus 1 since index is from 0.
528 unsigned int max_grp_index = gs_line_nums.size() -1;
529
530 // We combine both group lines and end_group lines to one vector.
531 // Another vector of bool with the same size is created to mark if
532 // this line is a start_of_a_group or an end_of_a_group.
533 // During this process, we elimiate the trivial groups.
534
535 while (end_grp_index <= max_grp_index) {
536
537 while (start_grp_index <= max_grp_index) {
538
539 if (gs_line_nums[start_grp_index] < ge_line_nums[end_grp_index]) {
540 gse_line_nums.push_back(gs_line_nums[start_grp_index]);
541 is_group_start.push_back(true);
542 start_grp_index++;
543 }
544 else if (gs_line_nums[start_grp_index] == ge_line_nums[end_grp_index]) {
545 // Exclude the case when the starting group line is equal to the ending group line.
546 // This is the empty group case.
547 start_grp_index++;
548 end_grp_index++;
549 }
550 else {
551 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
552 is_group_start.push_back(false);
553 end_grp_index++;
554 }
555 }
556 // end group </Group> will always be at last.
557 if (end_grp_index < (max_grp_index+1)) {
558 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
559 is_group_start.push_back(false);
560 end_grp_index++;
561 }
562 }
563
564 // No need to check this. It should always be true.
565#if 0
566 if (is_group_start.size() != gse_line_nums.size()) {
567 cerr<<"The group "<<endl;
568 return ret_value;
569 }
570#endif
571
572 // Debugging info, leave the block now.
573#if 0
574for (unsigned int i =0; i<gse_line_nums.size();i++) {
575 cerr<<"gse_line["<<i<<"] = "<<gse_line_nums[i] <<endl;
576 cerr<<"is_group_start["<<i<<"] = "<<is_group_start[i] <<endl;
577}
578#endif
579
580 // Obtain the start_end_group line index just before the the variable line.
581 int gse_line_index= obtain_gse_line_index(gse_line_nums,var_line);
582
583#if 0
584cerr<<"gse_line_index: "<<gse_line_index <<endl;
585#endif
586
587 // obtain group lines that this variable belongs to.
588 vector<unsigned int> grp_path_lines;
589
590 if (gse_line_index >= 0) {
591
592 int temp_index = gse_line_index;
593
594 // temp_rem_grp_index indicates the groups we need to remove for this var.
595 // The removed groups are groups that don't contain this variable.
596 unsigned int temp_rem_grp_index = 0;
597
598 // We have to search backward.
599 while (temp_index >= 0) {
600
601 // Encounter an end-group, we need to increase the index.
602 if (is_group_start[temp_index] == false)
603 temp_rem_grp_index++;
604 else {
605 // Only when the number of end-group counter is 0 for this block,
606 // does this group path belong to this variable.
607 if (temp_rem_grp_index == 0)
608 grp_path_lines.push_back(gse_line_nums[temp_index]);
609 else
610 temp_rem_grp_index--; //Cancel one start-group and end-group
611 }
612 temp_index--;
613 }
614 }
615
616 // For debugging
617#if 0
618for (const auto &gpl:grp_path_lines)
619cerr<<"grp_path_lines "<<gpl <<endl;
620for (const auto &gsn:gs_line_nums)
621cerr<<"gs_lines "<<gsn <<endl;
622for (const auto &gn:grp_names)
623cerr<<"group name is "<<gn <<endl;
624#endif
625
626 // Both the group path for this var and the group lines are sorted.
627 // group path is from backward. So we match the group line backward.
628 int gl_index = gs_line_nums.size() - 1; // gl_index should start with size-1 since we count backwards to zero
629
630 for (const auto & gpl:grp_path_lines) {
631
632 // Note: gl_index is modified. This is intentionally since
633 // we don't need to search the lines already visited.
634 // We just need to prepend the group path as we search backward.
635 for (; gl_index >= 0; gl_index--) {
636
637 if (gpl == gs_line_nums[gl_index]) {
638
639 ret_value = "/" + grp_names[gl_index] + ret_value;
640 gl_index--;
641 break;
642 }
643 }
644 }
645
646#if 0
647cerr<<"ret_value is "<<ret_value <<endl;
648#endif
649
650 return ret_value;
651
652}
653
654// Obtain the start_end_group line index just before the variable line.
655// The returned value is -1 if there is no group before this var.
656int obtain_gse_line_index(const vector<unsigned int> &gse_line_nums, unsigned int var_line) {
657
658 int ret_value = -1;
659 auto total_gse_lines = (unsigned int)(gse_line_nums.size());
660
661 if (total_gse_lines > 0) {
662
663 for (int i = total_gse_lines-1; i >= 0 ; i--) {
664 if (gse_line_nums[i] >var_line)
665 continue;
666 else {
667 ret_value = i;
668 break;
669 }
670 }
671 }
672 return ret_value;
673}
674
675
676
677
678
679
680
681