7bool find_var_helper(
const string &str,
const vector<string> &var_type_list,
8 vector<string> &var_type, vector<string> &var_name);
10bool find_var(
const string &str,
const vector<string> &var_type_list,
11 vector<string> &var_type, vector<string> &var_name,
12 vector<unsigned int> &var_lines,
unsigned int line_num);
13bool find_endvar(
const string &str,
const string &vtype);
15bool find_raw_data_location_info(
const string &str);
16bool find_fillValue_in_chunks(
const string &str);
17bool find_data_offset(
const string &str);
18bool find_embedded_data_info(
const string &str);
20string obtain_var_grp_paths(
const vector<unsigned int> &gs_line_nums,
21 const vector<unsigned int> &ge_line_nums,
22 const vector<string> &grp_names,
23 unsigned int var_line);
25int obtain_gse_line_index(
const vector<unsigned int> &gse_line_nums,
unsigned int var_line);
27bool find_grp(
const string &str,
unsigned int line_num, vector<string> &grp_names,
28 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines);
30bool find_end_grp(
const string &dmrpp_line,
unsigned int line_num, vector<unsigned int> &end_grp_lines);
32bool obtain_grp_info(
const string &fname, vector<string> &grp_names, vector<unsigned int> &grp_lines,
33 vector<unsigned int> &end_grp_lines);
35int main (
int argc,
char** argv)
38 if (argc <3 || argc >4) {
40 cout<<
"Provide the dmrpp file name to be checked and the output file name that includes the missing data variables."<<endl;
41 cout<<
"If you need to use the DAP2 constraint delimiter for the missing data variables, add -dap2 as the last argument."<<endl;
45 bool dap2_output =
false;
47 string dap2_str(argv[3]);
48 if (dap2_str ==
"-dap2")
51 cout<<
"To use the DAP2 constraint delimiter for the missing data variables, the last argument must be -dap2."<<endl;
52 cout<<
"The program is terminated. "<<endl;
57 string fname(argv[1]);
58 ifstream dmrpp_fstream;
59 dmrpp_fstream.open(fname.c_str(),ifstream::in);
63 vector<string> var_type_list;
64 var_type_list.push_back(
"Float32");
65 var_type_list.push_back(
"Int32");
66 var_type_list.push_back(
"Float64");
67 var_type_list.push_back(
"Byte");
68 var_type_list.push_back(
"Int16");
69 var_type_list.push_back(
"UInt16");
70 var_type_list.push_back(
"String");
71 var_type_list.push_back(
"UInt32");
72 var_type_list.push_back(
"Int8");
73 var_type_list.push_back(
"Int64");
74 var_type_list.push_back(
"UInt64");
75 var_type_list.push_back(
"UInt8");
76 var_type_list.push_back(
"Char");
77 var_type_list.push_back(
"Structure");
80 vector<string> var_type;
81 vector<string> var_name;
82 vector<unsigned int> var_lines;
85 vector<bool> data_exist;
93 bool fin_vb_start =
false;
94 bool fin_vb_end =
false;
95 bool data_found =
false;
97 unsigned int line_num = 0;
101 while (getline(dmrpp_fstream,dmrpp_line)) {
104 if (
true == fin_vb_start) {
107 if (var_type.empty()) {
108 cout<<
"Doesn't have the variable datatype, abort for dmrpp file "<<fname << endl;
112 if (
false == fin_vb_end)
113 fin_vb_end = find_endvar(dmrpp_line, var_type[var_type.size()-1]);
116 if (
true == fin_vb_end) {
118 if (
false == data_found)
119 data_exist.push_back(
false);
123 fin_vb_start =
false;
129 if (
false == data_found) {
131 data_found = find_raw_data_location_info(dmrpp_line);
134 if (
true == data_found)
135 data_exist.push_back(
true);
141 fin_vb_start = find_var(dmrpp_line,var_type_list,var_type,var_name, var_lines,line_num);
148 if (data_exist.size() != var_type.size()) {
149 cout<<
"Number of chunk check is not consistent with the number of var check."<<endl;
150 cout<<
"The dmrpp file is "<<fname<<endl;
154 bool has_missing_info =
false;
155 size_t last_missing_chunk_index = 0;
159 if (!var_type.empty()) {
160 auto ritr = var_type.rbegin();
161 size_t i = var_type.size() - 1;
162 while (ritr != var_type.rend()) {
163 if (!data_exist[i]) {
164 has_missing_info =
true;
165 last_missing_chunk_index = i;
174 if (
true == has_missing_info) {
177 vector<string> grp_names;
178 vector<unsigned int> grp_lines;
179 vector<unsigned int> end_grp_lines;
181 bool has_grps = obtain_grp_info(fname,grp_names,grp_lines,end_grp_lines);
182 if (grp_lines.size() != end_grp_lines.size()) {
183 cout<<
"The number of group bracket is NOT the same as the number of end group bracket."<<endl;
188 ofstream dmrpp_ofstream;
189 string fname2(argv[2]);
190 dmrpp_ofstream.open(fname2.c_str(),ofstream::out | ofstream::trunc);
195 for (
auto vt:var_type) {
200 string var_path = obtain_var_grp_paths(grp_lines,end_grp_lines,grp_names,var_lines[i]);
201 var_str = var_path +
"/" + var_name[i];
204 var_str = var_name[i];
210 if (i != last_missing_chunk_index) {
212 dmrpp_ofstream<<var_str <<
",";
214 dmrpp_ofstream<<var_str <<
";";
217 dmrpp_ofstream<<var_str;
227bool find_var_helper(
const string &str,
const vector<string> &var_type_list,
228 vector<string> &var_type,vector<string> &var_name) {
237 size_t non_space_char_pos = str.find_first_not_of(
' ');
238 if (non_space_char_pos == string::npos)
242 if (str[non_space_char_pos] !=
'<')
246 if (str.size() <= (non_space_char_pos+1))
251 if (str[str.size()-1] !=
'>' )
255 char char_2 = str[non_space_char_pos+1];
260 string v_1char_list =
"FIUBSC";
263 if (v_1char_list.find_first_of(char_2) == string::npos)
267 string sep=
" name=\"";
268 size_t sep_pos = str.find(sep,non_space_char_pos+2);
271 if (sep_pos == string::npos)
275 size_t var_index = -1;
277 for (
size_t i = 0; i < var_type_list.size() && !found ; i++) {
278 if(str.compare(non_space_char_pos+1,sep_pos-non_space_char_pos-1,var_type_list[i]) == 0) {
290 size_t end_name_pos = str.find(end_quote,sep_pos + sep.size() + 1);
291 if (end_name_pos != string::npos) {
294 string var_name_line = str.substr(sep_pos + sep.size(),end_name_pos - sep_pos - sep.size());
295 var_type.push_back(var_type_list[var_index]);
296 var_name.push_back(var_name_line);
303bool find_raw_data_location_info(
const string &dmrpp_line) {
308 ret = find_fillValue_in_chunks(dmrpp_line);
312 ret = find_data_offset(dmrpp_line);
317 ret = find_embedded_data_info(dmrpp_line);
324bool find_fillValue_in_chunks(
const string &str) {
327 string fvalue_mark =
"fillValue";
328 string dmrpp_chunks_mark =
"<dmrpp:chunks ";
330 size_t dmrpp_chunks_mark_pos = str.find(dmrpp_chunks_mark);
331 if (dmrpp_chunks_mark_pos != string::npos) {
332 if (string::npos != str.find(fvalue_mark, dmrpp_chunks_mark_pos+dmrpp_chunks_mark.size()))
342bool find_data_offset(
const string &str) {
345 string offset_mark =
"offset";
346 vector<string> data_storage_mark_list = {
"<dmrpp:chunk ",
"<dmrpp:block "};
348 for (
const auto & data_storage_mark:data_storage_mark_list) {
350 size_t data_storage_mark_pos = str.find(data_storage_mark);
351 if (data_storage_mark_pos != string::npos) {
352 if (string::npos != str.find(offset_mark, data_storage_mark_pos+data_storage_mark.size())) {
364bool find_embedded_data_info(
const string &str) {
367 vector<string> embedded_data_block_list = {
"<dmrpp:compact>",
368 "<dmrpp:missingdata>",
370 "<dmrpp:specialstructuredata>"};
372 for (
const auto & embedded_data_block:embedded_data_block_list) {
373 size_t embedded_data_block_pos = str.find(embedded_data_block);
374 if (embedded_data_block_pos != string::npos) {
385bool find_endvar(
const string &str,
const string &vtype) {
388 string end_var =
"</" + vtype +
'>';
389 size_t vb_end_pos = str.find(end_var);
390 if (vb_end_pos != string::npos) {
391 if ((vb_end_pos + end_var.size()) == str.size())
397bool find_var(
const string &str,
const vector<string> &var_type_list,
398 vector<string> &var_type,vector<string> &var_name,
399 vector<unsigned int> &var_lines,
unsigned int line_num) {
401 bool ret_value = find_var_helper(str,var_type_list,var_type,var_name);
402 if (ret_value ==
true)
403 var_lines.push_back(line_num);
409bool obtain_grp_info(
const string &fname, vector<string> &grp_names,
410 vector<unsigned int> &grp_lines,vector<unsigned int> &end_grp_lines)
416 bool find_grp_start =
false;
417 unsigned int line_num = 0;
419 ifstream dmrpp_fstream;
420 dmrpp_fstream.open(fname.c_str(),ifstream::in);
422 while(getline(dmrpp_fstream,dmrpp_line)) {
424 find_grp_start = find_grp(dmrpp_line,line_num,grp_names,grp_lines,end_grp_lines);
425 if (find_grp_start ==
false)
426 find_end_grp(dmrpp_line,line_num,end_grp_lines);
430 return !(grp_names.empty());
435bool find_grp(
const string &str,
unsigned int line_num, vector<string> &grp_names,
436 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines) {
445 size_t non_space_char_pos = str.find_first_not_of(
' ');
446 if (non_space_char_pos == string::npos)
450 if (str[non_space_char_pos] !=
'<')
454 if (str.size() <= (non_space_char_pos+1))
459 if (str[str.size()-1] !=
'>' )
463 char char_2 = str[non_space_char_pos+1];
468 string sep=
"Group name=\"";
469 size_t sep_pos = str.find(sep,non_space_char_pos+1);
472 if (sep_pos == string::npos)
477 size_t end_name_pos = str.find(end_quote,sep_pos+sep.size()+1);
478 if (end_name_pos != string::npos) {
481 string grp_name = str.substr(sep_pos+sep.size(),end_name_pos-sep_pos-sep.size());
482 grp_names.push_back(grp_name);
483 grp_lines.push_back(line_num);
488 if ((str.size() > (end_name_pos+1)) && str[end_name_pos+1]==
'/')
489 end_grp_lines.push_back(line_num);
499bool find_end_grp(
const string &dmrpp_line,
unsigned int line_num, vector<unsigned int> &end_grp_lines) {
502 string end_grp =
"</Group>" ;
503 size_t end_grp_pos = dmrpp_line.find(end_grp);
504 if (end_grp_pos != string::npos) {
505 if ((end_grp_pos + end_grp.size()) == dmrpp_line.size()) {
506 end_grp_lines.push_back(line_num);
515string obtain_var_grp_paths(
const vector<unsigned int> &gs_line_nums,
516 const vector<unsigned int> &ge_line_nums,
517 const vector<string> & grp_names,
518 unsigned int var_line) {
521 vector<unsigned int> gse_line_nums;
522 vector<bool> is_group_start;
524 unsigned int end_grp_index = 0;
525 unsigned int start_grp_index = 0;
528 unsigned int max_grp_index = gs_line_nums.size() -1;
535 while (end_grp_index <= max_grp_index) {
537 while (start_grp_index <= max_grp_index) {
539 if (gs_line_nums[start_grp_index] < ge_line_nums[end_grp_index]) {
540 gse_line_nums.push_back(gs_line_nums[start_grp_index]);
541 is_group_start.push_back(
true);
544 else if (gs_line_nums[start_grp_index] == ge_line_nums[end_grp_index]) {
551 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
552 is_group_start.push_back(
false);
557 if (end_grp_index < (max_grp_index+1)) {
558 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
559 is_group_start.push_back(
false);
566 if (is_group_start.size() != gse_line_nums.size()) {
567 cerr<<
"The group "<<endl;
574for (
unsigned int i =0; i<gse_line_nums.size();i++) {
575 cerr<<
"gse_line["<<i<<
"] = "<<gse_line_nums[i] <<endl;
576 cerr<<
"is_group_start["<<i<<
"] = "<<is_group_start[i] <<endl;
581 int gse_line_index= obtain_gse_line_index(gse_line_nums,var_line);
584cerr<<
"gse_line_index: "<<gse_line_index <<endl;
588 vector<unsigned int> grp_path_lines;
590 if (gse_line_index >= 0) {
592 int temp_index = gse_line_index;
596 unsigned int temp_rem_grp_index = 0;
599 while (temp_index >= 0) {
602 if (is_group_start[temp_index] ==
false)
603 temp_rem_grp_index++;
607 if (temp_rem_grp_index == 0)
608 grp_path_lines.push_back(gse_line_nums[temp_index]);
610 temp_rem_grp_index--;
618for (
const auto &gpl:grp_path_lines)
619cerr<<
"grp_path_lines "<<gpl <<endl;
620for (
const auto &gsn:gs_line_nums)
621cerr<<
"gs_lines "<<gsn <<endl;
622for (
const auto &gn:grp_names)
623cerr<<
"group name is "<<gn <<endl;
628 int gl_index = gs_line_nums.size() - 1;
630 for (
const auto & gpl:grp_path_lines) {
635 for (; gl_index >= 0; gl_index--) {
637 if (gpl == gs_line_nums[gl_index]) {
639 ret_value =
"/" + grp_names[gl_index] + ret_value;
647cerr<<
"ret_value is "<<ret_value <<endl;
656int obtain_gse_line_index(
const vector<unsigned int> &gse_line_nums,
unsigned int var_line) {
659 auto total_gse_lines = (
unsigned int)(gse_line_nums.size());
661 if (total_gse_lines > 0) {
663 for (
int i = total_gse_lines-1; i >= 0 ; i--) {
664 if (gse_line_nums[i] >var_line)