bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
merge_dmrpp.cc
1#include <iostream>
2#include <fstream>
3#include <sstream>
4#include <string>
5#include <vector>
6#include <set>
7#include <algorithm>
8
9using namespace std;
10
11// I. The following block of functions retrieves the "missing" variable type, variable name and data value information.
12// 1. General
13bool obtain_var_info(const string &miss_dmrpp_info, const vector<string> &var_type_check_list,
14 vector<string> &var_types, vector<string> &var_names, vector<string> &chunk_info_list,
15 bool &is_chunk_mark1);
16bool find_var_name(const string &str, size_t &str_pos, size_t &var_name_pos_start, size_t &var_name_pos_end);
17bool find_end_var_block(const string &str, const string &var_type, const size_t &str_pos, size_t &var_end_pos);
18bool find_chunk_info(const string &str, const size_t &str_pos, size_t &chunk_info_pos_start, size_t &chunk_info_pos_end,
19 const size_t &var_end_pos, bool &is_mark1);
20
21// 2. group handling
22bool obtain_var_path_info(const string &fname, const vector<string> &var_type_list, vector<string> &var_path,
23 vector<string>& var_type, vector<string> &var_name, vector<unsigned int> &var_lines);
24
25bool obtain_var_grp_info(const string &fname, const vector<string> &var_type_list, vector<string> &grp_names,
26 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines,
27 vector<string> &var_type, vector<string> &var_name, vector<unsigned int> &var_lines);
28
29string obtain_var_grp_paths(const vector<unsigned int> &gs_line_nums,const vector<unsigned int> &ge_line_nums,
30 const vector<string> &grp_names, unsigned int var_line);
31
32int obtain_gse_line_index(const vector<unsigned int> &gse_line_nums, unsigned int var_line);
33
34bool find_grp(const string &str, unsigned int line_num, vector<string> &grp_names,
35 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines);
36
37bool find_end_grp(const string &dmrpp_line,unsigned int line_num, vector<unsigned int> &end_grp_lines);
38
39bool find_var_helper(const string &dmrpp_line, const vector<string> &var_type_list, vector<string>& var_type,
40 vector<string>&var_name);
41
42bool find_var(const string &str, const vector<string> &var_type_list, vector<string> &var_type, vector<string> &var_name,
43 vector<unsigned int> &var_lines, unsigned int line_num);
44
45bool merge_chunk_info_g(const string &str, const vector<string> &var_type, const vector<string> &var_name,
46 const vector<string> &var_fqn, const vector<string> &miss_var_fqn,
47 const vector<string> &miss_var_type, const vector<string> &chunk_info);
48
49bool obtain_miss_var_candidate_pos(const string &dmrpp_str, const vector<string> &var_type,
50 const vector<string> &var_name, vector<size_t> &var_pos);
51
52void obtain_final_miss_var_info(const vector <string> &var_fqn, const vector <string> &miss_var_fqn,
53 const vector <string> &var_type, vector <string> &final_var_type,
54 const vector <size_t> &var_candidate_pos, vector<size_t> &var_pos,
55 const vector <string> &chunk_info, vector<string> &ordered_chunk_info);
56
57bool merge_chunk_info_vec(vector <string> &dmrpp_vec, const vector<string> &miss_var_type,
58 const vector<string> &ordered_chunk_info);
59
60bool insert_chunk_info_to_vec(string &dmrpp_block_str, const string &var_type, const string &chunk_info);
61
62// The following block of functions add the file address
63// (mostly the absolute path of the HDF5 file that stores the data value) to the chunk block.
64bool add_faddr_chunk_info_simple(vector<string> &chunk_info_list, bool is_dmrpp_mark1, const string& faddr_source = "");
65
66// The original function considers more factors that seem not necessary.
67#if 0
68//bool add_faddr_chunk_info(const string& miss_dmrpp_info,vector<string>&chunk_info_list,bool is_dmrpp_mark1, const string faddr_source = "");
69#endif
70
71bool add_faddr_contig_line(string &chunk_info, const string &file_addr);
72bool add_faddr_chunk_comp_lines(string &chunk_info,const string &file_addr);
73
74// The following block of functions merge the "missing" variable data value information to the original dmrpp file.
75// These routines are used for the case when there are no groups in the dmrpp file. The algorithm is faster.
76bool add_missing_info_to_file(const string &fname2,const vector<string> &var_types,const vector<string> &var_names,
77 const vector<string> &chunk_info_list);
78void gen_block(const vector<string> &var_type_list,const vector<string> &var_name_list,vector<string> &block_begin,
79 vector<string> &block_end);
80bool check_overlap_intervals(const vector<size_t> &sort_block_pos, const vector<size_t> &block_pos_start);
81void obtain_bindex_in_modified_string(const vector<size_t> &block_pos_start, vector<int> &block_index);
82bool split_string(const string &str, vector<string> &str_vec, const vector<string> &block_begin,
83 const vector<string> &block_end, vector<int> &block_index);
84bool convert_dmrppstr_to_vec(const string &dmrpp_str, vector<string> &dmrpp_str_vec, const vector<string> &var_types,
85 const vector<string> &var_names,vector<int> &block_index);
86void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,const vector<string> &chunk_info_list,
87 const vector<int> &block_index);
88void write_vec_to_file(const string &fname,const vector<string> &dmrpp_str_vec);
89
90// The following two functions are helper functions
91void file_to_string(const string &filename, string &out);
92bool string_tokenize(const string &in_str, const char delim, vector<string>&out_vec);
93bool string_tokenize_by_pos(const string &in_str, const vector<size_t> &var_pos, vector<string> &out_vec);
94
95
96int main(int argc,char**argv)
97{
98 string dmrpp_line;
99 vector<string> var_types;
100 vector<string> var_names;
101 vector<string> chunk_info_list;
102
103 bool add_dmrpp_info = false;
104 bool is_chunk_mark1 = true;
105
106 string missing_dmrpp_str;
107
108 if(argc != 5) {
109 cout<<"Please provide four arguments: "<< endl;
110 cout<<" The first is the dmrpp file that contains the information of the variable of which";
111 cout<<" the data cannot be found in the original HDF5/4 file but can be found"
112 <<" from the HDF5 file pointed by this dmrpp file. "<<endl;
113 cout<<" The second is the dmrpp file for the original HDF5/4 file. "<<endl;
114 cout<<" The third one is the href to HDF5/HDF4 file of which the missing data is stored. "<<endl;
115 cout<<" The fourth one is the text file that includes the variable path of which"
116 <<" the data cannot be found in the original HDF5/4 file. "<<endl;
117 cout <<endl;
118 cout <<" Warning: before running this program, one must run the check_dmrpp program first to see "
119 <<"if the original dmrpp file contains any missing variable of which the data cannot be found "
120 <<"in the original HDF5/HDF4 file. "<<endl;
121 return 0;
122 }
123
124 // We only consider the atomic datatype for the missing variables.
125 vector<string> var_type_check_list;
126
127 var_type_check_list.emplace_back("Float32");
128 var_type_check_list.emplace_back("Int32");
129 var_type_check_list.emplace_back("Float64");
130 var_type_check_list.emplace_back("Byte");
131 var_type_check_list.emplace_back("Int16");
132 var_type_check_list.emplace_back("UInt16");
133 var_type_check_list.emplace_back("String");
134 var_type_check_list.emplace_back("UInt32");
135 var_type_check_list.emplace_back("Int8");
136 var_type_check_list.emplace_back("Int64");
137 var_type_check_list.emplace_back("UInt64");
138 var_type_check_list.emplace_back("UInt8");
139 var_type_check_list.emplace_back("Char");
140
141 // Obtain the dmrpp file name that contains the missing variable value.
142 string fname(argv[1]);
143
144 // Read the "missing dmrpp file" to a string
145 file_to_string(fname,missing_dmrpp_str);
146
147 // Obtain the missing chunk information from the dmrpp file.
148 add_dmrpp_info = obtain_var_info(missing_dmrpp_str,var_type_check_list,var_types,var_names,
149 chunk_info_list,is_chunk_mark1);
150
151 // Just output a warning if there is no chunk info, in the supplemental dmrpp file.
152 if (false == add_dmrpp_info) {
153 cout<<"Cannot find the corresponding chunk info. from the supplemental dmrpp file."<<endl;
154 cout<<"You may need to check if there is any variable in the dmrpp file. "<<endl;
155 cout<<"The dmrpp file is "<<fname <<endl;
156 }
157
158 // Sanity check
159 if (var_types.size() != var_names.size() || var_names.size() != chunk_info_list.size()) {
160 cout <<"Var type, var name and chunk_info must have the same number of elements. "<<endl;
161 cout <<"The dmrpp file is "<<fname <<endl;
162 return 0;
163 }
164
165 // For debugging
166#if 0
167 for (size_t i =0; i<var_names.size();i++) {
168cout<<"var type["<<i<<"] "<< var_types[i]<<endl;
169cout<<"var name["<<i<<"] "<< var_names[i]<<endl;
170cout<<"chunk_info_list["<<i<<"] "<< chunk_info_list[i] << endl;
171
172 }
173#endif
174
175 // We need to erase those variables that are not really missing but are added by the generation program
176 string mvar_fname(argv[4]);
177 string missing_vname_str;
178
179 // Read the missing variable names to a string and tokenize the string to a vector of string.
180 file_to_string(mvar_fname,missing_vname_str);
181 // The data-missing variable list must not be empty.
182 if (missing_vname_str.empty()) {
183 cout<<" The text file that has the data-missing variable path is empty." <<endl;
184 cout<<" Please check the file. "<<endl;
185 return 0;
186 }
187
188 if (missing_vname_str[missing_vname_str.size()-1]=='\n')
189 missing_vname_str = missing_vname_str.substr(0,missing_vname_str.size()-1);
190
191#if 0
192cout<<"missing_vname_str: "<<missing_vname_str<<endl;
193#endif
194
195 vector<string> missing_vname_list;
196
197 // Here we come to the different syntax of DAP2 and DAP4 constraints.
198 // DAP2 uses comma(,) whereas DAP4 uses semicolon(;). We need to support both.
199 char delim=';';
200 bool has_delim = string_tokenize(missing_vname_str,delim,missing_vname_list);
201 if (!has_delim) {
202 delim=',';
203 missing_vname_list.clear();
204 string_tokenize(missing_vname_str,delim,missing_vname_list);
205 }
206
207 // Check if the dmrpp file that contains just the missing variables has groups.
208 // Note: we don't need to consider if there are groups in the original dmrpp file since
209 // we only care about the insertion of the chunk info for the missing variables.
210 bool handle_grp = false;
211 for (const auto &mv_name:missing_vname_list) {
212 // Find the last path position.
213 size_t path_pos = mv_name.find_last_of('/');
214
215 // The missing variables under the root group are treated as no-group.
216 if (path_pos !=string::npos && path_pos!=0) {
217 handle_grp = true;
218 break;
219 }
220 }
221
222 // For debugging
223#if 0
224 for(size_t i = 0;i<missing_vname_list.size();i++)
225 cout <<"missing_vname_list["<<i<<"]= "<<missing_vname_list[i]<<endl;
226#endif
227
228 // We need to handle differently if finding group(s) in the missing dmrpp file.
229 if (handle_grp == true) {
230
231 // Obtain the variable path of all the variables in the missing dmrpp string.
232 vector<string> mdp_var_fqn;
233 vector<string> mdp_var_names_g;
234 vector<string> mdp_var_types_g;
235 vector<unsigned int> mdp_var_lines;
236
237 if (false == obtain_var_path_info(fname, var_type_check_list, mdp_var_fqn, mdp_var_types_g,
238 mdp_var_names_g, mdp_var_lines))
239 return -1;
240
241 // Now we can use the var_path to match the missing vars,
242 // Remove the additional variables added by the filenetCDF-4 module.
243 vector<string> new_var_types;
244 vector<string> new_var_names;
245 vector<string> new_var_fqns;
246 vector<string> new_chunk_info_list;
247
248 if (mdp_var_names_g != var_names) {
249 cout <<" Internal error: variable names should be the same even retrieved with different methods."<<endl;
250 return -1;
251 }
252
253 for (size_t i =0; i<mdp_var_fqn.size();i++) {
254 for (const auto & mvl:missing_vname_list) {
255 if (mdp_var_fqn[i] == mvl) {
256 new_var_names.push_back(mdp_var_names_g[i]);
257 new_var_fqns.push_back(mdp_var_fqn[i]);
258 new_var_types.push_back(mdp_var_types_g[i]);
259 new_chunk_info_list.push_back(chunk_info_list[i]);
260 break;
261 }
262 }
263 }
264
265 // Add the file location to each chunk. Mostly the file location is the absolute path of the HDF5 file.
266 string fadd_source(argv[3]);
267 add_faddr_chunk_info_simple(new_chunk_info_list,is_chunk_mark1,fadd_source);
268#if 0
269 //add_faddr_chunk_info(missing_dmrpp_str,new_chunk_info_list,is_chunk_mark1,fadd_source);
270#endif
271
272 // For debugging
273#if 0
274for (const auto &nc_info:new_chunk_info_list)
275cout <<"chunk_info "<<nc_info <<endl;
276#endif
277
278 // Now go to the original dmrpp. Find the missing var blocks based on the var_path.
279 // Obtain the variable path of all the variables in the missing dmrpp string.
280 string fname2(argv[2]);
281 vector<string> odp_var_fqn;
282 vector<string> odp_var_names_g;
283 vector<string> odp_var_types_g;
284 vector<unsigned int> odp_var_lines;
285
286 // Note: if the original dmrpp file contains many variables and groups, this may take some time. We will see if
287 // the performance is an issue.
288 if (false == obtain_var_path_info(fname2,var_type_check_list,odp_var_fqn,
289 odp_var_types_g,odp_var_names_g,odp_var_lines))
290 return -1;
291
292 // To reduce the comparison of all the variable path in the original dmrpp file with the variables in the
293 // missing dmrpp file,we further select only the relevant variables: the variables that hold the same names as
294 // those of missing variables.
295 // For example, the missing variable path is /foo/missing1, /foo2/missing2.
296 // In the original dmrpp we may have 100 variables,
297 // the relevant variables may just include missing1, /foo/missing1, /foo2/missing2, /foo/foo1/missing2.
298 // We only need to compare these four variable path with the missing variable path to identify the location of
299 // the variables that miss the chunk information in the original dmrpp file.
300
301 vector<unsigned int> final_odp_var_lines;
302 vector<string> final_odp_var_fqns;
303 vector<string> final_odp_var_names;
304 vector<string> final_odp_var_types;
305
306 for (unsigned int i = 0; i < odp_var_names_g.size();i++) {
307 for (unsigned int j = 0; j < new_var_names.size(); j++) {
308 if ((odp_var_names_g[i] == new_var_names[j]) && (odp_var_types_g[i] == new_var_types[j])) {
309 final_odp_var_lines.push_back(odp_var_lines[i]);
310 final_odp_var_fqns.push_back(odp_var_fqn[i]);
311 final_odp_var_names.push_back(odp_var_names_g[i]);
312 final_odp_var_types.push_back(odp_var_types_g[i]);
313 break;
314 }
315 }
316 }
317
318 // debugging info
319#if 0
320cout <<" Before the final step "<<endl;
321for (unsigned int i = 0; i <final_odp_var_types.size(); i++) {
322cout <<"vtype: "<<final_odp_var_types[i] <<endl;
323cout <<"vname: "<<final_odp_var_names[i] <<endl;
324cout <<"vfqn: "<<final_odp_var_fqns[i] <<endl;
325cout <<"new vfqn: "<<new_var_fqns[i] <<endl;
326}
327#endif
328
329 // Merge the missing chunk info to the original dmrpp file.
330 merge_chunk_info_g(fname2,final_odp_var_types,final_odp_var_names,
331 final_odp_var_fqns,new_var_fqns,new_var_types,
332 new_chunk_info_list);
333
334 // This is not necessary now. But leave it for now.
335#if 0
336 // Now go to the original dmrpp. Find the missing var blocks based on the var_path.
337 // We need to compare the var full path in the missing dmrpp(new_var_fqns) with the var full path
338 // in the original dmrpp(odp_var_fqn).
339 // We want to find the corresponding line number in the original dmrpp to insert the missing chunk info.
340 vector <unsigned int> missing_chunk_info_lines;
341 for (unsigned int i = 0; i < new_var_fqns.size(); i++) {
342 for (unsigned int j = 0; j <final_odp_var_fqns.size(); j++) {
343 if (new_var_fqns[i] == final_odp_var_fqns[j]) {
344 missing_chunk_info_lines.push_back(final_odp_var_lines[j]);
345 break;
346 }
347 }
348 }
349#endif
350
351// for debugging info
352#if 0
353for (const auto &mcil:missing_chunk_info_lines)
354 cout <<"missing chunk info line is: "<<mcil <<endl;
355#endif
356
357 }
358 else {
359
360#if 0
361cout <<"coming to the nogroup case"<<endl;
362#endif
363
364 // Remove the additional variables added by the filenetCDF-4 module.
365 vector<string> new_var_types;
366 vector<string> new_var_names;
367 vector<string> new_chunk_info_list;
368
369 // Trim missing_vname_list if the missing_vname_list includes the root path /.
370 vector<string> missing_vname_list_trim;
371 for (const auto &mvname:missing_vname_list) {
372 string temp_str = mvname;
373 if (temp_str[0] == '/')
374 temp_str = temp_str.substr(1);
375 missing_vname_list_trim.emplace_back(temp_str);
376 }
377
378 for (size_t i =0; i<var_names.size();i++) {
379 for (const auto &mvname:missing_vname_list_trim) {
380 if (var_names[i] == mvname) {
381 new_var_names.push_back(var_names[i]);
382 new_var_types.push_back(var_types[i]);
383 new_chunk_info_list.push_back(chunk_info_list[i]);
384 break;
385 }
386 }
387 }
388
389 // Add the file location to each chunk. Mostly the file location is the absolute path of the HDF5 file.
390 string fadd_source(argv[3]);
391
392#if 0
393 //add_faddr_chunk_info(missing_dmrpp_str,new_chunk_info_list,is_chunk_mark1,fadd_source);
394#endif
395 add_faddr_chunk_info_simple(new_chunk_info_list,is_chunk_mark1,fadd_source);
396
397#if 0
398for (size_t i =0; i<new_var_types.size();i++) {
399 cout<<"new chunk_info_list["<<i<<"]"<< endl;
400 cout<<new_chunk_info_list[i]<<endl;
401}
402#endif
403
404 string fname2(argv[2]);
405
406 // Add the missing chunk info to the original dmrpp file.
407 bool well_formed = add_missing_info_to_file(fname2,new_var_types,new_var_names,
408 new_chunk_info_list);
409
410 if (false == well_formed) {
411 cout <<"The dmrpp file to be modified is either not well-formed or contains nested variable blocks ";
412 cout <<"that cannot be supported by this routine. " <<endl;
413 cout <<"The dmrpp file is "<<fname2<<endl;
414 }
415 }
416
417 return 0;
418
419}
420
421// Obtain the var info from the supplemental(missing) dmrpp file.
422// The variable types we checked are limited to DAP2 data types plus 64-bit integers.
423bool obtain_var_info(const string &miss_dmrpp_info,const vector<string> &var_type_check_list,
424 vector<string> &var_types, vector<string> &var_names,vector<string> &chunk_info_list,
425 bool &is_chunk_mark1) {
426
427 bool ret = false;
428
429 size_t var_type_pos_start = 0;
430 size_t var_name_pos_start = 0;
431 size_t var_name_pos_end = 0;
432 size_t chunk_pos_start = 0;
433 size_t chunk_pos_end = 0;
434 size_t var_end_pos = 0;
435 size_t str_pos = 0;
436
437 if (miss_dmrpp_info.empty())
438 return ret;
439
440 size_t str_last_char_pos = miss_dmrpp_info.size() - 1;
441 bool well_formed = true;
442
443 // Go through the whole missing dmrpp string
444 while (str_pos <= str_last_char_pos && well_formed) {
445
446 size_t i = 0;
447 string var_sign;
448 string temp_var_sign;
449 size_t temp_var_type_pos_start = string::npos;
450 int var_type_index = -1;
451
452 // Go through the var_type_check_list to obtain the var data type,
453 // We need to find the index in the var_type_check_list to
454 // obtain the correct var datatype.
455 while (i < var_type_check_list.size()) {
456
457 var_sign = "<" + var_type_check_list[i] + " name=\"";
458 var_type_pos_start = miss_dmrpp_info.find(var_sign, str_pos);
459
460 if (var_type_pos_start == string::npos) {
461 i++;
462 continue;
463 } else {
464
465 // We want to make sure we don't skip any vars.
466 if (temp_var_type_pos_start > var_type_pos_start) {
467 temp_var_type_pos_start = var_type_pos_start;
468 var_type_index = i;
469 temp_var_sign = var_sign;
470 }
471 i++;
472 }
473 }
474
475 // Ensure all variables are scanned.
476 if (temp_var_type_pos_start != string::npos) {
477 var_type_pos_start = temp_var_type_pos_start;
478 var_sign = temp_var_sign;
479 }
480
481 // This line will ignore the datatypes that are not in the var_type_check_list
482 if (var_type_pos_start == string::npos) {
483 str_pos = string::npos;
484 continue;
485 } else
486 str_pos = var_type_pos_start + var_sign.size();
487
488 // Now we can retrieve var name, var type and the corresponding chunk info.
489 // Sanity check is also applied.
490 if (false == find_var_name(miss_dmrpp_info, str_pos, var_name_pos_start, var_name_pos_end))
491 well_formed = false;
492 else if (false == find_end_var_block(miss_dmrpp_info, var_type_check_list[var_type_index],
493 str_pos, var_end_pos))
494 well_formed = false;
495 else if (false == find_chunk_info(miss_dmrpp_info, str_pos, chunk_pos_start, chunk_pos_end,
496 var_end_pos, is_chunk_mark1))
497 well_formed = false;
498 else {
499
500 // Move the string search pos to the next block
501 str_pos = var_end_pos + 1;
502
503 // Obtain var type, var name and chunk info. and save them to vectors.
504 var_types.push_back(var_type_check_list[var_type_index]);
505 var_names.push_back(miss_dmrpp_info.substr(var_name_pos_start, var_name_pos_end - var_name_pos_start));
506 string temp_chunk_info = miss_dmrpp_info.substr(chunk_pos_start, chunk_pos_end - chunk_pos_start);
507 if (true == is_chunk_mark1)
508 temp_chunk_info += "</dmrpp:chunks>";
509 else
510 temp_chunk_info += "/>";
511 chunk_info_list.push_back(temp_chunk_info);
512 }
513 }
514
515 return well_formed;
516}
517
518// Find the var name in the supplemental dmrpp file.
519// var name block must end with " such as name="temperature"
520bool find_var_name(const string &str,size_t &str_pos,size_t &var_name_pos_start,size_t &var_name_pos_end) {
521
522 bool ret = true;
523 var_name_pos_start = str_pos;
524 var_name_pos_end = str.find("\"",str_pos);
525 if (var_name_pos_end == string::npos)
526 ret = false;
527 else
528 str_pos = var_name_pos_end;
529
530 // debugging info
531#if 0
532if(ret==false)
533cout<<"cannot find var name"<<endl;
534#endif
535
536 return ret;
537}
538
539// The end var block must be something like </Float32>
540bool find_end_var_block(const string&str, const string&var_type, const size_t &str_pos, size_t &var_end_pos) {
541
542 string end_var = "</" + var_type + '>';
543 var_end_pos = str.find(end_var,str_pos);
544
545 // debugging info
546#if 0
547if(var_end_pos==string::npos)
548cout<<"cannot find end var block"<<endl;
549#endif
550
551 return !(var_end_pos==string::npos);
552
553}
554
555// The chunk info must be confined by either <dmrpp::chunks> and </dmrpp::chunks> or <dmrpp:chunk> and />.
556bool find_chunk_info(const string &str,const size_t&str_pos,size_t &chunk_info_pos_start, size_t &chunk_info_pos_end,
557 const size_t&var_end_pos,bool & is_mark1){
558
559 bool ret = true;
560 string chunk_start_mark1 = "<dmrpp:chunks";
561 string chunk_end_mark1 = "</dmrpp:chunks>";
562 string chunk_start_mark2 = "<dmrpp:chunk ";
563 string chunk_end_mark2 = "/>";
564 char wspace=' ';
565
566#if 0
567cout<<"str_pos is "<<str_pos <<endl;
568cout<<"var_end_pos is "<<var_end_pos <<endl;
569cout<<"substr is "<<str.substr(str_pos,var_end_pos-str_pos)<<endl;
570#endif
571
572 chunk_info_pos_start = str.find(chunk_start_mark1,str_pos);
573
574 if (string::npos == chunk_info_pos_start) {
575 chunk_info_pos_start = str.find(chunk_start_mark2,str_pos);
576 if(string::npos != chunk_info_pos_start)
577 chunk_info_pos_end = str.find(chunk_end_mark2,str_pos);
578
579 // This line is used to find the starting point of <dmrpp:chunk,
580 // The character ahead of "<dmrpp::chunk" is always a ' ' (space)
581 chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1) + 1;
582 is_mark1 = false;
583 }
584 else {
585 chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1) + 1;
586 chunk_info_pos_end = str.find(chunk_end_mark1,str_pos);
587 is_mark1 = true;
588#if 0
589 //chunk_info_pos_end = str.find(chunk_end_mark1.c_str(),str_pos,var_end_pos-str_pos);
590#endif
591 }
592 if (string::npos == chunk_info_pos_start || string::npos == chunk_info_pos_end)
593 ret = false;
594 else if (var_end_pos <= chunk_info_pos_end)
595 ret = false;
596#if 0
597if (ret == false)
598 cout<<"cannot find_chunk_info "<<endl;
599#endif
600 return ret;
601}
602
603// We need to add the supplemental file path to the chunk info.
604// It seems that we don't need the sanity check of the missing data dmrpp based on the current get_dmrpp implementation.
605// Make this routine simpler.
606bool add_faddr_chunk_info_simple(vector<string>& chunk_info, bool is_dmrpp_mark1, const string &faddr_source) {
607
608 if (chunk_info.size() == 0)
609 return true;
610 string addr_mark = "dmrpp:href=\"";
611
612 // The missing DMRPP file can have file address specified along with chunk info.
613 // But we assume if they do this for one chunk, they should do this for all chunks.
614 // If this is the case, no need to find address.
615 if (chunk_info[0].find(addr_mark)!=string::npos)
616 return true;
617
618 // Retrieve name and reference
619 string hdf5_faddr;
620 string end_delim1 ="\"";
621
622 // The string for use in each missing_variable <chunk href:"value" >
623 hdf5_faddr = " href=\"" + faddr_source + end_delim1;
624
625#if 0
626//cout<<"hdf5_faddr is "<<hdf5_faddr <<endl;
627#endif
628
629 for (size_t i = 0; i<chunk_info.size(); i++) {
630
631 //If is_dmrpp_mark1 is true,
632 //add hdf5_faddr to each chunk line(The chunk line should have offset==)
633 //However, the variable may also use the contiguous storage.
634 //That chunk line marks with (nbyte==). Essentially it is not a chunk but
635 //the dmrpp still starts with the dmrpp:chunk.
636 if (true == is_dmrpp_mark1)
637 add_faddr_chunk_comp_lines(chunk_info[i],hdf5_faddr);
638 else
639 add_faddr_contig_line(chunk_info[i],hdf5_faddr);
640
641 }
642 return true;
643
644}
645
646// The following code is not used. Leave it here now.
647#if 0
648// We need to add the supplemental file path to the chunk info.
649// The file name usually starts with "name= ..." and the path usually starts with dmrpp:href="
650bool add_faddr_chunk_info(const string &str,vector<string>& chunk_info,bool is_dmrpp_mark1, const string faddr_source) {
651
652 bool well_formed= true;
653 if(chunk_info.size()==0)
654 return true;
655 string addr_mark = "dmrpp:href=\"";
656
657 // The missing DMRPP file can have file address specified along with chunk info.
658 // But we assume if they do this for one chunk, they should do this for all chunks.
659 // If this is the case, no need to find address.
660 if(chunk_info[0].find(addr_mark)!=string::npos)
661 return true;
662
663 // retrieve name and reference
664 string hdf5_fname;
665 string hdf5_faddr;
666 string name_mark = " name=\"";
667 string end_delim1 ="\"";
668
669 // We must find a valid hdf5 file name.
670 size_t hdf5_fname_start_pos = str.find(name_mark);
671 if(hdf5_fname_start_pos == string::npos)
672 well_formed = false;
673 size_t hdf5_fname_end_pos = str.find(end_delim1,hdf5_fname_start_pos+name_mark.size());
674 if(hdf5_fname_end_pos == string::npos)
675 well_formed = false;
676 hdf5_fname = str.substr(hdf5_fname_start_pos+name_mark.size(),hdf5_fname_end_pos-hdf5_fname_start_pos-name_mark.size());
677 if(hdf5_fname=="")
678 well_formed = false;
679
680 // We also must find a valid file location .
681 size_t hdf5_faddr_start_pos = str.find(addr_mark);
682 if(hdf5_faddr_start_pos != string::npos) {
683 size_t hdf5_faddr_end_pos = str.find(end_delim1,hdf5_faddr_start_pos+addr_mark.size());
684 if(hdf5_faddr_end_pos == string::npos)
685 well_formed = false;
686 hdf5_faddr = str.substr(hdf5_faddr_start_pos+addr_mark.size(),hdf5_faddr_end_pos-hdf5_faddr_start_pos-addr_mark.size());
687 }
688
689 // The string for use in each missing_variable <chunk href:"value" >
690 hdf5_faddr = " href=\"" + faddr_source + end_delim1;
691
692 /*if (hdf5_faddr.rfind(hdf5_fname) == string::npos) {
693 //trim hdf5 file address.
694 hdf5_faddr = " href=\"" +hdf5_faddr+'/'+hdf5_fname+end_delim1;
695 }
696 else {
697 hdf5_faddr = " href=\"" +hdf5_faddr+end_delim1;
698 }*/
699
700//cout<<"hdf5_faddr is "<<hdf5_faddr <<endl;
701
702 for (size_t i = 0;i<chunk_info.size();i++) {
703
704 //If is_dmrpp_mark1 is true,
705 //add hdf5_faddr to each chunk line(The chunk line should have offset==)
706 //However, the variable may also use the contiguous storage.
707 //That chunk line marks with (nbyte==). Essentially it is not a chunk but
708 //the dmrpp still starts with the dmrpp:chunk.
709 if(true == is_dmrpp_mark1)
710 add_faddr_chunk_comp_lines(chunk_info[i],hdf5_faddr);
711 else
712 add_faddr_contig_line(chunk_info[i],hdf5_faddr);
713
714 }
715 return well_formed;
716
717}
718#endif
719
720// Add the chunk address when the HDF5 chunking address is used.
721bool add_faddr_chunk_comp_lines(string & chunk_info, const string &file_addr) {
722
723 string chunk_line_mark = "<dmrpp:chunk offset=";
724 string chunk_line_end_mark = "/>";
725 string chunk_stop_mark = "</dmrpp:chunks>";
726 size_t str_pos = 0;
727 size_t temp_pos = 0;
728 size_t chunk_line_end_pos = 0;
729 bool loop_continue = true;
730 string temp_str;
731 bool well_formed = true;
732 bool find_chunk_line = false;
733
734 // While loop from <dmrpp::chunks, until /dmrpp:chunks>
735 while (true == loop_continue) {
736 temp_pos = chunk_info.find(chunk_line_mark,str_pos);
737 if (temp_pos != string::npos) {
738
739 chunk_line_end_pos = chunk_info.find(chunk_line_end_mark,temp_pos);
740 if (chunk_line_end_pos != string::npos) {
741 find_chunk_line = true;
742 temp_str += chunk_info.substr(str_pos,chunk_line_end_pos-str_pos);
743 temp_str += file_addr;
744 str_pos = chunk_line_end_pos;
745 }
746 else {// Each chunk offset line must end with "/>"
747 loop_continue = false;
748 well_formed = false;
749 }
750 }
751 else { // We will go to the last line </dmrpp:chunks>
752
753 temp_pos = chunk_info.find(chunk_stop_mark,str_pos);
754 loop_continue = false;
755
756 //Add the last part of the chunk info. Note: a space between
757 //.h5" and "/>"
758 if (temp_pos != string::npos)
759 temp_str += ' '+ chunk_info.substr(str_pos);
760 else
761 well_formed = false;
762 }
763 }
764 if (true == find_chunk_line)
765 chunk_info = temp_str;
766 else
767 well_formed = false;
768 return well_formed;
769
770}
771
772// Add the file address with the contiguous storage.
773bool add_faddr_contig_line(string &chunk_info, const string &file_addr) {
774
775 bool well_formed = true;
776 string chunk_line_start_mark = "<dmrpp::chunk nBytes=";
777 string chunk_line_end_mark = "/>";
778 string temp_str;
779
780 // Just find the line and change it,this should always be the first line.
781 //May add a check to see if the start position is always 0.
782 size_t chunk_line_end_pos = chunk_info.find(chunk_line_end_mark);
783 if (string::npos == chunk_line_end_pos)
784 well_formed = false;
785 else {
786 temp_str = chunk_info.substr(0,chunk_line_end_pos);
787 temp_str += file_addr;
788 temp_str += ' ' +chunk_info.substr(chunk_line_end_pos);
789 chunk_info = temp_str;
790 }
791 return well_formed;
792}
793
794// Add the missing info to the original dmrpp file.
795bool add_missing_info_to_file(const string &fname,const vector<string> &var_types,const vector<string> &var_names,
796 const vector<string> &chunk_info_list) {
797
798 bool well_formed = true;
799 string dmrpp_str;
800
801 // The original dmrpp file to string
802 file_to_string(fname,dmrpp_str);
803
804 vector<string> dmrpp_str_vec;
805 vector<int> block_index;
806
807 // Convert the original DMRPP string to vector according to var_types and var_names.
808 // We need to remember the block index of the missing variables
809 // since the missing variable order in the supplemental dmrpp
810 // may be different from the original one.
811 well_formed = convert_dmrppstr_to_vec(dmrpp_str,dmrpp_str_vec,var_types,var_names,block_index);
812
813 // Release the memory of dmpstr. For a >10MB dmrpp file, this is not a small value.
814 string().swap(dmrpp_str);
815
816 // adding the missing chunk info to the dmrpp vector and then write back to the file.
817 if (true == well_formed) {
818 add_missing_info_to_vec(dmrpp_str_vec,chunk_info_list,block_index);
819 write_vec_to_file(fname,dmrpp_str_vec);
820 }
821 return well_formed;
822}
823
824// Convert the original dmrpp to vectors according to the *missing* variables.
825// Here we should NOT tokenize the orginal dmrpp according to every variable in it.
826// We only care about feeding those variables that miss the value information.
827bool convert_dmrppstr_to_vec(const string &dmrpp_str, vector<string> &dmrpp_str_vec,
828 const vector<string> &var_types, const vector<string> &var_names,
829 vector<int> &block_index) {
830
831 vector<string>block_begin;
832 block_begin.resize(var_types.size());
833 vector<string>block_end;
834 block_end.resize(var_types.size());
835 gen_block(var_types,var_names,block_begin,block_end);
836
837#if 0
838for(size_t i =0; i<block_begin.size();i++)
839{
840cout<<"block_begin["<<i<<"]= "<<block_begin[i]<<endl;
841cout<<"block_end["<<i<<"]= "<<block_end[i]<<endl;
842
843}
844#endif
845
846 bool well_formed = split_string(dmrpp_str,dmrpp_str_vec,block_begin,block_end,block_index);
847 return well_formed;
848
849}
850
851// Add missing information to vector according to the right block_index
852void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,const vector<string> &chunk_info_list,
853 const vector<int> &block_index) {
854
855 string temp_str;
856 char insert_mark = '>';
857 for (size_t i = 0; i < block_index.size(); i++) {
858
859 //cout<<"["<<2*i+1 <<"]= "<<dmrpp_str_vec[2*i+1]<<endl;
860 // The vector has to include the beginning and ending block.
861 // An example:
862 // The original string: Moses gre up i Egypt.
863 // The missing information is w in 'gre' and n in 'i'.
864 // So we have 2 missing blocks: grew and in.
865 // The original string should be divided into 5 to patch the
866 // missing characters. "Moses ","gre"," up ","i"," Egypt.".
867 // The final string then can be "Moses grew up in Egypt."
868
869 temp_str = dmrpp_str_vec[2*i+1];
870 size_t insert_pos = temp_str.find_last_of(insert_mark);
871 insert_pos = temp_str.find_last_of(insert_mark,insert_pos-1);
872
873 // The block_index[i] will ensure the right chunk info.
874 string temp_str2 = '\n' + chunk_info_list[block_index[i]];
875 temp_str.insert(insert_pos+1,temp_str2);
876#if 0
877 //cout<<"chunk_list["<<block_index[i]<<"]= "<<chunk_info_list[block_index[i]]<<endl;
878 //cout<<"temp_str is "<<temp_str <<endl;
879#endif
880 dmrpp_str_vec[2*i+1] = temp_str;
881 }
882
883 return;
884
885}
886
887// Used in the final step: to generate the final DMRPP file since
888// the dmrpp is relatively small, rewriting is still the fast way.
889void write_vec_to_file(const string &fname, const vector<string> &dmrpp_str_vec) {
890
891 string str_to_file;
892 for (size_t i =0;i<dmrpp_str_vec.size();i++)
893 str_to_file +=dmrpp_str_vec[i];
894
895 ofstream outFile;
896 outFile.open(fname.c_str());
897 outFile<<str_to_file;
898 outFile.close();
899
900}
901
902// Obtain the beginning and the ending information of the block information.
903void gen_block(const vector<string> &var_type_list,const vector<string> &var_name_list,
904 vector<string> &block_begin, vector<string> &block_end) {
905
906 for (size_t i = 0; i < var_type_list.size(); i++) {
907 block_begin[i] = '<' +var_type_list[i] +' '+"name=\""+var_name_list[i]+"\">";
908 block_end[i] = "</" + var_type_list[i] + '>';
909 }
910}
911
912// Split the string into different blocks.
913bool split_string(const string &str, vector<string> &str_vec, const vector<string> &block_begin,
914 const vector<string> &block_end,vector<int> &block_index) {
915
916 bool well_formed = true;
917 vector<size_t> block_begin_pos;
918 vector<size_t> block_end_pos;
919 block_begin_pos.resize(block_begin.size());
920 block_end_pos.resize(block_end.size());
921
922 // Note:
923 // 1) We just want to split the string according to the variables that miss values.
924 // 2) block_begin_pos in the original dmrpp file may NOT be sorted.
925 // However, when we read back the string vector, we want to read from beginning to the end.
926 // So we need to remember the index of each <var block> of the supplemental dmrpp file
927 // in the original dmrpp file so that the correct chunk info can be given to the var block that misses the values.
928 for(size_t i = 0; i<block_begin.size(); i++) {
929 block_begin_pos[i] = str.find(block_begin[i]);
930 block_end_pos[i] = str.find(block_end[i],block_begin_pos[i])+(block_end[i].size());
931 }
932
933 obtain_bindex_in_modified_string(block_begin_pos,block_index);
934
935#if 0
936for(size_t i = 0; i<block_index.size();i++)
937cout<<"block_index["<<i<<"] is: "<<block_index[i] <<endl;
938#endif
939 vector<size_t> block_pos;
940 block_pos.resize(2*block_begin_pos.size());
941 for (size_t i = 0; i < block_begin.size(); i++) {
942 block_pos[2*i] = block_begin_pos[i];
943 block_pos[2*i+1] = block_end_pos[i];
944 }
945
946 // This will ensure the string vector is kept from beginning to the end.
947 sort(block_pos.begin(),block_pos.end());
948
949 // Use a set: resume a different set, compare with the previous one. set_difference
950 // This will ensure that each <var block> doesn't overlap with others.
951 // It is a sanity check.
952 well_formed = check_overlap_intervals(block_pos,block_begin_pos);
953
954 // We need to consider the starting and the ending of the string
955 // So the string vector size is block_size + 1.
956 // Examples:
957 // string: Moses grew up in Egypt. It has four space intervals but five substrings.
958 if (true == well_formed) {
959
960 size_t str_block_pos = 0;
961 str_vec.resize(block_pos.size()+1);
962 for (size_t i =0; i < block_pos.size(); i++) {
963 str_vec[i] = str.substr(str_block_pos,block_pos[i]-str_block_pos);
964 str_block_pos = block_pos[i];
965 }
966 str_vec[block_pos.size()] = str.substr(str_block_pos);
967
968#if 0
969for(size_t i = 0; i <str_vec.size();i++)
970 cout<<"str_vec["<<i<<"] is: "<<str_vec[i] <<endl;
971#endif
972 }
973
974 return well_formed;
975
976}
977
978// Check if there are overlaps between any two var blocks.
979// Note: If there are no overlaps between var blocks, the sorted block-start's position set should be
980// the same as the unsorted one. This will take O(nlogn) rather than O(n*n) time.
981bool check_overlap_intervals(const vector<size_t> &sort_block_pos, const vector<size_t> &block_pos_start){
982
983 // No overlapping, return true.
984 set<size_t>sort_start_pos;
985 set<size_t>start_pos;
986 for (size_t i = 0; i<block_pos_start.size();i++) {
987 sort_start_pos.insert(sort_block_pos[2*i]);
988 start_pos.insert(block_pos_start[i]);
989 }
990 return (sort_start_pos == start_pos);
991
992}
993
994// Obtain the block index of the var block in the supplemental dmrpp file.
995// We need to remember the index of a var block in the supplemental dmrpp file to correctly match
996// the same var block in the original dmrpp file.
997// An example:
998// ex.h5.dmrpp has the variables as the order: ex1,ex2,lon,ex3,fakedim,lat.
999// It misses the values of lon,fakedime,lat.
1000// In the supplemental dmrpp that has the value information, the variable order is lat,lon,fakedim.
1001// In order to correctly provide the value info of lon,fakedim and lat without explicitly searching
1002// the string. I decide to remember the vector index of variables in the supplemental dmrpp file.
1003// In this case, the index of lat is 0, lon is 1 and fakedim is 2. While adding value info of the
1004// missing variables in the ex.h5.dmrpp, I can just use the index to identify which chunk info I
1005// should use to fill in.
1006//
1007void obtain_bindex_in_modified_string(const vector<size_t> &block_pos_start, vector<int> &block_index) {
1008
1009 vector<pair<size_t,int> > pos_index;
1010 for (size_t i = 0; i <block_pos_start.size(); i++)
1011 pos_index.push_back(make_pair(block_pos_start[i],i));
1012
1013 // The pos_index will be sorted according to the first element,block_pos_start
1014 sort(pos_index.begin(),pos_index.end());
1015
1016 for (size_t i = 0; i < block_pos_start.size(); i++)
1017 block_index.push_back(pos_index[i].second);
1018 return;
1019}
1020
1021// Help function: read the file content to a string.
1022void file_to_string(const string &filename, string &out_str) {
1023
1024 ifstream inFile;
1025 inFile.open(filename.c_str());
1026
1027 stringstream strStream;
1028 strStream << inFile.rdbuf();
1029
1030 // Save the content to the string
1031 out_str = strStream.str();
1032 inFile.close();
1033
1034}
1035
1036// Tokenize the string to a vector of string according to the delim.
1037bool string_tokenize(const string &in_str, const char delim, vector<string> &out_vec) {
1038 stringstream ss_str(in_str);
1039 string temp_str;
1040 while (getline(ss_str,temp_str,delim))
1041 out_vec.push_back(temp_str);
1042
1043 return (in_str.find(delim)!= string::npos);
1044}
1045
1046// Tokenize the string to a vector of string according to positions.
1047// We assume the positions are pre-sorted from small to large.
1048bool string_tokenize_by_pos(const string &in_str,const vector<size_t>& pos, vector<string> &out_vec) {
1049
1050 if (pos.empty() || pos.front() ==0 || (pos.back()+1) >in_str.size())
1051 return false;
1052
1053 out_vec.push_back(in_str.substr(0,pos[0]));
1054 for (unsigned int i = 0; i < (pos.size()-1); i++)
1055 out_vec.push_back(in_str.substr(pos[i],pos[i+1]-pos[i]));
1056 out_vec.push_back(in_str.substr(pos.back()));
1057
1058#if 0
1059for (unsigned int i = 0; i < out_vec.size(); i ++)
1060cout <<"string vec is: " << out_vec[i] <<endl;
1061#endif
1062
1063 return true;
1064}
1065
1066// Find the var type and var name like <Int16 name="foo">
1067bool find_var_helper(const string &str, const vector<string> &var_type_list,
1068 vector<string> &var_type, vector<string> &var_name) {
1069
1070 bool ret = false;
1071
1072#if 0
1073 //if(str[0]=='\n' || str[0]!=' '){
1074#endif
1075
1076 // Every var block will have spaces before <
1077 if (str[0]!=' ')
1078 return ret;
1079
1080 // Ignore the line with all spaces
1081 size_t non_space_char_pos = str.find_first_not_of(' ');
1082 if (non_space_char_pos == string::npos)
1083 return ret;
1084
1085 // The first non-space character should be '<'
1086 if (str[non_space_char_pos] != '<')
1087 return ret;
1088
1089 // After space, must at least contain '<','>'
1090 if (str.size() <= (non_space_char_pos+1))
1091 return ret;
1092
1093 // The last character must be '>', maybe this is too strict.
1094 // We will see.
1095 if (str[str.size()-1] != '>' )
1096 return ret;
1097
1098 // char_2 is a character right after<
1099 char char_2 = str[non_space_char_pos+1];
1100
1101 // The first var character must be one of the list.
1102 // The following list includes the first character
1103 // of all possible variable types.
1104 string v_1char_list = "FIUBSC";
1105
1106 // If the first character is not one of DAP type,ignore.
1107 if (v_1char_list.find_first_of(char_2) == string::npos)
1108 return ret;
1109
1110 // Find ' name="' and the position after non_space_char_pos+1, like <Int16 name="d16_1">
1111 string sep=" name=\"";
1112 size_t sep_pos = str.find(sep,non_space_char_pos+2);
1113
1114 // Cannot find "name=..", ignore this line.
1115 if (sep_pos == string::npos)
1116 return ret;
1117
1118 // Try to figure out the variable type.
1119 bool found_var_index = false;
1120 size_t var_index = 0;
1121 for (size_t i = 0; i<var_type_list.size(); i++) {
1122 if (str.compare(non_space_char_pos+1,sep_pos-non_space_char_pos-1,var_type_list[i]) == 0) {
1123 var_index = i;
1124 found_var_index = true;
1125 }
1126 }
1127
1128 // If cannot find the supported type, ignore this line.
1129 if (!found_var_index)
1130 return ret;
1131
1132 // Find the end quote position of the variable name.
1133 char end_quote='"';
1134 size_t end_name_pos = str.find(end_quote,sep_pos+sep.size()+1);
1135 if (end_name_pos == string::npos)
1136 ret = false;
1137 else {
1138 // Find both var type and var name. Store them in the vector
1139 string var_name_line = str.substr(sep_pos+sep.size(),end_name_pos-sep_pos-sep.size());
1140 var_type.push_back(var_type_list[var_index]);
1141 var_name.push_back(var_name_line);
1142 ret = true;
1143 }
1144
1145 return ret;
1146}
1147
1148bool find_var(const string &str, const vector<string> &var_type_list, vector<string>&var_type,
1149 vector<string>&var_name, vector<unsigned int> &var_lines,unsigned int line_num) {
1150
1151 bool ret_value = find_var_helper(str,var_type_list,var_type,var_name);
1152 if (ret_value == true)
1153 var_lines.push_back(line_num);
1154 return ret_value;
1155}
1156
1157// Find group
1158bool find_grp(const string &str, unsigned int line_num, vector<string> &grp_names,
1159 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines) {
1160
1161 bool ret = false;
1162
1163 // Every group block will have spaces before <
1164 if (str[0]!=' ')
1165 return ret;
1166
1167 // Ignore the line with all spaces
1168 size_t non_space_char_pos = str.find_first_not_of(' ');
1169 if (non_space_char_pos == string::npos)
1170 return ret;
1171
1172 // The first non-space character should be '<'
1173 if (str[non_space_char_pos]!='<')
1174 return ret;
1175
1176 // After space, must at least contain '<','>'
1177 if (str.size() <= (non_space_char_pos+1))
1178 return ret;
1179
1180 // The last character must be '>', maybe this is too strict.
1181 // We will see.
1182 if (str[str.size()-1]!='>' )
1183 return ret;
1184
1185 // char_2 is a character right after<
1186 char char_2 = str[non_space_char_pos+1];
1187 if (char_2 != 'G')
1188 return ret;
1189
1190 // Find ' name="' and the position after non_space_char_pos+1, like <Int16 name="d16_1">
1191 string sep="Group name=\"";
1192 size_t sep_pos = str.find(sep,non_space_char_pos+1);
1193
1194 // Cannot find "Group name=", ignore this line.
1195 if (sep_pos == string::npos){
1196 return ret;
1197 }
1198
1199 // Find the end quote position of the group name.
1200 char end_quote='"';
1201 size_t end_name_pos = str.find(end_quote,sep_pos+sep.size()+1);
1202 if (end_name_pos == string::npos)
1203 ret = false;
1204 else {
1205 // Store the group name in the vector
1206 string grp_name = str.substr(sep_pos+sep.size(),end_name_pos-sep_pos-sep.size());
1207 grp_names.push_back(grp_name);
1208 grp_lines.push_back(line_num);
1209
1210 // We also need to check the empty group case. That is when Group name="foo"/>
1211 // For this case, we need to remember this line also as the end group line.
1212 // Like <Group name="FILE_ATTRIBUTES"/>
1213 if ((str.size() >(end_name_pos+1)) && str[end_name_pos+1]=='/')
1214 end_grp_lines.push_back(line_num);
1215
1216 ret = true;
1217 }
1218
1219 return ret;
1220}
1221
1222// Find the end of var block such as </Int32>
1223// There may be space before </Int32>
1224bool find_end_grp(const string &dmrpp_line,unsigned int line_num, vector<unsigned int> &end_grp_lines) {
1225 bool ret = false;
1226 string end_grp = "</Group>" ;
1227 size_t end_grp_pos = dmrpp_line.find(end_grp);
1228 if (end_grp_pos != string::npos) {
1229 if ((end_grp_pos + end_grp.size()) == dmrpp_line.size()) {
1230 end_grp_lines.push_back(line_num);
1231 ret = true;
1232 }
1233 }
1234 return ret;
1235}
1236
1237
1238
1239// Obtain the variable path.
1240string obtain_var_grp_paths(const vector<unsigned int> &gs_line_nums,
1241 const vector<unsigned int> &ge_line_nums,
1242 const vector<string> &grp_names,
1243 unsigned int var_line) {
1244 string ret_value;
1245
1246 vector<unsigned int> gse_line_nums;
1247 vector<bool> is_group_start;
1248
1249 unsigned int end_grp_index = 0;
1250 unsigned int start_grp_index = 0;
1251
1252 // The maximum index of the group is the number of groups minus 1 since index is from 0.
1253 unsigned int max_grp_index = gs_line_nums.size() -1;
1254
1255 // We combine both group lines and end_group lines to one vector.
1256 // Another vector of bool with the same size is created to mark if
1257 // this line is a start_of_a_group or an end_of_a_group.
1258 // During this process, we also eliminate the trivial groups.
1259
1260 while (end_grp_index <= max_grp_index) {
1261
1262 while (start_grp_index <= max_grp_index) {
1263
1264 if (gs_line_nums[start_grp_index] < ge_line_nums[end_grp_index]) {
1265 gse_line_nums.push_back(gs_line_nums[start_grp_index]);
1266 is_group_start.push_back(true);
1267 start_grp_index++;
1268 }
1269 else if (gs_line_nums[start_grp_index] == ge_line_nums[end_grp_index]) {
1270 // Exclude the case when the starting group line is equal to the ending group line.
1271 // This is the empty group case.
1272 start_grp_index++;
1273 end_grp_index++;
1274 }
1275 else {
1276 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
1277 is_group_start.push_back(false);
1278 end_grp_index++;
1279 }
1280 }
1281 if (end_grp_index < (max_grp_index+1)) {
1282 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
1283 is_group_start.push_back(false);
1284 end_grp_index++;
1285 }
1286 }
1287
1288 // No need to check this. It should always be true.
1289#if 0
1290 if (is_group_start.size() != gse_line_nums.size()) {
1291 cerr<<"The group "<<endl;
1292 return ret_value;
1293 }
1294#endif
1295
1296 // Debugging info, leave the block now.
1297#if 0
1298for (unsigned int i =0; i<gse_line_nums.size();i++) {
1299 cerr<<"gse_line["<<i<<"] = "<<gse_line_nums[i] <<endl;
1300 cerr<<"is_group_start["<<i<<"] = "<<is_group_start[i] <<endl;
1301}
1302#endif
1303
1304 // Obtain the start_end_group line index just before the the variable line.
1305 int gse_line_index= obtain_gse_line_index(gse_line_nums,var_line);
1306
1307#if 0
1308cerr<<"gse_line_index: "<<gse_line_index <<endl;
1309#endif
1310
1311 // obtain group lines that this variable belongs to.
1312 vector<unsigned int> grp_path_lines;
1313
1314 if (gse_line_index >= 0) {
1315
1316 int temp_index = gse_line_index;
1317
1318 // temp_rem_grp_index indicates the groups we need to remove for this var.
1319 unsigned int temp_rem_grp_index = 0;
1320
1321 // We have to search backward.
1322 while (temp_index >= 0) {
1323
1324 // Encounter an end-group, we need to increase the index.
1325 if (is_group_start[temp_index] == false)
1326 temp_rem_grp_index++;
1327 else {
1328 // Only when the number of end-group counter is 0 for this block,
1329 // does this group path belong to this variable.
1330 if (temp_rem_grp_index == 0)
1331 grp_path_lines.push_back(gse_line_nums[temp_index]);
1332 else
1333 temp_rem_grp_index--; //Cancel one start-group and end-group
1334 }
1335 temp_index--;
1336 }
1337 }
1338
1339 // For debugging
1340#if 0
1341for (const auto &gpl:grp_path_lines)
1342cerr<<"grp_path_lines "<<gpl <<endl;
1343for (const auto &gsn:gs_line_nums)
1344cerr<<"gs_lines "<<gsn <<endl;
1345for (const auto &gn:grp_names)
1346cerr<<"group name is "<<gn <<endl;
1347#endif
1348
1349 // Both the group path for this var and the group lines are sorted.
1350 // group path is from backward. So we match the group line backward.
1351 int gl_index = gs_line_nums.size() - 1; // An gl_index starts at size-1
1352
1353 for (const auto &gpl:grp_path_lines) {
1354
1355 // Note: gl_index is modified. This is intentionally since
1356 // we don't need to search the lines already visited.
1357 // We just need to prepend the group path as we search backward.
1358 for (; gl_index >= 0; gl_index--) {
1359
1360 if (gpl == gs_line_nums[gl_index]) {
1361
1362 ret_value = "/" + grp_names[gl_index] + ret_value;
1363 gl_index--;
1364 break;
1365 }
1366 }
1367 }
1368
1369#if 0
1370cerr<<"ret_value is "<<ret_value <<endl;
1371#endif
1372
1373 return ret_value;
1374
1375}
1376
1377// Obtain the start_end_group line index just before the variable line.
1378// The returned value is -1 if there is no group before this var.
1379int obtain_gse_line_index(const vector<unsigned int> &gse_line_nums, unsigned int var_line) {
1380
1381 int ret_value = -1;
1382 unsigned int total_gse_lines = gse_line_nums.size();
1383
1384 if (total_gse_lines > 0) {
1385
1386 for (int i = total_gse_lines-1; i>=0 ; i--) {
1387 if (gse_line_nums[i] >var_line)
1388 continue;
1389 else {
1390 ret_value = i;
1391 break;
1392 }
1393 }
1394
1395 }
1396 return ret_value;
1397}
1398
1399bool obtain_var_path_info(const string &fname, const vector<string> &var_type_list, vector<string> &var_fqn,
1400 vector<string> &var_type, vector<string> &var_name, vector<unsigned int> &var_lines) {
1401
1402 vector<string> grp_names;
1403 vector<unsigned int> grp_lines;
1404 vector<unsigned int> end_grp_lines;
1405
1406 bool has_group = obtain_var_grp_info(fname,var_type_list,grp_names,grp_lines,end_grp_lines,var_type, var_name,var_lines);
1407 if (!has_group) {
1408 cout <<" the missing variable info shows this dmrpp has groups, however, no group is found. "<<endl;
1409 return false;
1410 }
1411 for (unsigned int i =0; i <var_lines.size(); i++) {
1412 string var_path = obtain_var_grp_paths(grp_lines,end_grp_lines,grp_names,var_lines[i]);
1413 string vfqn = var_path + "/" + var_name[i];
1414 var_fqn.push_back(vfqn);
1415 }
1416
1417// For debugging
1418#if 0
1419for (unsigned int i = 0; i <var_lines.size(); i++) {
1420cerr<<" var fqn: "<<var_fqn[i] <<endl;
1421cerr<<" var line: "<<var_lines[i] <<endl;
1422
1423}
1424#endif
1425
1426 return true;
1427}
1428
1429bool obtain_var_grp_info(const string &fname,const vector<string> &var_type_list, vector<string> &grp_names,
1430 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines,
1431 vector<string> &var_type, vector<string> &var_name, vector<unsigned int> &var_lines) {
1432
1433 string dmrpp_line;
1434
1435 // find <Group> and </Group>
1436 bool find_grp_start = false;
1437 bool find_grp_end = false;
1438
1439 unsigned int line_num = 0;
1440
1441 ifstream dmrpp_fstream;
1442 dmrpp_fstream.open(fname.c_str(),ifstream::in);
1443
1444 while(getline(dmrpp_fstream,dmrpp_line)) {
1445
1446 find_grp_start = find_grp(dmrpp_line,line_num,grp_names,grp_lines,end_grp_lines);
1447
1448 if (find_grp_start == false)
1449 find_grp_end = find_end_grp(dmrpp_line,line_num,end_grp_lines);
1450 if (!find_grp_start && !find_grp_end)
1451 find_var(dmrpp_line,var_type_list,var_type, var_name,var_lines,line_num);
1452 line_num++;
1453 }
1454
1455 return !(grp_names.empty());
1456
1457}
1458
1459bool merge_chunk_info_g(const string &fname, const vector<string> &var_type,const vector<string> &var_name,
1460 const vector<string> &var_candidate_fqn, const vector<string> &miss_var_fqn,
1461 const vector<string> &miss_var_type, const vector<string> &chunk_info) {
1462
1463 string dmrpp_str;
1464 bool ret_value = true;
1465
1466 // Read the "original dmrpp file" to a string
1467 file_to_string(fname,dmrpp_str);
1468
1469#if 0
1470cout <<"dmrpp_str is "<<dmrpp_str<<endl;
1471#endif
1472
1473 // Now find the *possible* missing variable positions in the dmrpp_str based on the variable names
1474 // and types in the original dmrpp file.
1475 vector<size_t> var_candidate_pos;
1476 ret_value = obtain_miss_var_candidate_pos(dmrpp_str, var_type, var_name,var_candidate_pos);
1477
1478 if (ret_value == false)
1479 return ret_value;
1480#if 0
1481for (const auto &vcp:var_candidate_pos)
1482 cout <<"pos is: "<<vcp <<endl;
1483#endif
1484
1485
1486 // Convert string according to the string positions.
1487 vector<string> dmrpp_vec;
1488 vector<string> ordered_chunk_info;
1489
1490 // Find the positions and the datatypes of the final missing variables in the original dmrpp file.
1491 vector<string> final_var_type;
1492 vector<size_t> var_pos;
1493 obtain_final_miss_var_info(var_candidate_fqn,miss_var_fqn,miss_var_type,final_var_type,var_candidate_pos, var_pos,chunk_info,ordered_chunk_info);
1494
1495#if 0
1496for (const auto &oci:ordered_chunk_info)
1497 cout << "chunk info: "<<oci <<endl;
1498for (const auto &fvt:final_var_type)
1499 cout << "fvt: "<<fvt <<endl;
1500
1501#endif
1502
1503 string_tokenize_by_pos(dmrpp_str, var_pos, dmrpp_vec);
1504
1505 ret_value = merge_chunk_info_vec(dmrpp_vec, final_var_type, ordered_chunk_info);
1506 if (ret_value == true)
1507 write_vec_to_file(fname,dmrpp_vec);
1508
1509 return ret_value;
1510}
1511
1512bool obtain_miss_var_candidate_pos(const string &dmrpp_str, const vector<string> &var_type,
1513 const vector<string> &var_name, vector<size_t> &var_pos) {
1514
1515 bool ret_value = true;
1516 size_t str_start_pos = 0;
1517 for (unsigned int i = 0; i < var_name.size(); i++) {
1518
1519 string var_sign = "<"+var_type[i] +" name=\"" + var_name[i] +"\">";
1520 size_t v_pos = dmrpp_str.find(var_sign,str_start_pos);
1521 if (v_pos == string::npos) {
1522 cout <<"Cannot find the var name " << var_name[i] << "in the original dmrpp file "<<endl;
1523 ret_value = false;
1524 break;
1525 }
1526 var_pos.push_back(v_pos);
1527 str_start_pos = v_pos + var_sign.size();
1528 }
1529
1530 return ret_value;
1531}
1532
1533void obtain_final_miss_var_info(const vector<string> &var_fqn, const vector<string> &miss_var_fqn,
1534 const vector<string> &miss_var_type, vector<string> &final_var_type,
1535 const vector<size_t> &var_candidate_pos, vector<size_t> &var_pos,
1536 const vector<string> &chunk_info, vector<string> &ordered_chunk_info) {
1537
1538 for (unsigned int i = 0; i<var_fqn.size(); i++) {
1539 for (unsigned int j = 0; j<miss_var_fqn.size(); j++) {
1540
1541 // This block assures that the chunk info is put in the right place of a missing variable
1542 // in the original dmrpp file.
1543 if (var_fqn[i] == miss_var_fqn[j]) {
1544 var_pos.push_back(var_candidate_pos[i]);
1545 final_var_type.push_back(miss_var_type[j]);
1546 ordered_chunk_info.push_back(chunk_info[j]);
1547 break;
1548 }
1549 }
1550 }
1551}
1552
1553
1554bool merge_chunk_info_vec(vector<string> &dmrpp_vec, const vector<string> &miss_var_type,
1555 const vector<string> &ordered_chunk_info) {
1556
1557 bool ret_value = true;
1558 // Note: the first element of the dmrpp_vec doesn't have chunk info.
1559 for (unsigned int i = 1; i < dmrpp_vec.size(); i++) {
1560 string temp_dmrpp_seg = dmrpp_vec[i];
1561 ret_value = insert_chunk_info_to_vec(temp_dmrpp_seg, miss_var_type[i-1], ordered_chunk_info[i-1]);
1562 if (ret_value == false)
1563 break;
1564 else
1565 dmrpp_vec[i] = temp_dmrpp_seg;
1566 }
1567
1568 return ret_value;
1569}
1570
1571bool insert_chunk_info_to_vec(string &dmrpp_block_str, const string &var_type, const string &chunk_info) {
1572
1573 bool ret_value = true;
1574 string end_var = "</" + var_type + '>';
1575 size_t end_var_pos = dmrpp_block_str.find(end_var);
1576
1577 if (end_var_pos == string::npos) {
1578 cout << "Cannot find:\n "<<end_var << " \n in the string \n"<<dmrpp_block_str <<endl;
1579 ret_value = false;
1580 }
1581 else {
1582
1583 char add_chunk_mark = '>';
1584 size_t chunk_mark_pos = dmrpp_block_str.rfind(add_chunk_mark,end_var_pos);
1585 if (chunk_mark_pos == string::npos) {
1586 cout << "Cannot find:\n "<<add_chunk_mark << " \n in the string \n"<<dmrpp_block_str <<endl;
1587 ret_value = false;
1588 }
1589 else {
1590 string before_chunk_info_str = dmrpp_block_str.substr(0,chunk_mark_pos+1);
1591 string after_chunk_info_str = dmrpp_block_str.substr(chunk_mark_pos+1);
1592 dmrpp_block_str = before_chunk_info_str + '\n' + chunk_info + after_chunk_info_str;
1593 }
1594 }
1595
1596 return ret_value;
1597}
1598