13bool obtain_var_info(
const string &miss_dmrpp_info,
const vector<string> &var_type_check_list,
14 vector<string> &var_types, vector<string> &var_names, vector<string> &chunk_info_list,
15 bool &is_chunk_mark1);
16bool find_var_name(
const string &str,
size_t &str_pos,
size_t &var_name_pos_start,
size_t &var_name_pos_end);
17bool find_end_var_block(
const string &str,
const string &var_type,
const size_t &str_pos,
size_t &var_end_pos);
18bool find_chunk_info(
const string &str,
const size_t &str_pos,
size_t &chunk_info_pos_start,
size_t &chunk_info_pos_end,
19 const size_t &var_end_pos,
bool &is_mark1);
22bool obtain_var_path_info(
const string &fname,
const vector<string> &var_type_list, vector<string> &var_path,
23 vector<string>& var_type, vector<string> &var_name, vector<unsigned int> &var_lines);
25bool obtain_var_grp_info(
const string &fname,
const vector<string> &var_type_list, vector<string> &grp_names,
26 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines,
27 vector<string> &var_type, vector<string> &var_name, vector<unsigned int> &var_lines);
29string obtain_var_grp_paths(
const vector<unsigned int> &gs_line_nums,
const vector<unsigned int> &ge_line_nums,
30 const vector<string> &grp_names,
unsigned int var_line);
32int obtain_gse_line_index(
const vector<unsigned int> &gse_line_nums,
unsigned int var_line);
34bool find_grp(
const string &str,
unsigned int line_num, vector<string> &grp_names,
35 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines);
37bool find_end_grp(
const string &dmrpp_line,
unsigned int line_num, vector<unsigned int> &end_grp_lines);
39bool find_var_helper(
const string &dmrpp_line,
const vector<string> &var_type_list, vector<string>& var_type,
40 vector<string>&var_name);
42bool find_var(
const string &str,
const vector<string> &var_type_list, vector<string> &var_type, vector<string> &var_name,
43 vector<unsigned int> &var_lines,
unsigned int line_num);
45bool merge_chunk_info_g(
const string &str,
const vector<string> &var_type,
const vector<string> &var_name,
46 const vector<string> &var_fqn,
const vector<string> &miss_var_fqn,
47 const vector<string> &miss_var_type,
const vector<string> &chunk_info);
49bool obtain_miss_var_candidate_pos(
const string &dmrpp_str,
const vector<string> &var_type,
50 const vector<string> &var_name, vector<size_t> &var_pos);
52void obtain_final_miss_var_info(
const vector <string> &var_fqn,
const vector <string> &miss_var_fqn,
53 const vector <string> &var_type, vector <string> &final_var_type,
54 const vector <size_t> &var_candidate_pos, vector<size_t> &var_pos,
55 const vector <string> &chunk_info, vector<string> &ordered_chunk_info);
57bool merge_chunk_info_vec(vector <string> &dmrpp_vec,
const vector<string> &miss_var_type,
58 const vector<string> &ordered_chunk_info);
60bool insert_chunk_info_to_vec(
string &dmrpp_block_str,
const string &var_type,
const string &chunk_info);
64bool add_faddr_chunk_info_simple(vector<string> &chunk_info_list,
bool is_dmrpp_mark1,
const string& faddr_source =
"");
71bool add_faddr_contig_line(
string &chunk_info,
const string &file_addr);
72bool add_faddr_chunk_comp_lines(
string &chunk_info,
const string &file_addr);
76bool add_missing_info_to_file(
const string &fname2,
const vector<string> &var_types,
const vector<string> &var_names,
77 const vector<string> &chunk_info_list);
78void gen_block(
const vector<string> &var_type_list,
const vector<string> &var_name_list,vector<string> &block_begin,
79 vector<string> &block_end);
80bool check_overlap_intervals(
const vector<size_t> &sort_block_pos,
const vector<size_t> &block_pos_start);
81void obtain_bindex_in_modified_string(
const vector<size_t> &block_pos_start, vector<int> &block_index);
82bool split_string(
const string &str, vector<string> &str_vec,
const vector<string> &block_begin,
83 const vector<string> &block_end, vector<int> &block_index);
84bool convert_dmrppstr_to_vec(
const string &dmrpp_str, vector<string> &dmrpp_str_vec,
const vector<string> &var_types,
85 const vector<string> &var_names,vector<int> &block_index);
86void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,
const vector<string> &chunk_info_list,
87 const vector<int> &block_index);
88void write_vec_to_file(
const string &fname,
const vector<string> &dmrpp_str_vec);
91void file_to_string(
const string &filename,
string &out);
92bool string_tokenize(
const string &in_str,
const char delim, vector<string>&out_vec);
93bool string_tokenize_by_pos(
const string &in_str,
const vector<size_t> &var_pos, vector<string> &out_vec);
96int main(
int argc,
char**argv)
99 vector<string> var_types;
100 vector<string> var_names;
101 vector<string> chunk_info_list;
103 bool add_dmrpp_info =
false;
104 bool is_chunk_mark1 =
true;
106 string missing_dmrpp_str;
109 cout<<
"Please provide four arguments: "<< endl;
110 cout<<
" The first is the dmrpp file that contains the information of the variable of which";
111 cout<<
" the data cannot be found in the original HDF5/4 file but can be found"
112 <<
" from the HDF5 file pointed by this dmrpp file. "<<endl;
113 cout<<
" The second is the dmrpp file for the original HDF5/4 file. "<<endl;
114 cout<<
" The third one is the href to HDF5/HDF4 file of which the missing data is stored. "<<endl;
115 cout<<
" The fourth one is the text file that includes the variable path of which"
116 <<
" the data cannot be found in the original HDF5/4 file. "<<endl;
118 cout <<
" Warning: before running this program, one must run the check_dmrpp program first to see "
119 <<
"if the original dmrpp file contains any missing variable of which the data cannot be found "
120 <<
"in the original HDF5/HDF4 file. "<<endl;
125 vector<string> var_type_check_list;
127 var_type_check_list.emplace_back(
"Float32");
128 var_type_check_list.emplace_back(
"Int32");
129 var_type_check_list.emplace_back(
"Float64");
130 var_type_check_list.emplace_back(
"Byte");
131 var_type_check_list.emplace_back(
"Int16");
132 var_type_check_list.emplace_back(
"UInt16");
133 var_type_check_list.emplace_back(
"String");
134 var_type_check_list.emplace_back(
"UInt32");
135 var_type_check_list.emplace_back(
"Int8");
136 var_type_check_list.emplace_back(
"Int64");
137 var_type_check_list.emplace_back(
"UInt64");
138 var_type_check_list.emplace_back(
"UInt8");
139 var_type_check_list.emplace_back(
"Char");
142 string fname(argv[1]);
145 file_to_string(fname,missing_dmrpp_str);
148 add_dmrpp_info = obtain_var_info(missing_dmrpp_str,var_type_check_list,var_types,var_names,
149 chunk_info_list,is_chunk_mark1);
152 if (
false == add_dmrpp_info) {
153 cout<<
"Cannot find the corresponding chunk info. from the supplemental dmrpp file."<<endl;
154 cout<<
"You may need to check if there is any variable in the dmrpp file. "<<endl;
155 cout<<
"The dmrpp file is "<<fname <<endl;
159 if (var_types.size() != var_names.size() || var_names.size() != chunk_info_list.size()) {
160 cout <<
"Var type, var name and chunk_info must have the same number of elements. "<<endl;
161 cout <<
"The dmrpp file is "<<fname <<endl;
167 for (
size_t i =0; i<var_names.size();i++) {
168cout<<
"var type["<<i<<
"] "<< var_types[i]<<endl;
169cout<<
"var name["<<i<<
"] "<< var_names[i]<<endl;
170cout<<
"chunk_info_list["<<i<<
"] "<< chunk_info_list[i] << endl;
176 string mvar_fname(argv[4]);
177 string missing_vname_str;
180 file_to_string(mvar_fname,missing_vname_str);
182 if (missing_vname_str.empty()) {
183 cout<<
" The text file that has the data-missing variable path is empty." <<endl;
184 cout<<
" Please check the file. "<<endl;
188 if (missing_vname_str[missing_vname_str.size()-1]==
'\n')
189 missing_vname_str = missing_vname_str.substr(0,missing_vname_str.size()-1);
192cout<<
"missing_vname_str: "<<missing_vname_str<<endl;
195 vector<string> missing_vname_list;
200 bool has_delim = string_tokenize(missing_vname_str,delim,missing_vname_list);
203 missing_vname_list.clear();
204 string_tokenize(missing_vname_str,delim,missing_vname_list);
210 bool handle_grp =
false;
211 for (
const auto &mv_name:missing_vname_list) {
213 size_t path_pos = mv_name.find_last_of(
'/');
216 if (path_pos !=string::npos && path_pos!=0) {
224 for(
size_t i = 0;i<missing_vname_list.size();i++)
225 cout <<
"missing_vname_list["<<i<<
"]= "<<missing_vname_list[i]<<endl;
229 if (handle_grp ==
true) {
232 vector<string> mdp_var_fqn;
233 vector<string> mdp_var_names_g;
234 vector<string> mdp_var_types_g;
235 vector<unsigned int> mdp_var_lines;
237 if (
false == obtain_var_path_info(fname, var_type_check_list, mdp_var_fqn, mdp_var_types_g,
238 mdp_var_names_g, mdp_var_lines))
243 vector<string> new_var_types;
244 vector<string> new_var_names;
245 vector<string> new_var_fqns;
246 vector<string> new_chunk_info_list;
248 if (mdp_var_names_g != var_names) {
249 cout <<
" Internal error: variable names should be the same even retrieved with different methods."<<endl;
253 for (
size_t i =0; i<mdp_var_fqn.size();i++) {
254 for (
const auto & mvl:missing_vname_list) {
255 if (mdp_var_fqn[i] == mvl) {
256 new_var_names.push_back(mdp_var_names_g[i]);
257 new_var_fqns.push_back(mdp_var_fqn[i]);
258 new_var_types.push_back(mdp_var_types_g[i]);
259 new_chunk_info_list.push_back(chunk_info_list[i]);
266 string fadd_source(argv[3]);
267 add_faddr_chunk_info_simple(new_chunk_info_list,is_chunk_mark1,fadd_source);
274for (
const auto &nc_info:new_chunk_info_list)
275cout <<
"chunk_info "<<nc_info <<endl;
280 string fname2(argv[2]);
281 vector<string> odp_var_fqn;
282 vector<string> odp_var_names_g;
283 vector<string> odp_var_types_g;
284 vector<unsigned int> odp_var_lines;
288 if (
false == obtain_var_path_info(fname2,var_type_check_list,odp_var_fqn,
289 odp_var_types_g,odp_var_names_g,odp_var_lines))
301 vector<unsigned int> final_odp_var_lines;
302 vector<string> final_odp_var_fqns;
303 vector<string> final_odp_var_names;
304 vector<string> final_odp_var_types;
306 for (
unsigned int i = 0; i < odp_var_names_g.size();i++) {
307 for (
unsigned int j = 0; j < new_var_names.size(); j++) {
308 if ((odp_var_names_g[i] == new_var_names[j]) && (odp_var_types_g[i] == new_var_types[j])) {
309 final_odp_var_lines.push_back(odp_var_lines[i]);
310 final_odp_var_fqns.push_back(odp_var_fqn[i]);
311 final_odp_var_names.push_back(odp_var_names_g[i]);
312 final_odp_var_types.push_back(odp_var_types_g[i]);
320cout <<
" Before the final step "<<endl;
321for (
unsigned int i = 0; i <final_odp_var_types.size(); i++) {
322cout <<
"vtype: "<<final_odp_var_types[i] <<endl;
323cout <<
"vname: "<<final_odp_var_names[i] <<endl;
324cout <<
"vfqn: "<<final_odp_var_fqns[i] <<endl;
325cout <<
"new vfqn: "<<new_var_fqns[i] <<endl;
330 merge_chunk_info_g(fname2,final_odp_var_types,final_odp_var_names,
331 final_odp_var_fqns,new_var_fqns,new_var_types,
332 new_chunk_info_list);
340 vector <unsigned int> missing_chunk_info_lines;
341 for (
unsigned int i = 0; i < new_var_fqns.size(); i++) {
342 for (
unsigned int j = 0; j <final_odp_var_fqns.size(); j++) {
343 if (new_var_fqns[i] == final_odp_var_fqns[j]) {
344 missing_chunk_info_lines.push_back(final_odp_var_lines[j]);
353for (
const auto &mcil:missing_chunk_info_lines)
354 cout <<
"missing chunk info line is: "<<mcil <<endl;
361cout <<
"coming to the nogroup case"<<endl;
365 vector<string> new_var_types;
366 vector<string> new_var_names;
367 vector<string> new_chunk_info_list;
370 vector<string> missing_vname_list_trim;
371 for (
const auto &mvname:missing_vname_list) {
372 string temp_str = mvname;
373 if (temp_str[0] ==
'/')
374 temp_str = temp_str.substr(1);
375 missing_vname_list_trim.emplace_back(temp_str);
378 for (
size_t i =0; i<var_names.size();i++) {
379 for (
const auto &mvname:missing_vname_list_trim) {
380 if (var_names[i] == mvname) {
381 new_var_names.push_back(var_names[i]);
382 new_var_types.push_back(var_types[i]);
383 new_chunk_info_list.push_back(chunk_info_list[i]);
390 string fadd_source(argv[3]);
395 add_faddr_chunk_info_simple(new_chunk_info_list,is_chunk_mark1,fadd_source);
398for (
size_t i =0; i<new_var_types.size();i++) {
399 cout<<
"new chunk_info_list["<<i<<
"]"<< endl;
400 cout<<new_chunk_info_list[i]<<endl;
404 string fname2(argv[2]);
407 bool well_formed = add_missing_info_to_file(fname2,new_var_types,new_var_names,
408 new_chunk_info_list);
410 if (
false == well_formed) {
411 cout <<
"The dmrpp file to be modified is either not well-formed or contains nested variable blocks ";
412 cout <<
"that cannot be supported by this routine. " <<endl;
413 cout <<
"The dmrpp file is "<<fname2<<endl;
423bool obtain_var_info(
const string &miss_dmrpp_info,
const vector<string> &var_type_check_list,
424 vector<string> &var_types, vector<string> &var_names,vector<string> &chunk_info_list,
425 bool &is_chunk_mark1) {
429 size_t var_type_pos_start = 0;
430 size_t var_name_pos_start = 0;
431 size_t var_name_pos_end = 0;
432 size_t chunk_pos_start = 0;
433 size_t chunk_pos_end = 0;
434 size_t var_end_pos = 0;
437 if (miss_dmrpp_info.empty())
440 size_t str_last_char_pos = miss_dmrpp_info.size() - 1;
441 bool well_formed =
true;
444 while (str_pos <= str_last_char_pos && well_formed) {
448 string temp_var_sign;
449 size_t temp_var_type_pos_start = string::npos;
450 int var_type_index = -1;
455 while (i < var_type_check_list.size()) {
457 var_sign =
"<" + var_type_check_list[i] +
" name=\"";
458 var_type_pos_start = miss_dmrpp_info.find(var_sign, str_pos);
460 if (var_type_pos_start == string::npos) {
466 if (temp_var_type_pos_start > var_type_pos_start) {
467 temp_var_type_pos_start = var_type_pos_start;
469 temp_var_sign = var_sign;
476 if (temp_var_type_pos_start != string::npos) {
477 var_type_pos_start = temp_var_type_pos_start;
478 var_sign = temp_var_sign;
482 if (var_type_pos_start == string::npos) {
483 str_pos = string::npos;
486 str_pos = var_type_pos_start + var_sign.size();
490 if (
false == find_var_name(miss_dmrpp_info, str_pos, var_name_pos_start, var_name_pos_end))
492 else if (
false == find_end_var_block(miss_dmrpp_info, var_type_check_list[var_type_index],
493 str_pos, var_end_pos))
495 else if (
false == find_chunk_info(miss_dmrpp_info, str_pos, chunk_pos_start, chunk_pos_end,
496 var_end_pos, is_chunk_mark1))
501 str_pos = var_end_pos + 1;
504 var_types.push_back(var_type_check_list[var_type_index]);
505 var_names.push_back(miss_dmrpp_info.substr(var_name_pos_start, var_name_pos_end - var_name_pos_start));
506 string temp_chunk_info = miss_dmrpp_info.substr(chunk_pos_start, chunk_pos_end - chunk_pos_start);
507 if (
true == is_chunk_mark1)
508 temp_chunk_info +=
"</dmrpp:chunks>";
510 temp_chunk_info +=
"/>";
511 chunk_info_list.push_back(temp_chunk_info);
520bool find_var_name(
const string &str,
size_t &str_pos,
size_t &var_name_pos_start,
size_t &var_name_pos_end) {
523 var_name_pos_start = str_pos;
524 var_name_pos_end = str.find(
"\"",str_pos);
525 if (var_name_pos_end == string::npos)
528 str_pos = var_name_pos_end;
533cout<<
"cannot find var name"<<endl;
540bool find_end_var_block(
const string&str,
const string&var_type,
const size_t &str_pos,
size_t &var_end_pos) {
542 string end_var =
"</" + var_type +
'>';
543 var_end_pos = str.find(end_var,str_pos);
547if(var_end_pos==string::npos)
548cout<<
"cannot find end var block"<<endl;
551 return !(var_end_pos==string::npos);
556bool find_chunk_info(
const string &str,
const size_t&str_pos,
size_t &chunk_info_pos_start,
size_t &chunk_info_pos_end,
557 const size_t&var_end_pos,
bool & is_mark1){
560 string chunk_start_mark1 =
"<dmrpp:chunks";
561 string chunk_end_mark1 =
"</dmrpp:chunks>";
562 string chunk_start_mark2 =
"<dmrpp:chunk ";
563 string chunk_end_mark2 =
"/>";
567cout<<
"str_pos is "<<str_pos <<endl;
568cout<<
"var_end_pos is "<<var_end_pos <<endl;
569cout<<
"substr is "<<str.substr(str_pos,var_end_pos-str_pos)<<endl;
572 chunk_info_pos_start = str.find(chunk_start_mark1,str_pos);
574 if (string::npos == chunk_info_pos_start) {
575 chunk_info_pos_start = str.find(chunk_start_mark2,str_pos);
576 if(string::npos != chunk_info_pos_start)
577 chunk_info_pos_end = str.find(chunk_end_mark2,str_pos);
581 chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1) + 1;
585 chunk_info_pos_start = str.find_last_not_of(wspace,chunk_info_pos_start-1) + 1;
586 chunk_info_pos_end = str.find(chunk_end_mark1,str_pos);
592 if (string::npos == chunk_info_pos_start || string::npos == chunk_info_pos_end)
594 else if (var_end_pos <= chunk_info_pos_end)
598 cout<<
"cannot find_chunk_info "<<endl;
606bool add_faddr_chunk_info_simple(vector<string>& chunk_info,
bool is_dmrpp_mark1,
const string &faddr_source) {
608 if (chunk_info.size() == 0)
610 string addr_mark =
"dmrpp:href=\"";
615 if (chunk_info[0].find(addr_mark)!=string::npos)
620 string end_delim1 =
"\"";
623 hdf5_faddr =
" href=\"" + faddr_source + end_delim1;
629 for (
size_t i = 0; i<chunk_info.size(); i++) {
636 if (
true == is_dmrpp_mark1)
637 add_faddr_chunk_comp_lines(chunk_info[i],hdf5_faddr);
639 add_faddr_contig_line(chunk_info[i],hdf5_faddr);
650bool add_faddr_chunk_info(
const string &str,vector<string>& chunk_info,
bool is_dmrpp_mark1,
const string faddr_source) {
652 bool well_formed=
true;
653 if(chunk_info.size()==0)
655 string addr_mark =
"dmrpp:href=\"";
660 if(chunk_info[0].find(addr_mark)!=string::npos)
666 string name_mark =
" name=\"";
667 string end_delim1 =
"\"";
670 size_t hdf5_fname_start_pos = str.find(name_mark);
671 if(hdf5_fname_start_pos == string::npos)
673 size_t hdf5_fname_end_pos = str.find(end_delim1,hdf5_fname_start_pos+name_mark.size());
674 if(hdf5_fname_end_pos == string::npos)
676 hdf5_fname = str.substr(hdf5_fname_start_pos+name_mark.size(),hdf5_fname_end_pos-hdf5_fname_start_pos-name_mark.size());
681 size_t hdf5_faddr_start_pos = str.find(addr_mark);
682 if(hdf5_faddr_start_pos != string::npos) {
683 size_t hdf5_faddr_end_pos = str.find(end_delim1,hdf5_faddr_start_pos+addr_mark.size());
684 if(hdf5_faddr_end_pos == string::npos)
686 hdf5_faddr = str.substr(hdf5_faddr_start_pos+addr_mark.size(),hdf5_faddr_end_pos-hdf5_faddr_start_pos-addr_mark.size());
690 hdf5_faddr =
" href=\"" + faddr_source + end_delim1;
702 for (
size_t i = 0;i<chunk_info.size();i++) {
709 if(
true == is_dmrpp_mark1)
710 add_faddr_chunk_comp_lines(chunk_info[i],hdf5_faddr);
712 add_faddr_contig_line(chunk_info[i],hdf5_faddr);
721bool add_faddr_chunk_comp_lines(
string & chunk_info,
const string &file_addr) {
723 string chunk_line_mark =
"<dmrpp:chunk offset=";
724 string chunk_line_end_mark =
"/>";
725 string chunk_stop_mark =
"</dmrpp:chunks>";
728 size_t chunk_line_end_pos = 0;
729 bool loop_continue =
true;
731 bool well_formed =
true;
732 bool find_chunk_line =
false;
735 while (
true == loop_continue) {
736 temp_pos = chunk_info.find(chunk_line_mark,str_pos);
737 if (temp_pos != string::npos) {
739 chunk_line_end_pos = chunk_info.find(chunk_line_end_mark,temp_pos);
740 if (chunk_line_end_pos != string::npos) {
741 find_chunk_line =
true;
742 temp_str += chunk_info.substr(str_pos,chunk_line_end_pos-str_pos);
743 temp_str += file_addr;
744 str_pos = chunk_line_end_pos;
747 loop_continue =
false;
753 temp_pos = chunk_info.find(chunk_stop_mark,str_pos);
754 loop_continue =
false;
758 if (temp_pos != string::npos)
759 temp_str +=
' '+ chunk_info.substr(str_pos);
764 if (
true == find_chunk_line)
765 chunk_info = temp_str;
773bool add_faddr_contig_line(
string &chunk_info,
const string &file_addr) {
775 bool well_formed =
true;
776 string chunk_line_start_mark =
"<dmrpp::chunk nBytes=";
777 string chunk_line_end_mark =
"/>";
782 size_t chunk_line_end_pos = chunk_info.find(chunk_line_end_mark);
783 if (string::npos == chunk_line_end_pos)
786 temp_str = chunk_info.substr(0,chunk_line_end_pos);
787 temp_str += file_addr;
788 temp_str +=
' ' +chunk_info.substr(chunk_line_end_pos);
789 chunk_info = temp_str;
795bool add_missing_info_to_file(
const string &fname,
const vector<string> &var_types,
const vector<string> &var_names,
796 const vector<string> &chunk_info_list) {
798 bool well_formed =
true;
802 file_to_string(fname,dmrpp_str);
804 vector<string> dmrpp_str_vec;
805 vector<int> block_index;
811 well_formed = convert_dmrppstr_to_vec(dmrpp_str,dmrpp_str_vec,var_types,var_names,block_index);
814 string().swap(dmrpp_str);
817 if (
true == well_formed) {
818 add_missing_info_to_vec(dmrpp_str_vec,chunk_info_list,block_index);
819 write_vec_to_file(fname,dmrpp_str_vec);
827bool convert_dmrppstr_to_vec(
const string &dmrpp_str, vector<string> &dmrpp_str_vec,
828 const vector<string> &var_types,
const vector<string> &var_names,
829 vector<int> &block_index) {
831 vector<string>block_begin;
832 block_begin.resize(var_types.size());
833 vector<string>block_end;
834 block_end.resize(var_types.size());
835 gen_block(var_types,var_names,block_begin,block_end);
838for(
size_t i =0; i<block_begin.size();i++)
840cout<<
"block_begin["<<i<<
"]= "<<block_begin[i]<<endl;
841cout<<
"block_end["<<i<<
"]= "<<block_end[i]<<endl;
846 bool well_formed = split_string(dmrpp_str,dmrpp_str_vec,block_begin,block_end,block_index);
852void add_missing_info_to_vec(vector<string> &dmrpp_str_vec,
const vector<string> &chunk_info_list,
853 const vector<int> &block_index) {
856 char insert_mark =
'>';
857 for (
size_t i = 0; i < block_index.size(); i++) {
869 temp_str = dmrpp_str_vec[2*i+1];
870 size_t insert_pos = temp_str.find_last_of(insert_mark);
871 insert_pos = temp_str.find_last_of(insert_mark,insert_pos-1);
874 string temp_str2 =
'\n' + chunk_info_list[block_index[i]];
875 temp_str.insert(insert_pos+1,temp_str2);
880 dmrpp_str_vec[2*i+1] = temp_str;
889void write_vec_to_file(
const string &fname,
const vector<string> &dmrpp_str_vec) {
892 for (
size_t i =0;i<dmrpp_str_vec.size();i++)
893 str_to_file +=dmrpp_str_vec[i];
896 outFile.open(fname.c_str());
897 outFile<<str_to_file;
903void gen_block(
const vector<string> &var_type_list,
const vector<string> &var_name_list,
904 vector<string> &block_begin, vector<string> &block_end) {
906 for (
size_t i = 0; i < var_type_list.size(); i++) {
907 block_begin[i] =
'<' +var_type_list[i] +
' '+
"name=\""+var_name_list[i]+
"\">";
908 block_end[i] =
"</" + var_type_list[i] +
'>';
913bool split_string(
const string &str, vector<string> &str_vec,
const vector<string> &block_begin,
914 const vector<string> &block_end,vector<int> &block_index) {
916 bool well_formed =
true;
917 vector<size_t> block_begin_pos;
918 vector<size_t> block_end_pos;
919 block_begin_pos.resize(block_begin.size());
920 block_end_pos.resize(block_end.size());
928 for(
size_t i = 0; i<block_begin.size(); i++) {
929 block_begin_pos[i] = str.find(block_begin[i]);
930 block_end_pos[i] = str.find(block_end[i],block_begin_pos[i])+(block_end[i].size());
933 obtain_bindex_in_modified_string(block_begin_pos,block_index);
936for(
size_t i = 0; i<block_index.size();i++)
937cout<<
"block_index["<<i<<
"] is: "<<block_index[i] <<endl;
939 vector<size_t> block_pos;
940 block_pos.resize(2*block_begin_pos.size());
941 for (
size_t i = 0; i < block_begin.size(); i++) {
942 block_pos[2*i] = block_begin_pos[i];
943 block_pos[2*i+1] = block_end_pos[i];
947 sort(block_pos.begin(),block_pos.end());
952 well_formed = check_overlap_intervals(block_pos,block_begin_pos);
958 if (
true == well_formed) {
960 size_t str_block_pos = 0;
961 str_vec.resize(block_pos.size()+1);
962 for (
size_t i =0; i < block_pos.size(); i++) {
963 str_vec[i] = str.substr(str_block_pos,block_pos[i]-str_block_pos);
964 str_block_pos = block_pos[i];
966 str_vec[block_pos.size()] = str.substr(str_block_pos);
969for(
size_t i = 0; i <str_vec.size();i++)
970 cout<<
"str_vec["<<i<<
"] is: "<<str_vec[i] <<endl;
981bool check_overlap_intervals(
const vector<size_t> &sort_block_pos,
const vector<size_t> &block_pos_start){
984 set<size_t>sort_start_pos;
985 set<size_t>start_pos;
986 for (
size_t i = 0; i<block_pos_start.size();i++) {
987 sort_start_pos.insert(sort_block_pos[2*i]);
988 start_pos.insert(block_pos_start[i]);
990 return (sort_start_pos == start_pos);
1007void obtain_bindex_in_modified_string(
const vector<size_t> &block_pos_start, vector<int> &block_index) {
1009 vector<pair<size_t,int> > pos_index;
1010 for (
size_t i = 0; i <block_pos_start.size(); i++)
1011 pos_index.push_back(make_pair(block_pos_start[i],i));
1014 sort(pos_index.begin(),pos_index.end());
1016 for (
size_t i = 0; i < block_pos_start.size(); i++)
1017 block_index.push_back(pos_index[i].second);
1022void file_to_string(
const string &filename,
string &out_str) {
1025 inFile.open(filename.c_str());
1027 stringstream strStream;
1028 strStream << inFile.rdbuf();
1031 out_str = strStream.str();
1037bool string_tokenize(
const string &in_str,
const char delim, vector<string> &out_vec) {
1038 stringstream ss_str(in_str);
1040 while (getline(ss_str,temp_str,delim))
1041 out_vec.push_back(temp_str);
1043 return (in_str.find(delim)!= string::npos);
1048bool string_tokenize_by_pos(
const string &in_str,
const vector<size_t>& pos, vector<string> &out_vec) {
1050 if (pos.empty() || pos.front() ==0 || (pos.back()+1) >in_str.size())
1053 out_vec.push_back(in_str.substr(0,pos[0]));
1054 for (
unsigned int i = 0; i < (pos.size()-1); i++)
1055 out_vec.push_back(in_str.substr(pos[i],pos[i+1]-pos[i]));
1056 out_vec.push_back(in_str.substr(pos.back()));
1059for (
unsigned int i = 0; i < out_vec.size(); i ++)
1060cout <<
"string vec is: " << out_vec[i] <<endl;
1067bool find_var_helper(
const string &str,
const vector<string> &var_type_list,
1068 vector<string> &var_type, vector<string> &var_name) {
1081 size_t non_space_char_pos = str.find_first_not_of(
' ');
1082 if (non_space_char_pos == string::npos)
1086 if (str[non_space_char_pos] !=
'<')
1090 if (str.size() <= (non_space_char_pos+1))
1095 if (str[str.size()-1] !=
'>' )
1099 char char_2 = str[non_space_char_pos+1];
1104 string v_1char_list =
"FIUBSC";
1107 if (v_1char_list.find_first_of(char_2) == string::npos)
1111 string sep=
" name=\"";
1112 size_t sep_pos = str.find(sep,non_space_char_pos+2);
1115 if (sep_pos == string::npos)
1119 bool found_var_index =
false;
1120 size_t var_index = 0;
1121 for (
size_t i = 0; i<var_type_list.size(); i++) {
1122 if (str.compare(non_space_char_pos+1,sep_pos-non_space_char_pos-1,var_type_list[i]) == 0) {
1124 found_var_index =
true;
1129 if (!found_var_index)
1134 size_t end_name_pos = str.find(end_quote,sep_pos+sep.size()+1);
1135 if (end_name_pos == string::npos)
1139 string var_name_line = str.substr(sep_pos+sep.size(),end_name_pos-sep_pos-sep.size());
1140 var_type.push_back(var_type_list[var_index]);
1141 var_name.push_back(var_name_line);
1148bool find_var(
const string &str,
const vector<string> &var_type_list, vector<string>&var_type,
1149 vector<string>&var_name, vector<unsigned int> &var_lines,
unsigned int line_num) {
1151 bool ret_value = find_var_helper(str,var_type_list,var_type,var_name);
1152 if (ret_value ==
true)
1153 var_lines.push_back(line_num);
1158bool find_grp(
const string &str,
unsigned int line_num, vector<string> &grp_names,
1159 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines) {
1168 size_t non_space_char_pos = str.find_first_not_of(
' ');
1169 if (non_space_char_pos == string::npos)
1173 if (str[non_space_char_pos]!=
'<')
1177 if (str.size() <= (non_space_char_pos+1))
1182 if (str[str.size()-1]!=
'>' )
1186 char char_2 = str[non_space_char_pos+1];
1191 string sep=
"Group name=\"";
1192 size_t sep_pos = str.find(sep,non_space_char_pos+1);
1195 if (sep_pos == string::npos){
1201 size_t end_name_pos = str.find(end_quote,sep_pos+sep.size()+1);
1202 if (end_name_pos == string::npos)
1206 string grp_name = str.substr(sep_pos+sep.size(),end_name_pos-sep_pos-sep.size());
1207 grp_names.push_back(grp_name);
1208 grp_lines.push_back(line_num);
1213 if ((str.size() >(end_name_pos+1)) && str[end_name_pos+1]==
'/')
1214 end_grp_lines.push_back(line_num);
1224bool find_end_grp(
const string &dmrpp_line,
unsigned int line_num, vector<unsigned int> &end_grp_lines) {
1226 string end_grp =
"</Group>" ;
1227 size_t end_grp_pos = dmrpp_line.find(end_grp);
1228 if (end_grp_pos != string::npos) {
1229 if ((end_grp_pos + end_grp.size()) == dmrpp_line.size()) {
1230 end_grp_lines.push_back(line_num);
1240string obtain_var_grp_paths(
const vector<unsigned int> &gs_line_nums,
1241 const vector<unsigned int> &ge_line_nums,
1242 const vector<string> &grp_names,
1243 unsigned int var_line) {
1246 vector<unsigned int> gse_line_nums;
1247 vector<bool> is_group_start;
1249 unsigned int end_grp_index = 0;
1250 unsigned int start_grp_index = 0;
1253 unsigned int max_grp_index = gs_line_nums.size() -1;
1260 while (end_grp_index <= max_grp_index) {
1262 while (start_grp_index <= max_grp_index) {
1264 if (gs_line_nums[start_grp_index] < ge_line_nums[end_grp_index]) {
1265 gse_line_nums.push_back(gs_line_nums[start_grp_index]);
1266 is_group_start.push_back(
true);
1269 else if (gs_line_nums[start_grp_index] == ge_line_nums[end_grp_index]) {
1276 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
1277 is_group_start.push_back(
false);
1281 if (end_grp_index < (max_grp_index+1)) {
1282 gse_line_nums.push_back(ge_line_nums[end_grp_index]);
1283 is_group_start.push_back(
false);
1290 if (is_group_start.size() != gse_line_nums.size()) {
1291 cerr<<
"The group "<<endl;
1298for (
unsigned int i =0; i<gse_line_nums.size();i++) {
1299 cerr<<
"gse_line["<<i<<
"] = "<<gse_line_nums[i] <<endl;
1300 cerr<<
"is_group_start["<<i<<
"] = "<<is_group_start[i] <<endl;
1305 int gse_line_index= obtain_gse_line_index(gse_line_nums,var_line);
1308cerr<<
"gse_line_index: "<<gse_line_index <<endl;
1312 vector<unsigned int> grp_path_lines;
1314 if (gse_line_index >= 0) {
1316 int temp_index = gse_line_index;
1319 unsigned int temp_rem_grp_index = 0;
1322 while (temp_index >= 0) {
1325 if (is_group_start[temp_index] ==
false)
1326 temp_rem_grp_index++;
1330 if (temp_rem_grp_index == 0)
1331 grp_path_lines.push_back(gse_line_nums[temp_index]);
1333 temp_rem_grp_index--;
1341for (
const auto &gpl:grp_path_lines)
1342cerr<<
"grp_path_lines "<<gpl <<endl;
1343for (
const auto &gsn:gs_line_nums)
1344cerr<<
"gs_lines "<<gsn <<endl;
1345for (
const auto &gn:grp_names)
1346cerr<<
"group name is "<<gn <<endl;
1351 int gl_index = gs_line_nums.size() - 1;
1353 for (
const auto &gpl:grp_path_lines) {
1358 for (; gl_index >= 0; gl_index--) {
1360 if (gpl == gs_line_nums[gl_index]) {
1362 ret_value =
"/" + grp_names[gl_index] + ret_value;
1370cerr<<
"ret_value is "<<ret_value <<endl;
1379int obtain_gse_line_index(
const vector<unsigned int> &gse_line_nums,
unsigned int var_line) {
1382 unsigned int total_gse_lines = gse_line_nums.size();
1384 if (total_gse_lines > 0) {
1386 for (
int i = total_gse_lines-1; i>=0 ; i--) {
1387 if (gse_line_nums[i] >var_line)
1399bool obtain_var_path_info(
const string &fname,
const vector<string> &var_type_list, vector<string> &var_fqn,
1400 vector<string> &var_type, vector<string> &var_name, vector<unsigned int> &var_lines) {
1402 vector<string> grp_names;
1403 vector<unsigned int> grp_lines;
1404 vector<unsigned int> end_grp_lines;
1406 bool has_group = obtain_var_grp_info(fname,var_type_list,grp_names,grp_lines,end_grp_lines,var_type, var_name,var_lines);
1408 cout <<
" the missing variable info shows this dmrpp has groups, however, no group is found. "<<endl;
1411 for (
unsigned int i =0; i <var_lines.size(); i++) {
1412 string var_path = obtain_var_grp_paths(grp_lines,end_grp_lines,grp_names,var_lines[i]);
1413 string vfqn = var_path +
"/" + var_name[i];
1414 var_fqn.push_back(vfqn);
1419for (
unsigned int i = 0; i <var_lines.size(); i++) {
1420cerr<<
" var fqn: "<<var_fqn[i] <<endl;
1421cerr<<
" var line: "<<var_lines[i] <<endl;
1429bool obtain_var_grp_info(
const string &fname,
const vector<string> &var_type_list, vector<string> &grp_names,
1430 vector<unsigned int> &grp_lines, vector<unsigned int> &end_grp_lines,
1431 vector<string> &var_type, vector<string> &var_name, vector<unsigned int> &var_lines) {
1436 bool find_grp_start =
false;
1437 bool find_grp_end =
false;
1439 unsigned int line_num = 0;
1441 ifstream dmrpp_fstream;
1442 dmrpp_fstream.open(fname.c_str(),ifstream::in);
1444 while(getline(dmrpp_fstream,dmrpp_line)) {
1446 find_grp_start = find_grp(dmrpp_line,line_num,grp_names,grp_lines,end_grp_lines);
1448 if (find_grp_start ==
false)
1449 find_grp_end = find_end_grp(dmrpp_line,line_num,end_grp_lines);
1450 if (!find_grp_start && !find_grp_end)
1451 find_var(dmrpp_line,var_type_list,var_type, var_name,var_lines,line_num);
1455 return !(grp_names.empty());
1459bool merge_chunk_info_g(
const string &fname,
const vector<string> &var_type,
const vector<string> &var_name,
1460 const vector<string> &var_candidate_fqn,
const vector<string> &miss_var_fqn,
1461 const vector<string> &miss_var_type,
const vector<string> &chunk_info) {
1464 bool ret_value =
true;
1467 file_to_string(fname,dmrpp_str);
1470cout <<
"dmrpp_str is "<<dmrpp_str<<endl;
1475 vector<size_t> var_candidate_pos;
1476 ret_value = obtain_miss_var_candidate_pos(dmrpp_str, var_type, var_name,var_candidate_pos);
1478 if (ret_value ==
false)
1481for (
const auto &vcp:var_candidate_pos)
1482 cout <<
"pos is: "<<vcp <<endl;
1487 vector<string> dmrpp_vec;
1488 vector<string> ordered_chunk_info;
1491 vector<string> final_var_type;
1492 vector<size_t> var_pos;
1493 obtain_final_miss_var_info(var_candidate_fqn,miss_var_fqn,miss_var_type,final_var_type,var_candidate_pos, var_pos,chunk_info,ordered_chunk_info);
1496for (
const auto &oci:ordered_chunk_info)
1497 cout <<
"chunk info: "<<oci <<endl;
1498for (
const auto &fvt:final_var_type)
1499 cout <<
"fvt: "<<fvt <<endl;
1503 string_tokenize_by_pos(dmrpp_str, var_pos, dmrpp_vec);
1505 ret_value = merge_chunk_info_vec(dmrpp_vec, final_var_type, ordered_chunk_info);
1506 if (ret_value ==
true)
1507 write_vec_to_file(fname,dmrpp_vec);
1512bool obtain_miss_var_candidate_pos(
const string &dmrpp_str,
const vector<string> &var_type,
1513 const vector<string> &var_name, vector<size_t> &var_pos) {
1515 bool ret_value =
true;
1516 size_t str_start_pos = 0;
1517 for (
unsigned int i = 0; i < var_name.size(); i++) {
1519 string var_sign =
"<"+var_type[i] +
" name=\"" + var_name[i] +
"\">";
1520 size_t v_pos = dmrpp_str.find(var_sign,str_start_pos);
1521 if (v_pos == string::npos) {
1522 cout <<
"Cannot find the var name " << var_name[i] <<
"in the original dmrpp file "<<endl;
1526 var_pos.push_back(v_pos);
1527 str_start_pos = v_pos + var_sign.size();
1533void obtain_final_miss_var_info(
const vector<string> &var_fqn,
const vector<string> &miss_var_fqn,
1534 const vector<string> &miss_var_type, vector<string> &final_var_type,
1535 const vector<size_t> &var_candidate_pos, vector<size_t> &var_pos,
1536 const vector<string> &chunk_info, vector<string> &ordered_chunk_info) {
1538 for (
unsigned int i = 0; i<var_fqn.size(); i++) {
1539 for (
unsigned int j = 0; j<miss_var_fqn.size(); j++) {
1543 if (var_fqn[i] == miss_var_fqn[j]) {
1544 var_pos.push_back(var_candidate_pos[i]);
1545 final_var_type.push_back(miss_var_type[j]);
1546 ordered_chunk_info.push_back(chunk_info[j]);
1554bool merge_chunk_info_vec(vector<string> &dmrpp_vec,
const vector<string> &miss_var_type,
1555 const vector<string> &ordered_chunk_info) {
1557 bool ret_value =
true;
1559 for (
unsigned int i = 1; i < dmrpp_vec.size(); i++) {
1560 string temp_dmrpp_seg = dmrpp_vec[i];
1561 ret_value = insert_chunk_info_to_vec(temp_dmrpp_seg, miss_var_type[i-1], ordered_chunk_info[i-1]);
1562 if (ret_value ==
false)
1565 dmrpp_vec[i] = temp_dmrpp_seg;
1571bool insert_chunk_info_to_vec(
string &dmrpp_block_str,
const string &var_type,
const string &chunk_info) {
1573 bool ret_value =
true;
1574 string end_var =
"</" + var_type +
'>';
1575 size_t end_var_pos = dmrpp_block_str.find(end_var);
1577 if (end_var_pos == string::npos) {
1578 cout <<
"Cannot find:\n "<<end_var <<
" \n in the string \n"<<dmrpp_block_str <<endl;
1583 char add_chunk_mark =
'>';
1584 size_t chunk_mark_pos = dmrpp_block_str.rfind(add_chunk_mark,end_var_pos);
1585 if (chunk_mark_pos == string::npos) {
1586 cout <<
"Cannot find:\n "<<add_chunk_mark <<
" \n in the string \n"<<dmrpp_block_str <<endl;
1590 string before_chunk_info_str = dmrpp_block_str.substr(0,chunk_mark_pos+1);
1591 string after_chunk_info_str = dmrpp_block_str.substr(chunk_mark_pos+1);
1592 dmrpp_block_str = before_chunk_info_str +
'\n' + chunk_info + after_chunk_info_str;