bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
reduce_mdf.cc
1#include <iostream>
2#include <sstream>
3#include <fstream>
4#include <string.h>
5#include <stdio.h>
6#include <vector>
7#include <openssl/sha.h>
8#include <unistd.h>
9
10using namespace std;
11
12bool obtain_offset_nbytes(const vector<string>& str_vec, vector<size_t>& offsets, vector<size_t>& nbytes);
13
14void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec);
15
16size_t string_to_size_t(const string& str);
17
18bool retrieve_chunk_info(FILE*,vector<size_t> &offsets,vector<size_t> &nbytes);
19
20string retrieve_data_sha256(FILE*,const vector<size_t> &offsets,const vector<size_t> &nbytes);
21
22short write_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,const string & sha256_buf);
23
24short update_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,char*stored_fname,const string & sha256_buf);
25
26string to_hex(unsigned char s) {
27 stringstream ss;
28 ss << hex << (int) s;
29 return ss.str();
30}
31
32// If the return value is 0, the sha256 exists, no need to use the generated HDF5 file.
33// If the return value is 1, the sha256 doesn't exist, need to use the generated HDF5 file.
34int main(int argc,char **argv ) {
35
36 if(argc !=5) {
37 cout<<"Please provide four arguments: "<< endl;
38 cout<<" The first is the dmrpp file that contains the missing variable value information. "<<endl;
39 cout<<" The second is the hdf5 file path that stores the missing variable values. "<<endl;
40 cout<<" The third is the text file that stores the file path and the sha256 value." <<endl;
41 cout<<" The fourth is the text file that stores the final HDF5 file path for this dmrpp file. "<<endl;
42 }
43
44 // Retrieve the chunk info from the dmrpp file.
45 FILE* fp_dmrpp = fopen(argv[1],"r");
46 if(fp_dmrpp == NULL) {
47 cout<<"The dmrpp file doesn't exist"<<endl;
48 return -1;
49 }
50
51 vector<size_t>offsets;
52 vector<size_t>nbytes;
53 bool ret_chunk = retrieve_chunk_info(fp_dmrpp,offsets,nbytes);
54 if(false == ret_chunk) {
55 cout<<"Cannot retrieve the chunk info from the dmrpp file successfully. "<<endl;
56 return -1;
57 }
58 fclose(fp_dmrpp);
59
60 // Obtain the sha256.
61 FILE* fp_h5 = fopen(argv[2],"r");
62 if(fp_h5 == NULL) {
63 cout<<"The HDF5 file doesn't exist"<<endl;
64 return -1;
65 }
66
67 string sha256_buf = retrieve_data_sha256(fp_h5,offsets,nbytes);
68 if(sha256_buf=="") {
69 cout<<"The sha256 of this file doesn't exist"<<endl;
70 return -1;
71 }
72 fclose(fp_h5);
73
74 // Store the sha256 if necessary to a file.
75 short ret_value = update_sha256_file(argv[1],argv[2],argv[3],argv[4],sha256_buf);
76 //return ret_value;
77 return ret_value;
78}
79
80// Append the sha256 to a file.
81short write_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,const string & sha256_buf) {
82
83 short sha_fname_ret = 1;
84 FILE*fp = fopen(m_sha256_fname,"a");
85 string fname_str(m_h5_fname);
86 string dname_str(m_dmrpp_fname);
87 string file_content = fname_str +' '+dname_str+' '+sha256_buf+'\n';
88 vector<char>buf(file_content.begin(),file_content.end());
89 size_t fsize = fwrite(buf.data(),1,file_content.size(),fp);
90 if(fsize != file_content.size())
91 sha_fname_ret = -1;
92 fclose(fp);
93
94 return sha_fname_ret;
95
96}
97
98// Update the sha256 in the recording file if necessary.
99short update_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,char* store_h5_fname,const string & sha256_buf) {
100
101 // If the recording file that stores thesha256 doesn't exist,
102 // just create this file and write the sha256 etc information to the file.
103
104 /* removed 11.10.20 SBL
105 * removed due to sonarcloud claiming that having the following before the if statement on ln 126
106 * causes a race condition and a security vulnerability
107 */
108 //if(access(m_sha256_fname,F_OK)==-1)
109 // return write_sha256_file(m_dmrpp_fname,m_h5_fname,m_sha256_fname,sha256_buf);
110
111 //
112 // If the recording file exists, open this file and see if the sha256 of
113 // this missing data can be found from the recording file.
114 // If the sha256 can be found,then the missing data file exists, we don't
115 // need to create a new one, otherwise, a new one needs to be created.
116 // If the sha256 can be found, we need to create a temp. text file to store
117 // the missing data file name so that this information can be passed to
118 // the patched dmrpp program afterwards.
119 short ret_value = 1;
120 ifstream sha_fstream;
121 sha_fstream.open(m_sha256_fname,ifstream::in);
122 /* added 11.10.20 SBL
123 * added check to fix race condition vulnerability detected by SonarCloud
124 * checks if stream was opened correctly and if not creates the file and opens again
125 */
126 if (!sha_fstream.is_open()){
127 return write_sha256_file(m_dmrpp_fname,m_h5_fname,m_sha256_fname,sha256_buf);
128#if 0
129 //sha_fstream.open(m_sha256_fname,ifstream::in);
130#endif
131 }
132 string sha_line;
133 char space_char=' ';
134 //char end_line='\n';
135 bool space_fname_ret = true;
136 bool need_add_sha256 = true;
137
138 while(getline(sha_fstream,sha_line)) {
139
140 size_t fname_epos = sha_line.find(space_char);
141 if(fname_epos==string::npos) {
142 space_fname_ret = false;
143 break;
144 }
145
146 size_t dname_epos = sha_line.find(space_char,fname_epos+1);
147 if(dname_epos==string::npos) {
148 space_fname_ret = false;
149 break;
150 }
151
152 string f_sha256_buf = sha_line.substr(dname_epos+1);
153 if(f_sha256_buf == sha256_buf) {
154
155 need_add_sha256 = false;
156
157 string exist_m_h5_name = sha_line.substr(0,fname_epos);
158 string exist_m_dmrpp_name = sha_line.substr(fname_epos+1,dname_epos-fname_epos-1);
159
160 // Open the file to store the HDF5 and dmrpp file
161 FILE*fp = fopen(store_h5_fname,"a");
162 string file_content = exist_m_h5_name +' '+exist_m_dmrpp_name;
163 vector<char>buf(file_content.begin(),file_content.end());
164 size_t fsize = fwrite(buf.data(),1,file_content.size(),fp);
165 if(fsize != file_content.size())
166 ret_value = -1;
167 fclose(fp);
168 break;
169 }
170 }
171 sha_fstream.close();
172
173
174 if(false == space_fname_ret)
175 ret_value = -1;
176 if(false == need_add_sha256)
177 ret_value = 0;
178
179 // sha256 is not found, append this sha256 and the missing data file name to the recording file.
180 if(true == space_fname_ret) {
181 if(true == need_add_sha256) {
182 ret_value = write_sha256_file(m_dmrpp_fname,m_h5_fname,m_sha256_fname,sha256_buf);
183 }
184 }
185
186 return ret_value;
187}
188
189// Obtain the sha256 from the data values.
190string retrieve_data_sha256(FILE*fp,const vector<size_t> &offsets,const vector<size_t> &nbytes){
191
192 string ret_str;
193 size_t fSize = 0;
194 unsigned char hash[SHA256_DIGEST_LENGTH];
195
196 // This is the buffer size
197 for(size_t i = 0; i <nbytes.size();i++)
198 fSize+=nbytes[i];
199
200 // Read in the offset and byte information.
201 vector<char>buf;
202 buf.resize(fSize);
203
204 size_t cur_size = 0;
205 for(size_t i = 0; i<offsets.size();i++) {
206 // Seek according to offset
207 if(fseek(fp,offsets[i],SEEK_SET)!=0)
208 return ret_str;
209 /* unused size_t result =*/ fread(&buf[cur_size],1,nbytes[i],fp);
210 cur_size +=nbytes[i];
211 }
212
213 // Calculate the hash
214 SHA256((const unsigned char*)buf.data(),fSize,hash);
215
216 string output="";
217
218 // Change 256 to hex and to a string
219 for(int i =0; i<SHA256_DIGEST_LENGTH;i++)
220 output+=to_hex(hash[i]);
221
222 return output;
223}
224
225// Retrieve the offsets and number of bytes of variable values.
226bool retrieve_chunk_info(FILE*fp,vector<size_t> &offsets,vector<size_t> &nbytes) {
227
228 size_t fSize = 0;
229
230 // Read in the offset and byte information.
231 if(fseek(fp,0,SEEK_END)!=0)
232 return false;
233 fSize = ftell(fp);
234#if 0
235 // fSize is unsigned. jhrg 11/23/21
236 if(fSize <0)
237 return false;
238#endif
239
240 if(fseek(fp,0,SEEK_SET)!=0)
241 return false;
242
243 vector<char>buf;
244 buf.resize((size_t)fSize);
245 size_t result = fread(buf.data(),1,fSize,fp);
246 if(result != fSize)
247 return false;
248
249 string str(buf.begin(),buf.end());
250 char delim='\n';
251 vector<string> str_vec;
252 string_tokenize(str,delim,str_vec);
253
254 bool get_offset_nbytes = obtain_offset_nbytes(str_vec,offsets,nbytes);
255 if(false == get_offset_nbytes) {
256 cout<<"cannot successfully obtain the offset and nbytes. \n";
257 return false;
258 }
259
260#if 0
261 for (int i = 0; i <offsets.size();i++) {
262 cout<<"offset["<<i<<"]= " <<offsets[i] <<endl;
263 cout<<"nbyte["<<i<<"]= " <<nbytes[i] <<endl;
264 }
265#endif
266
267 return get_offset_nbytes;
268
269}
270
271// Obtain the offset and number of bytes from the dmrpp file.
272// Here we don't need to worry about the filters. We just want to
273// make sure the data values(either in compressed form or uncompressed form)
274// can be retrieved.
275bool obtain_offset_nbytes(const vector<string>& str_vec, vector<size_t>& offsets, vector<size_t>& nbytes){
276
277 bool ret=true;
278 vector<string>chunk_info_str;
279 string delim1 ="chunk offset=\"";
280 string delim2 ="nBytes=\"";
281 string delim3="\"";
282
283 vector<size_t> unfiltered_offsets;
284 vector<size_t> unfiltered_nbytes;
285
286 // Pick up the line that includes chunk offset and save them to a vector.
287 for(size_t i = 0; i <str_vec.size(); i++)
288 if(str_vec[i].find(delim1)!=string::npos)
289 chunk_info_str.push_back(str_vec[i]);
290
291 // Obtain the offsets and number of bytes and save them to vectors.
292 for(size_t i = 0; i<chunk_info_str.size();i++) {
293 size_t co_spos = chunk_info_str[i].find(delim1);
294 size_t co_epos = chunk_info_str[i].find(delim3,co_spos+delim1.size());
295 if(co_epos==string::npos) {
296 ret = false;
297 break;
298 }
299 string temp_offset=chunk_info_str[i].substr(co_spos+delim1.size(),co_epos-co_spos-delim1.size());
300 unfiltered_offsets.push_back(string_to_size_t(temp_offset));
301
302 size_t nb_spos = chunk_info_str[i].find(delim2,co_epos);
303 size_t nb_epos = chunk_info_str[i].find(delim3,nb_spos+delim2.size());
304 if(nb_epos==string::npos) {
305 ret = false;
306 break;
307 }
308 string temp_nbyte=chunk_info_str[i].substr(nb_spos+delim2.size(),nb_epos-nb_spos-delim2.size());
309 unfiltered_nbytes.push_back(string_to_size_t(temp_nbyte));
310
311 }
312
313 // Remove nbyte = 0 case. This is a bug caused by build_dmrpp. Before that is fixed, we
314 // remove this case since this fortuately doesn't affect our purpose and the patch_dmrpp program.
315 if(true == ret) {
316 for(size_t i = 0; i<unfiltered_nbytes.size();i++) {
317 if(unfiltered_nbytes[i] != 0) {
318 offsets.push_back(unfiltered_offsets[i]);
319 nbytes.push_back(unfiltered_nbytes[i]);
320 }
321 }
322 }
323
324 return ret;
325}
326
327// Tokenize the string to a vector of string according to the delimiter
328void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec) {
329 stringstream ss_str(in_str);
330 string temp_str;
331 while (getline(ss_str,temp_str,delim)) {
332 out_vec.push_back(temp_str);
333 }
334}
335
336// Convert string to size_t.
337size_t string_to_size_t(const string& str) {
338 stringstream sstream(str);
339 size_t str_num;
340 sstream >>str_num;
341 return str_num;
342}
343
344