bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
CurlUtils.cc
1// -*- mode: c++; c-basic-offset:4 -*-
2// This file is part of the BES http package, part of the Hyrax data server.
3//
4// Copyright (c) 2020 OPeNDAP, Inc.
5// Author: Nathan Potter <ndp@opendap.org>
6//
7// This library is free software; you can redistribute it and/or
8// modify it under the terms of the GNU Lesser General Public
9// License as published by the Free Software Foundation; either
10// version 2.1 of the License, or (at your option) any later version.
11//
12// This library is distributed in the hope that it will be useful,
13// but WITHOUT ANY WARRANTY; without even the implied warranty of
14// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15// Lesser General Public License for more details.
16//
17// You should have received a copy of the GNU Lesser General Public
18// License along with this library; if not, write to the Free Software
19// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20//
21// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
22// Authors:
23// ndp Nathan Potter <ndp@opendap.org>
24
25#include "config.h"
26
27#include <unistd.h>
28#include <fcntl.h>
29#include <ctime>
30#include <cstring>
31
32#include <curl/curl.h>
33
34#include <sstream>
35#include <vector>
36#include <algorithm> // std::for_each
37
38#include "BESContextManager.h"
39#include "BESSyntaxUserError.h"
40#include "BESInternalError.h"
41#include "HttpError.h"
42#include "BESDebug.h"
43#include "BESRegex.h"
44#include "TheBESKeys.h"
45#include "BESLog.h"
46#include "BESStopWatch.h"
47
48#include "HttpNames.h"
49
50#include "HttpUtils.h"
51#include "ProxyConfig.h"
52#include "AllowedHosts.h"
53#include "CurlUtils.h"
54#include "CredentialsManager.h"
55#include "AccessCredentials.h"
56#include "RequestServiceTimer.h"
57
58#include "awsv4.h"
59#include "url_impl.h"
60
61#define MODULE "curl"
62#define CURL_TIMING "curl:timing"
63
64using namespace AWSV4;
65using namespace http;
66using namespace std;
67
68#define prolog std::string("CurlUtils::").append(__func__).append("() - ")
69
70namespace curl {
71
72static void super_easy_perform(CURL *c_handle, int fd);
73
74const unsigned int retry_limit = 3; // 10; // Amazon's suggestion
75const useconds_t url_retry_time = 250'000; // 1/4 second in micro seconds
76
77// Set this to 1 to turn on libcurl's verbose mode (for debugging).
78const int curl_trace = 0;
79
80const int CLIENT_ERR_MIN = 400;
81const int CLIENT_ERR_MAX = 417;
82const vector <string> http_client_errors = {
83 "Bad Request:",
84 "Unauthorized: Contact the server administrator.",
85 "Payment Required.",
86 "Forbidden: Contact the server administrator.",
87 "Not Found: The underlying data source or server could not be found.",
88 "Method Not Allowed.",
89 "Not Acceptable.",
90 "Proxy Authentication Required.",
91 "Request Time-out.",
92 "Conflict.",
93 "Gone.",
94 "Length Required.",
95 "Precondition Failed.",
96 "Request Entity Too Large.",
97 "Request URI Too Large.",
98 "Unsupported Media Type.",
99 "Requested Range Not Satisfiable.",
100 "Expectation Failed."
101};
102
103const int SERVER_ERR_MIN = 500;
104const int SERVER_ERR_MAX = 505;
105const vector <string> http_server_errors = {
106 "Internal Server Error.",
107 "Not Implemented.",
108 "Bad Gateway.",
109 "Service Unavailable.",
110 "Gateway Time-out.",
111 "HTTP Version Not Supported."
112};
113
122static string http_code_to_string(long code) {
123 if (code >= CLIENT_ERR_MIN && code <= CLIENT_ERR_MAX)
124 return {http_client_errors[code - CLIENT_ERR_MIN]};
125 else if (code >= SERVER_ERR_MIN && code <= SERVER_ERR_MAX)
126 return {http_server_errors[code - SERVER_ERR_MIN]};
127 else {
128 return {"Unknown HTTP Error: " + to_string(code)};
129 }
130}
131
137static string getCurlAuthTypeName(unsigned long auth_type) {
138
139 string authTypeString;
140 unsigned long match;
141
142 match = auth_type & CURLAUTH_BASIC;
143 if (match) {
144 authTypeString += "CURLAUTH_BASIC";
145 }
146
147 match = auth_type & CURLAUTH_DIGEST;
148 if (match) {
149 if (!authTypeString.empty())
150 authTypeString += " ";
151 authTypeString += "CURLAUTH_DIGEST";
152 }
153
154 match = auth_type & CURLAUTH_DIGEST_IE;
155 if (match) {
156 if (!authTypeString.empty())
157 authTypeString += " ";
158 authTypeString += "CURLAUTH_DIGEST_IE";
159 }
160
161 match = auth_type & CURLAUTH_GSSNEGOTIATE;
162 if (match) {
163 if (!authTypeString.empty())
164 authTypeString += " ";
165 authTypeString += "CURLAUTH_GSSNEGOTIATE";
166 }
167
168 match = auth_type & CURLAUTH_NTLM;
169 if (match) {
170 if (!authTypeString.empty())
171 authTypeString += " ";
172 authTypeString += "CURLAUTH_NTLM";
173 }
174
175 return authTypeString;
176}
177
182#define CURL_WRITE_TO_FILE_TIMEOUT_MSG "The function curl::writeToOpenFileDescriptor() was unable to complete the download process because it ran out of time."
183
184static size_t writeToOpenFileDescriptor(const char *data, size_t /* size */, size_t nmemb, const void *userdata) {
185
186 const auto fd = static_cast<const int *>(userdata);
187
188 BESDEBUG(MODULE, prolog << "Bytes received: " << nmemb << endl);
189 size_t bytes_written = write(*fd, data, nmemb);
190 BESDEBUG(MODULE, prolog << " Bytes written: " << bytes_written << endl);
191
192 // Verify the request hasn't exceeded bes_timeout, and throw if it has...
193 RequestServiceTimer::TheTimer()->throw_if_timeout_expired(CURL_WRITE_TO_FILE_TIMEOUT_MSG, __FILE__, __LINE__);
194
195 return bytes_written;
196}
197
198
221static size_t save_http_response_headers(void *ptr, size_t size, size_t nmemb, void *resp_hdrs) {
222 BESDEBUG(MODULE, prolog << "Inside the header parser." << endl);
223 auto hdrs = static_cast<vector<string> * >(resp_hdrs);
224
225 // Grab the header, minus the trailing newline. Or \r\n pair.
226 string complete_line;
227 if (nmemb > 1 && *(static_cast<char *>(ptr) + size * (nmemb - 2)) == '\r')
228 complete_line.assign(static_cast<char *>(ptr), size * (nmemb - 2));
229 else
230 complete_line.assign(static_cast<char *>(ptr), size * (nmemb - 1));
231
232 // Store all non-empty headers that are not HTTP codes
233 if (!complete_line.empty() && complete_line.find("HTTP") == string::npos) {
234 BESDEBUG(MODULE, prolog << "Header line: " << complete_line << endl);
235 hdrs->push_back(complete_line);
236 }
237
238 return size * nmemb;
239}
240
248static int curl_debug(const CURL *, curl_infotype info, const char *msg, size_t size, const void *) {
249 string message(msg, size);
250
251 switch (info) {
252 case CURLINFO_TEXT:
253 BESDEBUG(MODULE, prolog << "Text: " << message << endl);
254 break;
255 case CURLINFO_HEADER_IN:
256 BESDEBUG(MODULE, prolog << "Header in: " << message << endl);
257 break;
258 case CURLINFO_HEADER_OUT:
259 BESDEBUG(MODULE, prolog << "Header out: " << endl << message << endl);
260 break;
261 case CURLINFO_DATA_IN:
262 BESDEBUG(MODULE, prolog << "Data in: " << message << endl);
263 break;
264 case CURLINFO_DATA_OUT:
265 BESDEBUG(MODULE, prolog << "Data out: " << message << endl);
266 break;
267 case CURLINFO_END:
268 BESDEBUG(MODULE, prolog << "End: " << message << endl);
269 break;
270#ifdef CURLINFO_SSL_DATA_IN
271 case CURLINFO_SSL_DATA_IN:
272 BESDEBUG(MODULE, prolog << "SSL Data in: " << message << endl ); break;
273#endif
274#ifdef CURLINFO_SSL_DATA_OUT
275 case CURLINFO_SSL_DATA_OUT:
276 BESDEBUG(MODULE, prolog << "SSL Data out: " << message << endl ); break;
277#endif
278 default:
279 BESDEBUG(MODULE, prolog << "Curl info: " << message << endl);
280 break;
281 }
282 return 0;
283}
284
290static void unset_error_buffer(CURL *ceh) {
291 set_error_buffer(ceh, nullptr);
292}
293
316static bool configure_curl_handle_for_proxy(CURL *ceh, const string &target_url) {
317 BESDEBUG(MODULE, prolog << "BEGIN." << endl);
318
319 bool using_proxy = http::ProxyConfig::theOne()->is_configured();
320 if (using_proxy) {
321
322 BESDEBUG(MODULE, prolog << "Proxy has been configured..." << endl);
323
324 http::ProxyConfig *proxy = http::ProxyConfig::theOne();
325
326 // TODO remove these local variables (if possible) and pass the values into curl_easy_setopt() directly from HttpUtils
327 string proxyHost = proxy->host();
328 int proxyPort = proxy->port();
329 string proxyPassword = proxy->proxy_password();
330 string proxyUser = proxy->user();
331 string proxyUserPW = proxy->password();
332 int proxyAuthType = proxy->auth_type();
333 string no_proxy_regex = proxy->no_proxy_regex();
334
335
336 // Don't set up the proxy server for URLs that match the 'NoProxy'
337 // regex set in the gateway.conf file.
338
339 // Don't create the regex if the string is empty
340 if (!no_proxy_regex.empty()) {
341 BESDEBUG(MODULE, prolog << "Found NoProxyRegex." << endl);
342 BESRegex r(no_proxy_regex.c_str());
343 if (r.match(target_url.c_str(), static_cast<int>(target_url.size())) != -1) {
344 BESDEBUG(MODULE,
345 prolog << "Found NoProxy match. BESRegex: " << no_proxy_regex << "; Url: " << target_url
346 << endl);
347 using_proxy = false;
348 }
349 }
350
351 if (using_proxy) {
352 CURLcode res;
353 vector<char> error_buffer(CURL_ERROR_SIZE, (char) 0);
354
355 BESDEBUG(MODULE, prolog << "Setting up a proxy server." << endl);
356 BESDEBUG(MODULE, prolog << "Proxy host: " << proxyHost << endl);
357 BESDEBUG(MODULE, prolog << "Proxy port: " << proxyPort << endl);
358
359 set_error_buffer(ceh, error_buffer.data());
360
361 res = curl_easy_setopt(ceh, CURLOPT_PROXY, proxyHost.data());
362 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXY", error_buffer.data(), __FILE__, __LINE__);
363
364 res = curl_easy_setopt(ceh, CURLOPT_PROXYPORT, proxyPort);
365 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYPORT", error_buffer.data(), __FILE__, __LINE__);
366
367 // oddly "#ifdef CURLOPT_PROXYAUTH" doesn't work - even though CURLOPT_PROXYAUTH is defined and valued at 111 it
368 // fails the test. Eclipse hover over the CURLOPT_PROXYAUTH symbol shows: "CINIT(PROXYAUTH, LONG, 111)",
369 // for what that's worth
370
371 // According to http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTPROXYAUTH
372 // As of 4/21/08 only NTLM, Digest and Basic work.
373
374 res = curl_easy_setopt(ceh, CURLOPT_PROXYAUTH, proxyAuthType);
375 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYAUTH", error_buffer.data(), __FILE__, __LINE__);
376 BESDEBUG(MODULE, prolog << "Using CURLOPT_PROXYAUTH = " << getCurlAuthTypeName(proxyAuthType) << endl);
377
378 if (!proxyUser.empty()) {
379 res = curl_easy_setopt(ceh, CURLOPT_PROXYUSERNAME, proxyUser.data());
380 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYUSERNAME", error_buffer.data(), __FILE__,
381 __LINE__);
382 BESDEBUG(MODULE, prolog << "CURLOPT_PROXYUSERNAME : " << proxyUser << endl);
383
384 if (!proxyPassword.empty()) {
385 res = curl_easy_setopt(ceh, CURLOPT_PROXYPASSWORD, proxyPassword.data());
386 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYPASSWORD", error_buffer.data(), __FILE__,
387 __LINE__);
388 BESDEBUG(MODULE, prolog << "CURLOPT_PROXYPASSWORD: " << proxyPassword << endl);
389 }
390 } else if (!proxyUserPW.empty()) {
391 res = curl_easy_setopt(ceh, CURLOPT_PROXYUSERPWD, proxyUserPW.data());
392 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYUSERPWD", error_buffer.data(), __FILE__,
393 __LINE__);
394 BESDEBUG(MODULE, prolog << "CURLOPT_PROXYUSERPWD : " << proxyUserPW << endl);
395 }
396 unset_error_buffer(ceh);
397 }
398 }
399 BESDEBUG(MODULE, prolog << "END. using_proxy: " << (using_proxy ? "true" : "false") << endl);
400 return using_proxy;
401}
402
403// This is used in only one place.
404static CURL *init(CURL *ceh, const string &target_url, const curl_slist *http_request_headers,
405 vector <string> *http_response_hdrs) {
406 vector<char> error_buffer(CURL_ERROR_SIZE, (char) 0);
407 CURLcode res;
408
409 if (!ceh)
410 throw BESInternalError("Could not initialize cURL easy handle.", __FILE__, __LINE__);
411
412 // SET Error Buffer (for use during this setup) ----------------------------------------------------------------
413 set_error_buffer(ceh, error_buffer.data());
414
415 // Target URL --------------------------------------------------------------------------------------------------
416 res = curl_easy_setopt(ceh, CURLOPT_URL, target_url.c_str());
417 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_URL", error_buffer.data(), __FILE__, __LINE__);
418
419 // Load in the default headers to send with a request. The empty Pragma
420 // headers overrides libcurl's default Pragma: no-cache header (which
421 // will disable caching by Squid, etc.).
422 // the empty Pragma never appears in the outgoing headers when this isn't present
423 // d_request_headers->push_back(string("Pragma: no-cache"));
424 // d_request_headers->push_back(string("Cache-Control: no-cache"));
425
426 if (http_request_headers) {
427 // Add the http_request_headers to the cURL handle.
428 res = curl_easy_setopt(ceh, CURLOPT_HTTPHEADER, http_request_headers);
429 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_HTTPHEADER", error_buffer.data(), __FILE__, __LINE__);
430 }
431
432
433 if (http_response_hdrs) {
434 res = curl_easy_setopt(ceh, CURLOPT_HEADERFUNCTION, save_http_response_headers);
435 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_HEADERFUNCTION", error_buffer.data(), __FILE__, __LINE__);
436
437 // Pass save_http_response_headers() a pointer to the vector<string> where the
438 // response headers may be stored. Callers can use the resp_hdrs
439 // value/result parameter to get the raw response header information .
440 res = curl_easy_setopt(ceh, CURLOPT_WRITEHEADER, http_response_hdrs);
441 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEHEADER", error_buffer.data(), __FILE__, __LINE__);
442 }
443
444 // Allow compressed responses. Sending an empty string enables all supported compression types.
445#ifndef CURLOPT_ACCEPT_ENCODING
446 res = curl_easy_setopt(ceh, CURLOPT_ENCODING, "");
447 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_ENCODING", error_buffer.data(), __FILE__, __LINE__);
448#else
449 res = curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
450 check_setopt_result(res, prolog, "CURLOPT_ACCEPT_ENCODING", error_buffer, __FILE__,__LINE__);
451#endif
452 // Disable Progress Meter
453 res = curl_easy_setopt(ceh, CURLOPT_NOPROGRESS, 1L);
454 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NOPROGRESS", error_buffer.data(), __FILE__, __LINE__);
455
456 // Disable cURL signal handling
457 res = curl_easy_setopt(ceh, CURLOPT_NOSIGNAL, 1L);
458 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NOSIGNAL", error_buffer.data(), __FILE__, __LINE__);
459
460
461 // - - - - - - - - - - - - - - - - - - - - - - - - - - - -
462 // Authentication config.
463 //
464
465 // We have to set FailOnError to false for any of the non-Basic
466 // authentication schemes to work. 07/28/03 jhrg
467 res = curl_easy_setopt(ceh, CURLOPT_FAILONERROR, 0L);
468 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_FAILONERROR", error_buffer.data(), __FILE__, __LINE__);
469
470
471 // CURLAUTH_ANY means libcurl will use Basic, Digest, GSS Negotiate, or NTLM,
472 // choosing the 'safest' one supported by the server.
473 // This requires curl 7.10.6 which is still in pre-release. 07/25/03 jhrg
474 res = curl_easy_setopt(ceh, CURLOPT_HTTPAUTH, (long) CURLAUTH_ANY);
475 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_HTTPAUTH", error_buffer.data(), __FILE__, __LINE__);
476
477
478 // CURLOPT_NETRC means to use the netrc file for credentials.
479 // CURL_NETRC_OPTIONAL Means that if the supplied URL contains a username
480 // and password to prefer that to using the content of the netrc file.
481 res = curl_easy_setopt(ceh, CURLOPT_NETRC, CURL_NETRC_OPTIONAL);
482 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NETRC", error_buffer.data(), __FILE__, __LINE__);
483
484 // If the configuration specifies a particular .netrc credentials file, use it.
485 string netrc_file = get_netrc_filename();
486 if (!netrc_file.empty()) {
487 res = curl_easy_setopt(ceh, CURLOPT_NETRC_FILE, netrc_file.c_str());
488 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NETRC_FILE", error_buffer.data(), __FILE__, __LINE__);
489
490 }
491 VERBOSE(prolog + " is using the netrc file '"
492 + (!netrc_file.empty() ? netrc_file : "~/.netrc") + "'");
493
494
495 // - - - - - - - - - - - - - - - - - - - - - - - - - - - -
496 // Cookies
497 //
498 res = curl_easy_setopt(ceh, CURLOPT_COOKIEFILE, curl::get_cookie_filename().c_str());
499 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_COOKIEFILE", error_buffer.data(), __FILE__, __LINE__);
500
501 res = curl_easy_setopt(ceh, CURLOPT_COOKIEJAR, curl::get_cookie_filename().c_str());
502 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_COOKIEJAR", error_buffer.data(), __FILE__, __LINE__);
503
504 // save_http_response_headers
505
506 // Follow 302 (redirect) responses
507 res = curl_easy_setopt(ceh, CURLOPT_FOLLOWLOCATION, 1L);
508 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_FOLLOWLOCATION", error_buffer.data(), __FILE__, __LINE__);
509
510 res = curl_easy_setopt(ceh, CURLOPT_MAXREDIRS, max_redirects());
511 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_MAXREDIRS", error_buffer.data(), __FILE__, __LINE__);
512
513 // Set the user agent to Hyrax's user agent value
514 res = curl_easy_setopt(ceh, CURLOPT_USERAGENT, hyrax_user_agent().c_str());
515 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_USERAGENT", error_buffer.data(), __FILE__, __LINE__);
516
517 if (curl_trace) {
518 BESDEBUG(MODULE, prolog << "Curl version: " << curl_version() << endl);
519 res = curl_easy_setopt(ceh, CURLOPT_VERBOSE, 1L);
520 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_VERBOSE", error_buffer.data(), __FILE__, __LINE__);
521 BESDEBUG(MODULE, prolog << "Curl in verbose mode." << endl);
522
523 res = curl_easy_setopt(ceh, CURLOPT_DEBUGFUNCTION, curl_debug);
524 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_DEBUGFUNCTION", error_buffer.data(), __FILE__, __LINE__);
525 BESDEBUG(MODULE, prolog << "Curl debugging function installed." << endl);
526 }
527
528 // We unset the error buffer here because we know that curl::configure_curl_handle_for_proxy() will use it's own.
529 unset_error_buffer(ceh);
530 // Configure the proxy for this url (if appropriate).
531 curl::configure_curl_handle_for_proxy(ceh, target_url);
532
533 BESDEBUG(MODULE, prolog << "curl: " << (void *) ceh << endl);
534 return ceh;
535}
536
549CURL *init(const string &target_url,
550 const curl_slist *http_request_headers,
551 vector <string> *http_response_hdrs) {
552 CURL *swanky_new_curl_easy_handle = curl_easy_init();
553 return init(swanky_new_curl_easy_handle, target_url, http_request_headers, http_response_hdrs);
554}
555
556
557string get_range_arg_string(const unsigned long long &offset, const unsigned long long &size) {
558 ostringstream range; // range-get needs a string arg for the range
559 range << offset << "-" << offset + size - 1;
560 BESDEBUG(MODULE, prolog << " range: " << range.str() << endl);
561 return range.str();
562}
563
574static curl_slist *
575sign_url_for_s3_if_possible(const string &url, curl_slist *request_headers) {
576 // If this is a URL that references an S3 bucket, and there are credentials for the URL,
577 // sign the URL.
578 if (CredentialsManager::theCM()->size() > 0) {
579 auto ac = CredentialsManager::theCM()->get(url);
580 if (ac && ac->is_s3_cred()) {
581 BESDEBUG(MODULE, prolog << "Located S3 credentials for url: " << url
582 << " Using request headers to hold AWS signature\n");
583 request_headers = sign_s3_url(url, ac, request_headers);
584 }
585 else {
586 if(ac){
587 BESDEBUG(MODULE, prolog << "Located credentials for url: " << url << "They are "
588 << (ac->is_s3_cred()?"":"NOT ") << "S3 credentials.\n");
589 }
590 else {
591 BESDEBUG(MODULE, prolog << "Unable to locate credentials for url: " << url << "\n");
592 }
593 }
594 }
595
596 return request_headers;
597}
598
609static curl_slist *
610sign_url_for_s3_if_possible(const shared_ptr <url> &url, curl_slist *request_headers) {
611 return sign_url_for_s3_if_possible(url->str(), request_headers);
612}
613
623static string get_effective_url(CURL *ceh, const string &requested_url) {
624 char *effective_url = nullptr;
625 CURLcode curl_code = curl_easy_getinfo(ceh, CURLINFO_EFFECTIVE_URL, &effective_url);
626 if (curl_code != CURLE_OK) {
627 stringstream msg;
628 msg << prolog << "Unable to determine CURLINFO_EFFECTIVE_URL! Requested URL: " << requested_url;
629 BESDEBUG(MODULE, msg.str() << endl);
630 throw BESInternalError(msg.str(), __FILE__, __LINE__);
631 }
632 return effective_url;
633}
634
635// https://<<host>>>/<<path>>?A-userid=jhrg&amp;X-Amz-Algorithm=AWS4-HMAC-SHA256&amp;X-Amz-Credential=...;
636// X-Amz-Date=20230417T193403Z&amp;X-Amz-Expires=3467&amp;X-Amz-Security-Token=...
647string filter_aws_url(const string &eff_url) {
648 // It seems unlikely that the X-Amz prefix will be in the first part of the query string
649 // and the first part will likely be useful for the error message, so looking for the first
650 // '&' is a good start.
651 auto pos = eff_url.find('&');
652 string filtered_url = eff_url.substr(0, pos);
653 // Check to make sure that the X-Amz prefix is not in the first part of the query string
654 if (filtered_url.find("X-Amz-") == string::npos) {
655 return filtered_url;
656 } else {
657 pos = filtered_url.find('?');
658 return filtered_url.substr(0, pos);
659 }
660}
661
672
673// TODO If these regexes are complex, they will take a significant amount of time to compile. Fix.
674// A better solution is to compile the regex once and store the compiled regex for future use. It's
675// the compilation that takes a long time. jhrg 11/3/22
676bool is_retryable(const string &target_url) {
677 BESDEBUG(MODULE, prolog << "BEGIN" << endl);
678 bool retryable = true;
679
680 vector<string> nr_regexs;
681 bool found;
682 TheBESKeys::TheKeys()->get_values(HTTP_NO_RETRY_URL_REGEX_KEY, nr_regexs, found);
683 if (found) {
684 for (const auto &nr_regex: nr_regexs) {
685 BESDEBUG(MODULE, prolog << "nr_regex: '" << nr_regex << "'" << endl);
686 BESRegex no_retry_regex(nr_regex.c_str(), (int) nr_regex.size());
687 size_t match_length = no_retry_regex.match(target_url.c_str(), (int) target_url.size(), 0);
688 if (match_length == target_url.size()) {
689 BESDEBUG(MODULE, prolog << "The url: '" << target_url << "' fully matched the "
690 << HTTP_NO_RETRY_URL_REGEX_KEY << ": '" << nr_regex << "'" << endl);
691 retryable = false;
692 break;
693 }
694
695 }
696 }
697
698 BESDEBUG(MODULE, prolog << "END retryable: " << (retryable ? "true" : "false") << endl);
699 return retryable;
700}
701
724static bool eval_curl_easy_perform_code(
725 const string &eff_req_url,
726 CURLcode curl_code,
727 const char *error_buffer,
728 const unsigned int attempt
729) {
730 if (curl_code == CURLE_SSL_CONNECT_ERROR) {
731 stringstream msg;
732 msg << prolog << "ERROR - cURL experienced a CURLE_SSL_CONNECT_ERROR error. Message: ";
733 msg << curl::error_message(curl_code, error_buffer) << ". ";
734 msg << "A retry may be possible for: " << filter_aws_url(eff_req_url) << " (attempt: " << attempt << ")."
735 << endl;
736 BESDEBUG(MODULE, msg.str());
737 ERROR_LOG(msg.str());
738 return false;
739 } else if (curl_code == CURLE_SSL_CACERT_BADFILE) {
740 stringstream msg;
741 msg << prolog << "ERROR - cURL experienced a CURLE_SSL_CACERT_BADFILE error. Message: ";
742 msg << curl::error_message(curl_code, error_buffer) << ". ";
743 msg << "A retry may be possible for: " << filter_aws_url(eff_req_url) << " (attempt: " << attempt << ")."
744 << endl;
745 BESDEBUG(MODULE, msg.str());
746 ERROR_LOG(msg.str());
747 return false;
748 } else if (curl_code == CURLE_GOT_NOTHING) {
749 // First we check to see if the response was empty. This is a cURL error, not an HTTP error
750 // so we have to handle it like this. And we do that because this is one of the failure modes
751 // we see in the AWS cloud and by trapping this and returning false we are able to be resilient and retry.
752 stringstream msg;
753 msg << prolog << "ERROR - cURL returned CURLE_GOT_NOTHING. Message: ";
754 msg << error_message(curl_code, error_buffer) << ". ";
755 msg << "A retry may be possible for: " << filter_aws_url(eff_req_url) << " (attempt: " << attempt << ")."
756 << endl;
757 BESDEBUG(MODULE, msg.str());
758 ERROR_LOG(msg.str());
759 return false;
760 } else if (curl_code != CURLE_OK) {
761 stringstream msg;
762 msg << "ERROR - Problem with data transfer. Message: " << curl::error_message(curl_code, error_buffer);
763 msg << " CURLINFO_EFFECTIVE_URL: " << filter_aws_url(eff_req_url);
764 BESDEBUG(MODULE, prolog << msg.str() << endl);
765 ERROR_LOG(msg.str());
766 return false;
767 }
768
769 return true;
770}
771
783static void
784process_http_code_helper(const long http_code, const string &requested_url, const string &last_accessed_url) {
785 stringstream msg;
786 if (http_code >= 400) {
787 msg << "ERROR - The HTTP GET request for the source URL: " << requested_url << " FAILED. ";
788 msg << "CURLINFO_EFFECTIVE_URL: " << filter_aws_url(last_accessed_url) << " ";
789 BESDEBUG(MODULE, prolog << msg.str() << endl);
790 }
791
792 msg << "The response from " << last_accessed_url << " (Originally: " << requested_url << ") ";
793 msg << "returned an HTTP code of " << http_code;
794 msg << " which means " << http_code_to_string(http_code) << " ";
795
796 switch (http_code) {
797 case 400: // Bad Request
798 case 401: // Unauthorized
799 case 402: // Payment Required
800 case 403: // Forbidden
801 case 404: // Not Found
802 case 408: // Request Timeout
803 {
804 // These issues are not considered retryable problems, so we throw immediately.
805 // Remove this redundant call to ERROR_LOG since the thrown exception is
806 // logged as an error. jhrg 1/24/25
807 // ERROR_LOG(msg.str());
808 throw http::HttpError(msg.str(),
809 CURLE_OK,
810 http_code,
811 requested_url,
812 last_accessed_url,
813 __FILE__, __LINE__);
814
815 }
816 case 422: // Unprocessable Entity
817 case 500: // Internal server error
818 case 502: // Bad Gateway
819 case 503: // Service Unavailable
820 case 504: // Gateway Timeout
821 {
822 // These problems might actually be retryable, so we check and then act accordingly.
823 if (!is_retryable(last_accessed_url)) {
824 msg << " The HTTP response code of this last accessed URL indicate that it should not be retried.";
825 ERROR_LOG(msg.str());
826 throw http::HttpError(msg.str(),
827 CURLE_OK,
828 http_code,
829 requested_url,
830 last_accessed_url,
831 __FILE__, __LINE__);
832 } else {
833 msg << " The HTTP response code of this last accessed URL indicate that it should be retried.";
834 BESDEBUG(MODULE, prolog << msg.str() << endl);
835 }
836 }
837 break;
838
839 default:
840 // ERROR_LOG(msg.str());
841 throw BESInternalError(msg.str(), __FILE__, __LINE__);
842 }
843}
844
845
852long get_http_code(CURL *ceh) {
853 long http_code = 0;
854 CURLcode curl_code = curl_easy_getinfo(ceh, CURLINFO_RESPONSE_CODE, &http_code);
855 if (curl_code != CURLE_OK) {
856 throw BESInternalError(prolog + "Error acquiring HTTP response code.", __FILE__, __LINE__);
857 }
858 BESDEBUG(MODULE, prolog << "http_code: " << http_code << "\n");
859 return http_code;
860}
861
862
900static bool eval_http_get_response(CURL *ceh, const string &requested_url, long &http_code) {
901 BESDEBUG(MODULE, prolog << "Requested URL: " << requested_url << endl);
902
903 http_code = get_http_code(ceh);
904
905 // Special case for file:// URLs. An HTTP Code is zero means success in that case. jhrg 4/20/23
906 if (requested_url.find(FILE_PROTOCOL) == 0 && http_code == 0)
907 return true;
908
909#ifndef NDEBUG
910 if (BESISDEBUG(MODULE)) { // BESISDEBUG is a macro that expands to false when NDEBUG is defined. jhrg 4/19/23
911 CURLcode curl_code;
912 long redirects;
913 curl_code = curl_easy_getinfo(ceh, CURLINFO_REDIRECT_COUNT, &redirects);
914 if (curl_code != CURLE_OK)
915 throw BESInternalError("Error acquiring CURLINFO_REDIRECT_COUNT.", __FILE__, __LINE__);
916 BESDEBUG(MODULE, prolog << "CURLINFO_REDIRECT_COUNT: " << redirects << endl);
917
918 char *redirect_url = nullptr;
919 curl_code = curl_easy_getinfo(ceh, CURLINFO_REDIRECT_URL, &redirect_url);
920 if (curl_code != CURLE_OK)
921 throw BESInternalError("Error acquiring CURLINFO_REDIRECT_URL.", __FILE__, __LINE__);
922
923 if (redirect_url)
924 BESDEBUG(MODULE, prolog << "CURLINFO_REDIRECT_URL: " << redirect_url << endl);
925 }
926#endif
927
928 // Newer Apache servers return 206 for range requests. jhrg 8/8/18
929 switch (http_code) {
930 case 0:
931 case 200: // OK
932 case 206: // Partial content - this is to be expected since we use range gets
933 // cases 201-205 are things we should probably reject, unless we add more
934 // comprehensive HTTP/S processing here. jhrg 8/8/18
935 return true;
936
937 default:
938 string last_accessed_url = get_effective_url(ceh, requested_url);
939 BESDEBUG(MODULE, prolog << "Last Accessed URL(CURLINFO_EFFECTIVE_URL): "
940 << filter_aws_url(last_accessed_url) << endl);
941
942 // process_http_code_helper() _only_ returns if the request can be retried, otherwise
943 // it throws an exception. Pass the unfiltered last_accessed_url because the
944 // query string params might be needed to determine if the URL should be retried.
945 // jhrg 4/20/23
946 process_http_code_helper(http_code, requested_url, last_accessed_url);
947 return false; // if we get here, retry the request
948 }
949}
950
951// Truncate the file that holds information read off the wire when the
952// library has to retry a request made to S3. The file will contain error
953// text from the failed attempt and that needs to be cleaned out before
954// the next attempt. jhrg 5/9/23
955static void truncate_file(int fd) {
956 auto status = ftruncate(fd, 0);
957 if (status == -1)
958 throw BESInternalError(string("Could not truncate the file before retrying request (") + strerror(errno) + ").",
959 __FILE__, __LINE__);
960
961 // Removing this call to lseek will cause tests for the retry code to fail, which demonstrates that
962 // this fixes the issue with retires without this call having corrupted data. jhrg 5/9/23
963 status = lseek(fd, 0, SEEK_SET);
964 if (-1 == status)
965 throw BESInternalError(string("Could not seek within the response file (") + strerror(errno) + ").",
966 __FILE__, __LINE__);
967}
968
969// used only in one place here. jhrg 3/8/23
992void super_easy_perform(CURL *c_handle) {
993 int fd = -1;
994 super_easy_perform(c_handle, fd);
995}
996
997static void super_easy_perform(CURL *c_handle, int fd) {
998 BESDEBUG(MODULE, prolog << "BEGIN\n");
999
1000 useconds_t retry_time = url_retry_time; // 0.25 seconds
1001 bool curl_success{false};
1002 bool http_success{false};
1003 long http_code{0};
1004 unsigned int attempts{0};
1005
1006 vector<char> error_buffer(CURL_ERROR_SIZE, (char) 0);
1007 set_error_buffer(c_handle, error_buffer.data());
1008
1009 string target_url = get_effective_url(c_handle, ""); // This is a trick to get the URL from the cURL handle.
1010 // We check the value of target_url to see if the URL was correctly set in the cURL handle.
1011 if (target_url.empty())
1012 throw BESInternalError("URL acquisition failed.", __FILE__, __LINE__);
1013
1014 // This either works or throws an exception after retry_limit attempts
1015 while (!curl_success || !http_success) {
1016 ++attempts;
1017 BESDEBUG(MODULE,
1018 prolog << "Requesting URL: " << filter_aws_url(target_url) << " attempt: " << attempts << endl);
1019
1020 CURLcode curl_code = curl_easy_perform(c_handle);
1021 curl_success = eval_curl_easy_perform_code(target_url, curl_code, error_buffer.data(), attempts);
1022 BESDEBUG(MODULE, prolog << "curl_success: " << (curl_success ? "true" : "false") << endl);
1023 if (curl_success) {
1024 // Nothing obvious went wrong with the curl_easy_perform() so now we check the HTTP stuff
1025 http_success = eval_http_get_response(c_handle, target_url, http_code);
1026 BESDEBUG(MODULE, prolog << "http_success: " << (http_success ? "true" : "false") << endl);
1027 }
1028 // If the curl_easy_perform failed, or if the http request failed, then
1029 // we keep trying until we have exceeded the retry_limit at which point we throw
1030 // an exception.
1031 if (!curl_success || !http_success) {
1032 string effective_url;
1033 try {
1034 effective_url = filter_aws_url(get_effective_url(c_handle, target_url));
1035 }
1036 catch (BESInternalError &bie) {
1037 effective_url = "Unable_To_Determine_CURLINFO_EFFECTIVE_URL: " + bie.get_message();
1038 }
1039 if (attempts == retry_limit) {
1040 stringstream msg;
1041 msg << prolog << "ERROR - Made " << retry_limit << " failed attempts to retrieve the URL ";
1042 msg << filter_aws_url(target_url) << " The retry limit has been exceeded. Giving up! ";
1043 msg << "CURLINFO_EFFECTIVE_URL: " << effective_url << " ";
1044 msg << "Returned HTTP_STATUS: " << http_code;
1045 throw HttpError(msg.str(),
1046 curl_code,
1047 http_code,
1048 target_url,
1049 effective_url,
1050 __FILE__, __LINE__);
1051 } else {
1052 INFO_LOG(prolog + "Problem with data transfer. Will retry (url: "
1053 + filter_aws_url(target_url) + " attempt: " + std::to_string(attempts) + "). "
1054 + "CURLINFO_EFFECTIVE_URL: " + effective_url + " "
1055 + "Returned HTTP_STATUS: " + std::to_string(http_code));
1056 usleep(retry_time);
1057 retry_time *= 2;
1058
1059 if (fd >= 0)
1060 truncate_file(fd);
1061 }
1062 }
1063 }
1064
1065 // Unset the buffer before it goes out of scope
1066 unset_error_buffer(c_handle);
1067
1068 BESDEBUG(MODULE, prolog << "cURL operations completed. fd: " << fd << "\n");
1069
1070 // rewind the file, if the descriptor is valid
1071 if (fd >= 0) {
1072 BESDEBUG(MODULE, prolog << "Rewinding fd(" << fd << ")\n");
1073 auto status = lseek(fd, 0, SEEK_SET);
1074 if (-1 == status)
1075 throw BESInternalError("Could not seek within the response file.", __FILE__, __LINE__);
1076 }
1077 BESDEBUG(MODULE, prolog << "END\n");
1078}
1079
1098void http_get_and_write_resource(const std::shared_ptr<http::url> &target_url, int fd,
1099 vector <string> *http_response_headers) {
1100
1101 vector<char> error_buffer(CURL_ERROR_SIZE, (char) 0);
1102 CURLcode res;
1103 CURL *ceh = nullptr;
1104 curl_slist *req_headers = nullptr;
1105
1106 BESDEBUG(MODULE, prolog << "BEGIN" << endl);
1107 // Before we do anything, make sure that the URL is OK to pursue.
1108 if (!http::AllowedHosts::theHosts()->is_allowed(target_url)) {
1109 string err = (string) "The specified URL " + target_url->str()
1110 + " does not match any of the accessible services in"
1111 + " the allowed hosts list.";
1112 BESDEBUG(MODULE, prolog << err << endl);
1113 throw BESSyntaxUserError(err, __FILE__, __LINE__);
1114 }
1115
1116 try {
1117 // Add the EDL authorization headers if the Information is in the BES Context Manager
1118 req_headers = add_edl_auth_headers(req_headers);
1119 // Add AWS credentials if they're available.
1120 req_headers = sign_url_for_s3_if_possible(target_url->str(), req_headers);
1121
1122 // OK! Make the cURL handle
1123 ceh = init(target_url->str(), req_headers, http_response_headers);
1124
1125 set_error_buffer(ceh, error_buffer.data());
1126
1127 res = curl_easy_setopt(ceh, CURLOPT_WRITEFUNCTION, writeToOpenFileDescriptor);
1128 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEFUNCTION", error_buffer.data(), __FILE__, __LINE__);
1129
1130 // since curl 7.9.7 CURLOPT_FILE is the same as CURLOPT_WRITEDATA.
1131 res = curl_easy_setopt(ceh, CURLOPT_FILE, &fd);
1132 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_FILE", error_buffer.data(), __FILE__, __LINE__);
1133
1134 // We do this because we know super_easy_perform() is going to set it.
1135 unset_error_buffer(ceh);
1136
1137 super_easy_perform(ceh, fd);
1138
1139 // Free the header list
1140 BESDEBUG(MODULE, prolog << "Cleanup request headers. Calling curl_slist_free_all()." << endl);
1141 curl_slist_free_all(req_headers);
1142
1143 if (ceh) {
1144 curl_easy_cleanup(ceh);
1145 BESDEBUG(MODULE, prolog << "Called curl_easy_cleanup()." << endl);
1146 }
1147
1148 }
1149 catch (...) {
1150 curl_slist_free_all(req_headers);
1151 if (ceh) {
1152 curl_easy_cleanup(ceh);
1153 }
1154 throw;
1155 }
1156
1157 BESDEBUG(MODULE, prolog << "END" << endl);
1158}
1159
1168string error_message(const CURLcode response_code, const char *error_buffer) {
1169 string msg;
1170 if (error_buffer) {
1171 msg = string("cURL_error_buffer: ") + error_buffer + ", ";
1172 }
1173 msg += string("cURL_message: ") + curl_easy_strerror(response_code) + " (code: "
1174 + to_string(response_code) + ")\n";
1175 return msg;
1176}
1177
1178
1179static size_t string_write_data(void *buffer, size_t size, size_t nmemb, void *data) {
1180 auto str = reinterpret_cast<string *>(data);
1181 size_t nbytes = size * nmemb;
1182 size_t current_size = str->size();
1183 str->resize(current_size + nbytes);
1184 memcpy((void *) (str->data() + current_size), buffer, nbytes);
1185 return nbytes;
1186}
1187
1200void http_get(const string &target_url, string &buf) {
1201 BESDEBUG(MODULE, prolog << "BEGIN\n");
1202
1203 vector<char> error_buffer(CURL_ERROR_SIZE, (char) 0);
1204 CURL *ceh = nullptr;
1205 CURLcode res;
1206 curl_slist *request_headers = nullptr;
1207
1208 try {
1209 // Add the authorization headers
1210 request_headers = add_edl_auth_headers(request_headers);
1211
1212 request_headers = sign_url_for_s3_if_possible(target_url, request_headers);
1213
1214#ifdef DEVELOPER
1215 AccessCredentials *credentials = CredentialsManager::theCM()->get(target_url);
1216 if (credentials) {
1217 INFO_LOG(prolog + "Looking for EDL Token for URL: " + target_url );
1218 string edl_token = credentials->get("edl_token");
1219 if (!edl_token.empty()) {
1220 INFO_LOG(prolog + "Using EDL Token for URL: " + target_url + '\n');
1221 request_headers = curl::append_http_header(request_headers, "Authorization", edl_token);
1222 }
1223 }
1224#endif
1225
1226 ceh = curl::init(target_url, request_headers, nullptr);
1227 if (!ceh)
1228 throw BESInternalError(string("ERROR! Failed to acquire cURL Easy Handle! "), __FILE__, __LINE__);
1229
1230 // Error Buffer (for use during this setup) ----------------------------------------------------------------
1231 set_error_buffer(ceh, error_buffer.data());
1232
1233 // Pass all data to the 'write_data' function --------------------------------------------------------------
1234 res = curl_easy_setopt(ceh, CURLOPT_WRITEFUNCTION, string_write_data);
1235 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEFUNCTION", error_buffer.data(), __FILE__, __LINE__);
1236
1237 // Pass this to write_data as the fourth argument ----------------------------------------------------------
1238 res = curl_easy_setopt(ceh, CURLOPT_WRITEDATA, reinterpret_cast<void *>(&buf));
1239 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEDATA", error_buffer.data(), __FILE__, __LINE__);
1240
1241 // We do this because we know super_easy_perform() is going to set it.
1242 unset_error_buffer(ceh);
1243
1244 super_easy_perform(ceh);
1245
1246 // Free the header list
1247 BESDEBUG(MODULE, prolog << "Cleanup request headers. Calling curl_slist_free_all()." << endl);
1248 curl_slist_free_all(request_headers);
1249
1250 if (ceh) {
1251 curl_easy_cleanup(ceh);
1252 BESDEBUG(MODULE, prolog << "Called curl_easy_cleanup()." << endl);
1253 }
1254
1255 buf.push_back('\0'); // add a trailing null byte
1256 }
1257 catch (...) {
1258 curl_slist_free_all(request_headers);
1259 if (ceh) {
1260 curl_easy_cleanup(ceh);
1261 }
1262 throw;
1263 }
1264 BESDEBUG(MODULE, prolog << "END\n");
1265}
1266
1267// used only in one place here. jhrg 3/8/23
1268static string get_cookie_file_base() {
1269 return TheBESKeys::read_string_key(HTTP_COOKIES_FILE_KEY, HTTP_DEFAULT_COOKIES_FILE);
1270}
1271
1272// used here in init() and clear_cookies (which itself is never used) and in dmrpp_module
1273// jhrg 3/8/23
1274string get_cookie_filename() {
1275 string cookie_file_base = get_cookie_file_base();
1276 stringstream cf_with_pid;
1277 cf_with_pid << cookie_file_base << "-" << getpid();
1278 return cf_with_pid.str();
1279}
1280
1291string get_netrc_filename() {
1292 return TheBESKeys::read_string_key(HTTP_NETRC_FILE_KEY, "");
1293}
1294
1303void set_error_buffer(CURL *ceh, char *error_buffer) {
1304 CURLcode res;
1305 res = curl_easy_setopt(ceh, CURLOPT_ERRORBUFFER, error_buffer);
1306 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_ERRORBUFFER", error_buffer, __FILE__, __LINE__);
1307}
1308
1316string hyrax_user_agent() {
1317 string user_agent;
1318 bool found;
1319 TheBESKeys::TheKeys()->get_value(HTTP_USER_AGENT_KEY, user_agent, found);
1320 if (!found || user_agent.empty()) {
1321 user_agent = HTTP_DEFAULT_USER_AGENT;
1322 }
1323 BESDEBUG(MODULE, prolog << "User-Agent: " << user_agent << endl);
1324 return user_agent;
1325}
1326
1344void eval_curl_easy_setopt_result(CURLcode curl_code, const string &msg_base, const string &opt_name,
1345 const char *ebuf, const string &file, unsigned int line) {
1346 if (curl_code != CURLE_OK) {
1347 stringstream msg;
1348 msg << msg_base << "ERROR - cURL failed to set " << opt_name << " Message: "
1349 << curl::error_message(curl_code, ebuf);
1350 throw BESInternalError(msg.str(), file, line);
1351 }
1352}
1353
1354// Used here and in dmrpp_module. jhrg 3/8/23
1355unsigned long max_redirects() {
1357}
1358
1372curl_slist *append_http_header(curl_slist *slist, const string &header_name, const string &value) {
1373
1374 string full_header = header_name;
1375 full_header.append(": ").append(value);
1376
1377 BESDEBUG(MODULE, prolog << full_header << endl);
1378
1379 auto temp = curl_slist_append(slist, full_header.c_str());
1380 if (!temp) {
1381 stringstream msg;
1382 msg << prolog << "Encountered cURL Error setting the " << header_name << " header. full_header: "
1383 << full_header;
1384 throw BESInternalError(msg.str(), __FILE__, __LINE__);
1385 }
1386 return temp;
1387}
1388
1421curl_slist *add_edl_auth_headers(curl_slist *request_headers) {
1422 bool found;
1423 string s;
1424
1425 s = BESContextManager::TheManager()->get_context(EDL_UID_KEY, found);
1426 if (found && !s.empty()) {
1427 request_headers = append_http_header(request_headers, "User-Id", s);
1428 }
1429
1430 s = BESContextManager::TheManager()->get_context(EDL_AUTH_TOKEN_KEY, found);
1431 if (found && !s.empty()) {
1432 request_headers = append_http_header(request_headers, "Authorization", s);
1433 }
1434
1435 s = BESContextManager::TheManager()->get_context(EDL_ECHO_TOKEN_KEY, found);
1436 if (found && !s.empty()) {
1437 request_headers = append_http_header(request_headers, "Echo-Token", s);
1438 }
1439
1440 return request_headers;
1441}
1442
1463curl_slist *
1464sign_s3_url(const string &target_url, AccessCredentials *ac, curl_slist *req_headers) {
1465 const time_t request_time = time(nullptr);
1466 const auto url_obj = http::url(target_url); // parse the URL using the http::url object. jhrg 2/20/25
1467 const string auth_header = compute_awsv4_signature(url_obj.path(), url_obj.query(), url_obj.host(),
1468 request_time, ac->get(AccessCredentials::ID_KEY),
1469 ac->get(AccessCredentials::KEY_KEY),
1470 ac->get(AccessCredentials::REGION_KEY), "s3");
1471
1472 BESDEBUG(MODULE, prolog << "Authorization: " << auth_header << "\n");
1473 req_headers = append_http_header(req_headers, "Authorization", auth_header);
1474 req_headers = append_http_header(req_headers, "x-amz-content-sha256",
1475 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
1476 req_headers = append_http_header(req_headers, "x-amz-date", AWSV4::ISO8601_date(request_time));
1477 INFO_LOG(prolog + "Signed S3 request for " + target_url);
1478
1479 return req_headers;
1480}
1481
1490curl_slist *
1491sign_s3_url(const shared_ptr <url> &target_url, AccessCredentials *ac, curl_slist *req_headers) {
1492 return sign_s3_url(target_url->str(), ac, req_headers);
1493}
1494
1504bool is_url_signed_for_s3(const std::string &url) {
1505 return url.find("X-Amz-Algorithm=") != string::npos &&
1506 url.find("X-Amz-Credential=") != string::npos &&
1507 url.find("X-Amz-Signature=") != string::npos;
1508}
1509
1519bool is_url_signed_for_s3(const std::shared_ptr<http::url> &target_url) {
1520 return is_url_signed_for_s3(target_url->str());
1521}
1522
1538static CURL *init_no_follow_redirects_handle(const string &target_url, const curl_slist *req_headers,
1539 vector <string> &resp_hdrs, string &response_body) {
1540
1541 vector<char> error_buffer(CURL_ERROR_SIZE, (char) 0);
1542 CURL *ceh = curl::init(target_url, req_headers, &resp_hdrs);
1543
1544 set_error_buffer(ceh, error_buffer.data());
1545
1546 // Pass all data to the 'write_data' function --------------------------------------------------------------
1547 CURLcode res = curl_easy_setopt(ceh, CURLOPT_WRITEFUNCTION, string_write_data);
1548 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEFUNCTION", error_buffer.data(), __FILE__, __LINE__);
1549
1550 // Pass this to write_data as the fourth argument ----------------------------------------------------------
1551 res = curl_easy_setopt(ceh, CURLOPT_WRITEDATA, reinterpret_cast<void *>(&response_body));
1552 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEDATA", error_buffer.data(), __FILE__, __LINE__);
1553
1554 // Pass save_raw_http_headers() a pointer to the vector<string> where the
1555 // response headers may be stored. Callers can use the resp_hdrs
1556 // value/result parameter to get the raw response header information .
1557 res = curl_easy_setopt(ceh, CURLOPT_WRITEHEADER, &resp_hdrs);
1558 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEHEADER", error_buffer.data(), __FILE__, __LINE__);
1559
1560 // DO NOT Follow 302/306 (redirect) responses
1561 res = curl_easy_setopt(ceh, CURLOPT_FOLLOWLOCATION, 0L);
1562 eval_curl_easy_setopt_result(res, prolog, "CURLOPT_FOLLOWLOCATION", error_buffer.data(), __FILE__, __LINE__);
1563
1564 unset_error_buffer(ceh);
1565 return ceh;
1566}
1567
1568
1576void write_response_details(const long http_code,
1577 const vector <string> &response_headers,
1578 const string &response_body,
1579 stringstream &msg) {
1580 msg << "# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --\n";
1581 msg << "HTTP Response Details\n";
1582 msg << "The remote service returned an HTTP code of: " << http_code << "\n";
1583 msg << "Response Headers -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --\n";
1584 for (const auto &hdr: response_headers) {
1585 msg << " " << hdr << "\n";
1586 }
1587 msg << "# BEGIN Response Body -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --\n";
1588 msg << response_body << "\n";
1589 msg << "# END Response Body -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --\n";
1590}
1591
1603bool process_get_redirect_http_code(const long http_code,
1604 const vector <string> &response_headers,
1605 const string &response_body,
1606 const string &redirect_url_str,
1607 const string &origin_url_str,
1608 const unsigned int attempt,
1609 const unsigned int max_attempts) {
1610 bool success = false;
1611 switch (http_code) {
1612 case 301: // Moved Permanently
1613 case 302: // Found (fka Move Temporarily)
1614 case 303: // See Other
1615 case 307: // Temporary Redirect
1616 case 308: // Permanent Redirect
1617 {
1618 // Check for EDL redirect
1619 http::url rdu(redirect_url_str);
1620 if (rdu.host().find("urs.earthdata.nasa.gov") != string::npos) {
1621 if (attempt >= max_attempts) {
1622 stringstream msg;
1623 msg << prolog << "ERROR - I tried " << attempt << " times to access the url:\n";
1624 msg << " " << origin_url_str << "\n";
1625 msg << "It seems that the provided access credentials are either missing, invalid, or expired.\n";
1626 msg << "Here are the details from the most recent attempt:\n\n";
1627 write_response_details(http_code, response_headers, response_body, msg);
1628 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
1629 }
1630 // EDL is not the redirect we were looking for...
1631 success = false;
1632 } else {
1633 success = true;
1634 }
1635 break;
1636 }
1637
1638 default: {
1639 if (attempt >= max_attempts) {
1640 // Everything else is bad.
1641 stringstream msg;
1642 msg << prolog << "ERROR - I tried " << attempt << " times to access:\n";
1643 msg << " " << origin_url_str << "\n";
1644 msg << "I was expecting to receive an HTTP redirect code and location header in the response. \n";
1645 msg << "Unfortunately this did not happen.\n";
1646 msg << "Here are the details of the most recent transaction:\n\n";
1647 write_response_details(http_code, response_headers, response_body, msg);
1648 throw HttpError(msg.str(),
1649 CURLE_OK,
1650 http_code,
1651 origin_url_str,
1652 redirect_url_str,
1653 response_headers,
1654 response_body,
1655 __FILE__, __LINE__);
1656 }
1657 success = false;
1658 break;
1659 }
1660 }
1661 return success;
1662}
1663
1672static bool gru_mk_attempt(const shared_ptr <url> &origin_url,
1673 const unsigned int attempt,
1674 const unsigned int max_attempts,
1675 shared_ptr <EffectiveUrl> &redirect_url) {
1676
1677 BESDEBUG(MODULE, prolog << " BEGIN This is attempt #" << attempt << " for " << origin_url->str() << "\n");
1678 bool http_success = false;
1679 bool curl_success = false;
1680 CURL *ceh = nullptr;
1681 vector<char> error_buffer(CURL_ERROR_SIZE, (char) 0);
1682 curl_slist *req_headers = nullptr;
1683
1684 vector<string> response_headers;
1685 string response_body;
1686 CURLcode curl_code;
1687 long http_code;
1688 string redirect_url_str;
1689
1690 // Add the EDL authorization headers if the Information is in the BES Context Manager
1691 req_headers = add_edl_auth_headers(req_headers);
1692 req_headers = sign_url_for_s3_if_possible(origin_url, req_headers);
1693
1694 // FIXME Hackery for DMR++ Ownership POC code - see dmrpp_module CurlHandlePool.cc
1695 // for more info. jhrg 5/24/24
1696 AccessCredentials *credentials = CredentialsManager::theCM()->get(origin_url);
1697 if (credentials) {
1698 INFO_LOG(prolog + "Looking for EDL Token for URL: " + origin_url->str() + '\n');
1699 string edl_token = credentials->get("edl_token");
1700 if (!edl_token.empty()) {
1701 INFO_LOG(prolog + "Using EDL Token for URL: " + origin_url->str() + '\n');
1702 req_headers = curl::append_http_header(req_headers, "Authorization", edl_token);
1703 }
1704 }
1705
1706 try {
1707
1708 // OK! Make the cURL handle
1709 ceh = init_no_follow_redirects_handle(
1710 origin_url->str(),
1711 req_headers,
1712 response_headers,
1713 response_body);
1714
1715#ifndef NDEBUG
1716 {
1717 BES_STOPWATCH_START(MODULE,prolog + "Retrieved HTTP response from origin_url: " + origin_url->str());
1718#endif
1719
1720 curl_code = curl_easy_perform(ceh);
1721#ifndef NDEBUG
1722 }
1723#endif
1724 curl_success = eval_curl_easy_perform_code(
1725 origin_url->str(), // In this situation we use the origin url because we did NOT follow a redirect
1726 curl_code,
1727 error_buffer.data(),
1728 attempt);
1729
1730 if (curl_success) {
1731 http_code = get_http_code(ceh);
1732 char *url = nullptr;
1733 curl_easy_getinfo(ceh, CURLINFO_REDIRECT_URL, &url);
1734 if (url) {
1735 redirect_url_str = url;
1736 }
1737 BESDEBUG(MODULE, prolog << "redirect_url_str: " << redirect_url_str << "\n");
1738 http_success = process_get_redirect_http_code(http_code,
1739 response_headers,
1740 response_body,
1741 redirect_url_str,
1742 origin_url->str(),
1743 attempt,
1744 max_attempts);
1745 if (http_success) {
1746 redirect_url = make_shared<http::EffectiveUrl>(redirect_url_str,
1747 response_headers,
1748 origin_url->is_trusted());
1749 }
1750 } else if (attempt >= max_attempts) {
1751 // Everything is bad now.
1752 stringstream msg;
1753 msg << prolog << "ERROR - I tried " << attempt << " times to access:\n";
1754 msg << " " << origin_url << "\n";
1755 msg << "I was expecting to receive an HTTP redirect code and location header in the response. \n";
1756 msg << "Unfortunately this did not happen.\n";
1757 msg << "This failure appears to be a problem with cURL.\n";
1758 msg << "The cURL message associated with the most recent failure is:\n";
1759 msg << " " << error_message(curl_code, error_buffer.data()) << "\n";
1760 throw BESInternalError(msg.str(), __FILE__, __LINE__);
1761 }
1762
1763 // Free the header list
1764 curl_slist_free_all(req_headers);
1765 // clean up cURL handle
1766 if (ceh) {
1767 curl_easy_cleanup(ceh);
1768 BESDEBUG(MODULE, prolog << "Called curl_easy_cleanup()." << "\n");
1769 }
1770
1771 }
1772 catch (...) {
1773 // Free the header list
1774 curl_slist_free_all(req_headers);
1775
1776 // clean up cURL handle
1777 if (ceh) {
1778 curl_easy_cleanup(ceh);
1779 BESDEBUG(MODULE, prolog << "Called curl_easy_cleanup()." << "\n");
1780 }
1781 throw;
1782 }
1783 BESDEBUG(MODULE, prolog << "curl_success: " << (curl_success ? "true" : "false") << "\n");
1784 BESDEBUG(MODULE, prolog << "http_success: " << (http_success ? "true" : "false") << "\n");
1785 BESDEBUG(MODULE, prolog << " END success: " << ((curl_success && http_success) ? "true" : "false") <<
1786 " on attempt #" << attempt << " for " << origin_url->str() << "\n");
1787
1788 return curl_success && http_success;
1789}
1790
1800std::shared_ptr<http::EffectiveUrl> get_redirect_url(const std::shared_ptr<http::url> &origin_url) {
1801
1802 BESDEBUG(MODULE, prolog << "BEGIN" << endl);
1803 // Before we do anything, make sure that the URL is OK to pursue.
1804 if (!http::AllowedHosts::theHosts()->is_allowed(origin_url)) {
1805 string err = (string) "The specified URL " + origin_url->str()
1806 + " does not match any of the accessible services in"
1807 + " the allowed hosts list.";
1808 BESDEBUG(MODULE, prolog << err << endl);
1809 throw BESSyntaxUserError(err, __FILE__, __LINE__);
1810 }
1811
1812 std::shared_ptr<http::EffectiveUrl> redirect_url;
1813
1814 unsigned int attempt = 0;
1815 bool success = false;
1816
1817 while (!success && (attempt < retry_limit)) {
1818 attempt++;
1819 success = gru_mk_attempt(origin_url, attempt, retry_limit, redirect_url);
1820 }
1821 // This is a failsafe test - the gru_mk_attempt)_ should detect the errors and throw an exception
1822 // if the attempt count exceeds the retry_limit, but if for some reason there's flaw in that
1823 // logic I add this check as well... ndp-12/01/23
1824 if (attempt >= retry_limit) {
1825 stringstream msg;
1826 msg << prolog << "ERROR: I tried " << attempt << " times to determine the redirect URL for the origin_url:\n";
1827 msg << " " << origin_url->str() << "\n";
1828 msg << "Oddly, I was unable to detect an error, but nonetheless I have made the maximum ";
1829 msg << "number of attempts and I must now give up...\n";
1830 throw BESInternalError(msg.str(), __FILE__, __LINE__);
1831 }
1832
1833 BESDEBUG(MODULE, prolog << "END redirect_url: " << redirect_url->str() << "\n");
1834 return redirect_url;
1835}
1836
1837
1838} /* namespace curl */
virtual std::string get_context(const std::string &name, bool &found)
retrieve the value of the specified context from the BES
std::string get_message() const
get the error message for this exception
Definition BESError.h:132
static RequestServiceTimer * TheTimer()
Return a pointer to a singleton timer instance. If an instance does not exist it will create and init...
void throw_if_timeout_expired(const std::string &message, const std::string &file, const int line)
Checks the RequestServiceTimer to determine if the time spent servicing the request at this point has...
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
static TheBESKeys * TheKeys()
Access to the singleton.
Definition TheBESKeys.cc:85
void get_values(const std::string &s, std::vector< std::string > &vals, bool &found)
Retrieve the values of a given key, if set.
static std::string read_string_key(const std::string &key, const std::string &default_value)
Read a string-valued key from the bes.conf file.
virtual std::string get(const std::string &key)
static CredentialsManager * theCM()
Returns the singleton instance of the CredentialsManager.
AccessCredentials * get(const std::shared_ptr< http::url > &url)
utility class for the HTTP catalog module
Definition TheBESKeys.h:51
size_t load_max_redirects_from_keys()
Definition HttpUtils.cc:178