bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
NgapApi.cc
1// -*- mode: c++; c-basic-offset:4 -*-
2
3// This file is part of ngap_module, A C++ module that can be loaded in to
4// the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5
6// Copyright (c) 2020 OPeNDAP, Inc.
7// Author: Nathan Potter <ndp@opendap.org>
8//
9// This library is free software; you can redistribute it and/or
10// modify it under the terms of the GNU Lesser General Public
11// License as published by the Free Software Foundation; either
12// version 2.1 of the License, or (at your option) any later version.
13//
14// This library is distributed in the hope that it will be useful,
15// but WITHOUT ANY WARRANTY; without even the implied warranty of
16// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17// Lesser General Public License for more details.
18//
19// You should have received a copy of the GNU Lesser General Public
20// License along with this library; if not, write to the Free Software
21// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22//
23// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24
25#include "config.h"
26
27#include <sstream>
28#include <ctime>
29
30#include <curl/curl.h>
31#include "rapidjson/document.h"
32
33#include "BESNotFoundError.h"
34#include "BESSyntaxUserError.h"
35#include "BESInternalError.h"
36#include "BESDebug.h"
37#include "BESUtil.h"
38#include "TheBESKeys.h"
39#include "CurlUtils.h"
40#include "HttpError.h"
41
42#include "NgapApi.h"
43#include "NgapNames.h"
44
45using namespace std;
46
47#define prolog string("NgapApi::").append(__func__).append("() - ")
48
49namespace ngap {
50
51const unsigned int REFRESH_THRESHOLD = 3600; // An hour
52
59std::string NgapApi::get_cmr_search_endpoint_url() {
60 static string cmr_search_endpoint_url;
61 if (cmr_search_endpoint_url.empty()) {
62 string cmr_hostname = TheBESKeys::TheKeys()->read_string_key(NGAP_CMR_HOSTNAME_KEY, DEFAULT_CMR_ENDPOINT_URL);
63 string cmr_search_endpoint_path = TheBESKeys::TheKeys()->read_string_key(NGAP_CMR_SEARCH_ENDPOINT_PATH_KEY,
64 DEFAULT_CMR_SEARCH_ENDPOINT_PATH);
65 cmr_search_endpoint_url = BESUtil::assemblePath(cmr_hostname, cmr_search_endpoint_path);
66 }
67
68 return cmr_search_endpoint_url;
69}
70
78std::string NgapApi::build_cmr_query_url_old_rpath_format(const std::string &restified_path) {
79
80 // Make sure it starts with a '/' (see key strings above)
81 string r_path = (restified_path[0] != '/' ? "/" : "") + restified_path;
82
83 size_t provider_index = r_path.find(NGAP_PROVIDERS_KEY);
84 if (provider_index == string::npos) {
85 stringstream msg;
86 msg << prolog << "The specified path '" << r_path << "'";
87 msg << " does not contain the required path element '" << NGAP_PROVIDERS_KEY << "'";
88 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
89 }
90 if (provider_index != 0) {
91 stringstream msg;
92 msg << prolog << "The specified path '" << r_path << "'";
93 msg << " has the path element '" << NGAP_PROVIDERS_KEY << "' located in the incorrect position (";
94 msg << provider_index << ") expected 0.";
95 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
96 }
97 provider_index += string(NGAP_PROVIDERS_KEY).size();
98
99 bool use_collection_concept_id = false;
100 size_t collection_index = r_path.find(NGAP_COLLECTIONS_KEY);
101 if (collection_index == string::npos) {
102 size_t concepts_index = r_path.find(NGAP_CONCEPTS_KEY);
103 if (concepts_index == string::npos) {
104 stringstream msg;
105 msg << prolog << "The specified path '" << r_path << "'";
106 msg << " contains neither the '" << NGAP_COLLECTIONS_KEY << "'";
107 msg << " nor the '" << NGAP_CONCEPTS_KEY << "'";
108 msg << " key, one must be provided.";
109 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
110 }
111 collection_index = concepts_index;
112 use_collection_concept_id = true;
113 }
114 if (collection_index <= provider_index + 1) { // The value of provider has to be at least 1 character
115 stringstream msg;
116 msg << prolog << "The specified path '" << r_path << "'";
117 msg << " has the path element '" << (use_collection_concept_id ? NGAP_CONCEPTS_KEY : NGAP_COLLECTIONS_KEY)
118 << "' located in the incorrect position (";
119 msg << collection_index << ") expected at least " << provider_index + 1;
120 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
121 }
122 string provider = r_path.substr(provider_index, collection_index - provider_index);
123 collection_index += use_collection_concept_id ? string(NGAP_CONCEPTS_KEY).size() : string(
124 NGAP_COLLECTIONS_KEY).size();
125
126 size_t granule_index = r_path.find(NGAP_GRANULES_KEY);
127 if (granule_index == string::npos) {
128 stringstream msg;
129 msg << prolog << "The specified path '" << r_path << "'";
130 msg << " does not contain the required path element '" << NGAP_GRANULES_KEY << "'";
131 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
132 }
133 if (granule_index <= collection_index + 1) { // The value of collection must have at least one character.
134 stringstream msg;
135 msg << prolog << "The specified path '" << r_path << "'";
136 msg << " has the path element '" << NGAP_GRANULES_KEY << "' located in the incorrect position (";
137 msg << granule_index << ") expected at least " << collection_index + 1;
138 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
139 }
140 string collection = r_path.substr(collection_index, granule_index - collection_index);
141 granule_index += string(NGAP_GRANULES_KEY).size();
142
143 // The granule value is the path terminus so it's every thing after the key
144 string granule = r_path.substr(granule_index);
145
146 // Build the CMR query URL for the dataset
147 string cmr_url = get_cmr_search_endpoint_url() + "?";
148 {
149 // This easy handle is only created so we can use the curl_easy_escape() on the token values
150 CURL *ceh = curl_easy_init();
151 char *esc_url_content;
152
153 // Add provider
154 esc_url_content = curl_easy_escape(ceh, provider.c_str(), provider.size());
155 cmr_url += string(CMR_PROVIDER).append("=").append(esc_url_content).append("&");
156 curl_free(esc_url_content);
157
158 esc_url_content = curl_easy_escape(ceh, collection.c_str(), collection.size());
159 if (use_collection_concept_id) {
160 // Add collection_concept_id
161 cmr_url += string(CMR_COLLECTION_CONCEPT_ID).append("=").append(esc_url_content).append("&");
162 } else {
163 // Add entry_title
164 cmr_url += string(CMR_ENTRY_TITLE).append("=").append(esc_url_content).append("&");
165
166 }
167 curl_free(esc_url_content);
168
169 esc_url_content = curl_easy_escape(ceh, granule.c_str(), granule.size());
170 cmr_url += string(CMR_GRANULE_UR).append("=").append(esc_url_content);
171 curl_free(esc_url_content);
172
173 curl_easy_cleanup(ceh);
174 }
175
176 return cmr_url;
177}
178
195std::string NgapApi::build_cmr_query_url(const std::string &restified_path) {
196
197 // Make sure it starts with a '/' (see key strings above)
198 string r_path = (restified_path[0] != '/' ? "/" : "") + restified_path;
199
200 size_t provider_index = r_path.find(NGAP_PROVIDERS_KEY);
201 if (provider_index != string::npos) {
202 return build_cmr_query_url_old_rpath_format(restified_path);
203 }
204
205 size_t collections_key_index = r_path.find(NGAP_COLLECTIONS_KEY);
206 if (collections_key_index == string::npos) {
207 stringstream msg;
208 msg << prolog << "The specified path '" << r_path << "'";
209 msg << " contains neither the '" << NGAP_COLLECTIONS_KEY << "'";
210 msg << " nor the '" << NGAP_CONCEPTS_KEY << "'";
211 msg << " one must be provided.";
212 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
213 }
214 if (collections_key_index != 0) { // The COLLECTIONS_KEY comes first
215 stringstream msg;
216 msg << prolog << "The specified path '" << r_path << "'";
217 msg << " has the path element '" << NGAP_COLLECTIONS_KEY << "' located in the incorrect position (";
218 msg << collections_key_index << ") expected at least " << provider_index + 1;
219 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
220 }
221 // This is now the beginning of the collection_concept_id value.
222 size_t collections_index = collections_key_index + string(NGAP_COLLECTIONS_KEY).size();
223
224 size_t granules_key_index = r_path.find(NGAP_GRANULES_KEY);
225 if (granules_key_index == string::npos) {
226 stringstream msg;
227 msg << prolog << "The specified path '" << r_path << "'";
228 msg << " does not contain the required path element '" << NGAP_GRANULES_KEY << "'";
229 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
230 }
231
232 // The collection key must precede the granules key in the path,
233 // and the collection name must have at least one character.
234 if (granules_key_index <= collections_index + 1) {
235 stringstream msg;
236 msg << prolog << "The specified path '" << r_path << "'";
237 msg << " has the path element '" << NGAP_GRANULES_KEY << "' located in the incorrect position (";
238 msg << granules_key_index << ") expected at least " << collections_index + 1;
239 throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
240 }
241 size_t granules_index = granules_key_index + string(NGAP_GRANULES_KEY).size();
242 // The granule_name value is the path terminus so it's every thing after the key
243 string granule_name = r_path.substr(granules_index);
244
245 // Now we need to work on the collections value to eliminate the optional parts.
246 // This is the entire collections string including any optional components.
247 string collection_name = r_path.substr(collections_index, granules_key_index - collections_index);
248
249 // Since there may be optional parameters we need to strip them off to get the collection_concept_id
250 // And, since we know that collection_concept_id will never contain a '/', and we know that the optional
251 // part is separated from the collection_concept_id by a '/' we look for that and of we find it we truncate
252 // the value at that spot.
253 string optional_part;
254 size_t slash_pos = collection_name.find('/');
255 if (slash_pos != string::npos) {
256 optional_part = collection_name.substr(slash_pos);
257 BESDEBUG(MODULE, prolog << "Found optional collections name component: " << optional_part << endl);
258 collection_name = collection_name.substr(0, slash_pos);
259 }
260 BESDEBUG(MODULE, prolog << "Found collection_name (aka collection_concept_id): " << collection_name << endl);
261
262 // Build the CMR query URL for the dataset
263 string cmr_url = get_cmr_search_endpoint_url() + "?";
264 {
265 // This easy handle is only created so we can use the curl_easy_escape() on the token values
266 CURL *ceh = curl_easy_init();
267 char *esc_url_content;
268
269 esc_url_content = curl_easy_escape(ceh, collection_name.c_str(), collection_name.size());
270 cmr_url += string(CMR_COLLECTION_CONCEPT_ID).append("=").append(esc_url_content).append("&");
271 curl_free(esc_url_content);
272
273 esc_url_content = curl_easy_escape(ceh, granule_name.c_str(), granule_name.size());
274 cmr_url += string(CMR_GRANULE_UR).append("=").append(esc_url_content);
275 curl_free(esc_url_content);
276
277 curl_easy_cleanup(ceh);
278 }
279 return cmr_url;
280}
281
298std::string NgapApi::find_get_data_url_in_granules_umm_json_v1_4(const std::string &rest_path,
299 rapidjson::Document &cmr_granule_response) {
300 const rapidjson::Value &val = cmr_granule_response["hits"];
301 int hits = val.GetInt();
302 if (hits < 1) {
303 throw BESNotFoundError(string("The specified path '") + rest_path
304 + "' does not identify a granule in CMR.", __FILE__, __LINE__);
305 }
306
307 rapidjson::Value &items = cmr_granule_response["items"];
308 if (!items.IsArray()) {
309 throw BESInternalError(string("ERROR! The CMR response did not contain the data URL information: ")
310 + rest_path, __FILE__, __LINE__);
311 } else {
312 // Search the items array for the first item that contains a RelatedUrls array
313 if (BESISDEBUG(MODULE)) {
314 stringstream ss;
315 const string RJ_TYPE_NAMES[] = {string("kNullType"), string("kFalseType"), string("kTrueType"),
316 string("kObjectType"), string("kArrayType"), string("kStringType"),
317 string("kNumberType")};
318 for (rapidjson::SizeType i = 0; i < items.Size(); i++) // Uses SizeType instead of size_t
319 ss << "items[" << i << "]: " << RJ_TYPE_NAMES[items[i].GetType()] << endl;
320 BESDEBUG(MODULE, prolog << "items size: " << items.Size() << endl << ss.str() << endl);
321 }
322
323 rapidjson::Value &items_obj = items[0];
324 auto mitr = items_obj.FindMember("umm");
325
326 rapidjson::Value &umm = mitr->value;
327 mitr = umm.FindMember("RelatedUrls");
328 if (mitr == umm.MemberEnd()) {
329 throw BESInternalError("Error! The umm/RelatedUrls object was not located!", __FILE__, __LINE__);
330 }
331
332 rapidjson::Value &related_urls = mitr->value;
333
334 if (!related_urls.IsArray()) {
335 throw BESNotFoundError("Error! The RelatedUrls object in the CMR response is not an array!", __FILE__,
336 __LINE__);
337 }
338
339 BESDEBUG(MODULE, prolog << " Found RelatedUrls array in CMR response." << endl);
340
341 string data_access_url;
342 for (rapidjson::SizeType i = 0; i < related_urls.Size() && data_access_url.empty(); i++) {
343 rapidjson::Value &obj = related_urls[i];
344 mitr = obj.FindMember("URL");
345 if (mitr == obj.MemberEnd()) {
346 stringstream err;
347 err << "Error! The umm/RelatedUrls[" << i << "] does not contain the URL object";
348 throw BESInternalError(err.str(), __FILE__, __LINE__);
349 }
350
351 const rapidjson::Value &r_url = mitr->value;
352
353 mitr = obj.FindMember("Type");
354 if (mitr == obj.MemberEnd()) {
355 stringstream err;
356 err << "Error! The umm/RelatedUrls[" << i << "] does not contain the Type object";
357 throw BESInternalError(err.str(), __FILE__, __LINE__);
358 }
359
360 const rapidjson::Value &r_type = mitr->value;
361
362 bool noSubtype = obj.FindMember("Subtype") == obj.MemberEnd();
363
364 BESDEBUG(MODULE, prolog << "RelatedUrl Object:" <<
365 " URL: '" << r_url.GetString() << "'" <<
366 " Type: '" << r_type.GetString() << "'" <<
367 " SubType: '" << (noSubtype ? "Absent" : "Present") << "'" << endl);
368
369 if ((r_type.GetString() == string(CMR_URL_TYPE_GET_DATA)) && noSubtype) {
370
371 // Because a member of RelatedUrls may contain a URL of Type GET DATA with the s3:// protocol
372 // as well as a Type GET DATA URL which uses https:// or http://
373 // Added test that the URL does not end in 'xml' to avoid the LPDAAC .cmr.xml records. jhrg 5/22/24
374 string candidate_url = r_url.GetString();
375
376 if ((candidate_url.rfind("https://", 0) == 0 || candidate_url.rfind("http://", 0) == 0)
377 && candidate_url.find(".xml", candidate_url.size()-5) == string::npos) {
378 data_access_url = candidate_url;
379 }
380 }
381 }
382
383 if (data_access_url.empty()) {
384 throw BESInternalError(string("ERROR! Failed to locate a data access URL for the path: ") + rest_path,
385 __FILE__, __LINE__);
386 }
387
388 return data_access_url;
389 }
390}
391
414string NgapApi::convert_ngap_resty_path_to_data_access_url(const std::string &restified_path) {
415 BESDEBUG(MODULE, prolog << "BEGIN" << endl);
416 string data_access_url;
417
418 string cmr_query_url = build_cmr_query_url(restified_path);
419
420 BESDEBUG(MODULE, prolog << "CMR Request URL: " << cmr_query_url << endl);
421
422 string cmr_json_string;
423 try {
424 curl::http_get(cmr_query_url, cmr_json_string);
425 }
426 catch (http::HttpError &http_error) {
427 string err_msg = prolog + "Hyrax encountered a Service Chaining Error while "
428 "attempting to retrieve a CMR record. " + http_error.get_message();
429 http_error.set_message(err_msg);
430 throw;
431 }
432
433 rapidjson::Document cmr_response;
434 cmr_response.Parse(cmr_json_string.c_str());
435 data_access_url = find_get_data_url_in_granules_umm_json_v1_4(restified_path, cmr_response);
436
437 BESDEBUG(MODULE, prolog << "END (data_access_url: " << data_access_url << ")" << endl);
438
439 return data_access_url;
440}
441
453 bool is_expired;
454 time_t now;
455 time(&now); /* get current time; same as: timer = time(NULL) */
456 BESDEBUG(MODULE, prolog << "now: " << now << endl);
457
458 time_t expires = now;
459 string cf_expires = signed_url.query_parameter_value(CLOUDFRONT_EXPIRES_HEADER_KEY);
460 string aws_expires = signed_url.query_parameter_value(AMS_EXPIRES_HEADER_KEY);
461 time_t ingest_time = signed_url.ingest_time();
462
463 // If both cf_expires and aws_expires are empty, this code returns true. jhrg 10/13/23
464 if (!cf_expires.empty()) { // CloudFront expires header?
465 expires = stoll(cf_expires);
466 BESDEBUG(MODULE, prolog << "Using " << CLOUDFRONT_EXPIRES_HEADER_KEY << ": " << expires << endl);
467 } else if (!aws_expires.empty()) {
468 // AWS Expires header?
469 //
470 // By default we'll use the time we made the URL object, ingest_time
471 time_t start_time = ingest_time;
472 // But if there's an AWS Date we'll parse that and compute the time
473 string aws_date = signed_url.query_parameter_value(AWS_DATE_HEADER_KEY);
474 if (!aws_date.empty()) {
475 string year = aws_date.substr(0, 4);
476 string month = aws_date.substr(4, 2);
477 string day = aws_date.substr(6, 2);
478 string hour = aws_date.substr(9, 2);
479 string minute = aws_date.substr(11, 2);
480 string second = aws_date.substr(13, 2);
481
482 BESDEBUG(MODULE, prolog << "date: " << aws_date <<
483 " year: " << year << " month: " << month << " day: " << day <<
484 " hour: " << hour << " minute: " << minute << " second: " << second << endl);
485
486 struct tm ti{}; // NB: Calling gmtime_r() is an initialization hack since some fields are not set here.
487 if (gmtime_r(&now, &ti) == nullptr)
488 throw BESInternalError("Could not get the current time, gmtime_r() failed!", __FILE__, __LINE__);
489 ti.tm_year = stoi(year) - 1900;
490 ti.tm_mon = stoi(month) - 1;
491 ti.tm_mday = stoi(day);
492 ti.tm_hour = stoi(hour);
493 ti.tm_min = stoi(minute);
494 ti.tm_sec = stoi(second);
495
496 BESDEBUG(MODULE, prolog << "ti.tm_year: " << ti.tm_year <<
497 " ti.tm_mon: " << ti.tm_mon <<
498 " ti.tm_mday: " << ti.tm_mday <<
499 " ti.tm_hour: " << ti.tm_hour <<
500 " ti.tm_min: " << ti.tm_min <<
501 " ti.tm_sec: " << ti.tm_sec << endl);
502
503 start_time = mktime(&ti);
504 BESDEBUG(MODULE, prolog << "AWS (computed) start_time: " << start_time << endl);
505 }
506
507 expires = start_time + stoll(aws_expires);
508 BESDEBUG(MODULE, prolog << "Using " << AMS_EXPIRES_HEADER_KEY << ": " << aws_expires <<
509 " (expires: " << expires << ")" << endl);
510 }
511
512 // If both cf_expires and aws_expires are empty, 'expires' == 'now' and 'remaining' is 0 so
513 // this code returns true. jhrg 10/13/23
514 time_t remaining = expires - now;
515 BESDEBUG(MODULE, prolog << "expires_time: " << expires <<
516 " remaining_time: " << remaining <<
517 " refresh_threshold: " << REFRESH_THRESHOLD << endl);
518
519 is_expired = remaining < REFRESH_THRESHOLD;
520 BESDEBUG(MODULE, prolog << "is_expired: " << (is_expired ? "true" : "false") << endl);
521
522 return is_expired;
523}
524
525} // namespace ngap
526
std::string get_message() const
get the error message for this exception
Definition BESError.h:132
void set_message(const std::string &msg)
set the error message for this exception
Definition BESError.h:108
exception thrown if internal error encountered
static std::string assemblePath(const std::string &firstPart, const std::string &secondPart, bool leadingSlash=false, bool trailingSlash=false)
Assemble path fragments making sure that they are separated by a single '/' character.
Definition BESUtil.cc:804
static TheBESKeys * TheKeys()
Access to the singleton.
Definition TheBESKeys.cc:85
static std::string read_string_key(const std::string &key, const std::string &default_value)
Read a string-valued key from the bes.conf file.
Parse a URL into the protocol, host, path and query parts.
Definition url_impl.h:44
virtual std::string query_parameter_value(const std::string &key) const
Get the value of a query string key.
Definition url_impl.cc:167
static bool signed_url_is_expired(const http::url &signed_url)
Has the signed S3 URL expired? If neither the CloudFront Expires header nor the AWS Expires header ar...
Definition NgapApi.cc:452
static std::string convert_ngap_resty_path_to_data_access_url(const std::string &restified_path)
Converts an NGAP restified granule path into a CMR metadata query for the granule.
Definition NgapApi.cc:414