bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
NgapOwnedContainer.cc
1// NgapOwnedContainer.cc
2
3// -*- mode: c++; c-basic-offset:4 -*-
4
5// This file is part of ngap_module, A C++ module that can be loaded in to
6// the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
7
8// Copyright (c) 2020, 2024 OPeNDAP, Inc.
9// Author: Nathan Potter <ndp@opendap.org>
10// James Gallagher <jgallagher@opendap.org>
11//
12// This library is free software; you can redistribute it and/or
13// modify it under the terms of the GNU Lesser General Public
14// License as published by the Free Software Foundation; either
15// version 2.1 of the License, or (at your option) any later version.
16//
17// This library is distributed in the hope that it will be useful,
18// but WITHOUT ANY WARRANTY; without even the implied warranty of
19// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20// Lesser General Public License for more details.
21//
22// You should have received a copy of the GNU Lesser General Public
23// License along with this library; if not, write to the Free Software
24// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25//
26// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
27// Authors:
28// ndp Nathan Potter <ndp@opendap.org>
29// jhrg James Gallagher <jgallagher@opendap.org>
30
31#include "config.h"
32
33#include <sys/stat.h>
34#include <unistd.h>
35
36#include <sstream>
37#include <string>
38
39#include "BESStopWatch.h"
40#include "BESUtil.h"
41#include "CurlUtils.h"
42#include "BESContextManager.h"
43#include "HttpError.h"
44#include "BESLog.h"
45#include "TheBESKeys.h"
46#include "BESSyntaxUserError.h"
47#include "BESDebug.h"
48
49#include "NgapRequestHandler.h"
50#include "NgapOwnedContainer.h"
51#include "NgapApi.h"
52#include "NgapNames.h"
53
54#define prolog std::string("NgapOwnedContainer::").append(__func__).append("() - ")
55// CACHE_LOG is defined separately from INFO_LOG so that we can turn it off easily. jhrg 11/19/23
56#define CACHE_LOG(x) INFO_LOG(x)
57
58using namespace std;
59using namespace bes;
60
61namespace ngap {
62
63// This data source location currently (8/10/24) is a S3 bucket where the DMR++ files are stored
64// for the OPeNDAP-owned data used by the tests. jhrg 8/10/24
65std::string NgapOwnedContainer::d_data_source_location = "https://cloudydap.s3.us-east-1.amazonaws.com";
66bool NgapOwnedContainer::d_use_opendap_bucket = true;
67bool NgapOwnedContainer::d_inject_data_url = true;
68
80NgapOwnedContainer::NgapOwnedContainer(const string &sym_name, const string &real_name, const string &)
81 : BESContainer(sym_name, real_name, "owned-ngap"), d_ngap_path(real_name) {
82 NgapOwnedContainer::d_data_source_location
83 = TheBESKeys::read_string_key(DATA_SOURCE_LOCATION, NgapOwnedContainer::d_data_source_location);
84 NgapOwnedContainer::d_use_opendap_bucket
85 = TheBESKeys::read_bool_key(USE_OPENDAP_BUCKET, NgapOwnedContainer::d_use_opendap_bucket);
86 NgapOwnedContainer::d_inject_data_url
87 = TheBESKeys::read_bool_key(NGAP_INJECT_DATA_URL_KEY, NgapOwnedContainer::d_inject_data_url);
88}
89
96bool NgapOwnedContainer::file_to_string(int fd, string &content) {
97 // The file size is needed later; this doubles as a check that the file in open.
98 struct stat statbuf = {};
99 if (fstat(fd, &statbuf) < 0) {
100 ERROR_LOG("NgapOwnedContainer::file_to_string() - failed to get file descriptor status\n");
101 return false;
102 }
103
104 // read the data in 4k chunks
105 vector<char> buffer(4096);
106 ssize_t bytes_read;
107 while ((bytes_read = read(fd, buffer.data(), buffer.size())) > 0) {
108 content.append(buffer.data(), bytes_read);
109 }
110
111 // did we get it all
112 if (statbuf.st_size != content.size()) {
113 ERROR_LOG("NgapOwnedContainer::file_to_string() - failed to read all bytes from file cache\n");
114 return false;
115 }
116
117 return true;
118}
119
133string NgapOwnedContainer::build_data_url_to_daac_bucket(const string &rest_path) {
134 BES_MODULE_TIMING(prolog + rest_path);
135
136 bool found;
137 string uid = BESContextManager::TheManager()->get_context(EDL_UID_KEY, found);
138 BESDEBUG(MODULE, prolog << "EDL_UID_KEY(" << EDL_UID_KEY << "): " << uid << endl);
139
140 // If using the cache, look there. Note that the UID is part of the key to the cached data.
141 string url_key = rest_path + ':' + uid;
142 string data_url;
143 if (NgapRequestHandler::d_use_cmr_cache) {
144 if (NgapRequestHandler::d_cmr_mem_cache.get(url_key, data_url)) {
145 CACHE_LOG(prolog + "CMR Cache hit, translated URL: " + data_url + '\n');
146 return data_url;
147 } else {
148 CACHE_LOG(prolog + "CMR Cache miss, REST path: " + url_key + '\n');
149 }
150 }
151
152 // Not cached or not using the cache; ask CMR. Throws on lookup failure, HTTP failure. jhrg 1/24/25
154
155 // If using the CMR cache, cache the response.
156 if (NgapRequestHandler::d_use_cmr_cache) {
157 NgapRequestHandler::d_cmr_mem_cache.put(url_key, data_url);
158 CACHE_LOG(prolog + "CMR Cache put, translated URL: " + data_url + '\n');
159 }
160
161 return data_url;
162}
163
170string NgapOwnedContainer::build_dmrpp_url_to_owned_bucket(const string &rest_path, const string &data_source) {
171 // The PATH part of a URL to the NGAP/DMR++ is an 'NGAP REST path' that has the form:
172 // /collections/<ccid>/granules/<granule_id>. In our 'owned' S3 bucket, we use object
173 // names of the form: /<ccid>/<granule_id>.dmrpp.
174 BES_MODULE_TIMING(prolog + rest_path);
175
176 auto parts = BESUtil::split(rest_path, '/');
177 if (parts.size() != 4 || parts[0] != "collections" || parts[2] != "granules") {
178 throw BESSyntaxUserError("Invalid NGAP path: " + rest_path, __FILE__, __LINE__);
179 }
180
181 string dmrpp_name = parts[1] + '/' + parts[3] + ".dmrpp";
182
183 // http://<bucket_name>.s3.amazonaws.com/<object_key>
184 // Chane so the first part is read from a configuration file.
185 // That way it can be a file:// URL for testing, and later can be set
186 // in other ways. jhrg 5/1/24
187 string dmrpp_url_str = data_source + '/' + dmrpp_name;
188
189 return dmrpp_url_str;
190}
191
192bool NgapOwnedContainer::get_item_from_dmrpp_cache(string &dmrpp_string) const {
193
194 // Read the cache entry if it exists. jhrg 4/29/24
195 if (NgapRequestHandler::d_dmrpp_mem_cache.get(get_real_name(), dmrpp_string)) {
196 CACHE_LOG(prolog + "Memory Cache hit, DMR++: " + get_real_name() + '\n');
197 return true;
198 }
199 else {
200 CACHE_LOG(prolog + "Memory Cache miss, DMR++: " + get_real_name() + '\n');
201 }
202
203 // Before going over the network to get the DMR++, look in the FileCache.
204 // If found, put it in the memory cache and return it as a string.
205
206 FileCache::Item item;
207 if (NgapRequestHandler::d_dmrpp_file_cache.get(FileCache::hash_key(get_real_name()), item)) { // got it
208 // read data from the file into the string.
209 CACHE_LOG(prolog + "File Cache hit, DMR++: " + get_real_name() + '\n');
210 if (file_to_string(item.get_fd(), dmrpp_string)) {
211 // put it in the memory cache
212 NgapRequestHandler::d_dmrpp_mem_cache.put(get_real_name(), dmrpp_string);
213 CACHE_LOG(prolog + "Memory Cache put, DMR++: " + get_real_name() + '\n');
214 return true;
215 }
216 else {
217 ERROR_LOG(prolog + "Failed to read DMR++ from file cache\n");
218 return false;
219 }
220 }
221 else {
222 CACHE_LOG(prolog + "File Cache miss, DMR++: " + get_real_name() + '\n');
223 }
224
225 return false;
226}
227
228bool NgapOwnedContainer::put_item_in_dmrpp_cache(const std::string &dmrpp_string) const
229{
230 if (NgapRequestHandler::d_dmrpp_file_cache.put_data(FileCache::hash_key(get_real_name()), dmrpp_string)) {
231 CACHE_LOG(prolog + "File Cache put, DMR++: " + get_real_name() + '\n');
232 }
233 else {
234 // This might not be an error - put_data() records errors. jhrg 2/13/25
235 CACHE_LOG(prolog + "Failed to put DMR++ in file cache\n");
236 return false;
237 }
238
239 if (!NgapRequestHandler::d_dmrpp_file_cache.purge()) {
240 ERROR_LOG(prolog + "Call to FileCache::purge() failed\n");
241 }
242
243 NgapRequestHandler::d_dmrpp_mem_cache.put(get_real_name(), dmrpp_string);
244 CACHE_LOG(prolog + "Memory Cache put, DMR++: " + get_real_name() + '\n');
245
246 return true;
247}
248
261void NgapOwnedContainer::filter_response(const map <string, string, std::less<>> &content_filters, string &content) {
262 for (const auto &filter: content_filters) {
263 unsigned int replace_count = BESUtil::replace_all(content, filter.first, filter.second);
264 BESDEBUG(MODULE, prolog << "Replaced " << replace_count << " instance(s) of template(" << filter.first
265 << ") with " << filter.second << " in cached RemoteResource" << endl);
266 }
267}
268
275bool NgapOwnedContainer::get_daac_content_filters(const string &data_url, map<string, string, std::less<>> &content_filters) {
276 if (NgapOwnedContainer::d_inject_data_url) {
277 // data_url was get_real_name(). jhrg 8/9/24
278 const string missing_data_url_str = data_url + "_mvs.h5";
279 const string href = R"(href=")";
280 const string trusted_url_hack = R"(" dmrpp:trust="true")";
281 const string data_access_url_key = href + DATA_ACCESS_URL_KEY + "\"";
282 const string data_access_url_with_trusted_attr_str = href + data_url + trusted_url_hack;
283 const string missing_data_access_url_key = href + MISSING_DATA_ACCESS_URL_KEY + "\"";
284 const string missing_data_url_with_trusted_attr_str = href + missing_data_url_str + trusted_url_hack;
285
286 content_filters.clear();
287 content_filters.insert(pair<string, string>(data_access_url_key, data_access_url_with_trusted_attr_str));
288 content_filters.insert(pair<string, string>(missing_data_access_url_key, missing_data_url_with_trusted_attr_str));
289 return true;
290 }
291
292 return false;
293}
294
303bool NgapOwnedContainer::get_opendap_content_filters(map<string, string, std::less<>> &content_filters) {
304 if (NgapOwnedContainer::d_inject_data_url) { // Hmmm, this is a bit of a hack. jhrg 8/22/24
305
306 const string version_attribute = "dmrpp:version";
307 const string trusted_attribute = R"(dmrpp:trust="true" )";
308 // The 'trust' attribute is inserted _before_ the 'version' attribute. jhrg 8/22/24
309 const string trusted_and_version = trusted_attribute + version_attribute;
310
311 content_filters.clear();
312 content_filters.insert(pair<string, string>(version_attribute, trusted_and_version));
313
314 return true;
315 }
316
317 return false;
318}
319
326bool NgapOwnedContainer::dmrpp_read_from_opendap_bucket(string &dmrpp_string) const {
327 BES_MODULE_TIMING(prolog + get_real_name());
328 bool dmrpp_read = false;
329 try {
330 string dmrpp_url_str = build_dmrpp_url_to_owned_bucket(get_real_name(), get_data_source_location());
331 INFO_LOG(prolog + "Look in the OPeNDAP-bucket for the DMRpp for: " + dmrpp_url_str);
332 curl::http_get(dmrpp_url_str, dmrpp_string);
333 map <string, string, std::less<>> content_filters;
334 if (!get_opendap_content_filters(content_filters)) {
335 throw BESInternalError("Could not build opendap content filters for DMR++", __FILE__, __LINE__);
336 }
337 filter_response(content_filters, dmrpp_string);
338 INFO_LOG(prolog + "Found the DMRpp in the OPeNDAP-bucket for: " + dmrpp_url_str);
339 dmrpp_read = true;
340 }
341 catch (http::HttpError &http_error) {
342 // Assumption - when S3 returns a 404, the things is not there. jhrg 8/9/24
343 // But, sometimes AWS/S3 returns 400 for a missing object. jhrg 9/10/24
344 //
345 // for 400 and 500 errors, try the DAAC bucket.
346 // for a 404, do not log an error, just return false.
347 // for other errors, log the error and return false
348 switch (http_error.http_status()) {
349 case 400:
350 case 401:
351 case 403:
352 ERROR_LOG(prolog + "Looked in the OPeNDAP bucket for the DMRpp for: " + get_real_name()
353 + " but got HTTP Status: " + std::to_string(http_error.http_status()));
354 dmrpp_string.clear(); // ...because S3 puts an error message in the string. jhrg 8/9/24
355 dmrpp_read = false;
356 break;
357
358 case 404:
359 dmrpp_string.clear(); // ...because S3 puts an error message in the string. jhrg 8/9/24
360 dmrpp_read = false;
361 break;
362
363 default:
364 http_error.set_message(http_error.get_message()
365 + ". This error for a OPeNDAP-owned DMR++ could be from Hyrax or S3.");
366 throw;
367 }
368 }
369
370 return dmrpp_read;
371}
372
378void NgapOwnedContainer::dmrpp_read_from_daac_bucket(string &dmrpp_string) const {
379 BES_MODULE_TIMING(prolog + get_real_name());
380 // This code may ask CMR and will throw exceptions that mention CMR on error. jhrg 1/24/25
381 string data_url = build_data_url_to_daac_bucket(get_real_name());
382 string dmrpp_url_str = data_url + ".dmrpp"; // This is the URL to the DMR++ in the DAAC-owned bucket. jhrg 8/9/24
383 INFO_LOG(prolog + "Look in the DAAC-bucket for the DMRpp for: " + dmrpp_url_str);
384
385 try {
386 curl::http_get(dmrpp_url_str, dmrpp_string);
387 // filter the DMRPP from the DAAC's bucket to replace the template href with the data_url
388 map <string, string, std::less<>> content_filters;
389 if (!get_daac_content_filters(data_url, content_filters)) {
390 throw BESInternalError("Could not build content filters for DMR++", __FILE__, __LINE__);
391 }
392 filter_response(content_filters, dmrpp_string);
393 INFO_LOG(prolog + "Found the DMRpp in the DAAC-bucket for: " + dmrpp_url_str);
394 }
395 catch (http::HttpError &http_error) {
396 http_error.set_message(http_error.get_message() + "NgapOwnedContainer::dmrpp_read_from_daac_bucket() failed to read the DMR++ from S3.");
397 throw;
398 }
399}
400
416bool NgapOwnedContainer::get_dmrpp_from_cache_or_remote_source(string &dmrpp_string) const {
417 BES_MODULE_TIMING(prolog + get_real_name());
418
419 // If the DMR++ is cached, return it. NB: This cache holds OPeNDAP- and DAAC-owned DMR++ documents.
420 if (NgapRequestHandler::d_use_dmrpp_cache && get_item_from_dmrpp_cache(dmrpp_string)) {
421 return true;
422 }
423 else {
424 // Else, the DMR++ is neither in the memory cache nor the file cache.
425 // Read it from S3, etc., and filter it. Put it in the memory cache
426 bool dmrpp_read = false;
427
428 // If the server is set up to try the OPeNDAP bucket, look there first.
429 if (NgapOwnedContainer::d_use_opendap_bucket) {
430 // If we get the DMR++ from the OPeNDAP bucket, set dmrpp_read to true so
431 // we don't also try the DAAC bucket.
432 dmrpp_read = dmrpp_read_from_opendap_bucket(dmrpp_string);
433 }
434
435 // Try the DAAC bucket if either the OPeNDAP bucket is not used or the OPeNDAP bucket failed
436 if (!dmrpp_read) {
437 dmrpp_read_from_daac_bucket(dmrpp_string);
438 }
439 }
440
441 // if we get here, the DMR++ has been pulled over the network. Put it in both caches.
442 // The memory cache is for use by this process, the file cache for other processes/VMs
443 if (NgapRequestHandler::d_use_dmrpp_cache && !put_item_in_dmrpp_cache(dmrpp_string)) {
444 return false;
445 }
446
447 return true;
448}
449
461
462 string dmrpp_string;
463
464 // Get the DMR++ from the S3 bucket or the cache.
465 // get_dmrpp...() returns false for various caching errors, but throws if it cannot
466 // get the remote DMR++. jhrg 4/29/24
467 get_dmrpp_from_cache_or_remote_source(dmrpp_string);
468
469 set_attributes("as-string"); // This means access() returns a string. jhrg 10/19/23
470 // Originally, this was either hard-coded (as it is now) or was set using the 'extension'
471 // on the URL. But it's always a DMR++. jhrg 11/16/23
472 set_container_type("dmrpp");
473
474 return dmrpp_string;
475}
476
484void NgapOwnedContainer::dump(ostream &strm) const {
485 strm << BESIndent::LMarg << "NgapOwnedContainer::dump - (" << (void *) this << ")\n";
486 BESIndent::Indent();
487 BESContainer::dump(strm);
488 BESIndent::UnIndent();
489}
490
491} // namespace ngap
void set_container_type(const std::string &type)
set the type of data that this container represents, such as cedar or netcdf.
void set_attributes(const std::string &attrs)
set desired attributes for this container
void dump(std::ostream &strm) const override
dumps information about this object
std::string get_real_name() const
retrieve the real name for this container, such as a file name.
virtual std::string get_context(const std::string &name, bool &found)
retrieve the value of the specified context from the BES
std::string get_message() const
get the error message for this exception
Definition BESError.h:132
void set_message(const std::string &msg)
set the error message for this exception
Definition BESError.h:108
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
Definition BESUtil.cc:1068
static unsigned int replace_all(std::string &s, std::string find_this, std::string replace_with_this)
Operates on the string 's' to replaces every occurrence of the value of the string 'find_this' with t...
Definition BESUtil.cc:924
static std::string hash_key(const std::string &key, bool log_it=false)
Return a SHA256 hash of the given key.
Definition FileCache.h:314
static bool read_bool_key(const std::string &key, bool default_value)
Read a boolean-valued key from the bes.conf file.
static std::string read_string_key(const std::string &key, const std::string &default_value)
Read a string-valued key from the bes.conf file.
static std::string convert_ngap_resty_path_to_data_access_url(const std::string &restified_path)
Converts an NGAP restified granule path into a CMR metadata query for the granule.
Definition NgapApi.cc:414
void dump(std::ostream &strm) const override
dumps information about this object
std::string access() override
Get the DMR++ from a remote source or a local cache.