bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
url_impl.cc
1
2// -*- mode: c++; c-basic-offset:4 -*-
3
4// This file is part of the BES http package, part of the Hyrax data server.
5
6// Copyright (c) 2020 OPeNDAP, Inc.
7// Author: Nathan Potter <ndp@opendap.org>
8//
9// This library is free software; you can redistribute it and/or
10// modify it under the terms of the GNU Lesser General Public
11// License as published by the Free Software Foundation; either
12// version 2.1 of the License, or (at your option) any later version.
13//
14// This library is distributed in the hope that it will be useful,
15// but WITHOUT ANY WARRANTY; without even the implied warranty of
16// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17// Lesser General Public License for more details.
18//
19// You should have received a copy of the GNU Lesser General Public
20// License along with this library; if not, write to the Free Software
21// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22//
23// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24
25// Authors:
26// ndp Nathan Potter <ndp@opendap.org>
27
28#include "config.h"
29
30#include <string>
31#include <sstream>
32#include <map>
33#include <vector>
34#include <algorithm>
35#include <cctype>
36#include <functional>
37#include <ctime>
38
39#include "BESDebug.h"
40#include "BESUtil.h"
41#include "BESCatalogList.h"
42#include "HttpNames.h"
43
44#include "url_impl.h"
45
46using namespace std;
47using std::chrono::system_clock;
48
49#define MODULE HTTP_MODULE
50#define prolog string("url::").append(__func__).append("() - ")
51
52namespace http {
53
59void url::parse() {
60 const string protocol_end("://");
61 BESDEBUG(MODULE, prolog << "BEGIN (parsing: '" << d_source_url_str << "')" << endl);
62
63 // If the supplied string does not start with a protocol, we assume it must be a
64 // path relative the BES.Catalog.catalog.RootDirectory because that's the only
65 // thing we are going to allow, even when it starts with slash '/'. Basically
66 // we force it to be in the BES.Catalog.catalog.RootDirectory tree.
67 if(d_source_url_str.find(protocol_end) == string::npos){
68 // Since we want a valid path in the file system tree for data, we make it so by adding
69 // the file path that starts with the catalog root dir.
70 const BESCatalogList *bcl = BESCatalogList::TheCatalogList();
71 string default_catalog_name = bcl->default_catalog_name();
72 BESDEBUG(MODULE, prolog << "Searching for catalog: " << default_catalog_name << endl);
73 const BESCatalog *bcat = bcl->find_catalog(default_catalog_name);
74 if (bcat) {
75 BESDEBUG(MODULE, prolog << "Found catalog: " << bcat->get_catalog_name() << endl);
76 } else {
77 string msg = "OUCH! Unable to locate default catalog!";
78 BESDEBUG(MODULE, prolog << msg << endl);
79 throw BESInternalError(msg, __FILE__, __LINE__);
80 }
81 string catalog_root = bcat->get_root();
82 BESDEBUG(MODULE, prolog << "Catalog root: " << catalog_root << endl);
83
84 string file_path = BESUtil::pathConcat(catalog_root,d_source_url_str);
85 if(file_path[0] != '/')
86 file_path = "/" + file_path;
87 d_source_url_str = FILE_PROTOCOL + file_path;
88 }
89
90 const string parse_url_target(d_source_url_str);
91
92 auto prot_i = search(parse_url_target.cbegin(), parse_url_target.cend(),
93 protocol_end.begin(), protocol_end.end());
94
95 if (prot_i != parse_url_target.end())
96 advance(prot_i, protocol_end.size());
97
98 d_protocol.reserve(distance(parse_url_target.begin(), prot_i));
99 transform(parse_url_target.begin(), prot_i,
100 back_inserter(d_protocol),
101 [](int c) { return tolower(c); }); // protocol is icase
102 if (prot_i == parse_url_target.end())
103 return;
104
105 if (d_protocol == FILE_PROTOCOL) {
106 d_path = parse_url_target.substr(d_protocol.size());
107 BESDEBUG(MODULE, prolog << "FILE_PROTOCOL d_path: " << d_path << endl);
108 }
109 else if( d_protocol == HTTP_PROTOCOL || d_protocol == HTTPS_PROTOCOL){
110 // parse the host
111 const auto path_i = find(prot_i, parse_url_target.cend(), '/');
112 d_host.reserve(distance(prot_i, path_i));
113 transform(prot_i, path_i, back_inserter(d_host), [](int c) { return tolower(c); });
114 // parse the path
115 auto query_i = find(path_i, parse_url_target.cend(), '?');
116 d_path.assign(path_i, query_i);
117 // extract the query string
118 if (query_i != parse_url_target.cend())
119 ++query_i;
120 d_query.assign(query_i, parse_url_target.cend());
121
122 // parse the query string KVPs
123 if (!d_query.empty()) {
124 parse_query_string();
125 }
126 }
127 else {
128 stringstream msg;
129 msg << prolog << "Unsupported URL protocol " << d_protocol << " found in URL: " << d_source_url_str;
130 BESDEBUG(MODULE, msg.str() << endl);
131 throw BESInternalError(msg.str(), __FILE__, __LINE__);
132 }
133
134 BESDEBUG(MODULE, prolog << "END (parsing: '" << d_source_url_str << "')" << endl);
135}
136
140void url::parse_query_string() {
141 vector<string> records;
142 string delimiters = "&";
143 BESUtil::tokenize(d_query, records, delimiters);
144 for (const auto &kvp: records) {
145 size_t index = kvp.find('=');
146 if (index != string::npos) {
147 string key = kvp.substr(0, index);
148 string value = kvp.substr(index + 1);
149 BESDEBUG(MODULE, prolog << "key: " << key << " value: " << value << endl);
150
151 const auto &record_it = d_query_kvp.find(key);
152 if (record_it != d_query_kvp.end()) {
153 record_it->second.push_back(value);
154 } else {
155 vector<string> values{value};
156 d_query_kvp[key] = values;
157 }
158 }
159 }
160}
161
167string url::query_parameter_value(const string &key) const {
168 const auto &it = d_query_kvp.find(key);
169 if (it != d_query_kvp.end()) {
170 vector<string> values = it->second;
171 if (!it->second.empty()) {
172 return it->second[0];
173 }
174 }
175 return "";
176}
177
183size_t url::query_parameter_values_size(const std::string &key) const {
184 const auto &it = d_query_kvp.find(key);
185 if (it != d_query_kvp.end()) {
186 return it->second.size();
187 }
188 return 0;
189}
190
198const vector<string> &url::query_parameter_values(const std::string &key) const {
199 const auto &it = d_query_kvp.find(key);
200 if (it != d_query_kvp.end()) {
201 return it->second;
202 }
203 else {
204 throw BESInternalError(string("Key '") + key + "' not found in url::query_parameter_values().", __FILE__, __LINE__);
205 }
206}
207
213{
214 bool stale;
215 std::time_t now = system_clock::to_time_t(system_clock::now());
216
217 BESDEBUG(MODULE, prolog << "now: " << now << endl);
218 // We set the expiration time to the default, in case other avenues don't work out so well.
219 std::time_t expires_time = ingest_time() + HTTP_EFFECTIVE_URL_DEFAULT_EXPIRES_INTERVAL;
220
221 string cf_expires = query_parameter_value(CLOUDFRONT_EXPIRES_HEADER_KEY);
222 string aws_expires_str = query_parameter_value(AMS_EXPIRES_HEADER_KEY);
223
224 if (!cf_expires.empty()) { // CloudFront expires header?
225 std::istringstream(cf_expires) >> expires_time;
226 BESDEBUG(MODULE, prolog << "Using " << CLOUDFRONT_EXPIRES_HEADER_KEY << ": " << expires_time << endl);
227 }
228 else if (!aws_expires_str.empty()) {
229 long long aws_expires;
230 std::istringstream(aws_expires_str) >> aws_expires;
231 // AWS Expires header?
232 //
233 // By default, we'll use the time we made the URL object, ingest_time
234 std::time_t aws_start_time = ingest_time();
235
236 // But if there's an AWS Date we'll parse that and compute the time
237 string aws_date = query_parameter_value(AWS_DATE_HEADER_KEY);
238
239 if (!aws_date.empty()) {
240 // aws_date looks like: 20200624T175046Z
241 string year = aws_date.substr(0, 4);
242 string month = aws_date.substr(4, 2);
243 string day = aws_date.substr(6, 2);
244 string hour = aws_date.substr(9, 2);
245 string minute = aws_date.substr(11, 2);
246 string second = aws_date.substr(13, 2);
247
248 BESDEBUG(MODULE, prolog << "date: " << aws_date <<
249 " year: " << year << " month: " << month << " day: " << day <<
250 " hour: " << hour << " minute: " << minute << " second: " << second << endl);
251
252 std::time_t old_now;
253 time(&old_now); /* get current time; same as: timer = time(NULL) */
254 BESDEBUG(MODULE, prolog << "old_now: " << old_now << endl);
255 struct tm ti{};
256 gmtime_r(&old_now, &ti);
257 ti.tm_year = stoi(year) - 1900;
258 ti.tm_mon = stoi(month) - 1;
259 ti.tm_mday = stoi(day);
260 ti.tm_hour = stoi(hour);
261 ti.tm_min = stoi(minute);
262 ti.tm_sec = stoi(second);
263
264 BESDEBUG(MODULE, prolog << "ti.tm_year: " << ti.tm_year <<
265 " ti.tm_mon: " << ti.tm_mon <<
266 " ti.tm_mday: " << ti.tm_mday <<
267 " ti.tm_hour: " << ti.tm_hour <<
268 " ti.tm_min: " << ti.tm_min <<
269 " ti.tm_sec: " << ti.tm_sec << endl);
270
271 aws_start_time = mktime(&ti);
272 BESDEBUG(MODULE, prolog << "AWS start_time (computed): " << aws_start_time << endl);
273 }
274
275 expires_time = aws_start_time + aws_expires;
276 BESDEBUG(MODULE, prolog << "Using " << AMS_EXPIRES_HEADER_KEY << ": " << aws_expires <<
277 " (expires_time: " << expires_time << ")" << endl);
278 }
279
280 std::time_t remaining = expires_time - now;
281 BESDEBUG(MODULE, prolog << "expires_time: " << expires_time <<
282 " remaining: " << remaining <<
283 " threshold: " << HTTP_URL_REFRESH_THRESHOLD << endl);
284
285 stale = remaining < HTTP_URL_REFRESH_THRESHOLD;
286 BESDEBUG(MODULE, prolog << "stale: " << (stale ? "true" : "false") << endl);
287
288 return stale;
289}
290
295string url::dump(){
296 stringstream ss;
297 string indent = " ";
298
299 ss << "http::url [" << this << "] " << endl;
300 ss << indent << "d_source_url_str: " << d_source_url_str << endl;
301 ss << indent << "d_protocol: " << d_protocol << endl;
302 ss << indent << "d_host: " << d_host << endl;
303 ss << indent << "d_path: " << d_path << endl;
304 ss << indent << "d_query: " << d_query << endl;
305
306 string idt = indent+indent;
307 for(const auto &it: d_query_kvp) {
308 ss << indent << "d_query_kvp["<<it.first<<"]: " << endl;
309 int i = 0;
310 for(const auto &v: it.second) { // second is a vector<string>
311 ss << idt << "value[" << i << "]: " << v << endl;
312 i += 1;
313 }
314 }
315 ss << indent << "d_ingest_time: " << d_ingest_time.time_since_epoch().count() << endl;
316 return ss.str();
317}
318
319} // namespace http
virtual std::string default_catalog_name() const
The name of the default catalog.
virtual std::string get_root() const =0
virtual std::string get_catalog_name() const
Get the name for this catalog.
Definition BESCatalog.h:102
exception thrown if internal error encountered
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition BESUtil.cc:995
static std::string pathConcat(const std::string &firstPart, const std::string &secondPart, char separator='/')
Concatenate path fragments making sure that they are separated by a single '/' character.
Definition BESUtil.cc:754
virtual size_t query_parameter_values_size(const std::string &key) const
Return the number of query string values for a given key .
Definition url_impl.cc:183
virtual std::string query_parameter_value(const std::string &key) const
Get the value of a query string key.
Definition url_impl.cc:167
virtual std::string dump()
Definition url_impl.cc:295
virtual const std::vector< std::string > & query_parameter_values(const std::string &key) const
Get the vector of query string values for a given key.
Definition url_impl.cc:198
virtual bool is_expired()
Definition url_impl.cc:212
utility class for the HTTP catalog module
Definition TheBESKeys.h:51