bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
HttpdDirScraper.cc
1// -*- mode: c++; c-basic-offset:4 -*-
2//
3// This file is part of httpd_catalog_module, A C++ module that can be loaded in to
4// the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5//
6// Copyright (c) 2018 OPeNDAP, Inc.
7// Author: Nathan Potter <ndp@opendap.org>
8//
9// This library is free software; you can redistribute it and/or
10// modify it under the terms of the GNU Lesser General Public
11// License as published by the Free Software Foundation; either
12// version 2.1 of the License, or (at your option) any later version.
13//
14// This library is distributed in the hope that it will be useful,
15// but WITHOUT ANY WARRANTY; without even the implied warranty of
16// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17// Lesser General Public License for more details.
18//
19// You should have received a copy of the GNU Lesser General Public
20// License along with this library; if not, write to the Free Software
21// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22//
23// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24
25#include <iostream>
26#include <fstream>
27#include <sstream>
28#include <stdlib.h> /* atol */
29#include <ctype.h> /* isalpha and isdigit */
30#include <time.h> /* mktime */
31
32#include <BESDebug.h>
33#include <BESUtil.h>
34#include <BESRegex.h>
35#include <BESCatalogList.h>
36#include <BESCatalogUtils.h>
37#include <CatalogItem.h>
38
39#include "RemoteResource.h"
40#include "HttpdCatalogNames.h"
41
42#include "HttpdDirScraper.h"
43
44using namespace std;
46
47#define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
48
49namespace httpd_catalog {
50
51HttpdDirScraper::HttpdDirScraper()
52{
53 // There was probably a better way to make this association but this worked.
54 d_months.insert(pair<string, int>(string("jan"), 0));
55 d_months.insert(pair<string, int>(string("feb"), 1));
56 d_months.insert(pair<string, int>(string("mar"), 2));
57 d_months.insert(pair<string, int>(string("apr"), 3));
58 d_months.insert(pair<string, int>(string("may"), 4));
59 d_months.insert(pair<string, int>(string("jun"), 5));
60 d_months.insert(pair<string, int>(string("jul"), 6));
61 d_months.insert(pair<string, int>(string("aug"), 7));
62 d_months.insert(pair<string, int>(string("sep"), 8));
63 d_months.insert(pair<string, int>(string("oct"), 9));
64 d_months.insert(pair<string, int>(string("nov"), 10));
65 d_months.insert(pair<string, int>(string("dec"), 11));
66}
67
68/*
69 * @brief Converts an Apache httpd directory page "size" string (23K, 45M, 32G, etc)
70 * to an actual value, approximate though it may be.
71 */
72long HttpdDirScraper::get_size_val(const string size_str) const
73{
74 if(size_str.empty())
75 return 0;
76
77 char scale_c = *size_str.rbegin();
78 long scale = 1;
79
80 switch (scale_c) {
81 case 'K':
82 scale = 1e3;
83 break;
84 case 'M':
85 scale = 1e6;
86 break;
87 case 'G':
88 scale = 1e9;
89 break;
90 case 'T':
91 scale = 1e12;
92 break;
93 case 'P':
94 scale = 1e15;
95 break;
96 default:
97 scale = 1;
98 break;
99 }
100 BESDEBUG(MODULE, prolog << "scale: " << scale << endl);
101
102 string result = size_str;
103 if (isalpha(scale_c)) result = size_str.substr(0, size_str.size() - 1);
104
105 long size = atol(result.c_str());
106 BESDEBUG(MODULE, prolog << "raw size: " << size << endl);
107
108 size *= scale;
109 BESDEBUG(MODULE, prolog << "scaled size: " << size << endl);
110 return size;
111}
112
116string show_tm_struct(const tm tms)
117{
118 stringstream ss;
119 ss << "tm_sec: " << tms.tm_sec << endl;
120 ss << "tm_min: " << tms.tm_min << endl;
121 ss << "tm_hour: " << tms.tm_hour << endl;
122 ss << "tm_mday: " << tms.tm_mday << endl;
123 ss << "tm_mon: " << tms.tm_mon << endl;
124 ss << "tm_year: " << tms.tm_year << endl;
125 ss << "tm_wday: " << tms.tm_wday << endl;
126 ss << "tm_yday: " << tms.tm_yday << endl;
127 ss << "tm_isdst: " << tms.tm_isdst << endl;
128 return ss.str();
129}
130
134void zero_tm_struct(tm &tms)
135{
136 tms.tm_sec = 0;
137 tms.tm_min = 0;
138 tms.tm_hour = 0;
139 tms.tm_mday = 1;
140 tms.tm_mon = 0;
141 tms.tm_year = 0;
142 tms.tm_wday = 0;
143 tms.tm_yday = 0;
144 tms.tm_isdst = 0;
145}
146
147
148string HttpdDirScraper::httpd_time_to_iso_8601(const string httpd_time) const
149{
150 if(httpd_time.empty())
151 return httpd_time;
152
153 vector<string> tokens;
154 string delimiters = "- :";
155 BESUtil::tokenize(httpd_time, tokens, delimiters);
156
157 BESDEBUG(MODULE, prolog << "Found " << tokens.size() << " tokens." << endl);
158 vector<string>::iterator it = tokens.begin();
159 int i = 0;
160 if (BESDebug::IsSet(MODULE)) {
161 while (it != tokens.end()) {
162 BESDEBUG(MODULE, prolog << " token["<< i++ << "]: "<< *it << endl);
163 it++;
164 }
165 }
166
167 BESDEBUG(MODULE, prolog << "Second Field: "<< tokens[1] << endl);
168
169 const char *second_field = tokens[1].c_str();
170 bool is_alpha = true;
171 for(unsigned long i=0; is_alpha && i< tokens[1].size(); i++){
172 is_alpha = isalpha(second_field[i]);
173 }
174 time_t theTime;
175 if(is_alpha){
176 BESDEBUG(MODULE, prolog << "Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
177 theTime = parse_time_format_A(tokens);
178 }
179 else {
180 BESDEBUG(MODULE, prolog << "Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
181 theTime = parse_time_format_B(tokens);
182 }
183 return BESUtil::get_time(theTime, false);
184
185}
186
192time_t HttpdDirScraper::parse_time_format_A(const vector<string> tokens) const
193{
194 // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
195 struct tm tm{};
196 // jhrg 2/2/24 zero_tm_struct(tm);
197
198 if (tokens.size() > 2) {
199 std::istringstream(tokens[0]) >> tm.tm_mday;
200 BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
201
202 pair<string, int> mnth = *d_months.find(BESUtil::lowercase(tokens[1]));
203 BESDEBUG(MODULE, prolog << " mnth.first: "<< mnth.first << endl);
204 BESDEBUG(MODULE, prolog << " mnth.second: "<< mnth.second << endl);
205 tm.tm_mon = mnth.second;
206 BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
207
208 std::istringstream(tokens[2]) >> tm.tm_year;
209 tm.tm_year -= 1900;
210 BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
211
212 if (tokens.size() > 4) {
213 std::istringstream(tokens[3]) >> tm.tm_hour;
214 BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
215 std::istringstream(tokens[4]) >> tm.tm_min;
216 BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
217 }
218 }
219
220 BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
221
222 time_t theTime = mktime(&tm);
223 BESDEBUG(MODULE, prolog << "theTime: " << theTime << endl);
224 return theTime;
225}
226
232time_t HttpdDirScraper::parse_time_format_B(const vector<string> tokens) const
233{
234 // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
235 struct tm tm{};
236 if (tokens.size() > 2) {
237 std::istringstream(tokens[0]) >> tm.tm_year;
238 tm.tm_year -= 1900;
239 BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
240
241 std::istringstream(tokens[1]) >> tm.tm_mon;
242 BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
243
244 std::istringstream(tokens[2]) >> tm.tm_mday;
245 BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
246
247 if (tokens.size() > 4) {
248 std::istringstream(tokens[3]) >> tm.tm_hour;
249 BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
250 std::istringstream(tokens[4]) >> tm.tm_min;
251 BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
252 }
253 }
254
255 BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
256
257 time_t theTime = mktime(&tm);
258 BESDEBUG(MODULE, prolog << "ISO-8601 Time: " << theTime << endl);
259 return theTime;
260}
261
278void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items) const
279{
280 const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
281
282 // Go get the text from the remote resource
283 std::shared_ptr<http::url> url_ptr(new http::url(url));
284 http::RemoteResource rhr(url_ptr);
285 rhr.retrieve_resource();
286 stringstream buffer;
287
288 ifstream cache_file_is(rhr.get_filename().c_str());
289 if(!cache_file_is.is_open()){
290 string msg = prolog + "ERROR - Failed to open cache file: " + rhr.get_filename();
291 BESDEBUG(MODULE, msg << endl);
292 throw BESInternalError(msg ,__FILE__, __LINE__ );
293 }
294
295 buffer << cache_file_is.rdbuf();
296 string pageStr = buffer.str();
297 BESDEBUG(MODULE, prolog << "Page Content: " << endl << pageStr << endl);
298
299 // Does it look like an Apache httpd Index listing?
300 if(pageStr.find("<title>Index of ") == string::npos){
301 // Nope. Time to leave.
302 BESDEBUG(MODULE, prolog << "The url: " << url << " does not appear to reference an Apache httpd Index page." << endl);
303 return;
304 }
305
306 string aOpenStr = "<a ";
307 string aCloseStr = "</a>";
308 string hrefStr = "href=\"";
309 string tdOpenStr = "<td ";
310 string tdCloseStr = "</td>";
311
312 BESRegex hrefExcludeRegex("(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
313 BESRegex nameExcludeRegex("^Parent Directory$");
314
315 bool done = false;
316 int next_start = 0;
317 while (!done) {
318 int aOpenIndex = pageStr.find(aOpenStr, next_start);
319 if (aOpenIndex < 0) {
320 done = true;
321 }
322 else {
323 int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.size());
324 if (aCloseIndex < 0) {
325 done = true;
326 }
327 else {
328 int length;
329
330 // Locate the entire <a /> element
331 BESDEBUG(MODULE, prolog << "aOpenIndex: " << aOpenIndex << endl);
332 BESDEBUG(MODULE, prolog << "aCloseIndex: " << aCloseIndex << endl);
333 length = aCloseIndex + aCloseStr.size() - aOpenIndex;
334 string aElemStr = pageStr.substr(aOpenIndex, length);
335 BESDEBUG(MODULE, prolog << "Processing link: " << aElemStr << endl);
336
337 // Find the link text
338 int start = aElemStr.find(">") + 1;
339 int end = aElemStr.find("<", start);
340 length = end - start;
341 string linkText = aElemStr.substr(start, length);
342 BESDEBUG(MODULE, prolog << "Link Text: " << linkText << endl);
343
344 // Locate the href attribute
345 start = aElemStr.find(hrefStr) + hrefStr.size();
346 end = aElemStr.find("\"", start);
347 length = end - start;
348 string href = aElemStr.substr(start, length);
349 BESDEBUG(MODULE, prolog << "href: " << href << endl);
350
351 // attempt to get time string
352 string time_str;
353 int start_pos = getNextElementText(pageStr, "td", aCloseIndex + aCloseStr.size(), time_str);
354 BESDEBUG(MODULE, prolog << "time_str: '" << time_str << "'" << endl);
355
356 // attempt to get size string
357 string size_str;
358 start_pos = getNextElementText(pageStr, "td", start_pos, size_str);
359 BESDEBUG(MODULE, prolog << "size_str: '" << size_str << "'" << endl);
360
361 if ((linkText.find("<img") != string::npos) || !(linkText.size()) || (linkText.find("<<<") != string::npos)
362 || (linkText.find(">>>") != string::npos)) {
363 BESDEBUG(MODULE, prolog << "SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
364 }
365 else {
366 if (href.size() == 0 || (((href.find("http://") == 0) || (href.find("https://") == 0)) && !(href.find(url) == 0))) {
367 // SKIPPING
368 BESDEBUG(MODULE, prolog << "SKIPPING(null or remote): " << href << endl);
369 }
370 else if (hrefExcludeRegex.match(href.c_str(), href.size(), 0) > 0) {
371 // SKIPPING
372 BESDEBUG(MODULE, prolog << "SKIPPING(hrefExcludeRegex) - href: '" << href << "'"<< endl);
373 }
374 else if (nameExcludeRegex.match(linkText.c_str(), linkText.size(), 0) > 0) {
375 // SKIPPING
376 BESDEBUG(MODULE, prolog << "SKIPPING(nameExcludeRegex) - name: '" << linkText << "'" << endl);
377 }
378 else if (BESUtil::endsWith(href, "/")) {
379 string node_name = href.substr(0, href.size() - 1);
380 // it's a directory aka a node
381 BESDEBUG(MODULE, prolog << "NODE: " << node_name << endl);
382 bes::CatalogItem *childNode = new bes::CatalogItem();
383 childNode->set_type(CatalogItem::node);
384 childNode->set_name(node_name);
385 childNode->set_is_data(false);
386 string iso_8601_time = httpd_time_to_iso_8601(time_str);
387 childNode->set_lmt(iso_8601_time);
388 // FIXME: For nodes the size should be the number of children, but how without crawling?
389 long size = get_size_val(size_str);
390 childNode->set_size(size);
391
392 items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
393 }
394 else {
395 // It's a file aka a leaf
396 BESDEBUG(MODULE, prolog << "LEAF: " << href << endl);
397 CatalogItem *leafItem = new CatalogItem();
398 leafItem->set_type(CatalogItem::leaf);
399 leafItem->set_name(href);
400 leafItem->set_is_data(cat_utils->is_data(href));
401 string iso_8601_time = httpd_time_to_iso_8601(time_str);
402 leafItem->set_lmt(iso_8601_time);
403 long size = get_size_val(size_str);
404 leafItem->set_size(size);
405
406 items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
407 }
408 }
409 }
410 next_start = aCloseIndex + aCloseStr.size();
411 }
412 }
413}
414
427int HttpdDirScraper::getNextElementText(const string &page_str, const string element_name, int startIndex, string &resultText, bool trim) const
428{
429 string e_open_str = "<" + element_name + " ";
430 string e_close_str = "</" + element_name + ">";
431
432 // Locate the next "element_name" element
433 int start = page_str.find(e_open_str, startIndex);
434 int end = page_str.find(e_close_str, start + e_open_str.size());
435 if(start<0 || end<0 || end<start){
436 resultText="";
437 return startIndex;
438 }
439
440 int length = end + e_close_str.size() - start;
441 string element_str = page_str.substr(start, length);
442
443 // Find the text
444 start = element_str.find(">") + 1;
445 end = element_str.find("<", start);
446 length = end - start;
447 resultText = element_str.substr(start, length);
448
449 if (trim) BESUtil::removeLeadingAndTrailingBlanks(resultText);
450
451 BESDEBUG(MODULE, prolog << "resultText: '" << resultText << "'" << endl);
452 return startIndex + element_str.size();
453}
454
455/*
456 * @brief Returns the catalog node represented by the httpd directory page returned
457 * by dereferencing the passed url.
458 * @param url The url of the Apache httpd directory to process.
459 * @param path The path prefix that associates the location of this generated CatalogNode with it's
460 * correct position in the local service path.
461 */
462bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
463{
464 BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
465 bes::CatalogNode *node = new bes::CatalogNode(path);
466
467 if (BESUtil::endsWith(url, "/")) {
468 // This always means the URL points to a node when coming from httpd
469 map<string, bes::CatalogItem *> items;
470 createHttpdDirectoryPageMap(url, items);
471
472 BESDEBUG(MODULE, prolog << "Found " << items.size() << " items." << endl);
473 map<string, bes::CatalogItem *>::iterator it;
474 it = items.begin();
475 while (it != items.end()) {
476 bes::CatalogItem *item = it->second;
477 BESDEBUG(MODULE, prolog << "Adding item: '" << item->get_name() << "'"<< endl);
478 if (item->get_type() == CatalogItem::node)
479 node->add_node(item);
480 else
481 node->add_leaf(item);
482 it++;
483 }
484 }
485 else {
486 // It's a leaf aka "item" response.
487 const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
488 std::vector<std::string> url_parts = BESUtil::split(url, '/', true);
489 string leaf_name = url_parts.back();
490
491 CatalogItem *item = new CatalogItem();
492 item->set_type(CatalogItem::leaf);
493 item->set_name(leaf_name);
494 item->set_is_data(cat_utils->is_data(leaf_name));
495
496 // FIXME: Find the Last Modified date? Head??
497 item->set_lmt(BESUtil::get_time(true));
498
499 // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
500 item->set_size(1);
501
502 node->set_leaf(item);
503 }
504 return node;
505}
506
507#if 0
508
509bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
510{
511 BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
512 bes::CatalogNode *node = new bes::CatalogNode(path);
513
514 if (BESUtil::endsWith(url, "/")) {
515
516 set<string> pageNodes;
517 set<string> pageLeaves;
518 createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
519
520 BESDEBUG(MODULE, prolog << "Found " << pageNodes.size() << " nodes." << endl);
521 BESDEBUG(MODULE, prolog << "Found " << pageLeaves.size() << " leaves." << endl);
522
523 set<string>::iterator it;
524
525 it = pageNodes.begin();
526 while (it != pageNodes.end()) {
527 string pageNode = *it;
528 if (BESUtil::endsWith(pageNode, "/")) pageNode = pageNode.substr(0, pageNode.size() - 1);
529
530 bes::CatalogItem *childNode = new bes::CatalogItem();
531 childNode->set_type(CatalogItem::node);
532
533 childNode->set_name(pageNode);
534 childNode->set_is_data(false);
535
536 // FIXME: Figure out the LMT if we can... HEAD?
537 childNode->set_lmt(BESUtil::get_time(true));
538
539 // FIXME: For nodes the size should be the number of children, but how without crawling?
540 childNode->set_size(0);
541
542 node->add_node(childNode);
543 it++;
544 }
545
546 it = pageLeaves.begin();
547 while (it != pageLeaves.end()) {
548 string leaf = *it;
549 CatalogItem *leafItem = new CatalogItem();
550 leafItem->set_type(CatalogItem::leaf);
551 leafItem->set_name(leaf);
552
553 // FIXME: wrangle up the Typematch and see if we think this thing is data or not.
554 leafItem->set_is_data(false);
555
556 // FIXME: Find the Last Modified date?
557 leafItem->set_lmt(BESUtil::get_time(true));
558
559 // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
560 leafItem->set_size(1);
561
562 node->add_leaf(leafItem);
563 it++;
564 }
565 }
566 else {
567 std::vector<std::string> url_parts = BESUtil::split(url,'/',true);
568 string leaf_name = url_parts.back();
569
570 CatalogItem *item = new CatalogItem();
571 item->set_type(CatalogItem::leaf);
572 item->set_name(leaf_name);
573 // FIXME: Find the Last Modified date?
574 item->set_lmt(BESUtil::get_time(true));
575
576 // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
577 item->set_size(1);
578
579 node->set_leaf(item);
580
581 }
582 return node;
583
584}
585#endif
586
587}
588 // namespace httpd_catalog
589
bool is_data(const std::string &item) const
is there a handler that can process this
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
Definition BESCatalog.h:112
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition BESDebug.h:145
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
Definition BESUtil.cc:1068
static bool endsWith(std::string const &fullString, std::string const &ending)
Definition BESUtil.cc:837
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition BESUtil.cc:995
static std::string lowercase(const std::string &s)
Definition BESUtil.cc:257
static void removeLeadingAndTrailingBlanks(std::string &key)
Definition BESUtil.cc:448
static std::string get_time(bool use_local_time=false)
Definition BESUtil.cc:1017
void set_lmt(const std::string &lmt)
Set the LMT for this item.
std::string get_name() const
The name of this item in the node.
void set_size(size_t s)
Set the size of the item.
void set_is_data(bool id)
Is this item data that the BES should interpret?
item_type get_type() const
Get the type of this item (unknown, node or leaf)
void set_type(item_type t)
Set the type for this item.
void set_name(const std::string &n)
Set the name of the item.