35#include <BESCatalogList.h>
36#include <BESCatalogUtils.h>
37#include <CatalogItem.h>
39#include "RemoteResource.h"
40#include "HttpdCatalogNames.h"
42#include "HttpdDirScraper.h"
47#define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
49namespace httpd_catalog {
51HttpdDirScraper::HttpdDirScraper()
54 d_months.insert(pair<string, int>(
string(
"jan"), 0));
55 d_months.insert(pair<string, int>(
string(
"feb"), 1));
56 d_months.insert(pair<string, int>(
string(
"mar"), 2));
57 d_months.insert(pair<string, int>(
string(
"apr"), 3));
58 d_months.insert(pair<string, int>(
string(
"may"), 4));
59 d_months.insert(pair<string, int>(
string(
"jun"), 5));
60 d_months.insert(pair<string, int>(
string(
"jul"), 6));
61 d_months.insert(pair<string, int>(
string(
"aug"), 7));
62 d_months.insert(pair<string, int>(
string(
"sep"), 8));
63 d_months.insert(pair<string, int>(
string(
"oct"), 9));
64 d_months.insert(pair<string, int>(
string(
"nov"), 10));
65 d_months.insert(pair<string, int>(
string(
"dec"), 11));
72long HttpdDirScraper::get_size_val(
const string size_str)
const
77 char scale_c = *size_str.rbegin();
100 BESDEBUG(MODULE, prolog <<
"scale: " << scale << endl);
102 string result = size_str;
103 if (isalpha(scale_c)) result = size_str.substr(0, size_str.size() - 1);
105 long size = atol(result.c_str());
106 BESDEBUG(MODULE, prolog <<
"raw size: " << size << endl);
109 BESDEBUG(MODULE, prolog <<
"scaled size: " << size << endl);
116string show_tm_struct(
const tm tms)
119 ss <<
"tm_sec: " << tms.tm_sec << endl;
120 ss <<
"tm_min: " << tms.tm_min << endl;
121 ss <<
"tm_hour: " << tms.tm_hour << endl;
122 ss <<
"tm_mday: " << tms.tm_mday << endl;
123 ss <<
"tm_mon: " << tms.tm_mon << endl;
124 ss <<
"tm_year: " << tms.tm_year << endl;
125 ss <<
"tm_wday: " << tms.tm_wday << endl;
126 ss <<
"tm_yday: " << tms.tm_yday << endl;
127 ss <<
"tm_isdst: " << tms.tm_isdst << endl;
134void zero_tm_struct(tm &tms)
148string HttpdDirScraper::httpd_time_to_iso_8601(
const string httpd_time)
const
150 if(httpd_time.empty())
153 vector<string> tokens;
154 string delimiters =
"- :";
157 BESDEBUG(MODULE, prolog <<
"Found " << tokens.size() <<
" tokens." << endl);
158 vector<string>::iterator it = tokens.begin();
161 while (it != tokens.end()) {
162 BESDEBUG(MODULE, prolog <<
" token["<< i++ <<
"]: "<< *it << endl);
167 BESDEBUG(MODULE, prolog <<
"Second Field: "<< tokens[1] << endl);
169 const char *second_field = tokens[1].c_str();
170 bool is_alpha =
true;
171 for(
unsigned long i=0; is_alpha && i< tokens[1].size(); i++){
172 is_alpha = isalpha(second_field[i]);
176 BESDEBUG(MODULE, prolog <<
"Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
177 theTime = parse_time_format_A(tokens);
180 BESDEBUG(MODULE, prolog <<
"Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
181 theTime = parse_time_format_B(tokens);
192time_t HttpdDirScraper::parse_time_format_A(
const vector<string> tokens)
const
198 if (tokens.size() > 2) {
199 std::istringstream(tokens[0]) >> tm.tm_mday;
200 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
203 BESDEBUG(MODULE, prolog <<
" mnth.first: "<< mnth.first << endl);
204 BESDEBUG(MODULE, prolog <<
" mnth.second: "<< mnth.second << endl);
205 tm.tm_mon = mnth.second;
206 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
208 std::istringstream(tokens[2]) >> tm.tm_year;
210 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
212 if (tokens.size() > 4) {
213 std::istringstream(tokens[3]) >> tm.tm_hour;
214 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
215 std::istringstream(tokens[4]) >> tm.tm_min;
216 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
220 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
222 time_t theTime = mktime(&tm);
223 BESDEBUG(MODULE, prolog <<
"theTime: " << theTime << endl);
232time_t HttpdDirScraper::parse_time_format_B(
const vector<string> tokens)
const
236 if (tokens.size() > 2) {
237 std::istringstream(tokens[0]) >> tm.tm_year;
239 BESDEBUG(MODULE, prolog <<
" tm.tm_year: "<< tm.tm_year << endl);
241 std::istringstream(tokens[1]) >> tm.tm_mon;
242 BESDEBUG(MODULE, prolog <<
" tm.tm_mon: "<< tm.tm_mon << endl);
244 std::istringstream(tokens[2]) >> tm.tm_mday;
245 BESDEBUG(MODULE, prolog <<
" tm.tm_mday: "<< tm.tm_mday << endl);
247 if (tokens.size() > 4) {
248 std::istringstream(tokens[3]) >> tm.tm_hour;
249 BESDEBUG(MODULE, prolog <<
" tm.tm_hour: "<< tm.tm_hour << endl);
250 std::istringstream(tokens[4]) >> tm.tm_min;
251 BESDEBUG(MODULE, prolog <<
" tm.tm_min: "<< tm.tm_min << endl);
255 BESDEBUG(MODULE, prolog <<
"tm struct: " << endl << show_tm_struct(tm));
257 time_t theTime = mktime(&tm);
258 BESDEBUG(MODULE, prolog <<
"ISO-8601 Time: " << theTime << endl);
278void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items)
const
280 const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->
get_catalog_utils();
283 std::shared_ptr<http::url> url_ptr(
new http::url(url));
284 http::RemoteResource rhr(url_ptr);
285 rhr.retrieve_resource();
288 ifstream cache_file_is(rhr.get_filename().c_str());
289 if(!cache_file_is.is_open()){
290 string msg = prolog +
"ERROR - Failed to open cache file: " + rhr.get_filename();
291 BESDEBUG(MODULE, msg << endl);
292 throw BESInternalError(msg ,__FILE__, __LINE__ );
295 buffer << cache_file_is.rdbuf();
296 string pageStr = buffer.str();
297 BESDEBUG(MODULE, prolog <<
"Page Content: " << endl << pageStr << endl);
300 if(pageStr.find(
"<title>Index of ") == string::npos){
302 BESDEBUG(MODULE, prolog <<
"The url: " << url <<
" does not appear to reference an Apache httpd Index page." << endl);
306 string aOpenStr =
"<a ";
307 string aCloseStr =
"</a>";
308 string hrefStr =
"href=\"";
309 string tdOpenStr =
"<td ";
310 string tdCloseStr =
"</td>";
312 BESRegex hrefExcludeRegex(
"(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
313 BESRegex nameExcludeRegex(
"^Parent Directory$");
318 int aOpenIndex = pageStr.find(aOpenStr, next_start);
319 if (aOpenIndex < 0) {
323 int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.size());
324 if (aCloseIndex < 0) {
331 BESDEBUG(MODULE, prolog <<
"aOpenIndex: " << aOpenIndex << endl);
332 BESDEBUG(MODULE, prolog <<
"aCloseIndex: " << aCloseIndex << endl);
333 length = aCloseIndex + aCloseStr.size() - aOpenIndex;
334 string aElemStr = pageStr.substr(aOpenIndex, length);
335 BESDEBUG(MODULE, prolog <<
"Processing link: " << aElemStr << endl);
338 int start = aElemStr.find(
">") + 1;
339 int end = aElemStr.find(
"<", start);
340 length = end - start;
341 string linkText = aElemStr.substr(start, length);
342 BESDEBUG(MODULE, prolog <<
"Link Text: " << linkText << endl);
345 start = aElemStr.find(hrefStr) + hrefStr.size();
346 end = aElemStr.find(
"\"", start);
347 length = end - start;
348 string href = aElemStr.substr(start, length);
349 BESDEBUG(MODULE, prolog <<
"href: " << href << endl);
353 int start_pos = getNextElementText(pageStr,
"td", aCloseIndex + aCloseStr.size(), time_str);
354 BESDEBUG(MODULE, prolog <<
"time_str: '" << time_str <<
"'" << endl);
358 start_pos = getNextElementText(pageStr,
"td", start_pos, size_str);
359 BESDEBUG(MODULE, prolog <<
"size_str: '" << size_str <<
"'" << endl);
361 if ((linkText.find(
"<img") != string::npos) || !(linkText.size()) || (linkText.find(
"<<<") != string::npos)
362 || (linkText.find(
">>>") != string::npos)) {
363 BESDEBUG(MODULE, prolog <<
"SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
366 if (href.size() == 0 || (((href.find(
"http://") == 0) || (href.find(
"https://") == 0)) && !(href.find(url) == 0))) {
368 BESDEBUG(MODULE, prolog <<
"SKIPPING(null or remote): " << href << endl);
370 else if (hrefExcludeRegex.match(href.c_str(), href.size(), 0) > 0) {
372 BESDEBUG(MODULE, prolog <<
"SKIPPING(hrefExcludeRegex) - href: '" << href <<
"'"<< endl);
374 else if (nameExcludeRegex.match(linkText.c_str(), linkText.size(), 0) > 0) {
376 BESDEBUG(MODULE, prolog <<
"SKIPPING(nameExcludeRegex) - name: '" << linkText <<
"'" << endl);
379 string node_name = href.substr(0, href.size() - 1);
381 BESDEBUG(MODULE, prolog <<
"NODE: " << node_name << endl);
382 bes::CatalogItem *childNode =
new bes::CatalogItem();
383 childNode->
set_type(CatalogItem::node);
386 string iso_8601_time = httpd_time_to_iso_8601(time_str);
387 childNode->
set_lmt(iso_8601_time);
389 long size = get_size_val(size_str);
392 items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
396 BESDEBUG(MODULE, prolog <<
"LEAF: " << href << endl);
397 CatalogItem *leafItem =
new CatalogItem();
398 leafItem->
set_type(CatalogItem::leaf);
401 string iso_8601_time = httpd_time_to_iso_8601(time_str);
402 leafItem->
set_lmt(iso_8601_time);
403 long size = get_size_val(size_str);
406 items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
410 next_start = aCloseIndex + aCloseStr.size();
427int HttpdDirScraper::getNextElementText(
const string &page_str,
const string element_name,
int startIndex,
string &resultText,
bool trim)
const
429 string e_open_str =
"<" + element_name +
" ";
430 string e_close_str =
"</" + element_name +
">";
433 int start = page_str.find(e_open_str, startIndex);
434 int end = page_str.find(e_close_str, start + e_open_str.size());
435 if(start<0 || end<0 || end<start){
440 int length = end + e_close_str.size() - start;
441 string element_str = page_str.substr(start, length);
444 start = element_str.find(
">") + 1;
445 end = element_str.find(
"<", start);
446 length = end - start;
447 resultText = element_str.substr(start, length);
451 BESDEBUG(MODULE, prolog <<
"resultText: '" << resultText <<
"'" << endl);
452 return startIndex + element_str.size();
462bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
464 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
465 bes::CatalogNode *node =
new bes::CatalogNode(path);
469 map<string, bes::CatalogItem *> items;
470 createHttpdDirectoryPageMap(url, items);
472 BESDEBUG(MODULE, prolog <<
"Found " << items.size() <<
" items." << endl);
473 map<string, bes::CatalogItem *>::iterator it;
475 while (it != items.end()) {
476 bes::CatalogItem *item = it->second;
477 BESDEBUG(MODULE, prolog <<
"Adding item: '" << item->
get_name() <<
"'"<< endl);
478 if (item->
get_type() == CatalogItem::node)
479 node->add_node(item);
481 node->add_leaf(item);
487 const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->
get_catalog_utils();
488 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
489 string leaf_name = url_parts.back();
491 CatalogItem *item =
new CatalogItem();
502 node->set_leaf(item);
509bes::CatalogNode *HttpdDirScraper::get_node(
const string &url,
const string &path)
const
511 BESDEBUG(MODULE, prolog <<
"Processing url: '" << url <<
"'"<< endl);
512 bes::CatalogNode *node =
new bes::CatalogNode(path);
516 set<string> pageNodes;
517 set<string> pageLeaves;
518 createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
520 BESDEBUG(MODULE, prolog <<
"Found " << pageNodes.size() <<
" nodes." << endl);
521 BESDEBUG(MODULE, prolog <<
"Found " << pageLeaves.size() <<
" leaves." << endl);
523 set<string>::iterator it;
525 it = pageNodes.begin();
526 while (it != pageNodes.end()) {
527 string pageNode = *it;
528 if (
BESUtil::endsWith(pageNode,
"/")) pageNode = pageNode.substr(0, pageNode.size() - 1);
530 bes::CatalogItem *childNode =
new bes::CatalogItem();
531 childNode->
set_type(CatalogItem::node);
542 node->add_node(childNode);
546 it = pageLeaves.begin();
547 while (it != pageLeaves.end()) {
549 CatalogItem *leafItem =
new CatalogItem();
550 leafItem->
set_type(CatalogItem::leaf);
562 node->add_leaf(leafItem);
567 std::vector<std::string> url_parts =
BESUtil::split(url,
'/',
true);
568 string leaf_name = url_parts.back();
570 CatalogItem *item =
new CatalogItem();
579 node->set_leaf(item);
bool is_data(const std::string &item) const
is there a handler that can process this
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
static bool endsWith(std::string const &fullString, std::string const &ending)
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
static std::string lowercase(const std::string &s)
static void removeLeadingAndTrailingBlanks(std::string &key)
static std::string get_time(bool use_local_time=false)
void set_lmt(const std::string &lmt)
Set the LMT for this item.
std::string get_name() const
The name of this item in the node.
void set_size(size_t s)
Set the size of the item.
void set_is_data(bool id)
Is this item data that the BES should interpret?
item_type get_type() const
Get the type of this item (unknown, node or leaf)
void set_type(item_type t)
Set the type for this item.
void set_name(const std::string &n)
Set the name of the item.