bes Updated for version 3.21.1
The Backend Server (BES) is the lower two tiers of the Hyrax data server
FileCache.h
1// FileCache.h
2
3// This file was originally part of bes, A C++ back-end server
4// implementation framework for the OPeNDAP Data Access Protocol.
5// Copied to libdap. This is used to cache responses built from
6// functional CE expressions.
7
8// Copyright (c) 2023 OPeNDAP, Inc
9// Author: James Gallagher <jgallagher@opendap.org>
10//
11// This library is free software; you can redistribute it and/or
12// modify it under the terms of the GNU Lesser General Public
13// License as published by the Free Software Foundation; either
14// version 2.1 of the License, or (at your option) any later version.
15//
16// This library is distributed in the hope that it will be useful,
17// but WITHOUT ANY WARRANTY; without even the implied warranty of
18// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19// Lesser General Public License for more details.
20//
21// You should have received a copy of the GNU Lesser General Public
22// License along with this library; if not, write to the Free Software
23// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24//
25// You can contact University Corporation for Atmospheric Research at
26// 3080 Center Green Drive, Boulder, CO 80301
27
28#ifndef FileCache_h_
29#define FileCache_h_ 1
30
31#include <utility>
32#include <vector>
33#include <algorithm>
34#include <map>
35#include <mutex>
36#include <sstream>
37#include <iomanip>
38
39#include <cstring>
40
41#include <fcntl.h>
42#include <unistd.h>
43#include <dirent.h>
44#include <sys/file.h>
45#include <sys/stat.h>
46#include <sys/time.h>
47
48#include <openssl/sha.h>
49
50#include "BESUtil.h"
51#include "BESLog.h"
52
53// Make all the error log messages uniform in one small way. This is a macro
54// so that we can switch to exceptions if that seems necessary. jhrg 11/06/23
55#define ERROR(msg) ERROR_LOG("FileCache: " + std::string(msg))
56#define INFO(msg) INFO_LOG("FileCache: " + std::string(msg))
57
58// If this is defined, then the access time of a file is updated when it is
59// closed by the Item dtor. This is a hack to get around the fact that the
60// access time is not always updated by simple read operations.
61// If this is set to zero, the cache may become, in effect, a FIFO and not
62// an LRU cache. jhrg 11/03/23
63#define FORCE_ACCESS_TIME_UPDATE 1
64
65static inline std::string get_errno() {
66 const char *s_err = strerror(errno);
67 return s_err ? s_err : "unknown error";
68}
69
70const unsigned long long MEGABYTE = 1048576;
71
122class FileCache {
123 // pathname of the cache directory
124 std::string d_cache_dir;
125
128 unsigned long long d_max_cache_size_in_bytes = 0;
129
130 // When we purge, how much should we throw away. Set in the ctor to 80% of the max size.
131 unsigned long long d_purge_size = 0;
132
133 // Name of the file that tracks the size of the cache
134 int d_cache_info_fd = -1;
135
136 const std::string CACHE_INFO_FILE_NAME = "cache_info";
137
138 static std::string get_lock_type_string(int lock_type) {
139 return (lock_type == LOCK_EX) ? "Exclusive": "Shared";
140 }
141
143 class CacheLock {
144 private:
145 int d_fd = -1;
146
147 public:
148 CacheLock() = default;
149 CacheLock(const CacheLock &) = delete;
150 explicit CacheLock(int fd) : d_fd(fd) {}
151 CacheLock &operator=(const CacheLock &) = delete;
152 ~CacheLock() {
153 if (flock(d_fd, LOCK_UN) < 0)
154 ERROR("Could not unlock the FileCache.");
155 }
156
163 bool lock_the_cache(int lock_type, const std::string &msg = "") const {
164 if (d_fd < 0) {
165 ERROR("Call to CacheLock::lock_the_cache with uninitialized lock object.");
166 return false;
167 }
168 if (flock(d_fd, lock_type) < 0) {
169 if (msg.empty())
170 ERROR(msg + get_lock_type_string(lock_type) + get_errno());
171 else
172 ERROR(msg + get_errno() );
173 return false;
174 }
175 return true;
176 }
177 };
178
179 // These private methods assume they are called on a locked instance of the cache.
180
187 int create_key(const std::string &key) {
188 std::string key_file_name = BESUtil::pathConcat(d_cache_dir, key);
189 int fd;
190 if ((fd = open(key_file_name.c_str(), O_CREAT | O_EXCL | O_RDWR, 0666)) < 0) {
191 if (errno == EEXIST) {
192 INFO_LOG("Could not create the key/file; it already exists: " + key + " " + get_errno() );
193 return -1;
194 }
195 else {
196 ERROR("Error creating key/file: " + key + " " + get_errno());
197 return -1;
198 }
199 }
200
201 return fd;
202 }
203
207 bool files_in_cache(std::vector<std::string> &files) const {
208 // When we move the C++-17, we can use std::filesystem to do this. jhrg 10/24/23
209 DIR *dir;
210 const struct dirent *ent;
211 if ((dir = opendir (d_cache_dir.c_str())) != nullptr) {
212 /* print all the files and directories within directory */
213 while ((ent = readdir (dir)) != nullptr) {
214 // Skip the '.' and '..' files and the cache info file
215 // TODO For large caches, this could be slow. Instead, build the list
216 // and then use three operations to remove these three files. jhrg 12/27/24
217 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0
218 || strcmp(ent->d_name, CACHE_INFO_FILE_NAME.c_str()) == 0)
219 continue;
220 files.emplace_back(BESUtil::pathConcat(d_cache_dir, ent->d_name));
221 }
222 closedir (dir);
223 }
224 else {
225 ERROR("Could not open the cache directory (" + d_cache_dir + ").");
226 return false;
227 }
228
229 return true;
230 }
231
233 bool invariant() const {
234 if (d_cache_info_fd < 0)
235 return false;
236 return true;
237 }
238
240 static unsigned long long get_file_size(int fd) {
241 struct stat sb = {};
242 if (fstat(fd, &sb) != 0)
243 return 0;
244 return sb.st_size;
245 }
246
250 bool open_cache_info() {
251 if (d_cache_dir.empty())
252 return false;
253 // If O_CREAT and O_EXCL are used together and the file already exists, then open()
254 // fails with the error EEXIST. In that case, try to open the file using simple RDWR.
255 if ((d_cache_info_fd = open(BESUtil::pathConcat(d_cache_dir, CACHE_INFO_FILE_NAME).c_str(), O_RDWR | O_CREAT | O_EXCL, 0666)) >= 0) {
256 unsigned long long size = 0;
257 if (write(d_cache_info_fd, &size, sizeof(size)) != sizeof(size))
258 return false;
259 }
260 else if ((d_cache_info_fd = open(BESUtil::pathConcat(d_cache_dir, CACHE_INFO_FILE_NAME).c_str(), O_RDWR, 0666)) < 0) {
261 return false;
262 }
263 return true;
264 }
265
268 unsigned long long get_cache_info_size() const
269 {
270 if (d_cache_info_fd == -1) {
271 ERROR("Cache info file not open.");
272 return 0;
273 }
274 if (lseek(d_cache_info_fd, 0, SEEK_SET) == -1) {
275 ERROR("Could not seek to the beginning of the cache info file.");
276 return 0;
277 }
278 unsigned long long size = 0;
279 if (read(d_cache_info_fd, &size, sizeof(size)) != sizeof(size)) {
280 ERROR("Could not read the cache info file.");
281 return 0;
282 }
283 return size;
284 }
285
288 bool update_cache_info_size(unsigned long long size) const {
289 if (d_cache_info_fd == -1) {
290 ERROR("Cache info file not open.");
291 return false;
292 }
293 if (lseek(d_cache_info_fd, 0, SEEK_SET) == -1) {
294 ERROR("Could not seek to the beginning of the cache info file.");
295 return false;
296 }
297 if (write(d_cache_info_fd, &size, sizeof(size)) != sizeof(size)) {
298 ERROR("Could not write to the cache info file.");
299 return false;
300 }
301 return true;
302 }
303
304 friend class FileCacheTest;
305
306public:
314 static std::string hash_key(const std::string &key, bool log_it = false) {
315 unsigned char md[SHA256_DIGEST_LENGTH];
316 SHA256(reinterpret_cast<const unsigned char *>(key.c_str()), key.size(), md);
317 std::stringstream hex_stream;
318 for (auto b: md) {
319 hex_stream << std::hex << std::setw(2) << std::setfill('0') << (int)b;
320 }
321 if (log_it)
322 INFO_LOG(":hash_key() " + key + " -> " + hex_stream.str());
323 return {hex_stream.str()};
324 }
325
327 class Item {
328 int d_fd = -1;
329
330 public:
331 Item() = default;
332 Item(const Item &) = delete;
333 explicit Item(int fd) : d_fd(fd) { }
334 Item &operator=(const Item &) = delete;
335 virtual ~Item() {
336 if (d_fd != -1) {
337 close(d_fd); // Also releases any locks
338 d_fd = -1;
339 }
340 }
341
342 int get_fd() const {
343 return d_fd;
344 }
345 void set_fd(int fd) {
346 d_fd = fd;
347 }
348
349 bool lock_the_item(int lock_type, const std::string &msg = "") const {
350 if (d_fd < 0) {
351 ERROR("Call to Item::lock_the_item() with uninitialized item file descriptor.");
352 return false;
353 }
354 if (flock(d_fd, lock_type) < 0) {
355 if (msg.empty())
356 ERROR("Could not get " + get_lock_type_string(lock_type) + " lock: " + get_errno() );
357 else
358 ERROR(msg + ": " + get_errno());
359 return false;
360 }
361
362#if FORCE_ACCESS_TIME_UPDATE
363 futimes(d_fd, nullptr);
364#endif
365 return true;
366 }
367 };
368
376 class PutItem : public Item {
377 FileCache &d_fc;
378
379 public:
380 PutItem() = delete;
381 explicit PutItem(FileCache &fc) : d_fc(fc) {}
382 PutItem(const PutItem &) = delete;
383 const PutItem &operator=(const PutItem &) = delete;
384 ~PutItem() override {
385 // Locking the cache before calling update_cache_info_size() is necessary. jhrg 1/1/25
386 CacheLock lock(d_fc.d_cache_info_fd);
387 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in ~PutItem() for descriptor: " + std::to_string(get_fd())))
388 return;
389 if (!d_fc.update_cache_info_size(d_fc.get_cache_info_size() + get_file_size(get_fd()))) {
390 ERROR("Could not update the cache info file while unlocking a put item: " + get_errno() );
391 }
392 }
393 };
394
395 FileCache() = default;
396 FileCache(const FileCache &) = delete;
397 FileCache &operator=(const FileCache &rhs) = delete;
398
399 virtual ~FileCache() {
400 if (d_cache_info_fd != -1) {
401 close(d_cache_info_fd);
402 }
403 }
404
413 virtual bool initialize(const std::string &cache_dir, long long size, long long purge_size) {
414 if (size < 0 || purge_size < 0) {
415 ERROR_LOG("FileCache::initialize() - size and purge_size must be >= 0\n");
416 return false;
417 }
418
419 struct stat sb = {};
420 if (stat(cache_dir.c_str(), &sb) != 0) {
421 BESUtil::mkdir_p(cache_dir, 0775);
422 if (stat(cache_dir.c_str(), &sb) != 0) {
423 ERROR_LOG("FileCache::initialize() - could not stat the cache directory: " + cache_dir);
424 return false;
425 }
426 }
427
428 d_cache_dir = cache_dir;
429
430 if (!open_cache_info()) {
431 ERROR_LOG("FileCache::initialize() - could not open the cache info file: " + cache_dir);
432 return false;
433 }
434
435 d_max_cache_size_in_bytes = (unsigned long long)size;
436 d_purge_size = (unsigned long long)purge_size;
437 return true;
438 }
439
450 bool put(const std::string &key, const std::string &file_name) {
451 // Lock the cache. Ensure the cache is unlocked no matter how we exit
452 CacheLock lock(d_cache_info_fd);
453 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in put(1) for: " + key))
454 return false;
455
456 // Create the new cache entry
457 int fd = create_key(key);
458 if (fd == -1)
459 return false;
460
461 // The Item instance will take care of closing the file.
462 Item fdl(fd);
463
464 // Lock the file for writing; released when the file descriptor is closed.
465 if (!fdl.lock_the_item(LOCK_EX, "locking the just created key/file in put(1): " + key))
466 return false;
467
468 // Copy the contents of file_name to the new file
469 int fd2;
470 if ((fd2 = open(file_name.c_str(), O_RDONLY)) < 0) {
471 ERROR("Error reading from source file: " + file_name + " " + get_errno());
472 return false;
473 }
474
475 Item fdl2(fd2); // The 'source' file is not locked; the Item ensures it is closed.
476
477 // Here we might use st_blocks and st_blksize if that will speed up the transfer.
478 // This is likely to matter only for large files (where large means...?). jhrg 11/02/23
479 std::vector<char> buf(std::min(MEGABYTE, get_file_size(fd2)));
480 ssize_t n;
481 while ((n = read(fd2, buf.data(), buf.size())) > 0) {
482 if (write(fd, buf.data(), n) != n) {
483 ERROR("Error writing to destination file: " + key + " " + get_errno());
484 return false;
485 }
486 }
487
488 // NB: The cache_info file ws locked on entry to this method.
489 if (!update_cache_info_size(get_cache_info_size() + get_file_size(fd)))
490 return false;
491
492 // The fd_wrapper instances will take care of closing (and thus unlocking) the files.
493 return true;
494 }
495
496 bool put_data(const std::string &key, const std::string &data) {
497 // Lock the cache. Ensure the cache is unlocked no matter how we exit
498 CacheLock lock(d_cache_info_fd);
499 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in put_data() for: " + key))
500 return false;
501
502 // Create the new cache entry
503 int fd = create_key(key);
504 if (fd == -1)
505 return false;
506
507 // The Item instance will take care of closing the file.
508 Item fdl(fd);
509
510 // Lock the file for writing; released when the file descriptor is closed.
511 if (!fdl.lock_the_item(LOCK_EX, "locking the just created key/file in put_data(): " + key))
512 return false;
513
514 // Here we might use st_blocks and st_blksize if that will speed up the transfer.
515 // This is likely to matter only for large files (where large means...?). jhrg 11/02/23
516
517 if (write(fd, data.c_str(), data.size()) != data.size()) {
518 ERROR("Error writing to data to cache file: " + key + " " + get_errno());
519 return false;
520 }
521
522 // NB: The cache_info file ws locked on entry to this method.
523 if (!update_cache_info_size(get_cache_info_size() + get_file_size(fd)))
524 return false;
525
526 // The fd_wrapper instances will take care of closing (and thus unlocking) the files.
527 return true;
528 }
529
544 bool put(const std::string &key, PutItem &item) {
545 // Lock the cache. Ensure the cache is unlocked no matter how we exit
546 CacheLock lock(d_cache_info_fd);
547 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in put(2) for: " + key))
548 return false;
549
550 // Create the new cache entry
551 int fd = create_key(key);
552 if (fd == -1)
553 return false;
554
555 // The Item instance will take care of closing the file.
556 item.set_fd(fd);
557
558 // Lock the file for writing; released when the file descriptor is closed.
559 if (!item.lock_the_item(LOCK_EX, "locking the just created key/file in put(2): " + key))
560 return false;
561
562 // The Item instances will take care of closing (and thus unlocking) the files.
563 return true;
564 }
565
574 bool get(const std::string &key, Item &item, int lock_type = LOCK_SH | LOCK_NB) {
575 // Lock the cache. Ensure the cache is unlocked no matter how we exit
576 CacheLock lock(d_cache_info_fd);
577 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in get() for: " + key))
578 return false;
579
580 // open the file
581 std::string key_file_name = BESUtil::pathConcat(d_cache_dir, key);
582 int fd = open(key_file_name.c_str(), O_RDONLY, 0666);
583 if (fd < 0) {
584 if (errno == ENOENT)
585 return false;
586 else {
587 ERROR("Error opening the cache item in get for: " + key + " " + get_errno());
588 return false;
589 }
590 }
591
592 item.set_fd(fd);
593 if (!item.lock_the_item(lock_type, "locking the item in get() for: " + key))
594 return false;
595
596 // Here's where we should update the info about the item in the cache_info file
597
598 return true;
599 }
600
611 bool del(const std::string &key, int lock_type = LOCK_EX | LOCK_NB) {
612 // Lock the cache. Ensure the cache is unlocked no matter how we exit
613 CacheLock lock(d_cache_info_fd);
614 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in del()."))
615 return false;
616
617 std::string key_file_name = BESUtil::pathConcat(d_cache_dir, key);
618 int fd = open(key_file_name.c_str(), O_WRONLY, 0666);
619 if (fd < 0) {
620 ERROR("Error opening the cache item in del() for: " + key + " " + get_errno());
621 return false;
622 }
623
624 Item item(fd);
625 if (!item.lock_the_item(lock_type, "locking the cache item in del() for: " + key))
626 return false;
627
628 auto file_size = get_file_size(fd);
629
630 if (remove(key_file_name.c_str()) != 0) {
631 ERROR("Error removing " + key + " from cache directory (" + d_cache_dir + ") - " + get_errno());
632 return false;
633 }
634
635 if (!update_cache_info_size(get_cache_info_size() - file_size))
636 return false;
637
638 return true;
639 }
640
646 bool clear() const {
647 // Lock the cache. Ensure the cache is unlocked no matter how we exit
648 CacheLock lock(d_cache_info_fd);
649 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in clear()."))
650 return false;
651
652 std::vector<std::string> files;
653 if (!files_in_cache(files)) {
654 return false;
655 }
656
657 for (const auto &file: files) {
658 if (remove(file.c_str()) != 0) {
659 ERROR("Error removing " + file + " from cache directory (" + d_cache_dir + ") - " + get_errno());
660 return false;
661 }
662 }
663
664 return true;
665 }
666
678 bool purge() {
679 // Lock the cache. Ensure the cache is unlocked no matter how we exit
680 CacheLock lock(d_cache_info_fd);
681 if (!lock.lock_the_cache(LOCK_EX, "locking the cache in purge()."))
682 return false;
683
684 uint64_t ci_size = get_cache_info_size();
685 if (ci_size < d_max_cache_size_in_bytes)
686 return true;
687
688 struct item_info {
689 std::string d_name;
690 off_t d_size;
691 item_info(std::string name, off_t size) :d_name(std::move(name)), d_size(size) {}
692 };
693
694 // sorted by access time, with the oldest time first
695 std::multimap<unsigned long, struct item_info, std::less<>> items;
696 uint64_t total_size = 0; // for a sanity check. jhrg 11/03/23
697
698 std::vector<std::string> files;
699 if (!files_in_cache(files))
700 return false;
701
702 for (const auto &file: files) {
703 struct stat sb = {};
704
705 if (stat(file.c_str(), &sb) < 0) {
706 ERROR("Error getting info on " + file + " in purge() - " + get_errno());
707 return false;
708 }
709
710 items.insert(std::pair<unsigned long, item_info>(sb.st_atime, item_info(file, sb.st_size)));
711 total_size += sb.st_size; // sanity check; remove some day? jhrg 11/03/23
712 }
713
714 if (ci_size != total_size) {
715 ERROR("Error cache_info and the measured size of items differ by " + std::to_string(total_size) + " bytes.");
716 }
717
718 // choose which files to remove - since the 'items' map orders the things by time, use that ordering
719 uint64_t removed_bytes = 0;
720 for (const auto &item: items) {
721 if (removed_bytes > d_purge_size)
722 break;
723
724 // Get a non-blocking but exclusive lock on the item before deleting. If the code
725 // cannot get that lock, move on to the next item. jhrg 11/06/23
726 int fd = open(item.second.d_name.c_str(), O_WRONLY, 0666);
727 if (fd < 0) {
728 ERROR("Error opening the cache item in purge() for: " + item.second.d_name + " " + get_errno());
729 return false;
730 }
731 Item item_lock(fd); // The Item dtor is called on every loop iteration according to Google. jhrg 11/03/23
732 if (!item_lock.lock_the_item(LOCK_EX | LOCK_NB, "locking the cache item in purge() for: " + item.second.d_name))
733 continue;
734
735 if (remove(item.second.d_name.c_str()) != 0) {
736 ERROR("Error removing " + item.second.d_name + " from cache directory in purge() - " + get_errno());
737 // but keep going; this is a soft error
738 }
739 else {
740 // but only count the bytes if they are actually removed
741 removed_bytes += item.second.d_size;
742 }
743 }
744
745 // update the cache info file
746 if (!update_cache_info_size(ci_size - removed_bytes)) {
747 ERROR("Error updating the cache_info size in purge() - " + get_errno());
748 return false;
749 }
750
751 return true;
752 }
753};
754
755#endif // FileCache_h_
static int mkdir_p(const std::string &path, mode_t mode)
Definition BESUtil.cc:1254
static std::string pathConcat(const std::string &firstPart, const std::string &secondPart, char separator='/')
Concatenate path fragments making sure that they are separated by a single '/' character.
Definition BESUtil.cc:754
Manage the state of an open file descriptor for a cached item.
Definition FileCache.h:327
bool get(const std::string &key, Item &item, int lock_type=LOCK_SH|LOCK_NB)
Get a locked (shared) item for a file in the cache.
Definition FileCache.h:574
bool purge()
Purge the lest recently used items from the cache. The purge() method for FileCache is public....
Definition FileCache.h:678
bool put(const std::string &key, PutItem &item)
Put an item in the cache Put an item in the cache by returning an open and lock file descriptor to th...
Definition FileCache.h:544
static std::string hash_key(const std::string &key, bool log_it=false)
Return a SHA256 hash of the given key.
Definition FileCache.h:314
bool del(const std::string &key, int lock_type=LOCK_EX|LOCK_NB)
Remove the item at the given key Remove the key/item. Updates the size recorded in cache_info....
Definition FileCache.h:611
virtual bool initialize(const std::string &cache_dir, long long size, long long purge_size)
Initialize the cache.
Definition FileCache.h:413
bool clear() const
Remove all files from the cache. Zero the cache info file.
Definition FileCache.h:646
bool put(const std::string &key, const std::string &file_name)
Put an item in the cache Put the contents of a file in the cache, referenced by the given key....
Definition FileCache.h:450