libdap Updated for version 3.21.1
libdap4 is an implementation of OPeNDAP's DAP protocol.
escaping.cc
Go to the documentation of this file.
1
2// -*- mode: c++; c-basic-offset:4 -*-
3
4// This file is part of libdap, A C++ implementation of the OPeNDAP Data
5// Access Protocol.
6
7// Copyright (c) 2002,2003 OPeNDAP, Inc.
8// Author: James Gallagher <jgallagher@opendap.org>
9//
10// This library is free software; you can redistribute it and/or
11// modify it under the terms of the GNU Lesser General Public
12// License as published by the Free Software Foundation; either
13// version 2.1 of the License, or (at your option) any later version.
14//
15// This library is distributed in the hope that it will be useful,
16// but WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18// Lesser General Public License for more details.
19//
20// You should have received a copy of the GNU Lesser General Public
21// License along with this library; if not, write to the Free Software
22// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23//
24// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
25
26// Copyright (c) 1996, California Institute of Technology.
27// ALL RIGHTS RESERVED. U.S. Government Sponsorship acknowledged.
28//
29// Please read the full copyright notice in the file COPYRIGHT_URI
30// in this directory.
31//
32// Author: Todd Karakashian, NASA/Jet Propulsion Laboratory
33// Todd.K.Karakashian@jpl.nasa.gov
34//
35// $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server
36//
37// These two routines are for escaping/unescaping strings that are identifiers
38// in DAP2
39// id2www() -- escape (using WWW hex codes) non-allowable characters in a
40// DAP2 identifier
41// www2id() -- given an WWW hexcode escaped identifier, restore it
42//
43// These two routines are for escaping/unescaping strings storing attribute
44// values. They use traditional octal escapes (\nnn) because they are
45// intended to be viewed by a user
46// escattr() -- escape (using traditional octal backslash) non-allowable
47// characters in the value of a DAP2 attribute
48// unescattr() -- given an octally escaped string, restore it
49//
50// These are routines used by the above, not intended to be called directly:
51//
52// hexstring()
53// unhexstring()
54// octstring()
55// unoctstring()
56//
57// -Todd
58
59#include "config.h"
60
61#include <ctype.h>
62
63#include <iomanip>
64#include <sstream>
65#include <string>
66
67#include "Error.h"
68#include "GNURegex.h"
69#include "InternalErr.h"
70// #define DODS_DEBUG
71#include "debug.h"
72
73using namespace std;
74
75namespace libdap {
76
77// The next four functions were originally defined static, but I removed that
78// to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001
79// jhrg
80
81string hexstring(unsigned char val) {
82 ostringstream buf;
83 buf << hex << setw(2) << setfill('0') << static_cast<unsigned int>(val);
84
85 return buf.str();
86}
87
88string unhexstring(string s) {
89 int val;
90 istringstream ss(s);
91 ss >> hex >> val;
92 char tmp_str[2];
93 tmp_str[0] = static_cast<char>(val);
94 tmp_str[1] = '\0';
95 return string(tmp_str);
96}
97
98string octstring(unsigned char val) {
99 ostringstream buf;
100 buf << oct << setw(3) << setfill('0') << static_cast<unsigned int>(val);
101
102 return buf.str();
103}
104
105string unoctstring(string s) {
106 int val;
107
108 istringstream ss(s);
109 ss >> oct >> val;
110
111 DBG(cerr << "unoctstring: " << val << endl);
112
113 char tmp_str[2];
114 tmp_str[0] = static_cast<char>(val);
115 tmp_str[1] = '\0';
116 return string(tmp_str);
117}
118
143string id2www(string in, const string &allowable) {
144 string::size_type i = 0;
145 DBG(cerr << "Input string: [" << in << "]" << endl);
146 while ((i = in.find_first_not_of(allowable, i)) != string::npos) {
147 DBG(cerr << "Found escapee: [" << in[i] << "]");
148 in.replace(i, 1, "%" + hexstring(in[i]));
149 DBGN(cerr << " now the string is: " << in << endl);
150 i += 3; // i++;
151 }
152
153 return in;
154}
155
166string id2www_ce(string in, const string &allowable) { return id2www(in, allowable); }
167
202string www2id(const string &in, const string &escape, const string &except) {
203 string::size_type i = 0;
204 string res = in;
205 while ((i = res.find_first_of(escape, i)) != string::npos) {
206 if (except.find(res.substr(i, 3)) != string::npos) {
207 i += 3;
208 continue;
209 }
210 res.replace(i, 3, unhexstring(res.substr(i + 1, 2)));
211 ++i;
212 }
213
214 return res;
215}
216
217static string entity(char c) {
218 switch (c) {
219 case '>':
220 return "&gt;";
221 case '<':
222 return "&lt;";
223 case '&':
224 return "&amp;";
225 case '\'':
226 return "&apos;";
227 case '\"':
228 return "&quot;";
229 default:
230 throw InternalErr(__FILE__, __LINE__, "Unrecognized character.");
231 }
232}
233
234// Assumption: There are always exactly two octal digits in the input
235// and two hex digits in the result.
236string octal_to_hex(const string &octal_digits) {
237 int val;
238
239 istringstream ss(octal_digits);
240 ss >> oct >> val;
241
242 ostringstream ds;
243 ds << hex << setw(2) << setfill('0') << val;
244 return ds.str();
245}
246
253string id2xml(string in, const string &not_allowed) {
254 string::size_type i = 0;
255
256 while ((i = in.find_first_of(not_allowed, i)) != string::npos) {
257 in.replace(i, 1, entity(in[i]));
258 ++i;
259 }
260#if 0
261 // Removed the encoding of octal escapes. This function is used by
262 // AttrTable to encode the stuff that is the value of the <value>
263 // element in the DDX. The problem is that some of the values are not
264 // valid UTF-8 and that makes a XML parser gag.; ticket 1512.
265 // jhrg 3/19/10
266
267 // OK, now scan for octal escape sequences like \\012 (where the '\'
268 // is itself escaped). This type of attribute value comes from the netCDF
269 // handler and maybe others. Assumption: The '\' will always appear as
270 // in its escaped form: '\\'. NB: Both backslashes must be escaped in the
271 // C++ string.
272 string octal_escape = "\\\\";
273 i = 0;
274 string::size_type length = in.length();
275 while ((i = in.find(octal_escape, i)) != string::npos) {
276 // Get the three octal digits following the '\\0'
277 string::size_type j = i + 2;
278 if (j + 1 >= length) // Check that we're not past the end
279 break;
280 string octal_digits = in.substr(j, 3);
281 // convert to a &#xdd; XML escape
282 string hex_escape = string("&#x");
283 hex_escape.append(octal_to_hex(octal_digits));
284 hex_escape.append(string(";"));
285
286 // replace the octal escape with an XML/hex escape
287 in.replace(i, 5, hex_escape);
288
289 // increment i
290 i += 6;
291 }
292#endif
293 return in;
294}
295
301string xml2id(string in) {
302 string::size_type i = 0;
303
304 while ((i = in.find("&gt;", i)) != string::npos)
305 in.replace(i, 4, ">");
306
307 i = 0;
308 while ((i = in.find("&lt;", i)) != string::npos)
309 in.replace(i, 4, "<");
310
311 i = 0;
312 while ((i = in.find("&amp;", i)) != string::npos)
313 in.replace(i, 5, "&");
314
315 i = 0;
316 while ((i = in.find("&apos;", i)) != string::npos)
317 in.replace(i, 6, "'");
318
319 i = 0;
320 while ((i = in.find("&quot;", i)) != string::npos)
321 in.replace(i, 6, "\"");
322
323 return in;
324}
325
331string esc2underscore(string s) {
332 string::size_type pos;
333 while ((pos = s.find('%')) != string::npos)
334 s.replace(pos, 3, "_");
335
336 return s;
337}
338
342string escattr(string s) {
343 const string printable =
344 " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"\n\t\r";
345 const string ESC = "\\";
346 const string DOUBLE_ESC = ESC + ESC;
347 const string QUOTE = "\"";
348 const string ESCQUOTE = ESC + QUOTE;
349
350 // escape \ with a second backslash
351 string::size_type ind = 0;
352 while ((ind = s.find(ESC, ind)) != string::npos) {
353 s.replace(ind, 1, DOUBLE_ESC);
354 ind += DOUBLE_ESC.length();
355 }
356
357 // escape non-printing characters with octal escape
358 ind = 0;
359 while ((ind = s.find_first_not_of(printable, ind)) != string::npos)
360 s.replace(ind, 1, ESC + octstring(s[ind]));
361
362 // escape " with backslash
363 ind = 0;
364 while ((ind = s.find(QUOTE, ind)) != string::npos) {
365 s.replace(ind, 1, ESCQUOTE);
366 ind += ESCQUOTE.length();
367 }
368
369 return s;
370}
371
375string escattr_xml(string s) {
376 const string printable =
377 " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"\n\t\r";
378
379 const string ESC = "\\";
380 const char null_char = '\0';
381
382 string::size_type ind = 0;
383
384 // Unlike escaping the special characters for DAS, we don't need to handle quote or double quote.
385 // However, we would like to treate NULL as a printable character. Otherwise, a '\000'
386 // will be added to many Nullterm string. Note: we have to search the '\0' character like
387 // the code below. Put the '\0' inside the string "printable" still doesn't recongize the '\0' as a printable
388 // character. KY 2022-08-22
389 while ((ind = s.find_first_not_of(printable, ind)) != string::npos) {
390 if (s[ind] != null_char)
391 s.replace(ind, 1, ESC + octstring(s[ind]));
392 else
393 ind++;
394 }
395
396 return s;
397}
398
407string unescattr(string s) {
408 const Regex octal("\\\\[0-3][0-7][0-7]"); // matches 4 characters
409 const Regex esc_quote("\\\\\""); // matches 3 characters
410 const Regex esc_esc("\\\\\\\\"); // matches 2 characters
411 const string ESC = "\\";
412 const string QUOTE = "\"";
413 int matchlen;
414 unsigned int index;
415
416 DBG(cerr << "0XX" << s << "XXX" << endl);
417 // unescape any escaped backslashes
418 index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
419 while (index < s.length()) {
420 DBG(cerr << "1aXX" << s << "XXX index: " << index << endl);
421 s.replace(index, 2, ESC);
422 DBG(cerr << "1bXX" << s << "XXX index: " << index << endl);
423 index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
424 }
425
426 // unescape any escaped double quote characters
427 index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
428 while (index < s.length()) {
429 s.replace(index, 2, QUOTE);
430 DBG(cerr << "2XX" << s << "XXX index: " << index << endl);
431 index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
432 }
433
434 // unescape octal characters
435 index = octal.search(s.c_str(), s.length(), matchlen, 0);
436 while (index < s.length()) {
437 s.replace(index, 4, unoctstring(s.substr(index + 1, 3)));
438 DBG(cerr << "3XX" << s << "XXX index: " << index << endl);
439 index = octal.search(s.c_str(), s.length(), matchlen, 0);
440 }
441
442 DBG(cerr << "4XX" << s << "XXX" << endl);
443 return s;
444}
445
446string munge_error_message(string msg) {
447 // First, add enclosing quotes if needed.
448 if (*msg.begin() != '"')
449 msg.insert(msg.begin(), '"');
450 if (*(msg.end() - 1) != '"')
451 msg += "\"";
452
453 // Now escape any internal double quotes that aren't escaped.
454 string::iterator miter;
455 for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++)
456 if (*miter == '"' && *(miter - 1) != '\\')
457 miter = msg.insert(miter, '\\');
458
459 return msg;
460}
461
466string escape_double_quotes(string source) {
467 string::size_type idx = 0;
468 while ((idx = source.find('\"', idx)) != string::npos) {
469 source.replace(idx, 1, "\\\""); // a backslash and a double quote
470 idx += 2;
471 }
472
473 return source;
474}
475
481string unescape_double_quotes(string source) {
482 string::size_type idx = 0;
483 while ((idx = source.find("\\\"", idx)) != string::npos) {
484 source.replace(idx, 2, "\""); // a backslash and a double quote
485 ++idx;
486 }
487
488 return source;
489}
490
491} // namespace libdap
Regular expression matching.
Definition GNURegex.h:54
int search(const char *s, int len, int &matchlen, int pos=0) const
How much of the string does the pattern match.
Definition GNURegex.cc:197
#define DBGN(x)
Definition debug.h:59
#define DBG(x)
Definition debug.h:58
top level DAP object to house generic methods
Definition AISConnect.cc:30
string esc2underscore(string s)
Definition escaping.cc:331
string escattr(string s)
Definition escaping.cc:342
string escattr_xml(string s)
Definition escaping.cc:375
string www2id(const string &in, const string &escape, const string &except)
Definition escaping.cc:202
string unescape_double_quotes(string source)
Definition escaping.cc:481
string hexstring(unsigned char val)
Definition escaping.cc:81
string octstring(unsigned char val)
Definition escaping.cc:98
string xml2id(string in)
Definition escaping.cc:301
string octal_to_hex(const string &octal_digits)
Definition escaping.cc:236
string id2xml(string in, const string &not_allowed)
Definition escaping.cc:253
string unescattr(string s)
Definition escaping.cc:407
string munge_error_message(string msg)
Definition escaping.cc:446
string unhexstring(string s)
Definition escaping.cc:88
string escape_double_quotes(string source)
Definition escaping.cc:466
string id2www_ce(string in, const string &allowable)
Definition escaping.cc:166
string unoctstring(string s)
Definition escaping.cc:105
string id2www(string in, const string &allowable)
Definition escaping.cc:143