libdap  Updated for version 3.20.6
libdap4 is an implementation of OPeNDAP's DAP protocol.
escaping.cc
1 
2 // -*- mode: c++; c-basic-offset:4 -*-
3 
4 // This file is part of libdap, A C++ implementation of the OPeNDAP Data
5 // Access Protocol.
6 
7 // Copyright (c) 2002,2003 OPeNDAP, Inc.
8 // Author: James Gallagher <jgallagher@opendap.org>
9 //
10 // This library is free software; you can redistribute it and/or
11 // modify it under the terms of the GNU Lesser General Public
12 // License as published by the Free Software Foundation; either
13 // version 2.1 of the License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 //
24 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
25 
26 // Copyright (c) 1996, California Institute of Technology.
27 // ALL RIGHTS RESERVED. U.S. Government Sponsorship acknowledged.
28 //
29 // Please read the full copyright notice in the file COPYRIGHT_URI
30 // in this directory.
31 //
32 // Author: Todd Karakashian, NASA/Jet Propulsion Laboratory
33 // Todd.K.Karakashian@jpl.nasa.gov
34 //
35 // $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server
36 //
37 // These two routines are for escaping/unescaping strings that are identifiers
38 // in DAP2
39 // id2www() -- escape (using WWW hex codes) non-allowable characters in a
40 // DAP2 identifier
41 // www2id() -- given an WWW hexcode escaped identifier, restore it
42 //
43 // These two routines are for escaping/unescaping strings storing attribute
44 // values. They use traditional octal escapes (\nnn) because they are
45 // intended to be viewed by a user
46 // escattr() -- escape (using traditional octal backslash) non-allowable
47 // characters in the value of a DAP2 attribute
48 // unescattr() -- given an octally escaped string, restore it
49 //
50 // These are routines used by the above, not intended to be called directly:
51 //
52 // hexstring()
53 // unhexstring()
54 // octstring()
55 // unoctstring()
56 //
57 // -Todd
58 
59 #include "config.h"
60 
61 #include <ctype.h>
62 
63 #include <iomanip>
64 #include <string>
65 #include <sstream>
66 
67 #include "GNURegex.h"
68 #include "Error.h"
69 #include "InternalErr.h"
70 //#define DODS_DEBUG
71 #include "debug.h"
72 
73 using namespace std;
74 
75 namespace libdap {
76 
77 // The next four functions were originally defined static, but I removed that
78 // to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001
79 // jhrg
80 
81 string
82 hexstring(unsigned char val)
83 {
84  ostringstream buf;
85  buf << hex << setw(2) << setfill('0') << static_cast<unsigned int>(val);
86 
87  return buf.str();
88 }
89 
90 string
91 unhexstring(string s)
92 {
93  int val;
94  istringstream ss(s);
95  ss >> hex >> val;
96  char tmp_str[2];
97  tmp_str[0] = static_cast<char>(val);
98  tmp_str[1] = '\0';
99  return string(tmp_str);
100 }
101 
102 string
103 octstring(unsigned char val)
104 {
105  ostringstream buf;
106  buf << oct << setw(3) << setfill('0')
107  << static_cast<unsigned int>(val);
108 
109  return buf.str();
110 }
111 
112 string
113 unoctstring(string s)
114 {
115  int val;
116 
117  istringstream ss(s);
118  ss >> oct >> val;
119 
120  DBG(cerr << "unoctstring: " << val << endl);
121 
122  char tmp_str[2];
123  tmp_str[0] = static_cast<char>(val);
124  tmp_str[1] = '\0';
125  return string(tmp_str);
126 }
127 
152 string
153 id2www(string in, const string &allowable)
154 {
155  string::size_type i = 0;
156  DBG(cerr<<"Input string: [" << in << "]" << endl);
157  while ((i = in.find_first_not_of(allowable, i)) != string::npos) {
158  DBG(cerr<<"Found escapee: [" << in[i] << "]");
159  in.replace(i, 1, "%" + hexstring(in[i]));
160  DBGN(cerr<<" now the string is: " << in << endl);
161  i += 3;//i++;
162  }
163 
164  return in;
165 }
166 
177 string
178 id2www_ce(string in, const string &allowable)
179 {
180  return id2www(in, allowable);
181 
182 
183 }
184 
219 string
220 www2id(const string &in, const string &escape, const string &except)
221 {
222  string::size_type i = 0;
223  string res = in;
224  while ((i = res.find_first_of(escape, i)) != string::npos) {
225  if (except.find(res.substr(i, 3)) != string::npos) {
226  i += 3;
227  continue;
228  }
229  res.replace(i, 3, unhexstring(res.substr(i + 1, 2)));
230  ++i;
231  }
232 
233  return res;
234 }
235 
236 static string
237 entity(char c)
238 {
239  switch (c) {
240  case '>': return "&gt;";
241  case '<': return "&lt;";
242  case '&': return "&amp;";
243  case '\'': return "&apos;";
244  case '\"': return "&quot;";
245  default:
246  throw InternalErr(__FILE__, __LINE__, "Unrecognized character.");
247  }
248 }
249 
250 // Assumption: There are always exactly two octal digits in the input
251 // and two hex digits in the result.
252 string
253 octal_to_hex(const string &octal_digits)
254 {
255  int val;
256 
257  istringstream ss(octal_digits);
258  ss >> oct >> val;
259 
260  ostringstream ds;
261  ds << hex << setw(2) << setfill('0') << val;
262  return ds.str();
263 }
264 
271 string
272 id2xml(string in, const string &not_allowed)
273 {
274  string::size_type i = 0;
275 
276  while ((i = in.find_first_of(not_allowed, i)) != string::npos) {
277  in.replace(i, 1, entity(in[i]));
278  ++i;
279  }
280 #if 0
281  // Removed the encoding of octal escapes. This function is used by
282  // AttrTable to encode the stuff that is the value of the <value>
283  // element in the DDX. The problem is that some of the values are not
284  // valid UTF-8 and that makes a XML parser gag.; ticket 1512.
285  // jhrg 3/19/10
286 
287  // OK, now scan for octal escape sequences like \\012 (where the '\'
288  // is itself escaped). This type of attribute value comes from the netCDF
289  // handler and maybe others. Assumption: The '\' will always appear as
290  // in its escaped form: '\\'. NB: Both backslashes must be escaped in the
291  // C++ string.
292  string octal_escape = "\\\\";
293  i = 0;
294  string::size_type length = in.length();
295  while ((i = in.find(octal_escape, i)) != string::npos) {
296  // Get the three octal digits following the '\\0'
297  string::size_type j = i + 2;
298  if (j + 1 >= length) // Check that we're not past the end
299  break;
300  string octal_digits = in.substr(j, 3);
301  // convert to a &#xdd; XML escape
302  string hex_escape = string("&#x");
303  hex_escape.append(octal_to_hex(octal_digits));
304  hex_escape.append(string(";"));
305 
306  // replace the octal escape with an XML/hex escape
307  in.replace(i, 5, hex_escape);
308 
309  // increment i
310  i += 6;
311  }
312 #endif
313  return in;
314 }
315 
321 string
322 xml2id(string in)
323 {
324  string::size_type i = 0;
325 
326  while ((i = in.find("&gt;", i)) != string::npos)
327  in.replace(i, 4, ">");
328 
329  i = 0;
330  while ((i = in.find("&lt;", i)) != string::npos)
331  in.replace(i, 4, "<");
332 
333  i = 0;
334  while ((i = in.find("&amp;", i)) != string::npos)
335  in.replace(i, 5, "&");
336 
337  i = 0;
338  while ((i = in.find("&apos;", i)) != string::npos)
339  in.replace(i, 6, "'");
340 
341  i = 0;
342  while ((i = in.find("&quot;", i)) != string::npos)
343  in.replace(i, 6, "\"");
344 
345  return in;
346 }
347 
353 string
354 esc2underscore(string s)
355 {
356  string::size_type pos;
357  while ((pos = s.find('%')) != string::npos)
358  s.replace(pos, 3, "_");
359 
360  return s;
361 }
362 
363 
367 string
368 escattr(string s)
369 {
370  const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"";
371  const string ESC = "\\";
372  const string DOUBLE_ESC = ESC + ESC;
373  const string QUOTE = "\"";
374  const string ESCQUOTE = ESC + QUOTE;
375 
376  // escape \ with a second backslash
377  string::size_type ind = 0;
378  while ((ind = s.find(ESC, ind)) != s.npos) {
379  s.replace(ind, 1, DOUBLE_ESC);
380  ind += DOUBLE_ESC.length();
381  }
382 
383  // escape non-printing characters with octal escape
384  ind = 0;
385  while ((ind = s.find_first_not_of(printable, ind)) != s.npos)
386  s.replace(ind, 1, ESC + octstring(s[ind]));
387 
388  // escape " with backslash
389  ind = 0;
390  while ((ind = s.find(QUOTE, ind)) != s.npos) {
391  s.replace(ind, 1, ESCQUOTE);
392  ind += ESCQUOTE.length();
393  }
394 
395  return s;
396 }
397 
406 string
407 unescattr(string s)
408 {
409  Regex octal("\\\\[0-3][0-7][0-7]"); // matches 4 characters
410  Regex esc_quote("\\\\\""); // matches 3 characters
411  Regex esc_esc("\\\\\\\\"); // matches 2 characters
412  const string ESC = "\\";
413  const string QUOTE = "\"";
414  int matchlen;
415  unsigned int index;
416 
417  DBG(cerr << "0XX" << s << "XXX" << endl);
418  // unescape any escaped backslashes
419  index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
420  while (index < s.length()) {
421  DBG(cerr << "1aXX" << s << "XXX index: " << index << endl);
422  s.replace(index, 2, ESC);
423  DBG(cerr << "1bXX" << s << "XXX index: " << index << endl);
424  index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
425  }
426 
427  // unescape any escaped double quote characters
428  index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
429  while (index < s.length()) {
430  s.replace(index, 2, QUOTE);
431  DBG(cerr << "2XX" << s << "XXX index: " << index << endl);
432  index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
433  }
434 
435  // unescape octal characters
436  index = octal.search(s.c_str(), s.length(), matchlen, 0);
437  while (index < s.length()) {
438  s.replace(index, 4, unoctstring(s.substr(index + 1, 3)));
439  DBG(cerr << "3XX" << s << "XXX index: " << index << endl);
440  index = octal.search(s.c_str(), s.length(), matchlen, 0);
441  }
442 
443  DBG(cerr << "4XX" << s << "XXX" << endl);
444  return s;
445 }
446 
447 string
448 munge_error_message(string msg)
449 {
450  // First, add enclosing quotes if needed.
451  if (*msg.begin() != '"')
452  msg.insert(msg.begin(), '"');
453  if (*(msg.end() - 1) != '"')
454  msg += "\"";
455 
456  // Now escape any internal double quotes that aren't escaped.
457  string::iterator miter;
458  for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++)
459  if (*miter == '"' && *(miter - 1) != '\\')
460  miter = msg.insert(miter, '\\');
461 
462  return msg;
463 }
464 
469 string
470 escape_double_quotes(string source)
471 {
472  string::size_type idx = 0;
473  while((idx = source.find('\"', idx)) != string::npos) {
474  source.replace(idx, 1, "\\\""); // a backslash and a double quote
475  idx += 2;
476  }
477 
478  return source;
479 }
480 
486 string
488 {
489  string::size_type idx = 0;
490  while((idx = source.find("\\\"", idx)) != string::npos) {
491  source.replace(idx, 2, "\""); // a backslash and a double quote
492  ++idx;
493  }
494 
495  return source;
496 }
497 
498 } // namespace libdap
499 
string id2www_ce(string in, const string &allowable)
Definition: escaping.cc:178
string id2xml(string in, const string &not_allowed)
Definition: escaping.cc:272
int search(const char *s, int len, int &matchlen, int pos=0)
How much of the string does the pattern match.
Definition: GNURegex.cc:147
STL namespace.
string escape_double_quotes(string source)
Definition: escaping.cc:470
top level DAP object to house generic methods
Definition: AISConnect.cc:30
A class for software fault reporting.
Definition: InternalErr.h:64
string xml2id(string in)
Definition: escaping.cc:322
string www2id(const string &in, const string &escape, const string &except)
Definition: escaping.cc:220
string esc2underscore(string s)
Definition: escaping.cc:354
string unescattr(string s)
Definition: escaping.cc:407
string id2www(string in, const string &allowable)
Definition: escaping.cc:153
string unescape_double_quotes(string source)
Definition: escaping.cc:487
string escattr(string s)
Definition: escaping.cc:368