0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
TokenizerTools.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; version 3 of the
9  * License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
27 
28 #include <Common/Compat.h>
29 
30 #include "TokenizerTools.h"
31 
32 #include <Common/Logger.h>
33 
34 #include <cctype>
35 #include <cerrno>
36 #include <climits>
37 #include <cstdlib>
38 #include <map>
39 #include <stack>
40 
41 using namespace std;
42 
43 namespace Hypertable { namespace ClusterDefinitionFile { namespace TokenizerTools {
44 
46  return isalpha(c) || c == '_';
47 }
48 
50  return is_identifier_start_character(c) || isdigit(c);
51 }
52 
53 bool is_valid_identifier(const string &name) {
54  const char *ptr = name.c_str();
55  if (*ptr == 0 || !is_identifier_start_character(*ptr))
56  return false;
57  for (ptr=ptr+1; *ptr; ptr++) {
58  if (!is_identifier_character(*ptr))
59  return false;
60  }
61  return true;
62 }
63 
64 bool is_number(const string &str) {
65  char *end;
66 
67  errno = 0;
68  long val = strtol(str.c_str(), &end, 10);
69 
70  if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN))
71  || (errno != 0 && val == 0))
72  return false;
73 
74  if (end == 0 || *end)
75  return false;
76 
77  return true;
78 }
79 
80 bool find_token(const string &token, const char *base, const char *end, size_t *offsetp) {
81  char quote_char = 0;
82  const char *ptr = base;
83  HT_ASSERT(!token.empty());
84  end -= token.length() - 1;
85  while (ptr < end) {
86  if (quote_char == 0) {
87  if (*ptr == '"' || *ptr == '\'' || *ptr == '`')
88  quote_char = *ptr;
89  else if (*ptr == '#') { // skip comments
90  while (ptr < end && *ptr != '\n')
91  ptr++;
92  if (ptr == end)
93  return false;
94  }
95  else if (strncmp(ptr, token.c_str(), token.length()) == 0) {
96  *offsetp = ptr-base;
97  return true;
98  }
99  }
100  else {
101  if (*ptr == quote_char && *(ptr-1) != '\\')
102  quote_char = 0;
103  }
104  ptr++;
105  }
106  return false;
107 }
108 
109 bool find_next_token(const char *base, size_t *offsetp, size_t *lengthp) {
110  char quote_char = 0;
111  const char *ptr = base;
112  while (*ptr) {
113  if (quote_char == 0) {
114  if (*ptr == '"' || *ptr == '\'' || *ptr == '`')
115  quote_char = *ptr;
116  else if (*ptr == '#') { // skip comments
117  while (*ptr && *ptr != '\n')
118  ptr++;
119  if (*ptr == 0)
120  continue;
121  }
122  else if (is_identifier_character(*ptr)) {
123  *offsetp = ptr-base;
124  base = ptr++;
125  while (*ptr && is_identifier_character(*ptr))
126  ptr++;
127  *lengthp = ptr - base;
128  return true;
129  }
130  }
131  else {
132  if (*ptr == quote_char && *(ptr-1) != '\\')
133  quote_char = 0;
134  }
135  ptr++;
136  }
137  return false;
138 }
139 
140 
141 
142 bool find_end_char(const char *base, const char **endp, size_t *linep) {
143  stack<char> scope;
144 
145  HT_ASSERT(*base == '"' || *base == '\'' || *base == '`' || *base == '{');
146 
147  scope.push(*base);
148  const char *ptr = base+1;
149 
150  while (*ptr) {
151 
152  if (scope.top() == '"') {
153  if (*ptr == '"' && *(ptr-1) != '\\') {
154  scope.pop();
155  if (scope.empty())
156  break;
157  }
158  }
159  else if (scope.top() == '\'') {
160  if (*ptr == '\'' && *(ptr-1) != '\\') {
161  scope.pop();
162  if (scope.empty())
163  break;
164  }
165  }
166  else if (scope.top() == '`') {
167  if (*ptr == '`') {
168  scope.pop();
169  if (scope.empty())
170  break;
171  }
172  }
173  else {
174  HT_ASSERT(scope.top() == '{');
175  if (*ptr == '#') {
176  // skip comments
177  while (*ptr && *ptr != '\n')
178  ptr++;
179  if (*ptr == 0)
180  break;
181  }
182  else if (*ptr == '}') {
183  scope.pop();
184  if (scope.empty())
185  break;
186  }
187  else if (*ptr == '"' || *ptr == '\'' || *ptr == '`')
188  scope.push(*ptr);
189  else if (*ptr == '{') {
190  if (*(ptr-1) == '$') {
191  while (*ptr && *ptr != '}')
192  ptr++;
193  if (*ptr == 0)
194  break;
195  ptr++;
196  }
197  else
198  scope.push(*ptr);
199  }
200  }
201  if (*ptr == '\n' && linep)
202  (*linep)++;
203  ptr++;
204  }
205 
206  if (*ptr == 0)
207  return false;
208 
209  *endp = ptr;
210  return true;
211 }
212 
213 bool skip_control_flow_statement(const char **basep) {
214  map<string, string> control_flow_token_map;
215  control_flow_token_map["if"] = "fi";
216  control_flow_token_map["for"] = "done";
217  control_flow_token_map["until"] = "done";
218  control_flow_token_map["while"] = "done";
219  control_flow_token_map["case"] = "esac";
220  const char *ptr = *basep;
221 
222  while (*ptr && isspace(*ptr))
223  ptr++;
224 
225  size_t offset;
226  size_t length;
227  if (!find_next_token(ptr, &offset, &length)) {
228  *basep = ptr;
229  return false;
230  }
231 
232  string token(ptr+offset, length);
233  auto iter = control_flow_token_map.find(token);
234  if (iter == control_flow_token_map.end()) {
235  *basep = ptr;
236  return false;
237  }
238 
239  stack<string> scope;
240  scope.push(control_flow_token_map[token]);
241 
242  ptr = ptr+offset+length;
243  while (*ptr && find_next_token(ptr, &offset, &length)) {
244  token = string(ptr+offset, length);
245  if (token.compare(scope.top()) == 0) {
246  scope.pop();
247  if (scope.empty()) {
248  *basep = ptr+offset+length;
249  return true;
250  }
251  }
252  else if (control_flow_token_map.find(token) != control_flow_token_map.end())
253  scope.push(control_flow_token_map[token]);
254  ptr += (offset+length);
255  }
256 
257  *basep = ptr;
258  return false;
259 }
260 
261 
262 size_t count_newlines(const char *base, const char *end) {
263  size_t count = 0;
264  for (const char *ptr=base; ptr<end; ptr++) {
265  if (*ptr == '\n')
266  count++;
267  }
268  return count;
269 }
270 
271 bool skip_to_newline(const char **endp) {
272  const char *ptr = *endp;
273  while (*ptr && *ptr != '\n')
274  ptr++;
275  *endp = ptr;
276  return (*ptr == '\n');
277 }
278 
279 bool substitute_variables(const string &input, string &output,
280  map<string, string> &vmap) {
281  bool ret {};
282  string translated_text;
283  const char *base = input.c_str();
284  const char *ptr = strchr(base, '$');
285  while (ptr) {
286  if (ptr > base && *(ptr-1) == '\\') {
287  ptr++;
288  translated_text.append(base, ptr-base);
289  base = ptr;
290  ptr = strchr(base, '$');
291  continue;
292  }
293  translated_text.append(base, ptr-base);
294  base = ptr;
295  if (base[1] == '{') {
296  if ((ptr = strchr(base, '}')) != nullptr) {
297  string variable(base+2, (ptr-base)-2);
298  auto iter = vmap.find(variable);
299  if (iter != vmap.end()) {
300  translated_text.append(vmap[variable]);
301  ret = true;
302  }
303  else
304  translated_text.append(base, (ptr-base)+1);
305  base = ptr + 1;
306  }
307  else
308  base++;
309  }
310  else {
311  ptr = base + 1;
312  while (*ptr && is_identifier_character(*ptr))
313  ptr++;
314  string variable(base+1, (ptr-base)-1);
315  auto iter = vmap.find(variable);
316  if (iter != vmap.end()) {
317  translated_text.append(vmap[variable]);
318  ret = true;
319  }
320  else
321  translated_text.append(base, ptr-base);
322  base = ptr;
323  }
324  ptr = strchr(base, '$');
325  }
326  translated_text.append(base);
327  output.clear();
328  output.append(translated_text);
329  return ret;
330 }
331 
332 
333 }}}
bool find_token(const string &token, const char *base, const char *end, size_t *offsetp)
Finds a string token in a block of code.
bool skip_to_newline(const char **endp)
Skips to next newline character in text.
bool skip_control_flow_statement(const char **basep)
Skips over bash control flow statement.
bool substitute_variables(const string &input, string &output, map< string, string > &vmap)
Does variable sustitution in a block of text.
Po::typed_value< String > * str(String *v=0)
Definition: Properties.h:166
STL namespace.
bool find_end_char(const char *base, const char **endp, size_t *linep)
Skips to end of block or quoted string in code.
size_t count_newlines(const char *base, const char *end)
Counts number of newlines in text.
bool is_valid_identifier(const string &name)
Checks if name is a valid bash identifier.
#define HT_ASSERT(_e_)
Definition: Logger.h:396
Logging routines and macros.
Compatibility Macros for C/C++.
bool find_next_token(const char *base, size_t *offsetp, size_t *lengthp)
Finds next bash identifier token in a block of text.
Hypertable definitions
bool is_number(const string &str)
Checks if string is an ASCII number.
Declarations for TokenizerTools.
bool is_identifier_character(char c)
Checks if character is valid bash identifier character.
bool is_identifier_start_character(char c)
Checks if character is valid bash identifier start character.