Doxygen
Loading...
Searching...
No Matches
xml.l
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Copyright (C) 1997-2020 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
15/******************************************************************************
16 * Minimal flex based parser for XML
17 ******************************************************************************/
18
19%option never-interactive
20%option prefix="xmlYY"
21%option reentrant
22%option extra-type="struct xmlYY_state *"
23%option 8bit noyywrap
24%top{
25#include <stdint.h>
26}
27
28%{
29
30#include <ctype.h>
31#include <vector>
32#include <stdio.h>
33#include "xml.h"
34
35#define YY_NEVER_INTERACTIVE 1
36#define YY_NO_INPUT 1
37#define YY_NO_UNISTD_H 1
38
40{
41 std::string fileName;
42 int lineNr = 1;
43 const char * inputString = nullptr; //!< the code fragment as text
44 int inputPosition = 0; //!< read offset during parsing
45 std::string name;
46 bool isEnd = false;
47 bool selfClose = false;
48 std::string data;
49 std::string attrValue;
50 std::string attrName;
56 std::string encoding;
57 std::vector<std::string> xpath;
58 std::function<XMLParser::Transcode> transcodeFunc;
59};
60
61#if USE_STATE2STRING
62static const char *stateToString(int state);
63#endif
64
65static int yyread(yyscan_t yyscanner,char *buf,int max_size);
66static void initElement(yyscan_t yyscanner);
67static void addCharacters(yyscan_t yyscanner);
68static void addElement(yyscan_t yyscanner);
69static void addAttribute(yyscan_t yyscanner);
70static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len);
71static void reportError(yyscan_t yyscanner, const std::string &msg);
72static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len);
73
74#undef YY_INPUT
75#define YY_INPUT(buf,result,max_size) result=yyread(yyscanner,buf,max_size);
76
Event handlers that can installed by the client and called while parsing a XML document.
Definition xml.h:27
std::unordered_map< std::string, std::string > Attributes
Definition xml.h:29
static const char * stateToString(int state)
yyguts_t * yyscan_t
Definition code.l:24
void msg(const char *fmt,...)
Definition message.cpp:98
std::string name
Definition xml.l:45
std::string encoding
Definition xml.l:56
XMLHandlers handlers
Definition xml.l:52
char stringChar
Definition xml.l:55
XMLHandlers::Attributes attrs
Definition xml.l:51
bool isEnd
Definition xml.l:46
bool selfClose
Definition xml.l:47
std::string attrValue
Definition xml.l:49
std::string attrName
Definition xml.l:50
int cdataContext
Definition xml.l:53
std::vector< std::string > xpath
Definition xml.l:57
std::string fileName
Definition xml.l:41
int inputPosition
read offset during parsing
Definition xml.l:44
int lineNr
Definition xml.l:42
std::string data
Definition xml.l:48
int commentContext
Definition xml.l:54
const char * inputString
the code fragment as text
Definition xml.l:43
std::function< XMLParser::Transcode > transcodeFunc
Definition xml.l:58
static int yyread(yyscan_t yyscanner, char *buf, int max_size)
Definition xml.l:230
static void addCharacters(yyscan_t yyscanner)
Definition xml.l:329
static void reportError(yyscan_t yyscanner, const std::string &msg)
Definition xml.l:361
static void addAttribute(yyscan_t yyscanner)
Definition xml.l:350
static void addElement(yyscan_t yyscanner)
Definition xml.l:286
static void countLines(yyscan_t yyscanner, const char *txt, yy_size_t len)
Definition xml.l:245
static std::string processData(yyscan_t yyscanner, const char *txt, yy_size_t len)
Definition xml.l:380
static void initElement(yyscan_t yyscanner)
Definition xml.l:254
77%}
78
79NL (\r\n|\r|\n)
80SP [ \t\r\n]+
81OPEN {SP}?"<"
82OPENSPECIAL {SP}?"<?"
83CLOSE ">"{NL}?
84CLOSESPECIAL "?>"{NL}?
85NAMESTART [:A-Za-z\200-\377_]
86NAMECHAR [:A-Za-z\200-\377_0-9.-]
87NAME {NAMESTART}{NAMECHAR}*
88ESC "&#"[0-9]+";"|"&#x"[0-9a-fA-F]+";"
89COLON ":"
90PCDATA [^<]+
91COMMENT {OPEN}"!--"
92COMMENTEND "--"{CLOSE}
93STRING \"([^"&]|{ESC})*\"|\'([^'&]|{ESC})*\'
94DOCTYPE {SP}?"<!DOCTYPE"{SP}
95CDATA {SP}?"<![CDATA["
96ENDCDATA "]]>"
97
98%option noyywrap
99
100%s Initial
101%s Content
102%s CDataSection
103%s Element
104%s Attributes
105%s AttributeValue
106%s AttrValueStr
107%s Prolog
108%s Comment
109
111
112<Initial>{
113 {SP} { countLines(yyscanner,yytext,yyleng); }
static int countLines(yyscan_t yyscanner)
Definition code.l:3471
114 {DOCTYPE} { countLines(yyscanner,yytext,yyleng); }
115 {OPENSPECIAL} { countLines(yyscanner,yytext,yyleng); BEGIN(Prolog); }
116 {OPEN} { countLines(yyscanner,yytext,yyleng);
117 initElement(yyscanner);
118 BEGIN(Element); }
119 {COMMENT} { yyextra->commentContext = YY_START;
120 BEGIN(Comment);
121 }
122}
123<Content>{
124 {CDATA} { countLines(yyscanner,yytext,yyleng);
125 yyextra->cdataContext = YY_START;
126 BEGIN(CDataSection);
127 }
128 {PCDATA} { yyextra->data += processData(yyscanner,yytext,yyleng); }
129 {OPEN} { countLines(yyscanner,yytext,yyleng);
130 addCharacters(yyscanner);
131 initElement(yyscanner);
132 BEGIN(Element);
133 }
134 {COMMENT} { yyextra->commentContext = YY_START;
135 countLines(yyscanner,yytext,yyleng);
136 BEGIN(Comment);
137 }
138}
139<Element>{
140 "/" { yyextra->isEnd = true; }
141 {NAME} { yyextra->name = yytext;
142 BEGIN(Attributes); }
143 {CLOSE} { addElement(yyscanner);
144 countLines(yyscanner,yytext,yyleng);
145 yyextra->data = "";
146 BEGIN(Content);
147 }
148 {SP} { countLines(yyscanner,yytext,yyleng); }
149}
150<Attributes>{
151 "/" { yyextra->selfClose = true; }
152 {NAME} { yyextra->attrName = yytext; }
153 "=" { BEGIN(AttributeValue); }
154 {CLOSE} { addElement(yyscanner);
155 countLines(yyscanner,yytext,yyleng);
156 yyextra->data = "";
157 BEGIN(Content);
158 }
159 {SP} { countLines(yyscanner,yytext,yyleng); }
160}
161<AttributeValue>{
162 {SP} { countLines(yyscanner,yytext,yyleng); }
163 ['"] { yyextra->stringChar = *yytext;
164 yyextra->attrValue = "";
165 BEGIN(AttrValueStr);
166 }
167 . { std::string msg = std::string("Missing attribute value. Unexpected character `")+yytext+"` found";
168 reportError(yyscanner,msg);
169 unput(*yytext);
170 BEGIN(Attributes);
171 }
172}
173<AttrValueStr>{
174 [^'"\n]+ { yyextra->attrValue += processData(yyscanner,yytext,yyleng); }
175 ['"] { if (*yytext==yyextra->stringChar)
176 {
177 addAttribute(yyscanner);
178 BEGIN(Attributes);
179 }
180 else
181 {
182 yyextra->attrValue += processData(yyscanner,yytext,yyleng);
183 }
184 }
185 \n { yyextra->lineNr++; yyextra->attrValue+=' '; }
186}
187<CDataSection>{
188 {ENDCDATA} { BEGIN(yyextra->cdataContext); }
189 [^]\n]+ { yyextra->data += yytext; }
190 \n { yyextra->data += yytext;
191 yyextra->lineNr++;
192 }
193 . { yyextra->data += yytext; }
194}
195<Prolog>{
196 "encoding"\s*=\s*\"[^\"]*\" {
197 std::string encoding=yytext;
198 size_t i=encoding.find('"');
199 encoding=encoding.substr(i+1,yyleng-i-2);
200 if (encoding!="UTF-8") // need to transcode to UTF-8
201 {
202 yyextra->encoding=encoding;
203 }
204 }
205 {CLOSESPECIAL} { countLines(yyscanner,yytext,yyleng);
206 BEGIN(Initial);
207 }
208 \n { yyextra->lineNr++; }
209 . { }
210}
211<Comment>{
212 {COMMENTEND} { countLines(yyscanner,yytext,yyleng);
213 BEGIN(yyextra->commentContext);
214 }
215 [^\n-]+ { }
216 \n { yyextra->lineNr++; }
217 . { }
218}
219\n { yyextra->lineNr++; }
220. { std::string msg = "Unexpected character `";
221 msg+=yytext;
222 msg+="` found";
223 reportError(yyscanner,msg);
224 }
225
226%%
227
228//----------------------------------------------------------------------------------------
229
230static int yyread(yyscan_t yyscanner,char *buf,int max_size)
231{
232 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
233 int inputPosition = yyextra->inputPosition;
234 const char *s = yyextra->inputString + inputPosition;
235 int c=0;
236 while( c < max_size && *s)
237 {
238 *buf++ = *s++;
239 c++;
240 }
241 yyextra->inputPosition += c;
242 return c;
243}
244
245static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len)
246{
247 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
248 for (yy_size_t i=0;i<len;i++)
249 {
250 if (txt[i]=='\n') yyextra->lineNr++;
251 }
252}
253
254static void initElement(yyscan_t yyscanner)
255{
256 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
257 yyextra->isEnd = false; // true => </tag>
258 yyextra->selfClose = false; // true => <tag/>
259 yyextra->name = "";
260 yyextra->attrs.clear();
261}
262
263static void checkAndUpdatePath(yyscan_t yyscanner)
264{
265 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
266 if (yyextra->xpath.empty())
267 {
268 std::string msg = "found closing tag '"+yyextra->name+"' without matching opening tag";
269 reportError(yyscanner,msg);
270 }
271 else
272 {
273 std::string expectedTagName = yyextra->xpath.back();
274 if (expectedTagName!=yyextra->name)
275 {
276 std::string msg = "Found closing tag '"+yyextra->name+"' that does not match the opening tag '"+expectedTagName+"' at the same level";
277 reportError(yyscanner,msg);
278 }
279 else // matching end tag
280 {
281 yyextra->xpath.pop_back();
282 }
283 }
284}
285
286static void addElement(yyscan_t yyscanner)
287{
288 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
289 if (!yyextra->isEnd)
290 {
291 yyextra->xpath.push_back(yyextra->name);
292 if (yyextra->handlers.startElement)
293 {
294 yyextra->handlers.startElement(yyextra->name,yyextra->attrs);
295 }
296 if (yy_flex_debug)
297 {
298 fprintf(stderr,"%d: startElement(%s,attr=[",yyextra->lineNr,yyextra->name.data());
299 for (auto attr : yyextra->attrs)
300 {
301 fprintf(stderr,"%s='%s' ",attr.first.c_str(),attr.second.c_str());
302 }
303 fprintf(stderr,"])\n");
304 }
305 }
306 if (yyextra->isEnd || yyextra->selfClose)
307 {
308 if (yy_flex_debug)
309 {
310 fprintf(stderr,"%d: endElement(%s)\n",yyextra->lineNr,yyextra->name.data());
311 }
312 checkAndUpdatePath(yyscanner);
313 if (yyextra->handlers.endElement)
314 {
315 yyextra->handlers.endElement(yyextra->name);
316 }
317 }
318}
319
320static std::string trimSpaces(const std::string &str)
321{
322 const int l = static_cast<int>(str.length());
323 int s=0, e=l-1;
324 while (s<l && isspace(str.at(s))) s++;
325 while (e>s && isspace(str.at(e))) e--;
326 return str.substr(s,1+e-s);
327}
328
329static void addCharacters(yyscan_t yyscanner)
330{
331 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
332 std::string data = trimSpaces(yyextra->data);
333 if (!yyextra->encoding.empty() && !yyextra->transcodeFunc(data,yyextra->encoding.c_str()))
334 {
335 reportError(yyscanner,"failed to transcode string '"+data+"' from encoding '"+yyextra->encoding+"' to UTF-8");
336 }
337 if (yyextra->handlers.characters)
338 {
339 yyextra->handlers.characters(data);
340 }
341 if (!data.empty())
342 {
343 if (yy_flex_debug)
344 {
345 fprintf(stderr,"characters(%s)\n",data.c_str());
346 }
347 }
348}
349
350static void addAttribute(yyscan_t yyscanner)
351{
352 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
353 std::string val = yyextra->attrValue;
354 if (!yyextra->encoding.empty() && !yyextra->transcodeFunc(val,yyextra->encoding.c_str()))
355 {
356 reportError(yyscanner,"failed to transcode string '"+val+"' from encoding '"+yyextra->encoding+"' to UTF-8");
357 }
358 yyextra->attrs.insert(std::make_pair(yyextra->attrName,val));
359}
360
361static void reportError(yyscan_t yyscanner,const std::string &msg)
362{
363 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
364 if (yy_flex_debug)
365 {
366 fprintf(stderr,"%s:%d: Error '%s'\n",yyextra->fileName.c_str(),yyextra->lineNr,msg.c_str());
367 }
368 if (yyextra->handlers.error)
369 {
370 yyextra->handlers.error(yyextra->fileName,yyextra->lineNr,msg);
371 }
372}
373
374static const char *entities_enc[] = { "amp", "quot", "gt", "lt", "apos" };
375static const char entities_dec[] = { '&', '"', '>', '<', '\'' };
376static const int num_entities = 5;
377
378// replace character entities such as &amp; in txt and return the string where entities
379// are replaced
380static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len)
381{
382 std::string result;
383 result.reserve(len);
384 for (yy_size_t i=0; i<len; i++)
385 {
386 char c = txt[i];
387 if (c=='&')
388 {
389 const int maxEntityLen = 10;
390 char entity[maxEntityLen+1];
391 entity[maxEntityLen]='\0';
392 for (yy_size_t j=0; j<maxEntityLen && i+j+1<len; j++)
393 {
394 if (txt[i+j+1]!=';')
395 {
396 entity[j]=txt[i+j+1];
397 }
398 else
399 {
400 entity[j]=0;
401 break;
402 }
403 }
404 bool found=false;
405 for (int e=0; !found && e<num_entities; e++)
406 {
407 if (strcmp(entity,entities_enc[e])==0)
408 {
409 result+=entities_dec[e];
410 i+=strlen(entities_enc[e])+1;
411 found=true;
412 }
413 }
414 if (!found)
415 {
416 std::string msg = std::string("Invalid character entity '&") + entity + ";' found\n";
417 reportError(yyscanner,msg);
418 }
419 }
420 else
421 {
422 result+=c;
423 }
424 }
425 return result;
426}
427
428//--------------------------------------------------------------
429
435
437{
438 xmlYYlex_init_extra(&p->xmlYY_extra,&p->yyscanner);
439 p->xmlYY_extra.handlers = handlers;
440}
441
443{
444 xmlYYlex_destroy(p->yyscanner);
445}
446
447void XMLParser::parse(const char *fileName,
448 const char *inputStr,
449 bool debugEnabled,
450 std::function<void()> debugStart,
451 std::function<void()> debugEnd,
452 std::function<Transcode> transcodeFunc)
453{
454 yyscan_t yyscanner = p->yyscanner;
455 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
456
457#ifdef FLEX_DEBUG
458 xmlYYset_debug(debugEnabled?1:0,p->yyscanner);
459#endif
460
461 if (inputStr==nullptr || inputStr[0]=='\0') return; // empty input
462
463 debugStart();
464
465 BEGIN(Initial);
466 yyextra->fileName = fileName;
467 yyextra->lineNr = 1;
468 yyextra->inputString = inputStr;
469 yyextra->inputPosition = 0;
470 yyextra->transcodeFunc = transcodeFunc;
471
472 if (static_cast<unsigned char>(inputStr[0])==0xEF &&
473 static_cast<unsigned char>(inputStr[1])==0xBB &&
474 static_cast<unsigned char>(inputStr[2])==0xBF)
475 {
476 yyextra->inputPosition = 3; // remove UTF-8 BOM
477 }
478
479 xmlYYrestart( 0, yyscanner );
480
481 if (yyextra->handlers.startDocument)
482 {
483 yyextra->handlers.startDocument();
484 }
485 xmlYYlex(yyscanner);
486 if (yyextra->handlers.endDocument)
487 {
488 yyextra->handlers.endDocument();
489 }
490
491 if (!yyextra->xpath.empty())
492 {
493 std::string tagName = yyextra->xpath.back();
494 std::string msg = "End of file reached while expecting closing tag '"+tagName+"'";
495 reportError(yyscanner,msg);
496 }
497
498 debugEnd();
499}
500
502{
503 struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
504 return yyextra->lineNr;
505}
506
507std::string XMLParser::fileName() const
508{
509 struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
510 return yyextra->fileName;
511}
512
513#if USE_STATE2STRING
514#include "xml.l.h"
515#endif
int lineNr() const override
Definition xml.l:501
std::unique_ptr< Private > p
Definition xml.h:109
std::string fileName() const override
Definition xml.l:507
XMLParser(const XMLHandlers &handlers)
Definition xml.l:436
void parse(const char *fileName, const char *inputString, bool debugEnabled, std::function< void()> debugStart, std::function< void()> debugEnd, std::function< Transcode > transcoder=[](std::string &s, const char *){ return true;})
Definition xml.l:447
~XMLParser() override
Definition xml.l:442
static int yyread(yyscan_t yyscanner, char *buf, int max_size)
Definition code.l:3980
struct xmlYY_state xmlYY_extra
Definition xml.l:433
yyscan_t yyscanner
Definition xml.l:432
bool found
Definition util.cpp:984
static std::string trimSpaces(const std::string &str)
Definition xml.l:320
static const int num_entities
Definition xml.l:376
static const char entities_dec[]
Definition xml.l:375
static void checkAndUpdatePath(yyscan_t yyscanner)
Definition xml.l:263
static const char * entities_enc[]
Definition xml.l:374