Doxygen
Loading...
Searching...
No Matches
xml.l
Go to the documentation of this file.
-1/******************************************************************************
2 *
3 * Copyright (C) 1997-2020 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
13/******************************************************************************
14 * Minimal flex based parser for XML
15 ******************************************************************************/
16
17%option never-interactive
18%option prefix="xmlYY"
19%option reentrant
20%option extra-type="struct xmlYY_state *"
21%option 8bit noyywrap
22%top{
23#include <stdint.h>
24}
25
26%{
27
28#include <ctype.h>
29#include <vector>
30#include <stdio.h>
31#include "xml.h"
32
33#define YY_NEVER_INTERACTIVE 1
34#define YY_NO_INPUT 1
35#define YY_NO_UNISTD_H 1
38{
39 std::string fileName;
40 int lineNr = 1;
41 const char * inputString = nullptr; //!< the code fragment as text
42 int inputPosition = 0; //!< read offset during parsing
43 std::string name;
44 bool isEnd = false;
45 bool selfClose = false;
46 std::string data;
47 std::string attrValue;
48 std::string attrName;
54 std::string encoding;
55 std::vector<std::string> xpath;
56 std::function<XMLParser::Transcode> transcodeFunc;
57};
59#if USE_STATE2STRING
60static const char *stateToString(int state);
61#endif
62
63static int yyread(yyscan_t yyscanner,char *buf,int max_size);
64static void initElement(yyscan_t yyscanner);
65static void addCharacters(yyscan_t yyscanner);
66static void addElement(yyscan_t yyscanner);
67static void addAttribute(yyscan_t yyscanner);
68static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len);
69static void reportError(yyscan_t yyscanner, const std::string &msg);
70static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len);
71
72#undef YY_INPUT
73#define YY_INPUT(buf,result,max_size) result=yyread(yyscanner,buf,max_size);
74
Event handlers that can installed by the client and called while parsing a XML document.
Definition xml.h:27
std::unordered_map< std::string, std::string > Attributes
Definition xml.h:29
static const char * stateToString(int state)
yyguts_t * yyscan_t
Definition code.l:24
#define msg(fmt,...)
Definition message.h:94
std::string name
Definition xml.l:45
std::string encoding
Definition xml.l:56
XMLHandlers handlers
Definition xml.l:52
char stringChar
Definition xml.l:55
XMLHandlers::Attributes attrs
Definition xml.l:51
bool isEnd
Definition xml.l:46
bool selfClose
Definition xml.l:47
std::string attrValue
Definition xml.l:49
std::string attrName
Definition xml.l:50
int cdataContext
Definition xml.l:53
std::vector< std::string > xpath
Definition xml.l:57
std::string fileName
Definition xml.l:41
int inputPosition
read offset during parsing
Definition xml.l:44
int lineNr
Definition xml.l:42
std::string data
Definition xml.l:48
int commentContext
Definition xml.l:54
const char * inputString
the code fragment as text
Definition xml.l:43
std::function< XMLParser::Transcode > transcodeFunc
Definition xml.l:58
static int yyread(yyscan_t yyscanner, char *buf, int max_size)
Definition xml.l:230
static void addCharacters(yyscan_t yyscanner)
Definition xml.l:329
static void reportError(yyscan_t yyscanner, const std::string &msg)
Definition xml.l:361
static void addAttribute(yyscan_t yyscanner)
Definition xml.l:350
static void addElement(yyscan_t yyscanner)
Definition xml.l:286
static void countLines(yyscan_t yyscanner, const char *txt, yy_size_t len)
Definition xml.l:245
static std::string processData(yyscan_t yyscanner, const char *txt, yy_size_t len)
Definition xml.l:380
static void initElement(yyscan_t yyscanner)
Definition xml.l:254
75%}
76
77NL (\r\n|\r|\n)
78SP [ \t\r\n]+
79OPEN {SP}?"<"
80OPENSPECIAL {SP}?"<?"
81CLOSE ">"{NL}?
82CLOSESPECIAL "?>"{NL}?
83NAMESTART [:A-Za-z\200-\377_]
84NAMECHAR [:A-Za-z\200-\377_0-9.-]
85NAME {NAMESTART}{NAMECHAR}*
86ESC "&#"[0-9]+";"|"&#x"[0-9a-fA-F]+";"
87COLON ":"
88PCDATA [^<]+
89COMMENT {OPEN}"!--"
90COMMENTEND "--"{CLOSE}
91STRING \"([^"&]|{ESC})*\"|\'([^'&]|{ESC})*\'
92DOCTYPE {SP}?"<!DOCTYPE"{SP}
93CDATA {SP}?"<![CDATA["
94ENDCDATA "]]>"
95
96%option noyywrap
97
98%s Initial
99%s Content
100%s CDataSection
101%s Element
102%s Attributes
103%s AttributeValue
104%s AttrValueStr
105%s Prolog
106%s Comment
107
108%%
109
110<Initial>{
111 {SP} { countLines(yyscanner,yytext,yyleng); }
static int countLines(yyscan_t yyscanner)
Definition code.l:3462
112 {DOCTYPE} { countLines(yyscanner,yytext,yyleng); }
113 {OPENSPECIAL} { countLines(yyscanner,yytext,yyleng); BEGIN(Prolog); }
114 {OPEN} { countLines(yyscanner,yytext,yyleng);
115 initElement(yyscanner);
116 BEGIN(Element); }
117 {COMMENT} { yyextra->commentContext = YY_START;
118 BEGIN(Comment);
119 }
120}
121<Content>{
122 {CDATA} { countLines(yyscanner,yytext,yyleng);
123 yyextra->cdataContext = YY_START;
124 BEGIN(CDataSection);
125 }
126 {PCDATA} { yyextra->data += processData(yyscanner,yytext,yyleng); }
127 {OPEN} { countLines(yyscanner,yytext,yyleng);
128 addCharacters(yyscanner);
129 initElement(yyscanner);
130 BEGIN(Element);
131 }
132 {COMMENT} { yyextra->commentContext = YY_START;
133 countLines(yyscanner,yytext,yyleng);
134 BEGIN(Comment);
135 }
136}
137<Element>{
138 "/" { yyextra->isEnd = true; }
139 {NAME} { yyextra->name = yytext;
140 BEGIN(Attributes); }
141 {CLOSE} { addElement(yyscanner);
142 countLines(yyscanner,yytext,yyleng);
143 yyextra->data = "";
144 BEGIN(Content);
145 }
146 {SP} { countLines(yyscanner,yytext,yyleng); }
147}
148<Attributes>{
149 "/" { yyextra->selfClose = true; }
150 {NAME} { yyextra->attrName = yytext; }
151 "=" { BEGIN(AttributeValue); }
152 {CLOSE} { addElement(yyscanner);
153 countLines(yyscanner,yytext,yyleng);
154 yyextra->data = "";
155 BEGIN(Content);
156 }
157 {SP} { countLines(yyscanner,yytext,yyleng); }
158}
159<AttributeValue>{
160 {SP} { countLines(yyscanner,yytext,yyleng); }
161 ['"] { yyextra->stringChar = *yytext;
162 yyextra->attrValue = "";
163 BEGIN(AttrValueStr);
164 }
165 . { std::string msg = std::string("Missing attribute value. Unexpected character `")+yytext+"` found";
166 reportError(yyscanner,msg);
167 unput(*yytext);
168 BEGIN(Attributes);
169 }
170}
171<AttrValueStr>{
172 [^'"\n]+ { yyextra->attrValue += processData(yyscanner,yytext,yyleng); }
173 ['"] { if (*yytext==yyextra->stringChar)
174 {
175 addAttribute(yyscanner);
176 BEGIN(Attributes);
177 }
178 else
179 {
180 yyextra->attrValue += processData(yyscanner,yytext,yyleng);
181 }
182 }
183 \n { yyextra->lineNr++; yyextra->attrValue+=' '; }
184}
185<CDataSection>{
186 {ENDCDATA} { BEGIN(yyextra->cdataContext); }
187 [^]\n]+ { yyextra->data += yytext; }
188 \n { yyextra->data += yytext;
189 yyextra->lineNr++;
190 }
191 . { yyextra->data += yytext; }
192}
193<Prolog>{
194 "encoding"\s*=\s*\"[^\"]*\" {
195 std::string encoding=yytext;
196 size_t i=encoding.find('"');
197 encoding=encoding.substr(i+1,yyleng-i-2);
198 if (encoding!="UTF-8") // need to transcode to UTF-8
199 {
200 yyextra->encoding=encoding;
201 }
202 }
203 {CLOSESPECIAL} { countLines(yyscanner,yytext,yyleng);
204 BEGIN(Initial);
205 }
206 \n { yyextra->lineNr++; }
207 . { }
208}
209<Comment>{
210 {COMMENTEND} { countLines(yyscanner,yytext,yyleng);
211 BEGIN(yyextra->commentContext);
212 }
213 [^\n-]+ { }
214 \n { yyextra->lineNr++; }
215 . { }
216}
217\n { yyextra->lineNr++; }
218. { std::string msg = "Unexpected character `";
219 msg+=yytext;
220 msg+="` found";
221 reportError(yyscanner,msg);
222 }
223
224%%
225
226//----------------------------------------------------------------------------------------
227
228static int yyread(yyscan_t yyscanner,char *buf,int max_size)
229{
230 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
231 int inputPosition = yyextra->inputPosition;
232 const char *s = yyextra->inputString + inputPosition;
233 int c=0;
234 while( c < max_size && *s)
235 {
236 *buf++ = *s++;
237 c++;
238 }
239 yyextra->inputPosition += c;
240 return c;
241}
242
243static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len)
244{
245 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
246 for (yy_size_t i=0;i<len;i++)
247 {
248 if (txt[i]=='\n') yyextra->lineNr++;
249 }
250}
251
252static void initElement(yyscan_t yyscanner)
253{
254 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
255 yyextra->isEnd = false; // true => </tag>
256 yyextra->selfClose = false; // true => <tag/>
257 yyextra->name = "";
258 yyextra->attrs.clear();
259}
260
261static void checkAndUpdatePath(yyscan_t yyscanner)
262{
263 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
264 if (yyextra->xpath.empty())
265 {
266 std::string msg = "found closing tag '"+yyextra->name+"' without matching opening tag";
267 reportError(yyscanner,msg);
268 }
269 else
270 {
271 std::string expectedTagName = yyextra->xpath.back();
272 if (expectedTagName!=yyextra->name)
273 {
274 std::string msg = "Found closing tag '"+yyextra->name+"' that does not match the opening tag '"+expectedTagName+"' at the same level";
275 reportError(yyscanner,msg);
276 }
277 else // matching end tag
278 {
279 yyextra->xpath.pop_back();
280 }
281 }
282}
283
284static void addElement(yyscan_t yyscanner)
285{
286 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
287 if (!yyextra->isEnd)
288 {
289 yyextra->xpath.push_back(yyextra->name);
290 if (yyextra->handlers.startElement)
291 {
292 yyextra->handlers.startElement(yyextra->name,yyextra->attrs);
293 }
294 if (yy_flex_debug)
295 {
296 fprintf(stderr,"%d: startElement(%s,attr=[",yyextra->lineNr,yyextra->name.data());
297 for (auto attr : yyextra->attrs)
298 {
299 fprintf(stderr,"%s='%s' ",attr.first.c_str(),attr.second.c_str());
300 }
301 fprintf(stderr,"])\n");
302 }
303 }
304 if (yyextra->isEnd || yyextra->selfClose)
305 {
306 if (yy_flex_debug)
307 {
308 fprintf(stderr,"%d: endElement(%s)\n",yyextra->lineNr,yyextra->name.data());
309 }
310 checkAndUpdatePath(yyscanner);
311 if (yyextra->handlers.endElement)
312 {
313 yyextra->handlers.endElement(yyextra->name);
314 }
315 }
316}
317
318static std::string trimSpaces(const std::string &str)
319{
320 const int l = static_cast<int>(str.length());
321 int s=0, e=l-1;
322 while (s<l && isspace(str.at(s))) s++;
323 while (e>s && isspace(str.at(e))) e--;
324 return str.substr(s,1+e-s);
325}
326
327static void addCharacters(yyscan_t yyscanner)
328{
329 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
330 std::string data = trimSpaces(yyextra->data);
331 if (!yyextra->encoding.empty() && !yyextra->transcodeFunc(data,yyextra->encoding.c_str()))
332 {
333 reportError(yyscanner,"failed to transcode string '"+data+"' from encoding '"+yyextra->encoding+"' to UTF-8");
334 }
335 if (yyextra->handlers.characters)
336 {
337 yyextra->handlers.characters(data);
338 }
339 if (!data.empty())
340 {
341 if (yy_flex_debug)
342 {
343 fprintf(stderr,"characters(%s)\n",data.c_str());
344 }
345 }
346}
347
348static void addAttribute(yyscan_t yyscanner)
349{
350 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
351 std::string val = yyextra->attrValue;
352 if (!yyextra->encoding.empty() && !yyextra->transcodeFunc(val,yyextra->encoding.c_str()))
353 {
354 reportError(yyscanner,"failed to transcode string '"+val+"' from encoding '"+yyextra->encoding+"' to UTF-8");
355 }
356 yyextra->attrs.insert(std::make_pair(yyextra->attrName,val));
357}
358
359static void reportError(yyscan_t yyscanner,const std::string &msg)
360{
361 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
362 if (yy_flex_debug)
363 {
364 fprintf(stderr,"%s:%d: Error '%s'\n",yyextra->fileName.c_str(),yyextra->lineNr,msg.c_str());
365 }
366 if (yyextra->handlers.error)
367 {
368 yyextra->handlers.error(yyextra->fileName,yyextra->lineNr,msg);
369 }
370}
371
372static const char *entities_enc[] = { "amp", "quot", "gt", "lt", "apos" };
373static const char entities_dec[] = { '&', '"', '>', '<', '\'' };
374static const int num_entities = 5;
376// replace character entities such as &amp; in txt and return the string where entities
377// are replaced
378static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len)
379{
380 std::string result;
381 result.reserve(len);
382 for (yy_size_t i=0; i<len; i++)
383 {
384 char c = txt[i];
385 if (c=='&')
386 {
387 const int maxEntityLen = 10;
388 char entity[maxEntityLen+1];
389 entity[maxEntityLen]='\0';
390 for (yy_size_t j=0; j<maxEntityLen && i+j+1<len; j++)
391 {
392 if (txt[i+j+1]!=';')
393 {
394 entity[j]=txt[i+j+1];
395 }
396 else
397 {
398 entity[j]=0;
399 break;
400 }
401 }
402 bool found=false;
403 for (int e=0; !found && e<num_entities; e++)
404 {
405 if (strcmp(entity,entities_enc[e])==0)
406 {
407 result+=entities_dec[e];
408 i+=strlen(entities_enc[e])+1;
409 found=true;
410 }
411 }
412 if (!found)
413 {
414 std::string msg = std::string("Invalid character entity '&") + entity + ";' found\n";
415 reportError(yyscanner,msg);
416 }
417 }
418 else
419 {
420 result+=c;
421 }
422 }
423 return result;
424}
425
426//--------------------------------------------------------------
427
429{
435{
436 xmlYYlex_init_extra(&p->xmlYY_extra,&p->yyscanner);
437 p->xmlYY_extra.handlers = handlers;
438}
439
441{
442 xmlYYlex_destroy(p->yyscanner);
443}
444
445void XMLParser::parse(const char *fileName,
446 const char *inputStr,
447 bool debugEnabled,
448 std::function<void()> debugStart,
449 std::function<void()> debugEnd,
450 std::function<Transcode> transcodeFunc)
451{
452 yyscan_t yyscanner = p->yyscanner;
453 struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
454
455#ifdef FLEX_DEBUG
456 xmlYYset_debug(debugEnabled?1:0,p->yyscanner);
457#endif
458
459 if (inputStr==nullptr || inputStr[0]=='\0') return; // empty input
460
461 debugStart();
462
463 BEGIN(Initial);
464 yyextra->fileName = fileName;
465 yyextra->lineNr = 1;
466 yyextra->inputString = inputStr;
467 yyextra->inputPosition = 0;
468 yyextra->transcodeFunc = transcodeFunc;
469
470 if (static_cast<unsigned char>(inputStr[0])==0xEF &&
471 static_cast<unsigned char>(inputStr[1])==0xBB &&
472 static_cast<unsigned char>(inputStr[2])==0xBF)
473 {
474 yyextra->inputPosition = 3; // remove UTF-8 BOM
475 }
476
477 xmlYYrestart( 0, yyscanner );
478
479 if (yyextra->handlers.startDocument)
480 {
481 yyextra->handlers.startDocument();
482 }
483 xmlYYlex(yyscanner);
484 if (yyextra->handlers.endDocument)
485 {
486 yyextra->handlers.endDocument();
487 }
488
489 if (!yyextra->xpath.empty())
490 {
491 std::string tagName = yyextra->xpath.back();
492 std::string msg = "End of file reached while expecting closing tag '"+tagName+"'";
493 reportError(yyscanner,msg);
494 }
495
496 debugEnd();
497}
498
499int XMLParser::lineNr() const
500{
501 struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
502 return yyextra->lineNr;
503}
504
505std::string XMLParser::fileName() const
506{
507 struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
508 return yyextra->fileName;
509}
510
511#if USE_STATE2STRING
512#include "xml.l.h"
513#endif
int lineNr() const override
Definition xml.l:501
std::unique_ptr< Private > p
Definition xml.h:109
std::string fileName() const override
Definition xml.l:507
XMLParser(const XMLHandlers &handlers)
Definition xml.l:436
void parse(const char *fileName, const char *inputString, bool debugEnabled, std::function< void()> debugStart, std::function< void()> debugEnd, std::function< Transcode > transcoder=[](std::string &s, const char *){ return true;})
Definition xml.l:447
~XMLParser() override
Definition xml.l:442
static int yyread(yyscan_t yyscanner, char *buf, int max_size)
Definition code.l:3971
struct xmlYY_state xmlYY_extra
Definition xml.l:433
yyscan_t yyscanner
Definition xml.l:432
static std::string trimSpaces(const std::string &str)
Definition xml.l:320
static const int num_entities
Definition xml.l:376
static const char entities_dec[]
Definition xml.l:375
static void checkAndUpdatePath(yyscan_t yyscanner)
Definition xml.l:263
static const char * entities_enc[]
Definition xml.l:374