Doxygen
Loading...
Searching...
No Matches
searchindex.cpp
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Copyright (C) 1997-2020 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
15
16
17#include <ctype.h>
18#include <assert.h>
19#include <mutex>
20#include <map>
21#include <unordered_map>
22
23#include "searchindex.h"
24
25#include "config.h"
26#include "util.h"
27#include "doxygen.h"
28#include "language.h"
29#include "pagedef.h"
30#include "growbuf.h"
31#include "message.h"
32#include "groupdef.h"
33#include "filedef.h"
34#include "portable.h"
35
36
37// file format: (all multi-byte values are stored in big endian format)
38// 4 byte header
39// 256*256*4 byte index (4 bytes)
40// for each index entry: a zero terminated list of words
41// for each word: a \0 terminated string + 4 byte offset to the stats info
42// padding bytes to align at 4 byte boundary
43// for each word: the number of urls (4 bytes)
44// + for each url containing the word 8 bytes statistics
45// (4 bytes index to url string + 4 bytes frequency counter)
46// for each url: a \0 terminated string
47
48const size_t numIndexEntries = 256*256;
49
50static std::mutex g_searchIndexMutex;
51
52//--------------------------------------------------------------------
53
54void SearchIndex::IndexWord::addUrlIndex(int idx,bool hiPriority)
55{
56 //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);
57 auto it = m_urls.find(idx);
58 if (it==m_urls.end())
59 {
60 //printf("URLInfo::URLInfo(%d)\n",idx);
61 it = m_urls.emplace(idx,URLInfo(idx,0)).first;
62 }
63 it->second.freq+=2;
64 if (hiPriority) it->second.freq|=1; // mark as high priority document
65}
66
67//--------------------------------------------------------------------
68
73
74void SearchIndex::setCurrentDoc(const Definition *ctx,const QCString &anchor,bool isSourceFile)
75{
76 if (ctx==nullptr) return;
77 std::lock_guard<std::mutex> lock(g_searchIndexMutex);
78 assert(!isSourceFile || ctx->definitionType()==Definition::TypeFile);
79 //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);
80 QCString url=isSourceFile ? (toFileDef(ctx))->getSourceFileBase() : ctx->getOutputFileBase();
81 url+=Config_getString(HTML_FILE_EXTENSION);
82 QCString baseUrl = url;
83 if (!anchor.isEmpty()) url+=QCString("#")+anchor;
84 if (!isSourceFile) baseUrl=url;
85 QCString name=ctx->qualifiedName();
87 {
88 const MemberDef *md = toMemberDef(ctx);
90 theTranslator->trSubprogram(TRUE,TRUE) :
91 theTranslator->trMember(TRUE,TRUE))+" ");
92 }
93 else // compound type
94 {
95 SrcLangExt lang = ctx->getLanguage();
97 if (sep!="::")
98 {
99 name = substitute(name,"::",sep);
100 }
101 switch (ctx->definitionType())
102 {
104 {
105 const PageDef *pd = toPageDef(ctx);
106 if (pd->hasTitle())
107 {
108 name = theTranslator->trPage(TRUE,TRUE)+" "+pd->title();
109 }
110 else
111 {
112 name = theTranslator->trPage(TRUE,TRUE)+" "+pd->name();
113 }
114 }
115 break;
117 {
118 const ClassDef *cd = toClassDef(ctx);
119 name.prepend(cd->compoundTypeString()+" ");
120 }
121 break;
123 {
124 if (lang==SrcLangExt::Java || lang==SrcLangExt::CSharp)
125 {
126 name = theTranslator->trPackage(name);
127 }
128 else if (lang==SrcLangExt::Fortran)
129 {
130 name.prepend(theTranslator->trModule(TRUE,TRUE)+" ");
131 }
132 else
133 {
134 name.prepend(theTranslator->trNamespace(TRUE,TRUE)+" ");
135 }
136 }
137 break;
139 {
140 const GroupDef *gd = toGroupDef(ctx);
141 if (!gd->groupTitle().isEmpty())
142 {
143 name = theTranslator->trGroup(TRUE,TRUE)+" "+gd->groupTitle();
144 }
145 else
146 {
147 name.prepend(theTranslator->trGroup(TRUE,TRUE)+" ");
148 }
149 }
150 break;
152 {
153 name.prepend(theTranslator->trModule(TRUE,TRUE)+" ");
154 }
155 break;
156 default:
157 break;
158 }
159 }
160
161 auto it = m_url2IdMap.find(baseUrl.str());
162 if (it == m_url2IdMap.end()) // new entry
163 {
165 m_url2IdMap.emplace(baseUrl.str(),m_urlIndex);
166 m_urls.emplace(m_urlIndex,URL(name,url));
167 }
168 else // existing entry
169 {
170 m_urlIndex=it->second;
171 m_urls.emplace(it->second,URL(name,url));
172 }
173}
174
175static int charsToIndex(const QCString &word)
176{
177 if (word.length()<2) return -1;
178
179 // Fast string hashing algorithm
180 //register uint16_t h=0;
181 //const char *k = word;
182 //uint16_t mask=0xfc00;
183 //while ( *k )
184 //{
185 // h = (h&mask)^(h<<6)^(*k++);
186 //}
187 //return h;
188
189 // Simple hashing that allows for substring searching
190 uint32_t c1=static_cast<uint8_t>(word[0]);
191 uint32_t c2=static_cast<uint8_t>(word[1]);
192 return c1*256+c2;
193}
194
195void SearchIndex::addWordRec(const QCString &word,bool hiPriority,bool recurse)
196{
197 if (word.isEmpty()) return;
198 QCString wStr = QCString(word).lower();
199 //printf("SearchIndex::addWord(%s,%d) wStr=%s\n",word,hiPriority,qPrint(wStr));
200 int idx=charsToIndex(wStr);
201 if (idx<0 || idx>=static_cast<int>(m_index.size())) return;
202 auto it = m_words.find(wStr.str());
203 if (it==m_words.end())
204 {
205 //fprintf(stderr,"addWord(%s) at index %d\n",word,idx);
206 m_index[idx].emplace_back(wStr);
207 it = m_words.emplace( wStr.str(), static_cast<int>(m_index[idx].size())-1 ).first;
208 }
209 m_index[idx][it->second].addUrlIndex(m_urlIndex,hiPriority);
210 bool found=FALSE;
211 if (!recurse) // the first time we check if we can strip the prefix
212 {
213 int i=getPrefixIndex(word);
214 if (i>0)
215 {
216 addWordRec(word.data()+i,hiPriority,TRUE);
217 found=TRUE;
218 }
219 }
220 if (!found) // no prefix stripped
221 {
222 int i=0;
223 while (word[i]!=0 &&
224 !((word[i]=='_' || word[i]==':' || (word[i]>='a' && word[i]<='z')) && // [_a-z:]
225 (word[i+1]>='A' && word[i+1]<='Z'))) // [A-Z]
226 {
227 i++;
228 }
229 if (word[i]!=0 && i>=1)
230 {
231 addWordRec(word.data()+i+1,hiPriority,TRUE);
232 }
233 }
234}
235
236void SearchIndex::addWord(const QCString &word,bool hiPriority)
237{
238 std::lock_guard<std::mutex> lock(g_searchIndexMutex);
239 addWordRec(word,hiPriority,FALSE);
240}
241
242static void writeInt(std::ostream &f,size_t index)
243{
244 f.put(static_cast<int>((index>>24)&0xff));
245 f.put(static_cast<int>((index>>16)&0xff));
246 f.put(static_cast<int>((index>>8)&0xff));
247 f.put(static_cast<int>(index&0xff));
248}
249
250static void writeString(std::ostream &f,const QCString &s)
251{
252 size_t l = s.length();
253 for (size_t i=0;i<l;i++) f.put(s[i]);
254 f.put(0);
255}
256
257void SearchIndex::write(const QCString &fileName)
258{
259 size_t size=4; // for the header
260 size+=4*numIndexEntries; // for the index
261 size_t wordsOffset = size;
262 // first pass: compute the size of the wordlist
263 for (size_t i=0;i<numIndexEntries;i++)
264 {
265 const auto &wlist = m_index[i];
266 if (!wlist.empty())
267 {
268 for (const auto &iw : wlist)
269 {
270 size_t ws = iw.word().length()+1;
271 size+=ws+4; // word + url info list offset
272 }
273 size+=1; // zero list terminator
274 }
275 }
276
277 // second pass: compute the offsets in the index
278 size_t indexOffsets[numIndexEntries];
279 size_t offset=wordsOffset;
280 for (size_t i=0;i<numIndexEntries;i++)
281 {
282 const auto &wlist = m_index[i];
283 if (!wlist.empty())
284 {
285 indexOffsets[i]=offset;
286 for (const auto &iw : wlist)
287 {
288 offset+= iw.word().length()+1;
289 offset+=4; // word + offset to url info array
290 }
291 offset+=1; // zero list terminator
292 }
293 else
294 {
295 indexOffsets[i]=0;
296 }
297 }
298 size_t padding = size;
299 size = (size+3)&~3; // round up to 4 byte boundary
300 padding = size - padding;
301
302 std::vector<size_t> wordStatOffsets(m_words.size());
303
304 int count=0;
305
306 // third pass: compute offset to stats info for each word
307 for (size_t i=0;i<numIndexEntries;i++)
308 {
309 const auto &wlist = m_index[i];
310 if (!wlist.empty())
311 {
312 for (const auto &iw : wlist)
313 {
314 //printf("wordStatOffsets[%d]=%d\n",count,size);
315 wordStatOffsets[count++] = size;
316 size+=4 + iw.urls().size() * 8; // count + (url_index,freq) per url
317 }
318 }
319 }
320 std::vector<size_t> urlOffsets(m_urls.size());
321 for (const auto &udi : m_urls)
322 {
323 urlOffsets[udi.first]=size;
324 size+=udi.second.name.length()+1+
325 udi.second.url.length()+1;
326 }
327
328 //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);
329 std::ofstream f = Portable::openOutputStream(fileName);
330 if (f.is_open())
331 {
332 // write header
333 f.put('D'); f.put('O'); f.put('X'); f.put('S');
334 // write index
335 for (size_t i=0;i<numIndexEntries;i++)
336 {
337 writeInt(f,indexOffsets[i]);
338 }
339 // write word lists
340 count=0;
341 for (size_t i=0;i<numIndexEntries;i++)
342 {
343 const auto &wlist = m_index[i];
344 if (!wlist.empty())
345 {
346 for (const auto &iw : wlist)
347 {
348 writeString(f,iw.word());
349 writeInt(f,wordStatOffsets[count++]);
350 }
351 f.put(0);
352 }
353 }
354 // write extra padding bytes
355 for (size_t i=0;i<padding;i++) f.put(0);
356 // write word statistics
357 for (size_t i=0;i<numIndexEntries;i++)
358 {
359 const auto &wlist = m_index[i];
360 if (!wlist.empty())
361 {
362 for (const auto &iw : wlist)
363 {
364 size_t numUrls = iw.urls().size();
365 writeInt(f,numUrls);
366 for (const auto &ui : iw.urls())
367 {
368 writeInt(f,urlOffsets[ui.second.urlIdx]);
369 writeInt(f,ui.second.freq);
370 }
371 }
372 }
373 }
374 // write urls
375 for (const auto &udi : m_urls)
376 {
377 writeString(f,udi.second.name);
378 writeString(f,udi.second.url);
379 }
380 }
381
382}
383
384//---------------------------------------------------------------------------
385// the following part is for writing an external search index
386
390
392{
393 if (ctx && ctx->definitionType()==Definition::TypeMember)
394 {
395 const MemberDef *md = toMemberDef(ctx);
396 if (md->isFunction())
397 return "function";
398 else if (md->isSlot())
399 return "slot";
400 else if (md->isSignal())
401 return "signal";
402 else if (md->isVariable())
403 return "variable";
404 else if (md->isTypedef())
405 return "typedef";
406 else if (md->isEnumerate())
407 return "enum";
408 else if (md->isEnumValue())
409 return "enumvalue";
410 else if (md->isProperty())
411 return "property";
412 else if (md->isEvent())
413 return "event";
414 else if (md->isRelated() || md->isForeign())
415 return "related";
416 else if (md->isFriend())
417 return "friend";
418 else if (md->isDefine())
419 return "define";
420 }
421 else if (ctx)
422 {
423 switch(ctx->definitionType())
424 {
426 return (toClassDef(ctx))->compoundTypeString();
428 return "file";
430 return "namespace";
432 return "concept";
434 return "group";
436 return "package";
438 return "page";
440 return "dir";
442 return "module";
443 default:
444 break;
445 }
446 }
447 return "unknown";
448}
449
450void SearchIndexExternal::setCurrentDoc(const Definition *ctx,const QCString &anchor,bool isSourceFile)
451{
452 std::lock_guard<std::mutex> lock(g_searchIndexMutex);
453 QCString extId = stripPath(Config_getString(EXTERNAL_SEARCH_ID));
454 QCString url = isSourceFile ? (toFileDef(ctx))->getSourceFileBase() : ctx->getOutputFileBase();
456 if (!anchor.isEmpty()) url+=QCString("#")+anchor;
457 QCString key = extId+";"+url;
458
459 auto it = m_docEntries.find(key.str());
460 if (it == m_docEntries.end())
461 {
463 e.type = isSourceFile ? QCString("source") : definitionToName(ctx);
464 e.name = ctx->qualifiedName();
466 {
467 e.args = (toMemberDef(ctx))->argsString();
468 }
469 else if (ctx->definitionType()==Definition::TypeGroup)
470 {
471 const GroupDef *gd = toGroupDef(ctx);
472 if (!gd->groupTitle().isEmpty())
473 {
474 e.name = filterTitle(gd->groupTitle());
475 }
476 }
477 else if (ctx->definitionType()==Definition::TypePage)
478 {
479 const PageDef *pd = toPageDef(ctx);
480 if (pd->hasTitle())
481 {
482 e.name = filterTitle(pd->title());
483 }
484 }
485 e.extId = extId;
486 e.url = url;
487 it = m_docEntries.emplace(key.str(),e).first;
488 //printf("searchIndexExt %s : %s\n",qPrint(e->name),qPrint(e->url));
489 }
490 m_current = &it->second;
491}
492
493void SearchIndexExternal::addWord(const QCString &word,bool hiPriority)
494{
495 std::lock_guard<std::mutex> lock(g_searchIndexMutex);
496 if (word.isEmpty() || !isId(word[0]) || m_current==nullptr) return;
497 GrowBuf *pText = hiPriority ? &m_current->importantText : &m_current->normalText;
498 if (pText->getPos()>0) pText->addChar(' ');
499 pText->addStr(word);
500 //printf("addWord %s\n",word);
501}
502
504{
505 std::ofstream t = Portable::openOutputStream(fileName);
506 if (t.is_open())
507 {
508 t << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
509 t << "<add>\n";
510 for (auto &[name,doc] : m_docEntries)
511 {
512 doc.normalText.addChar(0); // make sure buffer ends with a 0 terminator
513 doc.importantText.addChar(0); // make sure buffer ends with a 0 terminator
514 t << " <doc>\n";
515 t << " <field name=\"type\">" << doc.type << "</field>\n";
516 t << " <field name=\"name\">" << convertToXML(doc.name) << "</field>\n";
517 if (!doc.args.isEmpty())
518 {
519 t << " <field name=\"args\">" << convertToXML(doc.args) << "</field>\n";
520 }
521 if (!doc.extId.isEmpty())
522 {
523 t << " <field name=\"tag\">" << convertToXML(doc.extId) << "</field>\n";
524 }
525 t << " <field name=\"url\">" << convertToXML(doc.url) << "</field>\n";
526 t << " <field name=\"keywords\">" << convertToXML(doc.importantText.get()) << "</field>\n";
527 t << " <field name=\"text\">" << convertToXML(doc.normalText.get()) << "</field>\n";
528 t << " </doc>\n";
529 }
530 t << "</add>\n";
531 }
532 else
533 {
534 err("Failed to open file {} for writing!\n",fileName);
535 }
536}
537
538//---------------------------------------------------------------------------------------------
539
541{
542 bool searchEngine = Config_getBool(SEARCHENGINE);
543 bool serverBasedSearch = Config_getBool(SERVER_BASED_SEARCH);
544 bool externalSearch = Config_getBool(EXTERNAL_SEARCH);
545 if (searchEngine && serverBasedSearch)
546 {
548 }
549}
550
A abstract class representing of a compound symbol.
Definition classdef.h:104
virtual QCString compoundTypeString() const =0
Returns the type of compound as a string.
The common base class of all entity definitions found in the sources.
Definition definition.h:76
virtual SrcLangExt getLanguage() const =0
Returns the programming language this definition was written in.
virtual DefType definitionType() const =0
virtual QCString qualifiedName() const =0
virtual QCString getOutputFileBase() const =0
virtual const QCString & name() const =0
static SearchIndexIntf searchIndex
Definition doxygen.h:124
A model of a group of symbols.
Definition groupdef.h:52
virtual QCString groupTitle() const =0
Class representing a string buffer optimized for growing.
Definition growbuf.h:28
size_t getPos() const
Definition growbuf.h:116
void addChar(char c)
Definition growbuf.h:69
void addStr(const QCString &s)
Definition growbuf.h:72
A model of a class/file/namespace member symbol.
Definition memberdef.h:48
virtual bool isSignal() const =0
virtual bool isFriend() const =0
virtual bool isForeign() const =0
virtual bool isRelated() const =0
virtual bool isTypedef() const =0
virtual bool isSlot() const =0
virtual bool isEvent() const =0
virtual bool isFunction() const =0
virtual bool isDefine() const =0
virtual bool isEnumerate() const =0
virtual bool isVariable() const =0
virtual bool isEnumValue() const =0
virtual bool isProperty() const =0
A model of a page symbol.
Definition pagedef.h:26
virtual bool hasTitle() const =0
virtual QCString title() const =0
This is an alternative implementation of QCString.
Definition qcstring.h:101
QCString & prepend(const char *s)
Definition qcstring.h:407
size_t length() const
Returns the length of the string, not counting the 0-terminator.
Definition qcstring.h:153
QCString lower() const
Definition qcstring.h:234
bool isEmpty() const
Returns TRUE iff the string is empty.
Definition qcstring.h:150
const std::string & str() const
Definition qcstring.h:537
void addUrlIndex(int, bool)
void addWord(const QCString &word, bool hiPriority)
SearchDocEntry * m_current
void write(const QCString &file)
std::map< std::string, SearchDocEntry > m_docEntries
void setCurrentDoc(const Definition *ctx, const QCString &anchor, bool isSourceFile)
std::map< int, URL > m_urls
void addWordRec(const QCString &word, bool hiPrio, bool recurse)
void addWord(const QCString &word, bool hiPriority)
std::vector< std::vector< IndexWord > > m_index
std::unordered_map< std::string, int > m_words
std::unordered_map< std::string, int > m_url2IdMap
void write(const QCString &file)
void setCurrentDoc(const Definition *ctx, const QCString &anchor, bool isSourceFile)
ClassDef * toClassDef(Definition *d)
#define Config_getBool(name)
Definition config.h:33
#define Config_getString(name)
Definition config.h:32
FileDef * toFileDef(Definition *d)
Definition filedef.cpp:1894
GroupDef * toGroupDef(Definition *d)
Translator * theTranslator
Definition language.cpp:71
MemberDef * toMemberDef(Definition *d)
#define err(fmt,...)
Definition message.h:127
std::ofstream openOutputStream(const QCString &name, bool append=false)
Definition portable.cpp:665
PageDef * toPageDef(Definition *d)
Definition pagedef.cpp:467
Portable versions of functions that are platform dependent.
QCString substitute(const QCString &s, const QCString &src, const QCString &dst)
substitute all occurrences of src in s by dst
Definition qcstring.cpp:477
#define TRUE
Definition qcstring.h:37
#define FALSE
Definition qcstring.h:34
static void writeString(std::ostream &f, const QCString &s)
static std::mutex g_searchIndexMutex
static void writeInt(std::ostream &f, size_t index)
static int charsToIndex(const QCString &word)
const size_t numIndexEntries
static QCString definitionToName(const Definition *ctx)
Web server based search engine.
void initSearchIndexer()
void finalizeSearchIndexer()
QCString type
QCString extId
QCString name
QCString args
QCString url
SrcLangExt
Language as given by extension.
Definition types.h:42
@ CSharp
Definition types.h:46
@ Fortran
Definition types.h:53
QCString stripPath(const QCString &s)
Definition util.cpp:5388
std::string_view word
Definition util.cpp:980
bool found
Definition util.cpp:984
QCString filterTitle(const QCString &title)
Definition util.cpp:6017
QCString convertToXML(const QCString &s, bool keepEntities)
Definition util.cpp:4352
QCString getLanguageSpecificSeparator(SrcLangExt lang, bool classScope)
Returns the scope separator to use given the programming language lang.
Definition util.cpp:6326
int getPrefixIndex(const QCString &name)
Definition util.cpp:3676
void addHtmlExtensionIfMissing(QCString &fName)
Definition util.cpp:5339
A bunch of utility functions.
bool isId(int c)
Definition util.h:206