#include <cstdint>
#include <sstream>
#include "utf8.h"
#include "caseconvert.h"
#include "textstream.h"

Include dependency graph for utf8.cpp:

Functions
uint8_t	getUTF8CharNumBytes (char c)
	Returns the number of bytes making up a single UTF8 character given the first byte in the sequence.
static uint32_t	decode_utf8 (const char *data, int numBytes) noexcept
static uint32_t	convertUTF8CharToUnicode (const char *s, size_t bytesLeft, int &len)
std::string	getUTF8CharAt (const std::string &input, size_t pos)
	Returns the UTF8 character found at byte position pos in the input string.
uint32_t	getUnicodeForUTF8CharAt (const std::string &input, size_t pos)
	Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input.
static char	asciiToLower (uint32_t code)
static char	asciiToUpper (uint32_t code)
static std::string	caseConvert (const std::string &input, char(asciiConversionFunc)(uint32_t code), const char (*conversionFunc)(uint32_t code))
std::string	convertUTF8ToLower (const std::string &input)
	Converts the input string into a lower case version, also taking into account non-ASCII characters that has a lower case variant.
std::string	convertUTF8ToUpper (const std::string &input)
	Converts the input string into a upper case version, also taking into account non-ASCII characters that has a upper case variant.
const char *	writeUTF8Char (TextStream &t, const char *s)
	Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character.
bool	lastUTF8CharIsMultibyte (const std::string &input)
	Returns true iff the last character in input is a multibyte character.
bool	isUTF8CharUpperCase (const std::string &input, size_t pos)
	Returns true iff the input string at byte position pos holds an upper case character.
int	isUTF8NonBreakableSpace (const char *input)
	Check if the first character pointed at by input is a non-breakable whitespace character.
bool	isUTF8PunctuationCharacter (uint32_t unicode)
	Check if the given Unicode character represents a punctuation character.

Function Documentation

◆ asciiToLower()

char asciiToLower ( uint32_t code )

inlinestatic

Definition at line 142 of file utf8.cpp.

{
  return code>='A' && code<='Z' ? static_cast<char>(code+'a'-'A') : static_cast<char>(code);
}

◆ asciiToUpper()

char asciiToUpper ( uint32_t code )

inlinestatic

Definition at line 147 of file utf8.cpp.

{
  return code>='a' && code<='z' ? static_cast<char>(code+'A'-'a') : static_cast<char>(code);
}

Referenced by convertUTF8ToUpper().

◆ caseConvert()

std::string caseConvert	(	const std::string &	input,
		char(*	asciiConversionFunc )(uint32_t code),
		const char (	conversionFunc )(uint32_t code) )

inlinestatic

Definition at line 152 of file utf8.cpp.

{
  uint32_t code=0;
  std::string result;
  result.reserve(input.length()); // assume all ASCII characters
  int len=0;
  size_t bytesLeft = input.length();
  const char *p = input.c_str();
  while ((code=convertUTF8CharToUnicode(p,bytesLeft,len)))
  {
    if (code<128) // ASCII case
    {
      char c = asciiConversionFunc(code);
      result+=c;
    }
    else // generic case
    {
      const char *conv = conversionFunc(code);
      if (conv==nullptr) // no difference between lower and upper case
      {
        result.append(p,len);
      }
      else // replace the input character with the conversion result
      {
        result.append(conv);
      }
    }
    p+=len;
    bytesLeft-=len;
  }
  return result;
}

References convertUTF8CharToUnicode().

Referenced by convertUTF8ToLower(), and convertUTF8ToUpper().

◆ convertUTF8CharToUnicode()

uint32_t convertUTF8CharToUnicode	(	const char *	s,
		size_t	bytesLeft,
		int &	len )

inlinestatic

Definition at line 69 of file utf8.cpp.

{
  if (s==nullptr || bytesLeft==0)
  {
    len=0;
    return 0;
  }
  unsigned char uc = static_cast<unsigned char>(*s);
  if (uc<128) // ASCII case
  {
    len=1;
    return uc;
  }
  switch (bytesLeft)
  {
    default:
      if ((uc&0xFEu)==0xFCu)// 1111110X six bytes
      {
        len=6;
        return decode_utf8(s,len);
      }
      // fall through
    case 5:
      if ((uc&0xFCu)==0xF8u) // 111110XX five bytes
      {
        len=5;
        return decode_utf8(s,len);
      }
      // fall through
    case 4:
      if ((uc&0xF8u)==0xF0u) // 11110XXX four bytes
      {
        len=4;
        return decode_utf8(s,len);
      }
      // fall through
    case 3:
      if ((uc&0xF0u)==0xE0u) // 1110XXXX three bytes
      {
        len=3;
        return decode_utf8(s,len);
      }
      // fall through
    case 2:
      if ((uc&0xE0u)==0xC0u) // 110XXXXX two bytes
      {
        len=2;
        return decode_utf8(s,len);
      }
      // fall through
    case 1:
      {
        len=1;
        return uc;
      }
  }
}

References decode_utf8().

Referenced by caseConvert(), getUnicodeForUTF8CharAt(), and isUTF8CharUpperCase().

◆ convertUTF8ToLower()

std::string convertUTF8ToLower ( const std::string & input )

Converts the input string into a lower case version, also taking into account non-ASCII characters that has a lower case variant.

Definition at line 187 of file utf8.cpp.

{
  return caseConvert(input,asciiToLower,convertUnicodeToLower);
}

References asciiToLower(), caseConvert(), and convertUnicodeToLower().

Referenced by SearchIndexInfo::add(), Index::addClassMemberNameToIndex(), Index::addFileMemberNameToIndex(), Index::addModuleMemberNameToIndex(), Index::addNamespaceMemberNameToIndex(), AnchorGenerator::generate(), QCString::lower(), FileNameFn::searchKey(), SearchTerm::termEncoded(), and HtmlGenerator::writeLabel().

◆ convertUTF8ToUpper()

std::string convertUTF8ToUpper ( const std::string & input )

Converts the input string into a upper case version, also taking into account non-ASCII characters that has a upper case variant.

Definition at line 192 of file utf8.cpp.

{
  return caseConvert(input,asciiToUpper,convertUnicodeToUpper);
}

References asciiToUpper(), caseConvert(), and convertUnicodeToUpper().

Referenced by Translator::createNoun(), QCString::upper(), and writeAlphabeticalClassList().

◆ decode_utf8()

uint32_t decode_utf8	(	const char *	data,
		int	numBytes )

inlinestaticnoexcept

Decodes a given input of utf8 data to a unicode code point given the number of bytes it's made of

Definition at line 55 of file utf8.cpp.

{
  uint32_t cp = static_cast<unsigned char>(*data);
  if (numBytes>1)
  {
    cp &= 0x7F >> numBytes; // Mask out the header bits
    for (int i=1 ; i<numBytes ; i++)
    {
      cp = (cp<<6) | (static_cast<unsigned char>(data[i])&0x3F);
    }
  }
  return cp;
}

Referenced by convertUTF8CharToUnicode().

◆ getUnicodeForUTF8CharAt()

uint32_t getUnicodeForUTF8CharAt	(	const std::string &	input,
		size_t	pos )

Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input.

Definition at line 135 of file utf8.cpp.

{
  std::string charS = getUTF8CharAt(input,pos);
  int len=0;
  return convertUTF8CharToUnicode(charS.c_str(),charS.length(),len);
}

References convertUTF8CharToUnicode(), and getUTF8CharAt().

Referenced by AnchorGenerator::generate().

◆ getUTF8CharAt()

std::string getUTF8CharAt	(	const std::string &	input,
		size_t	pos )

Returns the UTF8 character found at byte position pos in the input string.

The resulting string can be a multi byte sequence.

Definition at line 127 of file utf8.cpp.

{
  if (input.length()<=pos) return std::string();
  int numBytes=getUTF8CharNumBytes(input[pos]);
  if (input.length()<pos+numBytes) return std::string();
  return input.substr(pos,numBytes);
}

References getUTF8CharNumBytes().

Referenced by SearchIndexInfo::add(), Index::addClassMemberNameToIndex(), Index::addFileMemberNameToIndex(), Index::addModuleMemberNameToIndex(), Index::addNamespaceMemberNameToIndex(), Translator::createNoun(), AnchorGenerator::generate(), getUnicodeForUTF8CharAt(), and writeAlphabeticalClassList().

◆ getUTF8CharNumBytes()

uint8_t getUTF8CharNumBytes ( char c )

Returns the number of bytes making up a single UTF8 character given the first byte in the sequence.

Definition at line 23 of file utf8.cpp.

{
  uint8_t num=1;
  unsigned char uc = static_cast<unsigned char>(c);
  if (uc>=0x80u) // multibyte character
  {
    if ((uc&0xE0u)==0xC0u)
    {
      num=2; // 110x.xxxx: 2 byte character
    }
    if ((uc&0xF0u)==0xE0u)
    {
      num=3; // 1110.xxxx: 3 byte character
    }
    if ((uc&0xF8u)==0xF0u)
    {
      num=4; // 1111.0xxx: 4 byte character
    }
    if ((uc&0xFCu)==0xF8u)
    {
      num=5; // 1111.10xx: 5 byte character
    }
    if ((uc&0xFEu)==0xFCu)
    {
      num=6; // 1111.110x: 6 byte character
    }
  }
  return num;
}

Referenced by detab(), escapeCharsInString(), AnchorGenerator::generate(), getUTF8CharAt(), nextUTF8CharPosition(), updateColumnCount(), and writeUTF8Char().

◆ isUTF8CharUpperCase()

bool isUTF8CharUpperCase	(	const std::string &	input,
		size_t	pos )

Returns true iff the input string at byte position pos holds an upper case character.

Definition at line 218 of file utf8.cpp.

{
  if (input.length()<=pos) return false;
  int len=0;
  // turn the UTF8 character at position pos into a unicode value
  uint32_t code = convertUTF8CharToUnicode(input.c_str()+pos,input.length()-pos,len);
  // check if the character can be converted to lower case, if so it was an upper case character
  return convertUnicodeToLower(code)!=nullptr;
}

References convertUnicodeToLower(), and convertUTF8CharToUnicode().

Referenced by DefinitionImpl::_setBriefDescription().

◆ isUTF8NonBreakableSpace()

int isUTF8NonBreakableSpace ( const char * input )

Check if the first character pointed at by input is a non-breakable whitespace character.

Returns the byte size of the character if there is match or 0 if not.

Definition at line 228 of file utf8.cpp.

{
  return (static_cast<unsigned char>(input[0])==0xC2 &&
          static_cast<unsigned char>(input[1])==0xA0) ? 2 : 0;
}

Referenced by detab().

◆ isUTF8PunctuationCharacter()

bool isUTF8PunctuationCharacter ( uint32_t unicode )

Check if the given Unicode character represents a punctuation character.

Definition at line 234 of file utf8.cpp.

{
  bool b = isPunctuationCharacter(unicode);
  return b;
}

References isPunctuationCharacter().

Referenced by AnchorGenerator::generate().

◆ lastUTF8CharIsMultibyte()

bool lastUTF8CharIsMultibyte ( const std::string & input )

Returns true iff the last character in input is a multibyte character.

Definition at line 212 of file utf8.cpp.

{
  // last byte is part of a multibyte UTF8 char if bit 8 is set and bit 7 is not
  return !input.empty() && (static_cast<unsigned char>(input[input.length()-1])&0xC0)==0x80;
}

Referenced by DefinitionImpl::_setBriefDescription().

◆ writeUTF8Char()

const char * writeUTF8Char	(	TextStream &	t,
		const char *	s )

Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character.

Definition at line 197 of file utf8.cpp.

{
  if (s==nullptr) return nullptr;
  uint8_t len = getUTF8CharNumBytes(*s);
  for (uint8_t i=0;i<len;i++)
  {
    if (s[i]==0) // detect premature end of string (due to invalid UTF8 char)
    {
      len=i;
    }
  }
  t.write(s,len);
  return s+len;
}

References getUTF8CharNumBytes(), and TextStream::write().

Referenced by HtmlCodeGenerator::codify(), ManCodeGenerator::codify(), RTFCodeGenerator::codify(), HtmlDocVisitor::operator()(), HtmlDocVisitor::writeObfuscatedMailAddress(), and writeXMLCodeString().

Functions

Function Documentation

◆ asciiToLower()

◆ asciiToUpper()

◆ caseConvert()

◆ convertUTF8CharToUnicode()

◆ convertUTF8ToLower()

◆ convertUTF8ToUpper()

◆ decode_utf8()

◆ getUnicodeForUTF8CharAt()

◆ getUTF8CharAt()

◆ getUTF8CharNumBytes()

◆ isUTF8CharUpperCase()

◆ isUTF8NonBreakableSpace()

◆ isUTF8PunctuationCharacter()

◆ lastUTF8CharIsMultibyte()

◆ writeUTF8Char()