26 unsigned char uc =
static_cast<unsigned char>(c);
29 if ((uc&0xE0u)==0xC0u)
33 if ((uc&0xF0u)==0xE0u)
37 if ((uc&0xF8u)==0xF0u)
41 if ((uc&0xFCu)==0xF8u)
45 if ((uc&0xFEu)==0xFCu)
55static inline uint32_t
decode_utf8(
const char* data ,
int numBytes )
noexcept
57 uint32_t cp =
static_cast<unsigned char>(*data);
60 cp &= 0x7F >> numBytes;
61 for (
int i=1 ; i<numBytes ; i++)
63 cp = (cp<<6) | (static_cast<unsigned char>(data[i])&0x3F);
71 if (s==
nullptr || bytesLeft==0)
76 unsigned char uc =
static_cast<unsigned char>(*s);
85 if ((uc&0xFEu)==0xFCu)
92 if ((uc&0xFCu)==0xF8u)
99 if ((uc&0xF8u)==0xF0u)
106 if ((uc&0xF0u)==0xE0u)
113 if ((uc&0xE0u)==0xC0u)
129 if (input.length()<=pos)
return std::string();
131 if (input.length()<pos+numBytes)
return std::string();
132 return input.substr(pos,numBytes);
144 return code>=
'A' && code<='Z' ? static_cast<char>(code+
'a'-
'A') :
static_cast<char>(code);
149 return code>=
'a' && code<='z' ? static_cast<char>(code+
'A'-
'a') :
static_cast<char>(code);
153 char (*asciiConversionFunc)(uint32_t code),
154 const char *(*conversionFunc)(uint32_t code))
158 result.reserve(input.length());
160 size_t bytesLeft = input.length();
161 const char *p = input.c_str();
166 char c = asciiConversionFunc(code);
171 const char *conv = conversionFunc(code);
174 result.append(p,len);
199 if (s==
nullptr)
return nullptr;
201 for (uint8_t i=0;i<len;i++)
215 return !input.empty() && (
static_cast<unsigned char>(input[input.length()-1])&0xC0)==0x80;
220 if (input.length()<=pos)
return false;
230 return (
static_cast<unsigned char>(input[0])==0xC2 &&
231 static_cast<unsigned char>(input[1])==0xA0) ? 2 : 0;
bool isPunctuationCharacter(uint32_t code)
const char * convertUnicodeToUpper(uint32_t code)
const char * convertUnicodeToLower(uint32_t code)
Text streaming class that buffers data.
void write(const char *buf, size_t len)
Adds a array of character to the stream.
static char asciiToLower(char in)
std::string convertUTF8ToUpper(const std::string &input)
Converts the input string into a upper case version, also taking into account non-ASCII characters th...
static char asciiToLower(uint32_t code)
bool isUTF8CharUpperCase(const std::string &input, size_t pos)
Returns true iff the input string at byte position pos holds an upper case character.
bool isUTF8PunctuationCharacter(uint32_t unicode)
Check if the given Unicode character represents a punctuation character.
static uint32_t decode_utf8(const char *data, int numBytes) noexcept
static uint32_t convertUTF8CharToUnicode(const char *s, size_t bytesLeft, int &len)
static char asciiToUpper(uint32_t code)
static std::string caseConvert(const std::string &input, char(*asciiConversionFunc)(uint32_t code), const char *(*conversionFunc)(uint32_t code))
uint32_t getUnicodeForUTF8CharAt(const std::string &input, size_t pos)
Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input.
bool lastUTF8CharIsMultibyte(const std::string &input)
Returns true iff the last character in input is a multibyte character.
int isUTF8NonBreakableSpace(const char *input)
Check if the first character pointed at by input is a non-breakable whitespace character.
std::string convertUTF8ToLower(const std::string &input)
Converts the input string into a lower case version, also taking into account non-ASCII characters th...
uint8_t getUTF8CharNumBytes(char c)
Returns the number of bytes making up a single UTF8 character given the first byte in the sequence.
std::string getUTF8CharAt(const std::string &input, size_t pos)
Returns the UTF8 character found at byte position pos in the input string.
const char * writeUTF8Char(TextStream &t, const char *s)
Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character.
Various UTF8 related helper functions.