Doxygen
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * Copyright (C) 1997-2021 by Dimitri van Heesch.
4 *
5 * Permission to use, copy, modify, and distribute this software and its
6 * documentation under the terms of the GNU General Public License is hereby
7 * granted. No representations are made about the suitability of this software
8 * for any purpose. It is provided "as is" without express or implied warranty.
9 * See the GNU General Public License for more details.
10 *
11 * Documents produced by Doxygen are derivative works derived from the
12 * input used in their production; they are not affected by this license.
13 *
14 */
15
16#ifndef UTF8_H
17#define UTF8_H
18
19#include <cstdint>
20#include <string>
21
22class TextStream;
23
24/** @file
25 * @brief Various UTF8 related helper functions.
26 *
27 * See https://en.wikipedia.org/wiki/UTF-8 for details on UTF8 encoding.
28 */
29
30
31/** Converts the input string into a lower case version, also taking into account
32 * non-ASCII characters that has a lower case variant.
33 */
34std::string convertUTF8ToLower(const std::string &input);
35
36/** Converts the input string into a upper case version, also taking into account
37 * non-ASCII characters that has a upper case variant.
38 */
39std::string convertUTF8ToUpper(const std::string &input);
40
41/** Returns the UTF8 character found at byte position pos in the input string.
42 * The resulting string can be a multi byte sequence.
43 */
44std::string getUTF8CharAt(const std::string &input,size_t pos);
45
46/** Returns the 32bit Unicode value matching character at byte position pos in
47 * the UTF8 encoded input.
48 */
49uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos);
50
51/** Returns the number of bytes making up a single UTF8 character given the first byte
52 * in the sequence.
53 */
54uint8_t getUTF8CharNumBytes(char firstByte);
55
56/** Writes the UTF8 character pointed to by s to stream t and returns a pointer
57 * to the next character.
58 */
59const char *writeUTF8Char(TextStream &t,const char *s);
60
61/** Returns true iff the last character in input is a multibyte character. */
62bool lastUTF8CharIsMultibyte(const std::string &input);
63
64/** Returns true iff the input string at byte position pos holds an upper case character. */
65bool isUTF8CharUpperCase(const std::string &input,size_t pos);
66
67/** Check if the first character pointed at by input is a non-breakable whitespace character.
68 * Returns the byte size of the character if there is match or 0 if not.
69 */
70int isUTF8NonBreakableSpace(const char *input);
71
72/** Check if the given Unicode character represents a punctuation character */
73bool isUTF8PunctuationCharacter(uint32_t unicode);
74
75#endif
Text streaming class that buffers data.
Definition textstream.h:36
std::string convertUTF8ToUpper(const std::string &input)
Converts the input string into a upper case version, also taking into account non-ASCII characters th...
Definition utf8.cpp:192
bool isUTF8CharUpperCase(const std::string &input, size_t pos)
Returns true iff the input string at byte position pos holds an upper case character.
Definition utf8.cpp:218
bool isUTF8PunctuationCharacter(uint32_t unicode)
Check if the given Unicode character represents a punctuation character.
Definition utf8.cpp:234
uint32_t getUnicodeForUTF8CharAt(const std::string &input, size_t pos)
Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input.
Definition utf8.cpp:135
bool lastUTF8CharIsMultibyte(const std::string &input)
Returns true iff the last character in input is a multibyte character.
Definition utf8.cpp:212
int isUTF8NonBreakableSpace(const char *input)
Check if the first character pointed at by input is a non-breakable whitespace character.
Definition utf8.cpp:228
std::string convertUTF8ToLower(const std::string &input)
Converts the input string into a lower case version, also taking into account non-ASCII characters th...
Definition utf8.cpp:187
std::string getUTF8CharAt(const std::string &input, size_t pos)
Returns the UTF8 character found at byte position pos in the input string.
Definition utf8.cpp:127
uint8_t getUTF8CharNumBytes(char firstByte)
Returns the number of bytes making up a single UTF8 character given the first byte in the sequence.
Definition utf8.cpp:23
const char * writeUTF8Char(TextStream &t, const char *s)
Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character.
Definition utf8.cpp:197