![]() |
GDevelop Core
Core library for developing platforms and tools compatible with GDevelop.
|
#include <stdlib.h>
#include <sys/types.h>
#include <stdbool.h>
#include <inttypes.h>
#include <limits.h>
Go to the source code of this file.
Classes | |
struct | utf8proc_property_struct |
Macros | |
#define | UTF8PROC_DLLEXPORT GD_CORE_API |
#define | SSIZE_MAX ((size_t)SIZE_MAX/2) |
API version | |
The utf8proc API version MAJOR.MINOR.PATCH, following semantic-versioning rules (http://semver.org) based on API compatibility. This is also returned at runtime by utf8proc_version; however, the runtime version may append a string like "-dev" to the version number for prerelease versions.
| |
#define | UTF8PROC_VERSION_MAJOR 1 |
#define | UTF8PROC_VERSION_MINOR 3 |
#define | UTF8PROC_VERSION_PATCH 0 |
Error codes | |
Error codes being returned by almost all functions. | |
#define | UTF8PROC_ERROR_NOMEM -1 |
#define | UTF8PROC_ERROR_OVERFLOW -2 |
#define | UTF8PROC_ERROR_INVALIDUTF8 -3 |
#define | UTF8PROC_ERROR_NOTASSIGNED -4 |
#define | UTF8PROC_ERROR_INVALIDOPTS -5 |
Typedefs | |
typedef int8_t | utf8proc_int8_t |
typedef uint8_t | utf8proc_uint8_t |
typedef int16_t | utf8proc_int16_t |
typedef uint16_t | utf8proc_uint16_t |
typedef int32_t | utf8proc_int32_t |
typedef uint32_t | utf8proc_uint32_t |
typedef size_t | utf8proc_size_t |
typedef ssize_t | utf8proc_ssize_t |
typedef bool | utf8proc_bool |
typedef utf8proc_int16_t | utf8proc_propval_t |
typedef struct utf8proc_property_struct | utf8proc_property_t |
Functions | |
UTF8PROC_DLLEXPORT const char * | utf8proc_version (void) |
UTF8PROC_DLLEXPORT const char * | utf8proc_errmsg (utf8proc_ssize_t errcode) |
UTF8PROC_DLLEXPORT utf8proc_ssize_t | utf8proc_iterate (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref) |
UTF8PROC_DLLEXPORT utf8proc_bool | utf8proc_codepoint_valid (utf8proc_int32_t codepoint) |
UTF8PROC_DLLEXPORT utf8proc_ssize_t | utf8proc_encode_char (utf8proc_int32_t codepoint, utf8proc_uint8_t *dst) |
UTF8PROC_DLLEXPORT const utf8proc_property_t * | utf8proc_get_property (utf8proc_int32_t codepoint) |
UTF8PROC_DLLEXPORT utf8proc_ssize_t | utf8proc_decompose_char (utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) |
UTF8PROC_DLLEXPORT utf8proc_ssize_t | utf8proc_decompose (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options) |
UTF8PROC_DLLEXPORT utf8proc_ssize_t | utf8proc_reencode (utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) |
UTF8PROC_DLLEXPORT utf8proc_bool | utf8proc_grapheme_break (utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2) |
UTF8PROC_DLLEXPORT utf8proc_int32_t | utf8proc_tolower (utf8proc_int32_t c) |
UTF8PROC_DLLEXPORT utf8proc_int32_t | utf8proc_toupper (utf8proc_int32_t c) |
UTF8PROC_DLLEXPORT int | utf8proc_charwidth (utf8proc_int32_t codepoint) |
UTF8PROC_DLLEXPORT utf8proc_category_t | utf8proc_category (utf8proc_int32_t codepoint) |
UTF8PROC_DLLEXPORT const char * | utf8proc_category_string (utf8proc_int32_t codepoint) |
UTF8PROC_DLLEXPORT utf8proc_ssize_t | utf8proc_map (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options) |
Unicode normalization | |
Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC normalized version of the null-terminated string | |
UTF8PROC_DLLEXPORT utf8proc_uint8_t * | utf8proc_NFD (const utf8proc_uint8_t *str) |
UTF8PROC_DLLEXPORT utf8proc_uint8_t * | utf8proc_NFC (const utf8proc_uint8_t *str) |
UTF8PROC_DLLEXPORT utf8proc_uint8_t * | utf8proc_NFKD (const utf8proc_uint8_t *str) |
UTF8PROC_DLLEXPORT utf8proc_uint8_t * | utf8proc_NFKC (const utf8proc_uint8_t *str) |
Variables | |
UTF8PROC_DLLEXPORT const utf8proc_int8_t | utf8proc_utf8class [256] |
#define UTF8PROC_ERROR_INVALIDOPTS -5 |
Invalid options have been used.
#define UTF8PROC_ERROR_INVALIDUTF8 -3 |
The given string is not a legal UTF-8 string.
#define UTF8PROC_ERROR_NOMEM -1 |
Memory could not be allocated.
#define UTF8PROC_ERROR_NOTASSIGNED -4 |
The UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found.
#define UTF8PROC_ERROR_OVERFLOW -2 |
The given string is too long to be processed.
#define UTF8PROC_VERSION_MAJOR 1 |
The MAJOR version number (increased when backwards API compatibility is broken).
#define UTF8PROC_VERSION_MINOR 3 |
The MINOR version number (increased when new functionality is added in a backwards-compatible manner).
#define UTF8PROC_VERSION_PATCH 0 |
The PATCH version (increased for fixes that do not change the API).
typedef struct utf8proc_property_struct utf8proc_property_t |
Struct containing information about a codepoint.
typedef utf8proc_int16_t utf8proc_propval_t |
Holds the value of a property.
Bidirectional character classes.
Boundclass property.
enum utf8proc_category_t |
Unicode categories.
Decomposition type.
enum utf8proc_option_t |
Option flags used by several functions in the library.
Enumerator | |
---|---|
UTF8PROC_NULLTERM | The given UTF-8 input is NULL terminated. |
UTF8PROC_STABLE | Unicode Versioning Stability has to be respected. |
UTF8PROC_COMPAT | Compatibility decomposition (i.e. formatting information is lost). |
UTF8PROC_COMPOSE | Return a result with decomposed characters. |
UTF8PROC_DECOMPOSE | Return a result with decomposed characters. |
UTF8PROC_IGNORE | Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. |
UTF8PROC_REJECTNA | Return an error, if the input contains unassigned codepoints. |
UTF8PROC_NLF2LS | Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a line break, and should be converted to the codepoint for line separation (LS). |
UTF8PROC_NLF2PS | Indicating that NLF-sequences are representing a paragraph break, and should be converted to the codepoint for paragraph separation (PS). |
UTF8PROC_NLF2LF | Indicating that the meaning of NLF-sequences is unknown. |
UTF8PROC_STRIPCC | Strips and/or convers control characters. NLF-sequences are transformed into space, except if one of the NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) are treated as a NLF-sequence in this case. All other control characters are simply removed. |
UTF8PROC_CASEFOLD | Performs unicode case folding, to be able to do a case-insensitive string comparison. |
UTF8PROC_CHARBOUND | Inserts 0xFF bytes at the beginning of each sequence which is representing a single grapheme cluster (see UAX#29). |
UTF8PROC_LUMP | Lumps certain characters together. E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details. If NLF2LF is set, this includes a transformation of paragraph and line separators to ASCII line-feed (LF). |
UTF8PROC_STRIPMARK | Strips all character markings. This includes non-spacing, spacing and enclosing (i.e. accents).
|
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category | ( | utf8proc_int32_t | codepoint | ) |
Return the Unicode category for the codepoint (one of the utf8proc_category_t constants.)
UTF8PROC_DLLEXPORT const char* utf8proc_category_string | ( | utf8proc_int32_t | codepoint | ) |
Return the two-letter (nul-terminated) Unicode category string for the codepoint (e.g. "Lu"
or "Co"
).
UTF8PROC_DLLEXPORT int utf8proc_charwidth | ( | utf8proc_int32_t | codepoint | ) |
Given a codepoint, return a character width analogous to wcwidth(codepoint)
, except that a width of 0 is returned for non-printable codepoints instead of -1 as in wcwidth
.
isprint
or iscntrl
), use utf8proc_category. UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid | ( | utf8proc_int32_t | codepoint | ) |
Check if a codepoint is valid (regardless of whether it has been assigned a value by the current Unicode standard).
codepoint
is valid and otherwise return 0. UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_int32_t * | buffer, | ||
utf8proc_ssize_t | bufsize, | ||
utf8proc_option_t | options | ||
) |
The same as utf8proc_decompose_char, but acts on a whole UTF-8 string and orders the decomposed sequences correctly.
If the UTF8PROC_NULLTERM flag in options
is set, processing will be stopped, when a NULL byte is encounted, otherwise strlen
bytes are processed. The result (in the form of 32-bit unicode codepoints) is written into the buffer being pointed to by buffer
(which must contain at least bufsize
entries). In case of success, the number of codepoints written is returned; in case of an error, a negative error code is returned (utf8proc_errmsg).
If the number of written codepoints would be bigger than bufsize
, the required buffer size is returned, while the buffer will be overwritten with undefined data.
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char | ( | utf8proc_int32_t | codepoint, |
utf8proc_int32_t * | dst, | ||
utf8proc_ssize_t | bufsize, | ||
utf8proc_option_t | options, | ||
int * | last_boundclass | ||
) |
Decompose a codepoint into an array of codepoints.
codepoint | the codepoint. |
dst | the destination buffer. |
bufsize | the size of the destination buffer. |
options | one or more of the following flags:
|
last_boundclass | Pointer to an integer variable containing the previous codepoint's boundary class if the UTF8PROC_CHARBOUND option is used. Otherwise, this parameter is ignored. |
bufsize
, the required buffer size is returned, while the buffer will be overwritten with undefined data. UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char | ( | utf8proc_int32_t | codepoint, |
utf8proc_uint8_t * | dst | ||
) |
Encodes the codepoint as an UTF-8 string in the byte array pointed to by dst
. This array must be at least 4 bytes long.
In case of success the number of bytes written is returned, and otherwise 0 is returned.
This function does not check whether codepoint
is valid Unicode.
UTF8PROC_DLLEXPORT const char* utf8proc_errmsg | ( | utf8proc_ssize_t | errcode | ) |
Returns an informative error string for the given utf8proc error code (e.g. the error codes returned by utf8proc_map).
UTF8PROC_DLLEXPORT const utf8proc_property_t* utf8proc_get_property | ( | utf8proc_int32_t | codepoint | ) |
Look up the properties for a given codepoint.
codepoint | The Unicode codepoint. |
category
is 0 (UTF8PROC_CATEGORY_CN). UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break | ( | utf8proc_int32_t | codepoint1, |
utf8proc_int32_t | codepoint2 | ||
) |
Given a pair of consecutive codepoints, return whether a grapheme break is permitted between them (as defined by the extended grapheme clusters in UAX#29).
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_int32_t * | codepoint_ref | ||
) |
Reads a single codepoint from the UTF-8 sequence being pointed to by str
. The maximum number of bytes read is strlen
, unless strlen
is negative (in which case up to 4 bytes are read).
If a valid codepoint could be read, it is stored in the variable pointed to by codepoint_ref
, otherwise that variable will be set to -1. In case of success, the number of bytes read is returned; otherwise, a negative error code is returned.
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_uint8_t ** | dstptr, | ||
utf8proc_option_t | options | ||
) |
Maps the given UTF-8 string pointed to by str
to a new UTF-8 string, allocated dynamically by malloc
and returned via dstptr
.
If the UTF8PROC_NULLTERM flag in the options
field is set, the length is determined by a NULL terminator, otherwise the parameter strlen
is evaluated to determine the string length, but in any case the result will be NULL terminated (though it might contain NULL characters with the string if str
contained NULL characters). Other flags in the options
field are passed to the functions defined above, and regarded as described.
In case of success the length of the new string is returned, otherwise a negative error code is returned.
malloc
, and should therefore be deallocated with free
. UTF8PROC_DLLEXPORT utf8proc_uint8_t* utf8proc_NFC | ( | const utf8proc_uint8_t * | str | ) |
NFC normalization (UTF8PROC_COMPOSE).
UTF8PROC_DLLEXPORT utf8proc_uint8_t* utf8proc_NFD | ( | const utf8proc_uint8_t * | str | ) |
NFD normalization (UTF8PROC_DECOMPOSE).
UTF8PROC_DLLEXPORT utf8proc_uint8_t* utf8proc_NFKC | ( | const utf8proc_uint8_t * | str | ) |
NFD normalization (UTF8PROC_COMPOSE and UTF8PROC_COMPAT).
UTF8PROC_DLLEXPORT utf8proc_uint8_t* utf8proc_NFKD | ( | const utf8proc_uint8_t * | str | ) |
NFD normalization (UTF8PROC_DECOMPOSE and UTF8PROC_COMPAT).
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode | ( | utf8proc_int32_t * | buffer, |
utf8proc_ssize_t | length, | ||
utf8proc_option_t | options | ||
) |
Reencodes the sequence of length
codepoints pointed to by buffer
UTF-8 data in-place (i.e., the result is also stored in buffer
).
buffer | the (native-endian UTF-32) unicode codepoints to re-encode. |
length | the length (in codepoints) of the buffer. |
options | a bitwise or (| ) of one or more of the following flags:
|
buffer
must exceed the amount of the input data by one byte, and the entries of the array pointed to by str
have to be in the range 0x0000
to 0x10FFFF
. Otherwise, the program might crash! UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower | ( | utf8proc_int32_t | c | ) |
Given a codepoint c
, return the codepoint of the corresponding lower-case character, if any; otherwise (if there is no lower-case variant, or if c
is not a valid codepoint) return c
.
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper | ( | utf8proc_int32_t | c | ) |
Given a codepoint c
, return the codepoint of the corresponding upper-case character, if any; otherwise (if there is no upper-case variant, or if c
is not a valid codepoint) return c
.
UTF8PROC_DLLEXPORT const char* utf8proc_version | ( | void | ) |
Returns the utf8proc API version as a string MAJOR.MINOR.PATCH (http://semver.org format), possibly with a "-dev" suffix for development versions.
|
extern |
Array containing the byte lengths of a UTF-8 encoded codepoint based on the first byte.