|
libunibreak 7.0
|
Implementation of the line breaking algorithm as described in Unicode Standard Annex 14. More...
#include <assert.h>#include <stddef.h>#include <stdint.h>#include <string.h>#include "eastasianwidthdef.h"#include "linebreak.h"#include "linebreakdef.h"Macros | |
| #define | UB_LB25_OPT_HACK 1 |
| #define | LINEBREAK_UNDEFINED -1 |
| Special value used internally to indicate an undefined break result. | |
| #define | INVALID_POS ((size_t)-1) |
| Special value used internally to mark an invalid position. | |
| #define | ENDS_WITH(str, suffix) |
Enumerations | |
| enum | BreakAction { DIR_BRK , IND_BRK , CMI_BRK , CMP_BRK , PRH_BRK } |
| Enumeration of break actions. More... | |
Functions | |
| void | init_linebreak (void) |
| Does nothing. | |
| void | lb_init_break_context (struct LineBreakContext *lbpCtx, utf32_t ch, const char *lang) |
| Initializes line breaking context for a given language. | |
| int | lb_process_next_char (struct LineBreakContext *lbpCtx, utf32_t ch) |
| Updates LineBreakingContext for the next codepoint and returns the detected break. | |
| enum LineBreakClass | lb_get_char_class (const struct LineBreakContext *lbpCtx, utf32_t ch) |
| Gets the line breaking class of a character for a line breaking context. | |
| size_t | set_linebreaks (const void *s, size_t len, const char *lang, enum BreakOutputType outputType, char *brks, get_next_char_t get_next_char) |
| Sets the line breaking information for a generic input string. | |
| void | set_linebreaks_utf8 (const utf8_t *s, size_t len, const char *lang, char *brks) |
| Sets the line breaking information for a UTF-8 input string. | |
| size_t | set_linebreaks_utf8_per_code_point (const utf8_t *s, size_t len, const char *lang, char *brks) |
| Sets the line breaking information for a UTF-8 input string. | |
| void | set_linebreaks_utf16 (const utf16_t *s, size_t len, const char *lang, char *brks) |
| Sets the line breaking information for a UTF-16 input string. | |
| size_t | set_linebreaks_utf16_per_code_point (const utf16_t *s, size_t len, const char *lang, char *brks) |
| Sets the line breaking information for a UTF-16 input string. | |
| void | set_linebreaks_utf32 (const utf32_t *s, size_t len, const char *lang, char *brks) |
| Sets the line breaking information for a UTF-32 input string. | |
| int | is_line_breakable (utf32_t char1, utf32_t char2, const char *lang) |
| Tells whether a line break can occur between two Unicode characters. | |
Implementation of the line breaking algorithm as described in Unicode Standard Annex 14.
| #define ENDS_WITH | ( | str, | |
| suffix ) |
| #define INVALID_POS ((size_t)-1) |
Special value used internally to mark an invalid position.
| #define LINEBREAK_UNDEFINED -1 |
Special value used internally to indicate an undefined break result.
| #define UB_LB25_OPT_HACK 1 |
| enum BreakAction |
| void init_linebreak | ( | void | ) |
Does nothing.
This is kept for binary compatibility.
Tells whether a line break can occur between two Unicode characters.
This is a wrapper function to expose a simple interface. Generally speaking, it is better to use set_linebreaks_utf32 instead, since complicated cases involving combining marks, spaces, etc. cannot be correctly processed.
| char1 | the first Unicode character |
| char2 | the second Unicode character |
| lang | language of the input |
| enum LineBreakClass lb_get_char_class | ( | const struct LineBreakContext * | lbpCtx, |
| utf32_t | ch ) |
Gets the line breaking class of a character for a line breaking context.
This function will check the language-specific data first, and then the default data if there is no language-specific property available for the character.
| lbpCtx | pointer to the line breaking context |
| ch | character to check |
LBP_XX otherwise | void lb_init_break_context | ( | struct LineBreakContext * | lbpCtx, |
| utf32_t | ch, | ||
| const char * | lang ) |
Initializes line breaking context for a given language.
| [in,out] | lbpCtx | pointer to the line breaking context |
| [in] | ch | the first character to process |
| [in] | lang | language of the input |
| int lb_process_next_char | ( | struct LineBreakContext * | lbpCtx, |
| utf32_t | ch ) |
Updates LineBreakingContext for the next codepoint and returns the detected break.
This function is deprecated, as it cannot support fixups, as required by LB25 tailoring (and some more recent rules). See the implementation of set_linebreaks for the fixup logic.
| [in,out] | lbpCtx | pointer to the line breaking context |
| [in] | ch | Unicode codepoint |
| size_t set_linebreaks | ( | const void * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| enum BreakOutputType | outputType, | ||
| char * | brks, | ||
| get_next_char_t | get_next_char ) |
Sets the line breaking information for a generic input string.
Currently, this implementation has customization for the following ISO 639-1 language codes (for lang):
In addition, a suffix "-strict" may be added to indicate strict (as versus normal) line-breaking behaviour. See the Conditional Japanese Starter section of UAX #14 for more details.
| [in] | s | input string |
| [in] | len | length of the input |
| [in] | lang | language of the input |
| [in] | outputType | output per code-unit or per code-point |
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
| [in] | get_next_char | function to get the next UTF-32 character |
| void set_linebreaks_utf16 | ( | const utf16_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) |
Sets the line breaking information for a UTF-16 input string.
| [in] | s | input UTF-16 string |
| [in] | len | length of the input |
| [in] | lang | language of the input |
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
| size_t set_linebreaks_utf16_per_code_point | ( | const utf16_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) |
Sets the line breaking information for a UTF-16 input string.
| [in] | s | input UTF-16 string |
| [in] | len | length of the input |
| [in] | lang | language of the input |
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK |
| void set_linebreaks_utf32 | ( | const utf32_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) |
Sets the line breaking information for a UTF-32 input string.
| [in] | s | input UTF-32 string |
| [in] | len | length of the input |
| [in] | lang | language of the input |
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
| void set_linebreaks_utf8 | ( | const utf8_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) |
Sets the line breaking information for a UTF-8 input string.
| [in] | s | input UTF-8 string |
| [in] | len | length of the input |
| [in] | lang | language of the input |
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR |
| size_t set_linebreaks_utf8_per_code_point | ( | const utf8_t * | s, |
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) |
Sets the line breaking information for a UTF-8 input string.
| [in] | s | input UTF-8 string |
| [in] | len | length of the input |
| [in] | lang | language of the input |
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK |