usual/utf8.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

/** @file
 * Low-level UTF8 handling.
 */

/*
 * Copyright (c) 2009  Marko Kreen
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#ifndef _USUAL_UTF8_H_
#define _USUAL_UTF8_H_

#include <usual/base.h>

/**
 * Parse Unicode codepoint from UTF8 stream.
 *
 * On invalid UTF8 sequence returns negative byte value and
 * inreases src_p by one.
 *
 * @param src_p Location of data pointer.  Will be incremented in-place.
 * @param srcend  Pointer to end of data.
 * @return UNOCODE codepoint or negative byte value on error.
 */
int  utf8_get_char(const char **src_p, const char *srcend);

/**
 * Write Unicode codepoint as UTF8 sequence.
 *
 * Skips invalid Unicode values without error.
 *
 * @param c       Unicode codepoint.
 * @param dst_p   Location of dest pointer, will be increased in-place.
 * @param dstend  Pointer to end of buffer.
 * @return false if not room, true otherwise.
 */
bool utf8_put_char(unsigned int c, char **dst_p, const char *dstend);

/** Return UTF8 seq length based on unicode codepoint */
int utf8_char_size(unsigned int c);

/** Return UTF8 seq length based on first byte */
int utf8_seq_size(unsigned char c);

/** Return sequence length if all bytes are valid, 0 otherwise. */
int utf8_validate_seq(const char *src, const char *srcend);

bool utf8_validate_string(const char *src, const char *end);

#endif