-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathutf8.cpp
More file actions
150 lines (139 loc) · 4.99 KB
/
utf8.cpp
File metadata and controls
150 lines (139 loc) · 4.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#include <algorithm>
#include <string>
#include <iostream>
#include "utf8.h"
namespace Url
{
Utf8::codepoint_t Utf8::readCodepoint(
std::string::const_iterator& it, const std::string::const_iterator& end)
{
Utf8::char_t current = static_cast<Utf8::char_t>(*it++);
if (current & 0x80)
{
// Number of additional bytes needed
unsigned int bytes = 0;
// The accumulated value
Utf8::codepoint_t result = 0;
if (current < 0xC0)
{
// Invalid sequence
throw std::invalid_argument("Low UTF-8 start byte");
}
else if (current < 0xE0)
{
// One additional byte, two bytes total, use 5 bits
bytes = 1;
result = current & 0x1F;
}
else if (current < 0xF0)
{
// Two additional bytes, three bytes total, use 4 bits
bytes = 2;
result = current & 0x0F;
}
else if (current < 0xF8)
{
// Three additional bytes, four bytes total, use 3 bits
bytes = 3;
result = current & 0x07;
}
else
{
throw std::invalid_argument("High UTF-8 start byte");
}
for (; bytes > 0; --bytes) {
if (it == end)
{
throw std::invalid_argument("UTF-8 sequence terminated early.");
}
current = static_cast<unsigned char>(*it++);
// Ensure the first two bits are 10
if ((current & 0xC0) != 0x80)
{
throw std::invalid_argument("Invalid continuation byte");
}
result = (result << 6) | (current & 0x3F);
}
return result;
}
else
{
return current;
}
}
std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value)
{
if (value > MAX_CODEPOINT)
{
throw std::invalid_argument("Code point too high.");
}
else if (value <= 0x007F)
{
// Just append the character itself
str.append(1, static_cast<char>(value));
return str;
}
unsigned int bytes = 0;
if (value > 0xFFFF)
{
/**
* 11110xxx + 3 bytes for 21 bits total
*
* We need to take bits 20-18, which 0x1C0000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 18). The 5
* most significant bits of this byte are 11110, so we OR this result with
* 0xF0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 18) = 14.
*/
str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0));
bytes = 3;
value <<= 14;
}
else if (value > 0x07FF)
{
/**
* 1110xxxx + 2 bytes for 16 bits total
*
* We need to take bits 15-12, which 0xF000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 12). The 4
* most significant bits of this byte are 1110, so we OR this result with
* 0xE0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 12) = 20.
*/
str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0));
bytes = 2;
value <<= 20;
}
else
{
/**
* 110xxxxx + 1 byte for 11 bits total
*
* We need to take bits 10-6, which 0x7C0 masks out. These form the least
* significant bits of this byte (so we shift them back down by 6). The 3
* most significant bits of this byte are 110, so we OR this result with
* 0xC0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 6) = 26.
*/
str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0));
bytes = 1;
value <<= 26;
}
/**
* The remaining bits are to be consumed 6 at a time from the most-significant
* end. The mask 0xFC000000 grabs these six bits, which then must be shifted down
* by 26, and OR'd with 0x80 to produce the continuation byte.
*/
for (; bytes > 0; --bytes, value <<= 6)
{
str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80));
}
return str;
}
};