00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #pragma once
00028
00029 #include <iterator>
00030
00031 namespace drizzled
00032 {
00033 namespace utf8
00034 {
00035
00036
00037 namespace internal
00038 {
00039
00040
00041
00042 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
00043 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
00044 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00045 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00046 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00047 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00048
00049
00050 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
00051
00052 template<typename octet_type>
00053 inline uint8_t mask8(octet_type oc)
00054 {
00055 return static_cast<uint8_t>(0xff & oc);
00056 }
00057 template<typename u16_type>
00058 inline uint16_t mask16(u16_type oc)
00059 {
00060 return static_cast<uint16_t>(0xffff & oc);
00061 }
00062 template<typename octet_type>
00063 inline bool is_trail(octet_type oc)
00064 {
00065 return ((mask8(oc) >> 6) == 0x2);
00066 }
00067
00068 template <typename u16>
00069 inline bool is_lead_surrogate(u16 cp)
00070 {
00071 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
00072 }
00073
00074 template <typename u16>
00075 inline bool is_trail_surrogate(u16 cp)
00076 {
00077 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00078 }
00079
00080 template <typename u16>
00081 inline bool is_surrogate(u16 cp)
00082 {
00083 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00084 }
00085
00086 template <typename u32>
00087 inline bool is_code_point_valid(u32 cp)
00088 {
00089 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
00090 }
00091
00092 template <typename octet_iterator>
00093 inline typename std::iterator_traits<octet_iterator>::difference_type
00094 sequence_length(octet_iterator lead_it)
00095 {
00096 uint8_t lead = mask8(*lead_it);
00097 if (lead < 0x80)
00098 return 1;
00099 else if ((lead >> 5) == 0x6)
00100 return 2;
00101 else if ((lead >> 4) == 0xe)
00102 return 3;
00103 else if ((lead >> 3) == 0x1e)
00104 return 4;
00105 else
00106 return 0;
00107 }
00108
00109 template <typename octet_difference_type>
00110 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
00111 {
00112 if (cp < 0x80) {
00113 if (length != 1)
00114 return true;
00115 }
00116 else if (cp < 0x800) {
00117 if (length != 2)
00118 return true;
00119 }
00120 else if (cp < 0x10000) {
00121 if (length != 3)
00122 return true;
00123 }
00124
00125 return false;
00126 }
00127
00128 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00129
00131
00132 template <typename octet_iterator>
00133 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00134 {
00135 if (it != end) {
00136 if (code_point)
00137 *code_point = mask8(*it);
00138 return UTF8_OK;
00139 }
00140 return NOT_ENOUGH_ROOM;
00141 }
00142
00143 template <typename octet_iterator>
00144 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00145 {
00146 utf_error ret_code = NOT_ENOUGH_ROOM;
00147
00148 if (it != end) {
00149 uint32_t cp = mask8(*it);
00150 if (++it != end) {
00151 if (is_trail(*it)) {
00152 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00153
00154 if (code_point)
00155 *code_point = cp;
00156 ret_code = UTF8_OK;
00157 }
00158 else
00159 ret_code = INCOMPLETE_SEQUENCE;
00160 }
00161 else
00162 ret_code = NOT_ENOUGH_ROOM;
00163 }
00164
00165 return ret_code;
00166 }
00167
00168 template <typename octet_iterator>
00169 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00170 {
00171 utf_error ret_code = NOT_ENOUGH_ROOM;
00172
00173 if (it != end) {
00174 uint32_t cp = mask8(*it);
00175 if (++it != end) {
00176 if (is_trail(*it)) {
00177 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
00178 if (++it != end) {
00179 if (is_trail(*it)) {
00180 cp += (*it) & 0x3f;
00181
00182 if (code_point)
00183 *code_point = cp;
00184 ret_code = UTF8_OK;
00185 }
00186 else
00187 ret_code = INCOMPLETE_SEQUENCE;
00188 }
00189 else
00190 ret_code = NOT_ENOUGH_ROOM;
00191 }
00192 else
00193 ret_code = INCOMPLETE_SEQUENCE;
00194 }
00195 else
00196 ret_code = NOT_ENOUGH_ROOM;
00197 }
00198
00199 return ret_code;
00200 }
00201
00202 template <typename octet_iterator>
00203 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00204 {
00205 utf_error ret_code = NOT_ENOUGH_ROOM;
00206
00207 if (it != end) {
00208 uint32_t cp = mask8(*it);
00209 if (++it != end) {
00210 if (is_trail(*it)) {
00211 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
00212 if (++it != end) {
00213 if (is_trail(*it)) {
00214 cp += (mask8(*it) << 6) & 0xfff;
00215 if (++it != end) {
00216 if (is_trail(*it)) {
00217 cp += (*it) & 0x3f;
00218
00219 if (code_point)
00220 *code_point = cp;
00221 ret_code = UTF8_OK;
00222 }
00223 else
00224 ret_code = INCOMPLETE_SEQUENCE;
00225 }
00226 else
00227 ret_code = NOT_ENOUGH_ROOM;
00228 }
00229 else
00230 ret_code = INCOMPLETE_SEQUENCE;
00231 }
00232 else
00233 ret_code = NOT_ENOUGH_ROOM;
00234 }
00235 else
00236 ret_code = INCOMPLETE_SEQUENCE;
00237 }
00238 else
00239 ret_code = NOT_ENOUGH_ROOM;
00240 }
00241
00242 return ret_code;
00243 }
00244
00245 template <typename octet_iterator>
00246 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00247 {
00248
00249
00250 octet_iterator original_it = it;
00251
00252 uint32_t cp = 0;
00253
00254 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00255 octet_difference_type length = sequence_length(it);
00256 if (length == 0)
00257 return INVALID_LEAD;
00258
00259
00260 utf_error err = UTF8_OK;
00261 switch (length) {
00262 case 1:
00263 err = get_sequence_1(it, end, &cp);
00264 break;
00265 case 2:
00266 err = get_sequence_2(it, end, &cp);
00267 break;
00268 case 3:
00269 err = get_sequence_3(it, end, &cp);
00270 break;
00271 case 4:
00272 err = get_sequence_4(it, end, &cp);
00273 break;
00274 }
00275
00276 if (err == UTF8_OK) {
00277
00278 if (is_code_point_valid(cp)) {
00279 if (!is_overlong_sequence(cp, length)){
00280
00281 if (code_point)
00282 *code_point = cp;
00283 ++it;
00284 return UTF8_OK;
00285 }
00286 else
00287 err = OVERLONG_SEQUENCE;
00288 }
00289 else
00290 err = INVALID_CODE_POINT;
00291 }
00292
00293
00294 it = original_it;
00295 return err;
00296 }
00297
00298 template <typename octet_iterator>
00299 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00300 return validate_next(it, end, 0);
00301 }
00302
00303 }
00304
00306
00307
00308 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
00309
00310 template <typename octet_iterator>
00311 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00312 {
00313 octet_iterator result = start;
00314 while (result != end) {
00315 internal::utf_error err_code = internal::validate_next(result, end);
00316 if (err_code != internal::UTF8_OK)
00317 return result;
00318 }
00319 return result;
00320 }
00321
00322 template <typename octet_iterator>
00323 inline bool is_valid(octet_iterator start, octet_iterator end)
00324 {
00325 return (find_invalid(start, end) == end);
00326 }
00327
00328 template <typename octet_iterator>
00329 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
00330 {
00331 return (
00332 ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
00333 ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
00334 ((it != end) && (internal::mask8(*it)) == bom[2])
00335 );
00336 }
00337
00338
00339 template <typename octet_iterator>
00340 inline bool is_bom (octet_iterator it)
00341 {
00342 return (
00343 (internal::mask8(*it++)) == bom[0] &&
00344 (internal::mask8(*it++)) == bom[1] &&
00345 (internal::mask8(*it)) == bom[2]
00346 );
00347 }
00348 }
00349 }
00350
00351
00352