00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #pragma once
00030
00031 #include <drizzled/utf8/core.h>
00032 #include <stdexcept>
00033
00034 namespace drizzled
00035 {
00036 namespace utf8
00037 {
00038
00039 class exception : public std::exception {
00040 };
00041
00042
00043 class invalid_code_point : public exception {
00044 uint32_t cp;
00045 public:
00046 invalid_code_point(uint32_t cp_in) : cp(cp_in) {}
00047 virtual const char* what() const throw() { return "Invalid code point"; }
00048 uint32_t code_point() const {return cp;}
00049 };
00050
00051 class invalid_utf8 : public exception {
00052 uint8_t u8;
00053 public:
00054 invalid_utf8 (uint8_t u) : u8(u) {}
00055 virtual const char* what() const throw() { return "Invalid UTF-8"; }
00056 uint8_t utf8_octet() const {return u8;}
00057 };
00058
00059 class invalid_utf16 : public exception {
00060 uint16_t u16;
00061 public:
00062 invalid_utf16 (uint16_t u) : u16(u) {}
00063 virtual const char* what() const throw() { return "Invalid UTF-16"; }
00064 uint16_t utf16_word() const {return u16;}
00065 };
00066
00067 class not_enough_room : public exception {
00068 public:
00069 virtual const char* what() const throw() { return "Not enough space"; }
00070 };
00071
00073
00074 template <typename octet_iterator, typename output_iterator>
00075 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
00076 {
00077 while (start != end) {
00078 octet_iterator sequence_start = start;
00079 internal::utf_error err_code = internal::validate_next(start, end);
00080 switch (err_code) {
00081 case internal::UTF8_OK :
00082 for (octet_iterator it = sequence_start; it != start; ++it)
00083 *out++ = *it;
00084 break;
00085 case internal::NOT_ENOUGH_ROOM:
00086 throw not_enough_room();
00087 case internal::INVALID_LEAD:
00088 append (replacement, out);
00089 ++start;
00090 break;
00091 case internal::INCOMPLETE_SEQUENCE:
00092 case internal::OVERLONG_SEQUENCE:
00093 case internal::INVALID_CODE_POINT:
00094 append (replacement, out);
00095 ++start;
00096
00097 while (internal::is_trail(*start) && start != end)
00098 ++start;
00099 break;
00100 }
00101 }
00102 return out;
00103 }
00104
00105 template <typename octet_iterator, typename output_iterator>
00106 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
00107 {
00108 static const uint32_t replacement_marker = internal::mask16(0xfffd);
00109 return replace_invalid(start, end, out, replacement_marker);
00110 }
00111
00112 template <typename octet_iterator>
00113 octet_iterator append(uint32_t cp, octet_iterator result)
00114 {
00115 if (!internal::is_code_point_valid(cp))
00116 throw invalid_code_point(cp);
00117
00118 if (cp < 0x80)
00119 *(result++) = static_cast<uint8_t>(cp);
00120 else if (cp < 0x800) {
00121 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
00122 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00123 }
00124 else if (cp < 0x10000) {
00125 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
00126 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
00127 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00128 }
00129 else {
00130 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
00131 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
00132 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
00133 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00134 }
00135 return result;
00136 }
00137
00138 template <typename octet_iterator>
00139 uint32_t next(octet_iterator& it, octet_iterator end)
00140 {
00141 uint32_t cp = 0;
00142 internal::utf_error err_code = internal::validate_next(it, end, &cp);
00143 switch (err_code) {
00144 case internal::UTF8_OK :
00145 break;
00146 case internal::NOT_ENOUGH_ROOM :
00147 throw not_enough_room();
00148 case internal::INVALID_LEAD :
00149 case internal::INCOMPLETE_SEQUENCE :
00150 case internal::OVERLONG_SEQUENCE :
00151 throw invalid_utf8(*it);
00152 case internal::INVALID_CODE_POINT :
00153 throw invalid_code_point(cp);
00154 }
00155 return cp;
00156 }
00157
00158 template <typename octet_iterator>
00159 uint32_t peek_next(octet_iterator it, octet_iterator end)
00160 {
00161 return next(it, end);
00162 }
00163
00164 template <typename octet_iterator>
00165 uint32_t prior(octet_iterator& it, octet_iterator start)
00166 {
00167 octet_iterator end = it;
00168 while (internal::is_trail(*(--it)))
00169 if (it < start)
00170 throw invalid_utf8(*it);
00171 octet_iterator temp = it;
00172 return next(temp, end);
00173 }
00174
00176 template <typename octet_iterator>
00177 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
00178 {
00179 octet_iterator end = it;
00180 while (internal::is_trail(*(--it)))
00181 if (it == pass_start)
00182 throw invalid_utf8(*it);
00183 octet_iterator temp = it;
00184 return next(temp, end);
00185 }
00186
00187 template <typename octet_iterator, typename distance_type>
00188 void advance (octet_iterator& it, distance_type n, octet_iterator end)
00189 {
00190 for (distance_type i = 0; i < n; ++i)
00191 next(it, end);
00192 }
00193
00194 template <typename octet_iterator>
00195 typename std::iterator_traits<octet_iterator>::difference_type
00196 distance (octet_iterator first, octet_iterator last)
00197 {
00198 typename std::iterator_traits<octet_iterator>::difference_type dist;
00199 for (dist = 0; first < last; ++dist)
00200 next(first, last);
00201 return dist;
00202 }
00203
00204 template <typename u16bit_iterator, typename octet_iterator>
00205 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00206 {
00207 while (start != end) {
00208 uint32_t cp = internal::mask16(*start++);
00209
00210 if (internal::is_lead_surrogate(cp)) {
00211 if (start != end) {
00212 uint32_t trail_surrogate = internal::mask16(*start++);
00213 if (internal::is_trail_surrogate(trail_surrogate))
00214 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
00215 else
00216 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
00217 }
00218 else
00219 throw invalid_utf16(static_cast<uint16_t>(cp));
00220
00221 }
00222
00223 else if (internal::is_trail_surrogate(cp))
00224 throw invalid_utf16(static_cast<uint16_t>(cp));
00225
00226 result = append(cp, result);
00227 }
00228 return result;
00229 }
00230
00231 template <typename u16bit_iterator, typename octet_iterator>
00232 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00233 {
00234 while (start != end) {
00235 uint32_t cp = next(start, end);
00236 if (cp > 0xffff) {
00237 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
00238 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00239 }
00240 else
00241 *result++ = static_cast<uint16_t>(cp);
00242 }
00243 return result;
00244 }
00245
00246 template <typename octet_iterator, typename u32bit_iterator>
00247 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00248 {
00249 while (start != end)
00250 result = append(*(start++), result);
00251
00252 return result;
00253 }
00254
00255 template <typename octet_iterator, typename u32bit_iterator>
00256 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00257 {
00258 while (start != end)
00259 (*result++) = next(start, end);
00260
00261 return result;
00262 }
00263
00264
00265 template <typename octet_iterator>
00266 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
00267 octet_iterator it;
00268 octet_iterator range_start;
00269 octet_iterator range_end;
00270 public:
00271 iterator () {};
00272 explicit iterator (const octet_iterator& octet_it,
00273 const octet_iterator& range_start_in,
00274 const octet_iterator& range_end_in) :
00275 it(octet_it), range_start(range_start_in), range_end(range_end_in)
00276 {
00277 if (it < range_start || it > range_end)
00278 throw std::out_of_range("Invalid utf-8 iterator position");
00279 }
00280
00281 octet_iterator base () const { return it; }
00282 uint32_t operator * () const
00283 {
00284 octet_iterator temp = it;
00285 return next(temp, range_end);
00286 }
00287 bool operator == (const iterator& rhs) const
00288 {
00289 if (range_start != rhs.range_start || range_end != rhs.range_end)
00290 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00291 return (it == rhs.it);
00292 }
00293 bool operator != (const iterator& rhs) const
00294 {
00295 return !(operator == (rhs));
00296 }
00297 iterator& operator ++ ()
00298 {
00299 next(it, range_end);
00300 return *this;
00301 }
00302 iterator operator ++ (int)
00303 {
00304 iterator temp = *this;
00305 next(it, range_end);
00306 return temp;
00307 }
00308 iterator& operator -- ()
00309 {
00310 prior(it, range_start);
00311 return *this;
00312 }
00313 iterator operator -- (int)
00314 {
00315 iterator temp = *this;
00316 prior(it, range_start);
00317 return temp;
00318 }
00319 };
00320
00321 }
00322 }
00323
00324
00325