Drizzled Public API Documentation

checked.h
00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00027 
00028  
00029 #pragma once
00030  
00031 #include <drizzled/utf8/core.h>
00032 #include <stdexcept>
00033  
00034 namespace drizzled
00035 {
00036 namespace utf8
00037 {
00038     // Base for the exceptions that may be thrown from the library
00039     class exception : public std::exception {
00040     };
00041 
00042     // Exceptions that may be thrown from the library functions.
00043     class invalid_code_point : public exception {
00044         uint32_t cp;
00045     public:
00046         invalid_code_point(uint32_t cp_in) : cp(cp_in) {}
00047         virtual const char* what() const throw() { return "Invalid code point"; }
00048         uint32_t code_point() const {return cp;}
00049     };
00050 
00051     class invalid_utf8 : public exception {
00052         uint8_t u8;
00053     public:
00054         invalid_utf8 (uint8_t u) : u8(u) {}
00055         virtual const char* what() const throw() { return "Invalid UTF-8"; }
00056         uint8_t utf8_octet() const {return u8;}
00057     };
00058 
00059     class invalid_utf16 : public exception {
00060         uint16_t u16;
00061     public:
00062         invalid_utf16 (uint16_t u) : u16(u) {}
00063         virtual const char* what() const throw() { return "Invalid UTF-16"; }
00064         uint16_t utf16_word() const {return u16;}
00065     };
00066 
00067     class not_enough_room : public exception {
00068     public:
00069         virtual const char* what() const throw() { return "Not enough space"; }
00070     };
00071 
00073 
00074     template <typename octet_iterator, typename output_iterator>
00075     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
00076     {
00077         while (start != end) {
00078             octet_iterator sequence_start = start;
00079             internal::utf_error err_code = internal::validate_next(start, end);
00080             switch (err_code) {
00081                 case internal::UTF8_OK :
00082                     for (octet_iterator it = sequence_start; it != start; ++it)
00083                         *out++ = *it;
00084                     break;
00085                 case internal::NOT_ENOUGH_ROOM:
00086                     throw not_enough_room();
00087                 case internal::INVALID_LEAD:
00088                     append (replacement, out);
00089                     ++start;
00090                     break;
00091                 case internal::INCOMPLETE_SEQUENCE:
00092                 case internal::OVERLONG_SEQUENCE:
00093                 case internal::INVALID_CODE_POINT:
00094                     append (replacement, out);
00095                     ++start;
00096                     // just one replacement mark for the sequence
00097                     while (internal::is_trail(*start) && start != end)
00098                         ++start;
00099                     break;
00100             }
00101         }
00102         return out;
00103     }
00104 
00105     template <typename octet_iterator, typename output_iterator>
00106     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
00107     {
00108         static const uint32_t replacement_marker = internal::mask16(0xfffd);
00109         return replace_invalid(start, end, out, replacement_marker);
00110     }
00111 
00112     template <typename octet_iterator>
00113     octet_iterator append(uint32_t cp, octet_iterator result)
00114     {
00115         if (!internal::is_code_point_valid(cp))
00116             throw invalid_code_point(cp);
00117 
00118         if (cp < 0x80)                        // one octet
00119             *(result++) = static_cast<uint8_t>(cp);
00120         else if (cp < 0x800) {                // two octets
00121             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
00122             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00123         }
00124         else if (cp < 0x10000) {              // three octets
00125             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
00126             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
00127             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00128         }
00129         else {      // four octets
00130             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
00131             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
00132             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
00133             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00134         }
00135         return result;
00136     }
00137 
00138     template <typename octet_iterator>
00139     uint32_t next(octet_iterator& it, octet_iterator end)
00140     {
00141         uint32_t cp = 0;
00142         internal::utf_error err_code = internal::validate_next(it, end, &cp);
00143         switch (err_code) {
00144             case internal::UTF8_OK :
00145                 break;
00146             case internal::NOT_ENOUGH_ROOM :
00147                 throw not_enough_room();
00148             case internal::INVALID_LEAD :
00149             case internal::INCOMPLETE_SEQUENCE :
00150             case internal::OVERLONG_SEQUENCE :
00151                 throw invalid_utf8(*it);
00152             case internal::INVALID_CODE_POINT :
00153                 throw invalid_code_point(cp);
00154         }
00155         return cp;
00156     }
00157 
00158     template <typename octet_iterator>
00159     uint32_t peek_next(octet_iterator it, octet_iterator end)
00160     {
00161         return next(it, end);
00162     }
00163 
00164     template <typename octet_iterator>
00165     uint32_t prior(octet_iterator& it, octet_iterator start)
00166     {
00167         octet_iterator end = it;
00168         while (internal::is_trail(*(--it)))
00169             if (it < start)
00170                 throw invalid_utf8(*it); // error - no lead byte in the sequence
00171         octet_iterator temp = it;
00172         return next(temp, end);
00173     }
00174 
00176     template <typename octet_iterator>
00177     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
00178     {
00179         octet_iterator end = it;
00180         while (internal::is_trail(*(--it)))
00181             if (it == pass_start)
00182                 throw invalid_utf8(*it); // error - no lead byte in the sequence
00183         octet_iterator temp = it;
00184         return next(temp, end);
00185     }
00186 
00187     template <typename octet_iterator, typename distance_type>
00188     void advance (octet_iterator& it, distance_type n, octet_iterator end)
00189     {
00190         for (distance_type i = 0; i < n; ++i)
00191             next(it, end);
00192     }
00193 
00194     template <typename octet_iterator>
00195     typename std::iterator_traits<octet_iterator>::difference_type
00196     distance (octet_iterator first, octet_iterator last)
00197     {
00198         typename std::iterator_traits<octet_iterator>::difference_type dist;
00199         for (dist = 0; first < last; ++dist)
00200             next(first, last);
00201         return dist;
00202     }
00203 
00204     template <typename u16bit_iterator, typename octet_iterator>
00205     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00206     {
00207         while (start != end) {
00208             uint32_t cp = internal::mask16(*start++);
00209             // Take care of surrogate pairs first
00210             if (internal::is_lead_surrogate(cp)) {
00211                 if (start != end) {
00212                     uint32_t trail_surrogate = internal::mask16(*start++);
00213                     if (internal::is_trail_surrogate(trail_surrogate))
00214                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
00215                     else
00216                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
00217                 }
00218                 else
00219                     throw invalid_utf16(static_cast<uint16_t>(cp));
00220 
00221             }
00222             // Lone trail surrogate
00223             else if (internal::is_trail_surrogate(cp))
00224                 throw invalid_utf16(static_cast<uint16_t>(cp));
00225 
00226             result = append(cp, result);
00227         }
00228         return result;
00229     }
00230 
00231     template <typename u16bit_iterator, typename octet_iterator>
00232     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00233     {
00234         while (start != end) {
00235             uint32_t cp = next(start, end);
00236             if (cp > 0xffff) { //make a surrogate pair
00237                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
00238                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00239             }
00240             else
00241                 *result++ = static_cast<uint16_t>(cp);
00242         }
00243         return result;
00244     }
00245 
00246     template <typename octet_iterator, typename u32bit_iterator>
00247     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00248     {
00249         while (start != end)
00250             result = append(*(start++), result);
00251 
00252         return result;
00253     }
00254 
00255     template <typename octet_iterator, typename u32bit_iterator>
00256     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00257     {
00258         while (start != end)
00259             (*result++) = next(start, end);
00260 
00261         return result;
00262     }
00263 
00264     // The iterator class
00265     template <typename octet_iterator>
00266     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
00267       octet_iterator it;
00268       octet_iterator range_start;
00269       octet_iterator range_end;
00270       public:
00271       iterator () {};
00272       explicit iterator (const octet_iterator& octet_it,
00273                          const octet_iterator& range_start_in,
00274                          const octet_iterator& range_end_in) :
00275                it(octet_it), range_start(range_start_in), range_end(range_end_in)
00276       {
00277           if (it < range_start || it > range_end)
00278               throw std::out_of_range("Invalid utf-8 iterator position");
00279       }
00280       // the default "big three" are OK
00281       octet_iterator base () const { return it; }
00282       uint32_t operator * () const
00283       {
00284           octet_iterator temp = it;
00285           return next(temp, range_end);
00286       }
00287       bool operator == (const iterator& rhs) const
00288       {
00289           if (range_start != rhs.range_start || range_end != rhs.range_end)
00290               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00291           return (it == rhs.it);
00292       }
00293       bool operator != (const iterator& rhs) const
00294       {
00295           return !(operator == (rhs));
00296       }
00297       iterator& operator ++ ()
00298       {
00299           next(it, range_end);
00300           return *this;
00301       }
00302       iterator operator ++ (int)
00303       {
00304           iterator temp = *this;
00305           next(it, range_end);
00306           return temp;
00307       }
00308       iterator& operator -- ()
00309       {
00310           prior(it, range_start);
00311           return *this;
00312       }
00313       iterator operator -- (int)
00314       {
00315           iterator temp = *this;
00316           prior(it, range_start);
00317           return temp;
00318       }
00319     }; // class iterator
00320 
00321 } // namespace utf8
00322 } // namespace drizzled
00323 
00324 
00325