Drizzled Public API Documentation

CSXML.h
00001 /* Copyright (C) 2010 PrimeBase Technologies GmbH, Germany
00002  *
00003  * PrimeBase Media Stream for MySQL
00004  *
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
00018  *
00019  * Paul McCullagh (H&G2JCtL)
00020  *
00021  * 2010-01-12
00022  *
00023  * CORE SYSTEM:
00024  * XML Parsing
00025  *
00026  */
00027 
00028 #include <inttypes.h>
00029 #include <wchar.h>
00030 
00031 #pragma once
00032 #ifndef __CSXML_H__
00033 #define __CSXML_H__
00034 
00035 #define CS_XML_ERR_OUT_OF_MEMORY    -1
00036 #define CS_XML_ERR_CHAR_TOO_LARGE   -2
00037 
00038 #define CS_XML_EOF_CHAR         WCHAR_MAX
00039 
00040 #define CS_MAX_XML_NAME_SIZE      48
00041 #define CS_XML_ERR_MSG_SIZE       128
00042 
00043 /* pxml.h 23.3.01 Paul McCullagh */
00044 /* Parse XML */
00045 /* Entities understood by XML:
00046    &gt;   (>)
00047    &lt;   (<)
00048    &amp;  (&)
00049    &apos; (')
00050    &quot; (")
00051 
00052    Processing Instructions    <? ... ?>
00053    CDATA Sections       <![CDATA[ ... ]]>
00054    Document Type Definition   <!DOCTYPE ... [ ...markup... ] >
00055    Conditional Sections     <![ ... [ ...markup... ]]>
00056  */
00057 
00058 #define XML_BEFORE_CDATA        0   /* XXX */
00059 #define XML_IN_CDATA          1   /* XXX */
00060 
00061 #define XML_LT              2   /* < */
00062 #define XML_LT_BANG           3   /* <! */
00063 #define XML_LT_BANG_DASH        4   /* <!- */
00064 #define XML_LT_BANG_SQR         5   /* <![ */
00065 #define XML_LT_BANG_SQR_IN_NAME     6
00066 #define XML_LT_BANG_SQR_AFTER_NAME    7
00067 
00068 #define XML_IN_TAG_NAME         8   /* abc */
00069 
00070 #define XML_BEFORE_ATTR         9   /* ' ' */
00071 #define XML_IN_ATTR           10    /* xyz */
00072 
00073 #define XML_BEFORE_EQUAL        11    /* ' ' */
00074 #define XML_AFTER_EQUAL         12    /* ' ' */
00075 
00076 #define XML_QUOTE_BEFORE_VALUE      13    /* " or ' */
00077 #define XML_IN_VALUE          14    /* ... */
00078 #define XML_QUOTE_AFTER_VALUE     15    /* " or ' */
00079 
00080 #define XML_SLASH           16    /* / */
00081 #define XML_QMARK           17    /* ? */
00082 #define XML_SQR             18    /* ] */
00083 
00084 #define XML_IN_COMMENT          19    /* <!--... */
00085 #define XML_IN_COMMENT_DASH       20    /* - */
00086 #define XML_IN_COMMENT_DASH_DASH    21    /* -- */
00087 #define XML_IN_COMMENT_3_DASH     22    /* --- */
00088 
00089 #define XML_IN_CDATA_TAG        23    /* <![CDATA[... */
00090 #define XML_IN_CDATA_TAG_SQR      24    /* ] */
00091 #define XML_IN_CDATA_TAG_SQR_SQR    25    /* ]] */
00092 #define XML_IN_CDATA_TAG_3_SQR      26    /* ]]] */
00093 
00094 #define PARSE_BUFFER_SIZE       20
00095 #define PARSE_STACK_SIZE        200
00096 
00097 #define END_TAG_TYPE(x)         (x->nesting-1 < PARSE_STACK_SIZE ? x->end_type[x->nesting-1] : XML_OP_1_END_UNKNOWN_TAG)
00098 
00099 #define TO_LONG_CHAR(ch)        ((unsigned char) (ch))
00100 
00101 #define XML_STEP_NONE         0
00102 #define XML_STEP_TAG          1
00103 #define XML_STEP_ATTR         2
00104 #define XML_STEP_VALUE          3
00105 #define XML_STEP_NESTED         4
00106 
00107 class CSXMLParser {
00108   public:
00109   CSXMLParser() :
00110     state(0),
00111     quote(0),
00112     step(0),
00113     type(0),
00114     count(0),
00115     nesting(0) {
00116   }
00117   virtual ~CSXMLParser() { }
00118 
00119   int32_t parseChar(wchar_t ch);
00120   void setDataType(int32_t t) { type = t; }
00121   int32_t getDataLen() { return count; }
00122   wchar_t *getDataPtr() { return buffer; }
00123 
00124   private:
00125   /* Internal information: */
00126   int32_t     state;
00127   int32_t     quote;
00128   int32_t     step;
00129 
00130   /* Data: output is always in the buffer: */
00131   int32_t     type;             /* Type of data in the buffer. */
00132   int32_t     count;              /* Size of the buffer.  */
00133   wchar_t     buffer[PARSE_BUFFER_SIZE];    /* Contains data to be added. */
00134 
00135   /* Signals: tag start and end: */
00136   int32_t     nesting;            /* Tag nesting depth. */
00137   uint8_t     end_type[PARSE_STACK_SIZE];   /* Stack of tag types */
00138 
00139   bool match_string(const char *ch);
00140   void increment_nesting(wchar_t ch);
00141 };
00142 
00143 #define XML_OP_1_MASK         0x0000000F
00144 #define XML_ERROR           0x00001000
00145 
00146 #define XML_OP_1_NOOP         0x00000000
00147 #define XML_OP_1_END_TAG        0x00000001    /* < ... >   */
00148 #define XML_OP_1_END_CLOSE_TAG      0x00000002    /* </ ... >  */
00149 #define XML_OP_1_END_EMPTY_TAG      0x00000003    /* < ... />  */
00150 #define XML_OP_1_END_PI_TAG       0x00000004    /* <? ... ?> */
00151 #define XML_OP_1_END_ENTITY_TAG     0x00000005    /* <! ... >  */
00152 #define XML_OP_1_END_BRACKET_TAG    0x00000006    /* <![ ... ]> */
00153 #define XML_OP_1_END_UNKNOWN_TAG    0x00000007    /* <_ ... > */
00154 #define XML_OP_1_START_CDATA_TAG    0x00000008    /* <![CDATA[ ... */
00155 #define XML_OP_1_START_COMMENT      0x00000009    /* <!-- ... */
00156 #define XML_OP_1_START_TAG        0x0000000A    /* <... */
00157 #define XML_OP_1_ADD_ATTR       0x0000000B
00158 #define XML_OP_1_END_CDATA        0x0000000C
00159 #define XML_OP_1_END_CDATA_TAG      0x0000000D    /* ... ]]> */
00160 #define XML_OP_1_END_COMMENT      0x0000000E    /* ... --> */
00161 
00162 #define XML_DATA_MASK         0x000000F0
00163 
00164 #define XML_NO_DATA           0x00000000
00165 #define XML_DATA_TAG          0x00000010
00166 #define XML_DATA_ATTR         0x00000020
00167 #define XML_DATA_CDATA          0x00000030
00168 #define XML_DATA_CDATA_TAG        0x00000040
00169 #define XML_COMMENT           0x00000050
00170 #define XML_DATA_VALUE          0x00000060
00171 
00172 #define XML_OP_2_MASK         0x00000F00
00173 
00174 #define XML_OP_2_NOOP         0x00000000
00175 #define XML_OP_2_END_TAG        0x00000100
00176 #define XML_OP_2_END_CLOSE_TAG      0x00000200
00177 #define XML_OP_2_END_EMPTY_TAG      0x00000300
00178 #define XML_OP_2_END_PI_TAG       0x00000400
00179 #define XML_OP_2_END_ENTITY_TAG     0x00000500
00180 #define XML_OP_2_END_BRACKET_TAG    0x00000600
00181 #define XML_OP_2_END_UNKNOWN_TAG    0x00000700
00182 #define XML_OP_2_START_CDATA_TAG    0x00000800
00183 #define XML_OP_2_START_COMMENT      0x00000900
00184 
00185 #define XML_noop            (XML_OP_2_NOOP|XML_NO_DATA)
00186 
00187 #define XML_CDATA_CH          (XML_DATA_CDATA)
00188 #define XML_end_cdata_TAG_CH      (XML_OP_1_END_CDATA|XML_DATA_TAG)
00189 #define XML_start_tag_TAG_CH      (XML_OP_1_START_TAG|XML_DATA_TAG)
00190 #define XML_add_attr_TAG_CH       (XML_OP_1_ADD_ATTR|XML_DATA_TAG)
00191 #define XML_TAG_CH            (XML_DATA_TAG)
00192 #define XML_start_tag_ATTR_CH     (XML_OP_1_START_TAG|XML_DATA_ATTR)
00193 #define XML_add_attr_ATTR_CH      (XML_OP_1_ADD_ATTR|XML_DATA_ATTR)
00194 #define XML_ATTR_CH           (XML_DATA_ATTR)
00195 #define XML_start_tag_VALUE_CH      (XML_OP_1_START_TAG|XML_DATA_VALUE)
00196 #define XML_add_attr_VALUE_CH     (XML_OP_1_ADD_ATTR|XML_DATA_VALUE)
00197 #define XML_VALUE_CH          (XML_DATA_VALUE)
00198 #define XML_start_tag_end_tag(x)    (XML_OP_1_START_TAG|((x) << 8))
00199 #define XML_add_attr_end_tag(x)     (XML_OP_1_ADD_ATTR|((x) << 8))
00200 #define XML_end_tag(x)          (x)
00201 #define XML_start_tag_end_empty_tag   XML_start_tag_end_tag(XML_OP_1_END_EMPTY_TAG)
00202 #define XML_add_attr_end_empty_tag    XML_add_attr_end_tag(XML_OP_1_END_EMPTY_TAG)
00203 #define XML_end_empty_tag       XML_end_tag(XML_OP_1_END_EMPTY_TAG)
00204 #define XML_start_tag_end_pi_tag    XML_start_tag_end_tag(XML_OP_1_END_PI_TAG)
00205 #define XML_add_attr_end_pi_tag     XML_add_attr_end_tag(XML_OP_1_END_PI_TAG)
00206 #define XML_end_pi_tag          XML_end_tag(XML_OP_1_END_PI_TAG)
00207 
00208 #define XML_end_cdata_start_cdata_tag (XML_OP_1_END_CDATA|XML_OP_2_START_CDATA_TAG)
00209 #define XML_start_tag_start_cdata_tag (XML_OP_1_START_TAG|XML_OP_2_START_CDATA_TAG)
00210 #define XML_add_attr_start_cdata_tag  (XML_OP_1_ADD_ATTR|XML_OP_2_START_CDATA_TAG)
00211 #define XML_start_cdata_tag       (XML_OP_1_START_CDATA_TAG)
00212 #define XML_CDATA_TAG_CH        (XML_DATA_CDATA_TAG)
00213 #define XML_end_cdata_tag       (XML_OP_1_END_CDATA_TAG)
00214 
00215 #define XML_end_cdata_start_comment   (XML_OP_1_END_CDATA|XML_OP_2_START_COMMENT)
00216 #define XML_start_tag_start_comment   (XML_OP_1_START_TAG|XML_OP_2_START_COMMENT)
00217 #define XML_add_attr_start_comment    (XML_OP_1_ADD_ATTR|XML_OP_2_START_COMMENT)
00218 #define XML_start_comment       (XML_OP_1_START_COMMENT)
00219 #define XML_COMMENT_CH          (XML_COMMENT)
00220 #define XML_end_comment         (XML_OP_1_END_COMMENT)
00221 
00222 /* Standard charsets are ISO-8879-1, US-ASCII or UNICODE. None
00223  * require conversion!
00224  */
00225 #define CHARSET_STANDARD        0
00226 #define CHARSET_UTF_8         1
00227 #define CHARSET_TO_CONVERT_8_BIT    2
00228 
00229 class CSXMLProcessor : public CSXMLParser {
00230   public:
00231   CSXMLProcessor() :
00232     err_no(0),
00233     ip(false),
00234     tlength(0),
00235     nlength(0),
00236     vlength(0),
00237     utf8_count(0),
00238     utf8_length(0),
00239     elength(0) {
00240     err_message[0] = 0;
00241     charset[0] = 0;
00242     pr_tag[0] = 0;
00243     pr_name[0] = 0;
00244     pr_value[0] = 0;
00245     utf8_buffer[0] = 0;
00246     entity[0] = 0;
00247   }
00248   virtual ~CSXMLProcessor() { }
00249 
00250   /* This function processes a UNICODE character from an XML
00251    * document returns parsing instructions (operations).
00252    * Each instruction can consist of up to 3 operations. The
00253    * operations must be executed in the following order:
00254    * - Operation 1
00255    * - Data operation, record one of the following:
00256    *   - part of a tag name
00257    *   - part of an attribute name
00258    *   - part of an attribute value
00259    *   - part of CDATA
00260    * - Operation 2
00261    * Output for the data operation (if any) is placed in the buffer
00262    * in the state structure. The input state structure must be zeroed
00263    * before processing begins. Input characters may be 1 byte or
00264    * 2 byte. Output is always 2-byte UNICODE.
00265    */
00266   int32_t processChar(wchar_t ch);
00267 
00268   bool getError(int32_t *err, char **msg);
00269   void setError(int32_t err, char *msg);
00270   void printError(char *prefix);
00271 
00272   private:
00273   int32_t     err_no;
00274   char      err_message[CS_XML_ERR_MSG_SIZE];
00275 
00276   private:
00277   /* When this function is called, use the name of the charset.
00278    * to build the conversion table which maps characters in the
00279    * range 128 to 255 to the unicode eqivalent.
00280    */
00281   virtual bool buildConversionTable();
00282 
00283   int32_t     charset_type;
00284   char      charset[CS_MAX_XML_NAME_SIZE];
00285   wchar_t     conversion_table[128];
00286 
00287   bool      ip;
00288   size_t      tlength;
00289   char      pr_tag[CS_MAX_XML_NAME_SIZE];
00290   size_t      nlength;
00291   char      pr_name[CS_MAX_XML_NAME_SIZE];
00292   size_t      vlength;
00293   char      pr_value[CS_MAX_XML_NAME_SIZE];
00294 
00295   int32_t     utf8_count;
00296   int32_t     utf8_length;
00297   uint32_t    utf8_buffer[6];
00298 
00299   int32_t     elength;
00300   char      entity[CS_MAX_XML_NAME_SIZE];
00301 
00302   int32_t capture_initializer(wchar_t ch);
00303   int32_t entity_translator(wchar_t ch);
00304   int32_t charset_transformer(wchar_t ch);
00305   void appendWCharToString(char *dstr, size_t *dlen, size_t dsize, wchar_t *schars, size_t slen);
00306 };
00307 
00308 /* path is a / separated list of nodes to date. */
00309 /* Name and path are given in lower-case!!! */
00310 
00311 #define XML_KEEP_EMPTY_CDATA  1
00312 
00313 class CSXMLString {
00314   public:
00315   CSXMLString() : stringPtr(NULL), stringLen(0), stringSize(0) {}
00316   virtual ~CSXMLString() { }
00317 
00318   public:
00319   bool addChar(char ch, CSXMLProcessor *xml);
00320   bool addChars(size_t size, wchar_t *buffer, bool to_lower, CSXMLProcessor *xml);
00321   bool addString(const char *string, CSXMLProcessor *xml);
00322   void setEmpty();
00323   void setNull();
00324   char *lastComponent();
00325   char *findTrailingComponent(const char *comp);
00326   void truncate(char *ptr);
00327 
00328   char      *stringPtr;
00329   size_t      stringLen;
00330   size_t      stringSize;
00331 };
00332 
00333 class CSXML : public CSXMLProcessor {
00334   public:
00335   bool parseXML(int32_t flags);
00336 
00337   private:
00338   /*
00339    * Return CS_XML_EOF_CHAR when there are no more characters.
00340    */
00341   virtual bool getChar(wchar_t *ch) = 0;
00342 
00343   /*
00344    * These methods are called as the input data
00345    * is parsed.
00346    */
00347   virtual bool openNode(char *path, char *value) = 0;
00348   virtual bool closeNode(char *path) = 0;
00349   virtual bool addAttribute(char *path, char *name, char *value) = 0;
00350 
00351   private:
00352   uint32_t    flags;
00353 
00354   CSXMLString   xml_path;
00355   CSXMLString   xml_name;
00356   CSXMLString   xml_value;
00357 
00358   int32_t nodeType(char *name);
00359   bool internalCloseNode(const char *name, bool single);
00360   bool internalOpenNode(const char *name);
00361 };
00362 
00363 class CSXMLPrint : public CSXML {
00364   private:
00365   virtual bool openNode(char *path, char *value);
00366   virtual bool closeNode(char *path);
00367   virtual bool addAttribute(char *path, char *name, char *value);
00368 };
00369 
00370 class CSXMLBuffer : public CSXMLPrint {
00371   public:
00372   bool parseString(const char *data, int32_t flags);
00373   bool parseData(const char *data, size_t len, int32_t flags);
00374 
00375   private:
00376   virtual bool getChar(wchar_t *ch);
00377 
00378   private:
00379   const char    *charData;
00380   size_t      dataLen;
00381   size_t      dataPos;
00382 };
00383 
00384 class CSXMLFile : public CSXMLPrint {
00385   public:
00386   bool parseFile(char *file_name, int32_t flags);
00387 
00388   private:
00389   virtual bool getChar(wchar_t *ch);
00390 
00391   private:
00392   char      *fileName;
00393   FILE      *file;
00394 };
00395 
00396 #endif