Source for gnu.xml.pipeline.XIncludeFilter

   1: /* XIncludeFilter.java -- 
   2:    Copyright (C) 2001,2002 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.pipeline;
  39: 
  40: import java.io.IOException;
  41: import java.io.InputStream;
  42: import java.io.InputStreamReader;
  43: import java.net.URL; 
  44: import java.net.URLConnection; 
  45: import java.util.Hashtable;
  46: import java.util.Stack;
  47: import java.util.Vector;
  48: 
  49: import org.xml.sax.Attributes;
  50: import org.xml.sax.ErrorHandler;
  51: import org.xml.sax.InputSource;
  52: import org.xml.sax.Locator;
  53: import org.xml.sax.SAXException;
  54: import org.xml.sax.SAXParseException;
  55: import org.xml.sax.XMLReader;
  56: import org.xml.sax.helpers.XMLReaderFactory;
  57: 
  58: import gnu.xml.util.Resolver;
  59: 
  60: 
  61: 
  62: /**
  63:  * Filter to process an XPointer-free subset of
  64:  * <a href="http://www.w3.org/TR/xinclude">XInclude</a>, supporting its
  65:  * use as a kind of replacement for parsed general entities.
  66:  * XInclude works much like the <code>#include</code> of C/C++ but
  67:  * works for XML documents as well as unparsed text files.
  68:  * Restrictions from the 17-Sept-2002 CR draft of XInclude are as follows:
  69:  *
  70:  * <ul>
  71:  *
  72:  * <li> URIs must not include fragment identifiers.
  73:  * The CR specifies support for XPointer <em>element()</em> fragment IDs,
  74:  * which is not currently implemented here.
  75:  *
  76:  * <li> <em>xi:fallback</em> handling of resource errors is not
  77:  * currently supported.
  78:  *
  79:  * <li> DTDs are not supported in included files, since the SAX DTD events
  80:  * must have completely preceded any included file. 
  81:  * The CR explicitly allows the DTD related portions of the infoset to
  82:  * grow as an effect of including XML documents.
  83:  *
  84:  * <li> <em>xml:base</em> fixup isn't done.
  85:  *
  86:  * </ul>
  87:  *
  88:  * <p> XML documents that are included will normally be processed using
  89:  * the default SAX namespace rules, meaning that prefix information may
  90:  * be discarded.  This may be changed with {@link #setSavingPrefixes
  91:  * setSavingPrefixes()}.  <em>You are strongly advised to do this.</em>
  92:  *
  93:  * <p> Note that XInclude allows highly incompatible implementations, which
  94:  * are specialized to handle application-specific infoset extensions.  Some
  95:  * such implementations can be implemented by subclassing this one, but
  96:  * they may only be substituted in applications at "user option".
  97:  *
  98:  * <p>TBD: "IURI" handling.
  99:  *
 100:  * @author David Brownell
 101:  */
 102: public class XIncludeFilter extends EventFilter implements Locator
 103: {
 104:     private Hashtable        extEntities = new Hashtable (5, 5);
 105:     private int            ignoreCount;
 106:     private Stack        uris = new Stack ();
 107:     private Locator        locator;
 108:     private Vector        inclusions = new Vector (5, 5);
 109:     private boolean        savingPrefixes;
 110: 
 111:     /**
 112:      */
 113:     public XIncludeFilter (EventConsumer next)
 114:     throws SAXException
 115:     {
 116:     super (next);
 117:     setContentHandler (this);
 118:     // DTDHandler callbacks pass straight through
 119:     setProperty (DECL_HANDLER, this);
 120:     setProperty (LEXICAL_HANDLER, this);
 121:     }
 122: 
 123:     private void fatal (SAXParseException e) throws SAXException
 124:     {
 125:     ErrorHandler        eh;
 126:     
 127:     eh = getErrorHandler ();
 128:     if (eh != null)
 129:         eh.fatalError (e);
 130:     throw e;
 131:     }
 132: 
 133:     /**
 134:      * Passes "this" down the filter chain as a proxy locator.
 135:      */
 136:     public void setDocumentLocator (Locator locator)
 137:     {
 138:     this.locator = locator;
 139:     super.setDocumentLocator (this);
 140:     }
 141: 
 142:     /** Used for proxy locator; do not call directly. */
 143:     public String getSystemId ()
 144:     { return (locator == null) ? null : locator.getSystemId (); }
 145:     /** Used for proxy locator; do not call directly. */
 146:     public String getPublicId ()
 147:     { return (locator == null) ? null : locator.getPublicId (); }
 148:     /** Used for proxy locator; do not call directly. */
 149:     public int getLineNumber ()
 150:     { return (locator == null) ? -1 : locator.getLineNumber (); }
 151:     /** Used for proxy locator; do not call directly. */
 152:     public int getColumnNumber ()
 153:     { return (locator == null) ? -1 : locator.getColumnNumber (); }
 154: 
 155:     /**
 156:      * Assigns the flag controlling the setting of the SAX2
 157:      * <em>namespace-prefixes</em> flag.
 158:      */
 159:     public void setSavingPrefixes (boolean flag)
 160:     { savingPrefixes = flag; }
 161: 
 162:     /**
 163:      * Returns the flag controlling the setting of the SAX2
 164:      * <em>namespace-prefixes</em> flag when parsing included documents.
 165:      * The default value is the SAX2 default (false), which discards
 166:      * information that can be useful.
 167:      */
 168:     public boolean isSavingPrefixes ()
 169:     { return savingPrefixes; }
 170: 
 171:     //
 172:     // Two mechanisms are interacting here.
 173:     // 
 174:     //    - XML Base implies a stack of base URIs, updated both by
 175:     //      "real entity" boundaries and element boundaries.
 176:     //
 177:     //    - Active "Real Entities" (for document and general entities,
 178:     //      and by xincluded files) are tracked to prevent circular
 179:     //      inclusions.
 180:     //
 181:     private String addMarker (String uri)
 182:     throws SAXException
 183:     {
 184:     if (locator != null && locator.getSystemId () != null)
 185:         uri = locator.getSystemId ();
 186: 
 187:     // guard against InputSource objects without system IDs
 188:     if (uri == null)
 189:         fatal (new SAXParseException ("Entity URI is unknown", locator));
 190: 
 191:     try {
 192:         URL    url = new URL (uri);
 193: 
 194:         uri = url.toString ();
 195:         if (inclusions.contains (uri))
 196:         fatal (new SAXParseException (
 197:             "XInclude, circular inclusion", locator));
 198:         inclusions.addElement (uri);
 199:         uris.push (url);
 200:     } catch (IOException e) {
 201:         // guard against illegal relative URIs (Xerces)
 202:         fatal (new SAXParseException ("parser bug: relative URI",
 203:         locator, e));
 204:     }
 205:     return uri;
 206:     }
 207: 
 208:     private void pop (String uri)
 209:     {
 210:     inclusions.removeElement (uri);
 211:     uris.pop ();
 212:     }
 213: 
 214:     //
 215:     // Document entity boundaries get both treatments.
 216:     //
 217:     public void startDocument () throws SAXException
 218:     {
 219:     ignoreCount = 0;
 220:     addMarker (null);
 221:     super.startDocument ();
 222:     }
 223: 
 224:     public void endDocument () throws SAXException
 225:     {
 226:     inclusions.setSize (0);
 227:     extEntities.clear ();
 228:     uris.setSize (0);
 229:     super.endDocument ();
 230:     }
 231: 
 232:     //
 233:     // External general entity boundaries get both treatments.
 234:     //
 235:     public void externalEntityDecl (String name,
 236:         String publicId, String systemId)
 237:     throws SAXException
 238:     {
 239:     if (name.charAt (0) == '%')
 240:         return;
 241:     try {
 242:         URL    url = new URL (locator.getSystemId ());
 243:         systemId = new URL (url, systemId).toString ();
 244:     } catch (IOException e) {
 245:         // what could we do?
 246:     }
 247:     extEntities.put (name, systemId);
 248:     }
 249: 
 250:     public void startEntity (String name)
 251:     throws SAXException
 252:     {
 253:     if (ignoreCount != 0) {
 254:         ignoreCount++;
 255:         return;
 256:     }
 257: 
 258:     String    uri = (String) extEntities.get (name);
 259:     if (uri != null)
 260:         addMarker (uri);
 261:     super.startEntity (name);
 262:     }
 263: 
 264:     public void endEntity (String name)
 265:     throws SAXException
 266:     {
 267:     if (ignoreCount != 0) {
 268:         if (--ignoreCount != 0)
 269:         return;
 270:     }
 271: 
 272:     String    uri = (String) extEntities.get (name);
 273: 
 274:     if (uri != null)
 275:         pop (uri);
 276:     super.endEntity (name);
 277:     }
 278:     
 279:     //
 280:     // element boundaries only affect the base URI stack,
 281:     // unless they're XInclude elements.
 282:     //
 283:     public void
 284:     startElement (String uri, String localName, String qName, Attributes atts)
 285:     throws SAXException
 286:     {
 287:     if (ignoreCount != 0) {
 288:         ignoreCount++;
 289:         return;
 290:     }
 291: 
 292:     URL    baseURI = (URL) uris.peek ();
 293:     String    base;
 294: 
 295:     base = atts.getValue ("http://www.w3.org/XML/1998/namespace", "base");
 296:     if (base == null)
 297:         uris.push (baseURI);
 298:     else {
 299:         URL        url;
 300: 
 301:         if (base.indexOf ('#') != -1)
 302:         fatal (new SAXParseException (
 303:             "xml:base with fragment: " + base,
 304:             locator));
 305: 
 306:         try {
 307:         baseURI = new URL (baseURI, base);
 308:         uris.push (baseURI);
 309:         } catch (Exception e) {
 310:         fatal (new SAXParseException (
 311:             "xml:base with illegal uri: " + base,
 312:             locator, e));
 313:         }
 314:     }
 315: 
 316:     if (!"http://www.w3.org/2001/XInclude".equals (uri)) {
 317:         super.startElement (uri, localName, qName, atts);
 318:         return;
 319:     }
 320: 
 321:     if ("include".equals (localName)) {
 322:         String    href = atts.getValue ("href");
 323:         String    parse = atts.getValue ("parse");
 324:         String    encoding = atts.getValue ("encoding");
 325:         URL        url = (URL) uris.peek ();
 326:         SAXParseException    x = null;
 327: 
 328:         if (href == null)
 329:         fatal (new SAXParseException (
 330:             "XInclude missing href",
 331:             locator));
 332:         if (href.indexOf ('#') != -1)
 333:         fatal (new SAXParseException (
 334:             "XInclude with fragment: " + href,
 335:             locator));
 336: 
 337:         if (parse == null || "xml".equals (parse))
 338:         x = xinclude (url, href);
 339:         else if ("text".equals (parse))
 340:         x = readText (url, href, encoding);
 341:         else
 342:         fatal (new SAXParseException (
 343:             "unknown XInclude parsing mode: " + parse,
 344:             locator));
 345:         if (x == null) {
 346:         // strip out all child content
 347:         ignoreCount++;
 348:         return;
 349:         }
 350: 
 351:         // FIXME the 17-Sept-2002 CR of XInclude says we "must"
 352:         // use xi:fallback elements to handle resource errors,
 353:         // if they exist.
 354:         fatal (x);
 355: 
 356:     } else if ("fallback".equals (localName)) {
 357:         fatal (new SAXParseException (
 358:         "illegal top level XInclude 'fallback' element",
 359:         locator));
 360:     } else {
 361:         ErrorHandler    eh = getErrorHandler ();
 362: 
 363:         // CR doesn't say this is an error
 364:         if (eh != null)
 365:         eh.warning (new SAXParseException (
 366:             "unrecognized toplevel XInclude element: " + localName,
 367:             locator));
 368:         super.startElement (uri, localName, qName, atts);
 369:     }
 370:     }
 371: 
 372:     public void endElement (String uri, String localName, String qName)
 373:     throws SAXException
 374:     {
 375:     if (ignoreCount != 0) {
 376:         if (--ignoreCount != 0)
 377:         return;
 378:     }
 379: 
 380:     uris.pop ();
 381:     if (!("http://www.w3.org/2001/XInclude".equals (uri)
 382:         && "include".equals (localName)))
 383:         super.endElement (uri, localName, qName);
 384:     }
 385: 
 386:     //
 387:     // ignore all content within non-empty xi:include elements
 388:     //
 389:     public void characters (char ch [], int start, int length)
 390:     throws SAXException
 391:     {
 392:     if (ignoreCount == 0)
 393:         super.characters (ch, start, length);
 394:     }
 395: 
 396:     public void processingInstruction (String target, String value)
 397:     throws SAXException
 398:     {
 399:     if (ignoreCount == 0)
 400:         super.processingInstruction (target, value);
 401:     }
 402: 
 403:     public void ignorableWhitespace (char ch [], int start, int length)
 404:     throws SAXException
 405:     {
 406:     if (ignoreCount == 0)
 407:         super.ignorableWhitespace (ch, start, length);
 408:     }
 409: 
 410:     public void comment (char ch [], int start, int length)
 411:     throws SAXException
 412:     {
 413:     if (ignoreCount == 0)
 414:         super.comment (ch, start, length);
 415:     }
 416: 
 417:     public void startCDATA () throws SAXException
 418:     {
 419:     if (ignoreCount == 0)
 420:         super.startCDATA ();
 421:     }
 422: 
 423:     public void endCDATA () throws SAXException
 424:     {
 425:     if (ignoreCount == 0)
 426:         super.endCDATA ();
 427:     }
 428: 
 429:     public void startPrefixMapping (String prefix, String uri)
 430:     throws SAXException
 431:     {
 432:     if (ignoreCount == 0)
 433:         super.startPrefixMapping (prefix, uri);
 434:     }
 435: 
 436:     public void endPrefixMapping (String prefix) throws SAXException
 437:     {
 438:     if (ignoreCount == 0)
 439:         super.endPrefixMapping (prefix);
 440:     }
 441: 
 442:     public void skippedEntity (String name) throws SAXException
 443:     {
 444:     if (ignoreCount == 0)
 445:         super.skippedEntity (name);
 446:     }
 447: 
 448:     // JDK 1.1 seems to need it to be done this way, sigh
 449:     void setLocator (Locator l) { locator = l; }
 450:     Locator getLocator () { return locator; }
 451:     
 452: 
 453:     //
 454:     // for XIncluded entities, manage the current locator and
 455:     // filter out events that would be incorrect to report
 456:     //
 457:     private class Scrubber extends EventFilter
 458:     {
 459:     Scrubber (EventFilter f)
 460:     throws SAXException
 461:     {
 462:         // delegation passes to next in chain
 463:         super (f);
 464: 
 465:         // process all content events
 466:         super.setContentHandler (this);
 467:         super.setProperty (LEXICAL_HANDLER, this);
 468: 
 469:         // drop all DTD events
 470:         super.setDTDHandler (null);
 471:         super.setProperty (DECL_HANDLER, null);
 472:     }
 473: 
 474:     // maintain proxy locator
 475:     // only one startDocument()/endDocument() pair per event stream
 476:     public void setDocumentLocator (Locator l)
 477:         { setLocator (l); }
 478:     public void startDocument ()
 479:         { }
 480:     public void endDocument ()
 481:         { }
 482:     
 483:     private void reject (String message) throws SAXException
 484:         { fatal (new SAXParseException (message, getLocator ())); }
 485:     
 486:     // only the DTD from the "base document" gets reported
 487:     public void startDTD (String root, String publicId, String systemId)
 488:     throws SAXException
 489:         { reject ("XIncluded DTD: " + systemId); }
 490:     public void endDTD ()
 491:     throws SAXException
 492:         { reject ("XIncluded DTD"); }
 493:     // ... so this should never happen
 494:     public void skippedEntity (String name) throws SAXException
 495:         { reject ("XInclude skipped entity: " + name); }
 496: 
 497:     // since we rejected DTDs, only builtin entities can be reported
 498:     }
 499: 
 500:     // <xi:include parse='xml' ...>
 501:     // relative to the base URI passed
 502:     private SAXParseException xinclude (URL url, String href)
 503:     throws SAXException
 504:     {
 505:     XMLReader    helper;
 506:     Scrubber    scrubber;
 507:     Locator        savedLocator = locator;
 508: 
 509:     // start with a parser acting just like our input
 510:     // modulo DTD-ish stuff (validation flag, entity resolver)
 511:     helper = XMLReaderFactory.createXMLReader ();
 512:     helper.setErrorHandler (getErrorHandler ());
 513:     helper.setFeature (FEATURE_URI + "namespace-prefixes", true);
 514: 
 515:     // Set up the proxy locator and event filter.
 516:     scrubber = new Scrubber (this);
 517:     locator = null;
 518:     bind (helper, scrubber);
 519: 
 520:     // Merge the included document, except its DTD
 521:     try {
 522:         url = new URL (url, href);
 523:         href = url.toString ();
 524: 
 525:         if (inclusions.contains (href))
 526:         fatal (new SAXParseException (
 527:             "XInclude, circular inclusion", locator));
 528: 
 529:         inclusions.addElement (href);
 530:         uris.push (url);
 531:         helper.parse (new InputSource (href));
 532:         return null;
 533:     } catch (java.io.IOException e) {
 534:         return new SAXParseException (href, locator, e);
 535:     } finally {
 536:         pop (href);
 537:         locator = savedLocator;
 538:     }
 539:     }
 540: 
 541:     // <xi:include parse='text' ...>
 542:     // relative to the base URI passed
 543:     private SAXParseException readText (URL url, String href, String encoding)
 544:     throws SAXException
 545:     {
 546:     InputStream    in = null;
 547: 
 548:     try {
 549:         URLConnection    conn;
 550:         InputStreamReader    reader;
 551:         char        buf [] = new char [4096];
 552:         int            count;
 553: 
 554:         url = new URL (url, href);
 555:         conn = url.openConnection ();
 556:         in = conn.getInputStream ();
 557:         if (encoding == null)
 558:         encoding = Resolver.getEncoding (conn.getContentType ());
 559:         if (encoding == null) {
 560:         ErrorHandler    eh = getErrorHandler ();
 561:         if (eh != null)
 562:             eh.warning (new SAXParseException (
 563:             "guessing text encoding for URL: " + url,
 564:             locator));
 565:         reader = new InputStreamReader (in);
 566:         } else
 567:         reader = new InputStreamReader (in, encoding);
 568: 
 569:         while ((count = reader.read (buf, 0, buf.length)) != -1)
 570:         super.characters (buf, 0, count);
 571:         in.close ();
 572:         return null;
 573:     } catch (IOException e) {
 574:         return new SAXParseException (
 575:         "can't XInclude text",
 576:         locator, e);
 577:     }
 578:     }
 579: }