Class HTML5::HTMLParser
In: lib/feed_tools/vendor/html5/lib/html5/html5parser.rb
Parent: Object
Phase XmlElementPhase InTablePhase RootElementPhase InHeadPhase AfterHeadPhase AfterFramesetPhase XmlRootPhase InitialPhase InTableBodyPhase InFramesetPhase InColumnGroupPhase InCaptionPhase TrailingEndPhase InSelectPhase BeforeHeadPhase InCellPhase InBodyPhase AfterBodyPhase InRowPhase Exception SerializeError EOF AssertionError ParseError HTMLSanitizer HTMLTokenizer XhmlRootPhase XMLParser XHTMLParser HTMLParser String EncodingBytes HTMLSerializer XHTMLSerializer TreeWalkers::Base NonRecursiveTreeWalker TreeWalker TreeWalker Base TreeWalker Element DocumentFragment Node CommentNode DocumentType TextNode Document Base::Node Node Node Base::TreeBuilder TreeBuilder TreeBuilder TreeBuilder Element DocumentFragment CommentNode DocumentType TextNode Document Element DocumentFragment CommentNode DocumentType TextNode Document Enumerable TestData Base OptionalTagFilter InjectMetaCharset WhitespaceFilter HTMLSanitizeFilter HTMLSanitizeModule SimpleDelegator HTMLInputStream EncodingParser ContentAttrParser Node TreeBuilder lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb lib/feed_tools/vendor/html5/lib/html5/constants.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb lib/feed_tools/vendor/html5/lib/html5/inputstream.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb Hpricot TokenConstructor lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb SimpleTree TreeWalkers HTMLSanitizeModule lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb Hpricot lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb Base lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb SimpleTree TreeBuilders lib/feed_tools/vendor/html5/tests/preamble.rb TestSupport Sniffer lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb lib/feed_tools/vendor/html5/lib/html5/filters/base.rb lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb Filters HTML5 dot/m_66_0.png

HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML

Methods

Attributes

errors  [R] 
first_start_tag  [RW] 
inner_html  [RW] 
insert_from_table  [RW] 
last_phase  [RW] 
phase  [RW] 
phases  [R] 
tokenizer  [R] 
tree  [R] 

Public Class methods

:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through HTML5::TreeBuilders[treeType]

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 41
41:     def initialize(options = {})
42:       @strict = false
43:       @errors = []
44:      
45:       @tokenizer =  HTMLTokenizer
46:       @tree = TreeBuilders::REXML::TreeBuilder
47: 
48:       options.each {|name, value| instance_variable_set("@#{name}", value) }
49:       @lowercase_attr_name    = nil unless instance_variables.include?("@lowercase_attr_name")
50:       @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")
51: 
52:       @tree = @tree.new
53: 
54:       @phases = @@phases.inject({}) do |phases, phase_name|
55:         phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
56:         phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
57:         phases
58:       end
59:     end

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 23
23:     def self.parse(stream, options = {})
24:       encoding = options.delete(:encoding)
25:       new(options).parse(stream,encoding)
26:     end

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 28
28:     def self.parse_fragment(stream, options = {})
29:       container = options.delete(:container) || 'div'
30:       encoding = options.delete(:encoding)
31:       new(options).parse_fragment(stream, container, encoding)
32:     end

Public Instance methods

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 245
245:     def _(string); string; end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 61
 61:     def _parse(stream, inner_html, encoding, container = 'div')
 62:       @tree.reset
 63:       @first_start_tag = false
 64:       @errors = []
 65: 
 66:       @tokenizer = @tokenizer.class unless Class === @tokenizer
 67:       @tokenizer = @tokenizer.new(stream, :encoding => encoding,
 68:         :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
 69: 
 70:       if inner_html
 71:         case @inner_html = container.downcase
 72:         when 'title', 'textarea'
 73:           @tokenizer.content_model_flag = :RCDATA
 74:         when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
 75:           @tokenizer.content_model_flag = :CDATA
 76:         when 'plaintext'
 77:           @tokenizer.content_model_flag = :PLAINTEXT
 78:         else
 79:           # content_model_flag already is PCDATA
 80:           @tokenizer.content_model_flag = :PCDATA
 81:         end
 82:       
 83:         @phase = @phases[:rootElement]
 84:         @phase.insert_html_element
 85:         reset_insertion_mode
 86:       else
 87:         @inner_html = false
 88:         @phase = @phases[:initial]
 89:       end
 90: 
 91:       # We only seem to have InBodyPhase testcases where the following is
 92:       # relevant ... need others too
 93:       @last_phase = nil
 94: 
 95:       # XXX This is temporary for the moment so there isn't any other
 96:       # changes needed for the parser to work with the iterable tokenizer
 97:       @tokenizer.each do |token|
 98:         token = normalize_token(token)
 99: 
100:         method = 'process%s' % token[:type]
101: 
102:         case token[:type]
103:         when :Characters, :SpaceCharacters, :Comment
104:           @phase.send method, token[:data]
105:         when :StartTag
106:           @phase.send method, token[:name], token[:data]
107:         when :EndTag
108:           @phase.send method, token[:name]
109:         when :Doctype
110:           @phase.send method, token[:name], token[:publicId],
111:             token[:systemId], token[:correct]
112:         else
113:           parse_error(token[:data], token[:datavars])
114:         end
115:       end
116: 
117:       # When the loop finishes it's EOF
118:       @phase.process_eof
119:     end

HTML5 specific normalizations to the token stream

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 157
157:     def normalize_token(token)
158: 
159:       if token[:type] == :EmptyTag
160:         # When a solidus (/) is encountered within a tag name what happens
161:         # depends on whether the current tag name matches that of a void
162:         # element.  If it matches a void element atheists did the wrong
163:         # thing and if it doesn't it's wrong for everyone.
164: 
165:         unless VOID_ELEMENTS.include?(token[:name])
166:           parse_error("incorrectly-placed-solidus")
167:         end
168: 
169:         token[:type] = :StartTag
170:       end
171: 
172:       if token[:type] == :StartTag
173:         token[:name] = token[:name].downcase
174: 
175:         # We need to remove the duplicate attributes and convert attributes
176:         # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
177: 
178:         unless token[:data].empty?
179:           data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
180:           token[:data] = Hash[*data.flatten]
181:         end
182: 
183:       elsif token[:type] == :EndTag
184:         parse_error("attributes-in-end-tag") unless token[:data].empty?
185:         token[:name] = token[:name].downcase
186:       end
187: 
188:       token
189:     end

Parse a HTML document into a well-formed tree

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 129
129:     def parse(stream, encoding=nil)
130:       _parse(stream, false, encoding)
131:       @tree.get_document
132:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 150
150:     def parse_error(code = 'XXX-undefined-error', data = {})
151:       # XXX The idea is to make data mandatory.
152:       @errors.push([@tokenizer.stream.position, code, data])
153:       raise ParseError if @strict
154:     end

container - name of the element we‘re setting the inner_html property if set to nil, default to ‘div‘

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 145
145:     def parse_fragment(stream, container='div', encoding=nil)
146:       _parse(stream, true, encoding, container)
147:       @tree.get_fragment
148:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 207
207:     def reset_insertion_mode
208:       # The name of this method is mostly historical. (It's also used in the
209:       # specification.)
210:       last = false
211: 
212:       @tree.open_elements.reverse.each do |node|
213:         node_name = node.name
214: 
215:         if node == @tree.open_elements.first
216:           last = true
217:           unless ['td', 'th'].include?(node_name)
218:             # XXX
219:             # assert @inner_html
220:             node_name = @inner_html
221:           end
222:         end
223: 
224:         # Check for conditions that should only happen in the inner_html
225:         # case
226:         if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
227:           # XXX
228:           # assert @inner_html
229:         end
230: 
231:         if @@new_modes.has_key?(node_name)
232:           @phase = @phases[@@new_modes[node_name]]
233:         elsif node_name == 'html'
234:           @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
235:         elsif last
236:           @phase = @phases[:inBody]
237:         else
238:           next
239:         end
240: 
241:         break
242:       end
243:     end

[Validate]