Class | HTML5::HTMLParser |
In: |
lib/feed_tools/vendor/html5/lib/html5/html5parser.rb
|
Parent: | Object |
HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
errors | [R] | |
first_start_tag | [RW] | |
inner_html | [RW] | |
insert_from_table | [RW] | |
last_phase | [RW] | |
phase | [RW] | |
phases | [R] | |
tokenizer | [R] | |
tree | [R] |
:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through HTML5::TreeBuilders[treeType]
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 41 41: def initialize(options = {}) 42: @strict = false 43: @errors = [] 44: 45: @tokenizer = HTMLTokenizer 46: @tree = TreeBuilders::REXML::TreeBuilder 47: 48: options.each {|name, value| instance_variable_set("@#{name}", value) } 49: @lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name") 50: @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name") 51: 52: @tree = @tree.new 53: 54: @phases = @@phases.inject({}) do |phases, phase_name| 55: phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase' 56: phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree) 57: phases 58: end 59: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 23 23: def self.parse(stream, options = {}) 24: encoding = options.delete(:encoding) 25: new(options).parse(stream,encoding) 26: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 28 28: def self.parse_fragment(stream, options = {}) 29: container = options.delete(:container) || 'div' 30: encoding = options.delete(:encoding) 31: new(options).parse_fragment(stream, container, encoding) 32: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 245 245: def _(string); string; end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 61 61: def _parse(stream, inner_html, encoding, container = 'div') 62: @tree.reset 63: @first_start_tag = false 64: @errors = [] 65: 66: @tokenizer = @tokenizer.class unless Class === @tokenizer 67: @tokenizer = @tokenizer.new(stream, :encoding => encoding, 68: :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name) 69: 70: if inner_html 71: case @inner_html = container.downcase 72: when 'title', 'textarea' 73: @tokenizer.content_model_flag = :RCDATA 74: when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' 75: @tokenizer.content_model_flag = :CDATA 76: when 'plaintext' 77: @tokenizer.content_model_flag = :PLAINTEXT 78: else 79: # content_model_flag already is PCDATA 80: @tokenizer.content_model_flag = :PCDATA 81: end 82: 83: @phase = @phases[:rootElement] 84: @phase.insert_html_element 85: reset_insertion_mode 86: else 87: @inner_html = false 88: @phase = @phases[:initial] 89: end 90: 91: # We only seem to have InBodyPhase testcases where the following is 92: # relevant ... need others too 93: @last_phase = nil 94: 95: # XXX This is temporary for the moment so there isn't any other 96: # changes needed for the parser to work with the iterable tokenizer 97: @tokenizer.each do |token| 98: token = normalize_token(token) 99: 100: method = 'process%s' % token[:type] 101: 102: case token[:type] 103: when :Characters, :SpaceCharacters, :Comment 104: @phase.send method, token[:data] 105: when :StartTag 106: @phase.send method, token[:name], token[:data] 107: when :EndTag 108: @phase.send method, token[:name] 109: when :Doctype 110: @phase.send method, token[:name], token[:publicId], 111: token[:systemId], token[:correct] 112: else 113: parse_error(token[:data], token[:datavars]) 114: end 115: end 116: 117: # When the loop finishes it's EOF 118: @phase.process_eof 119: end
HTML5 specific normalizations to the token stream
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 157 157: def normalize_token(token) 158: 159: if token[:type] == :EmptyTag 160: # When a solidus (/) is encountered within a tag name what happens 161: # depends on whether the current tag name matches that of a void 162: # element. If it matches a void element atheists did the wrong 163: # thing and if it doesn't it's wrong for everyone. 164: 165: unless VOID_ELEMENTS.include?(token[:name]) 166: parse_error("incorrectly-placed-solidus") 167: end 168: 169: token[:type] = :StartTag 170: end 171: 172: if token[:type] == :StartTag 173: token[:name] = token[:name].downcase 174: 175: # We need to remove the duplicate attributes and convert attributes 176: # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} 177: 178: unless token[:data].empty? 179: data = token[:data].reverse.map {|attr, value| [attr.downcase, value] } 180: token[:data] = Hash[*data.flatten] 181: end 182: 183: elsif token[:type] == :EndTag 184: parse_error("attributes-in-end-tag") unless token[:data].empty? 185: token[:name] = token[:name].downcase 186: end 187: 188: token 189: end
Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 129 129: def parse(stream, encoding=nil) 130: _parse(stream, false, encoding) 131: @tree.get_document 132: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 150 150: def parse_error(code = 'XXX-undefined-error', data = {}) 151: # XXX The idea is to make data mandatory. 152: @errors.push([@tokenizer.stream.position, code, data]) 153: raise ParseError if @strict 154: end
container - name of the element we‘re setting the inner_html property if set to nil, default to ‘div‘
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 145 145: def parse_fragment(stream, container='div', encoding=nil) 146: _parse(stream, true, encoding, container) 147: @tree.get_fragment 148: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 207 207: def reset_insertion_mode 208: # The name of this method is mostly historical. (It's also used in the 209: # specification.) 210: last = false 211: 212: @tree.open_elements.reverse.each do |node| 213: node_name = node.name 214: 215: if node == @tree.open_elements.first 216: last = true 217: unless ['td', 'th'].include?(node_name) 218: # XXX 219: # assert @inner_html 220: node_name = @inner_html 221: end 222: end 223: 224: # Check for conditions that should only happen in the inner_html 225: # case 226: if ['select', 'colgroup', 'head', 'frameset'].include?(node_name) 227: # XXX 228: # assert @inner_html 229: end 230: 231: if @@new_modes.has_key?(node_name) 232: @phase = @phases[@@new_modes[node_name]] 233: elsif node_name == 'html' 234: @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead] 235: elsif last 236: @phase = @phases[:inBody] 237: else 238: next 239: end 240: 241: break 242: end 243: end