Class HTML5::HTMLInputStream
In: lib/feed_tools/vendor/html5/lib/html5/inputstream.rb
Parent: Object
Phase XmlElementPhase InTablePhase RootElementPhase InHeadPhase AfterHeadPhase AfterFramesetPhase XmlRootPhase InTableBodyPhase InFramesetPhase InColumnGroupPhase InitialPhase InCaptionPhase TrailingEndPhase InSelectPhase BeforeHeadPhase AfterBodyPhase InCellPhase InBodyPhase InRowPhase XhmlRootPhase Exception SerializeError EOF AssertionError ParseError HTMLSanitizer HTMLTokenizer XMLParser XHTMLParser HTMLParser String EncodingBytes HTMLSerializer XHTMLSerializer TreeWalkers::Base NonRecursiveTreeWalker TreeWalker TreeWalker Base TreeWalker Element DocumentFragment Node CommentNode DocumentType TextNode Document Base::Node Node Node Base::TreeBuilder TreeBuilder TreeBuilder TreeBuilder Element DocumentFragment CommentNode DocumentType TextNode Document Element DocumentFragment CommentNode DocumentType TextNode Document Base InjectMetaCharset OptionalTagFilter WhitespaceFilter HTMLSanitizeFilter HTMLSanitizeModule Enumerable TestData SimpleDelegator HTMLInputStream EncodingParser ContentAttrParser Node TreeBuilder lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb lib/feed_tools/vendor/html5/lib/html5/constants.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb lib/feed_tools/vendor/html5/lib/html5/inputstream.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb Hpricot TokenConstructor lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb SimpleTree TreeWalkers HTMLSanitizeModule lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb Hpricot lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb Base lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb SimpleTree TreeBuilders lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb lib/feed_tools/vendor/html5/lib/html5/filters/base.rb lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb Filters Sniffer lib/feed_tools/vendor/html5/tests/preamble.rb TestSupport HTML5 dot/m_75_0.png

This class takes care of character encoding and removing or replacing incorrect byte-sequences and also provides column and line tracking.

Methods

Attributes

char_encoding  [RW] 
errors  [RW] 
queue  [RW] 

Public Class methods

Initialises the HTMLInputStream.

HTMLInputStream(source, [encoding]) -> Normalized stream from source for use by the HTML5Lib.

source can be either a file-object, local filename or a string.

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

parseMeta - Look for a <meta> element containing encoding information

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 29
29:     def initialize(source, options = {})
30:       @encoding   = nil
31:       @parse_meta = true
32:       @chardet    = true
33: 
34:       options.each {|name, value| instance_variable_set("@#{name}", value) }
35: 
36:       # Raw Stream
37:       @raw_stream = open_stream(source)
38: 
39:       # Encoding Information
40:       #Number of bytes to use when looking for a meta element with
41:       #encoding information
42:       @NUM_BYTES_META = 512
43:       #Number of bytes to use when using detecting encoding using chardet
44:       @NUM_BYTES_CHARDET = 256
45:       #Number of bytes to use when reading content
46:       @NUM_BYTES_BUFFER = 1024
47: 
48:       #Encoding to use if no other information can be found
49:       @DEFAULT_ENCODING = 'windows-1252'
50:     
51:       #Detect encoding iff no explicit "transport level" encoding is supplied
52:       if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
53:         @char_encoding = detect_encoding
54:       else
55:         @char_encoding = @encoding
56:       end
57: 
58:       # Read bytes from stream decoding them into Unicode
59:       @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
60:       if @char_encoding == 'windows-1252'
61:         @win1252 = true
62:       elsif @char_encoding != 'utf-8'
63:         require 'iconv'
64:         begin
65:           @buffer << @raw_stream.read unless @raw_stream.eof?
66:           @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
67:         rescue
68:           @win1252 = true
69:         end
70:       end
71: 
72:       @queue = []
73:       @errors = []
74: 
75:       # Reset position in the list to read from
76:       @tell = 0
77:       @line = @col = 0
78:       @line_lengths = []
79:     end

Public Instance methods

Read one character from the stream or queue if available. Return EOF when EOF is reached.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 252
252:     def char
253:       unless @queue.empty?
254:         return @queue.shift
255:       else
256:         if @tell + 3 > @buffer.length && !@raw_stream.eof?
257:           # read next block
258:           @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
259:           @tell = 0
260:         end
261: 
262:         c = @buffer[@tell]
263:         @tell += 1
264: 
265:         case c
266:         when 0x01..0x7F
267:           if c == 0x0D
268:             # normalize newlines
269:             @tell += 1 if @buffer[@tell] == 0x0A
270:             c = 0x0A
271:           end
272: 
273:           # update position in stream
274:           if c == 0x0a
275:             @line_lengths << @col
276:             @line += 1
277:             @col = 0
278:           else
279:             @col += 1
280:           end
281: 
282:           c.chr
283: 
284:         when 0x80..0xBF
285:           if !@win1252
286:             [0xFFFD].pack('U') # invalid utf-8
287:           elsif c <= 0x9f
288:             [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
289:           else
290:             "\xC2" + c.chr # convert to utf-8
291:           end
292: 
293:         when 0xC0..0xFF
294:           if instance_variables.include?("@win1252") && @win1252
295:             "\xC3" + (c - 64).chr # convert to utf-8
296:           # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
297:           elsif @buffer[@tell - 1..@tell + 3] =~ /^
298:                 ( [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
299:                 |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
300:                 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
301:                 |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
302:                 |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
303:                 | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
304:                 |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
305:                 )/x
306:             @tell += $1.length - 1
307:             $1
308:           else
309:             [0xFFFD].pack('U') # invalid utf-8
310:           end
311: 
312:         when 0x00
313:           @errors.push("null-character")
314:           [0xFFFD].pack('U') # null characters are invalid
315: 
316:         else
317:           :EOF
318:         end
319:       end
320:     end

Returns a string of characters from the stream up to but not including any character in characters or EOF. characters can be any container that supports the in method being called on it.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 325
325:     def chars_until(characters, opposite=false)
326:       char_stack = [char]
327: 
328:       while char_stack.last != :EOF
329:         break unless (characters.include?(char_stack.last)) == opposite
330:         char_stack.push(char)
331:       end
332: 
333:       # Put the character stopped on back to the front of the queue
334:       # from where it came.
335:       c = char_stack.pop
336:       @queue.insert(0, c) unless c == :EOF
337:       return char_stack.join('')
338:     end

Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return nil

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 147
147:     def detect_bom
148:       bom_dict = {
149:         "\xef\xbb\xbf"     => 'utf-8',
150:         "\xff\xfe"         => 'utf-16le',
151:         "\xfe\xff"         => 'utf-16be',
152:         "\xff\xfe\x00\x00" => 'utf-32le',
153:         "\x00\x00\xfe\xff" => 'utf-32be'
154:       }
155: 
156:       # Go to beginning of file and read in 4 bytes
157:       string = @raw_stream.read(4)
158:       return nil unless string
159: 
160:       # Try detecting the BOM using bytes from the string
161:       encoding = bom_dict[string[0...3]]      # UTF-8
162:       seek = 3
163:       unless encoding
164:         # Need to detect UTF-32 before UTF-16
165:         encoding = bom_dict[string]       # UTF-32
166:         seek = 4
167:         unless encoding
168:           encoding = bom_dict[string[0...2]]  # UTF-16
169:           seek = 2
170:         end
171:       end
172: 
173:       # Set the read position past the BOM if one was found, otherwise
174:       # set it to the start of the stream
175:       seek(string, encoding ? seek : 0)
176: 
177:       return encoding
178:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 94
 94:     def detect_encoding
 95: 
 96:       #First look for a BOM
 97:       #This will also read past the BOM if present
 98:       encoding = detect_bom
 99: 
100:       #If there is no BOM need to look for meta elements with encoding 
101:       #information
102:       if encoding.nil? and @parse_meta
103:         encoding = detect_encoding_meta
104:       end
105: 
106:       #Guess with chardet, if avaliable
107:       if encoding.nil? and @chardet
108:         begin
109:           require 'rubygems'
110:           require 'UniversalDetector' # gem install chardet
111:           buffers = []
112:           detector = UniversalDetector::Detector.instance
113:           detector.reset
114:           until @raw_stream.eof?
115:             buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
116:             break if !buffer or buffer.empty?
117:             buffers << buffer
118:             detector.feed(buffer)
119:             break if detector.instance_eval {@done}
120:             detector.instance_eval {
121:               @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
122:             }
123:           end
124:           detector.close
125:           encoding = detector.result['encoding']
126:           seek(buffers*'', 0)
127:         rescue LoadError
128:         end
129:       end
130: 
131:       # If all else fails use the default encoding
132:       if encoding.nil?
133:         encoding = @DEFAULT_ENCODING
134:       end
135:     
136:       #Substitute for equivalent encoding
137:       if 'iso-8859-1' == encoding.downcase
138:         encoding = 'windows-1252'
139:       end
140: 
141:       encoding
142:     end

Report the encoding declared by the meta element

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 228
228:     def detect_encoding_meta
229:       buffer = @raw_stream.read(@NUM_BYTES_META)
230:       parser = EncodingParser.new(buffer)
231:       seek(buffer, 0)
232:       return parser.get_encoding
233:     end

Produces a file object from source.

source can be either a file object, local filename or a string.

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 84
84:     def open_stream(source)
85:       # Already an IO like object
86:       if source.respond_to?(:read)
87:         source
88:       else
89:         # Treat source as a string and wrap in StringIO
90:         StringIO.new(source)
91:       end
92:     end

Returns (line, col) of the current position in the stream.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 236
236:     def position
237:       line, col = @line, @col
238:       @queue.reverse.each do |c|
239:         if c == "\n"
240:           line -= 1
241:           raise RuntimeError.new("col=#{col}") unless col == 0
242:           col = @line_lengths[line]
243:         else
244:           col -= 1
245:         end 
246:       end
247:       return [line + 1, col]
248:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 180
180:     def seek(buffer, n)
181:       if @raw_stream.respond_to?(:unget)
182:         @raw_stream.unget(buffer[n..-1])
183:         return
184:       end
185: 
186:       if @raw_stream.respond_to?(:seek)
187:         begin
188:           @raw_stream.seek(n)
189:           return
190:         rescue Errno::ESPIPE
191:         end
192:       end
193: 
194:       #TODO: huh?
195:       require 'delegate'
196:       @raw_stream = SimpleDelegator.new(@raw_stream)
197: 
198:       class << @raw_stream
199:         def read(chars=-1)
200:           if chars == -1 or chars > @data.length
201:             result = @data
202:             @data = ''
203:             return result if __getobj__.eof?
204:             return result + __getobj__.read if chars == -1
205:             return result + __getobj__.read(chars-result.length)
206:           elsif @data.empty?
207:             return __getobj__.read(chars)
208:           else
209:             result = @data[1...chars]
210:             @data = @data[chars..-1]
211:             return result
212:           end
213:         end
214: 
215:         def unget(data)
216:           if !@data or @data.empty?
217:             @data = data
218:           else
219:             @data += data
220:           end
221:         end
222:       end
223: 
224:       @raw_stream.unget(buffer[n .. -1])
225:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 340
340:     def unget(characters)
341:       @queue.unshift(*characters.to_a) unless characters == :EOF
342:     end

[Validate]