Class | HTML5::EncodingParser |
In: |
lib/feed_tools/vendor/html5/lib/html5/inputstream.rb
|
Parent: | Object |
Mini parser for detecting character encoding from meta elements
string - the data to work on for encoding detection
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 412 412: def initialize(data) 413: @data = EncodingBytes.new(data.to_s) 414: @encoding = nil 415: end
Return a name,value pair for the next attribute in the stream, if one is found, or nil
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 514 514: def get_attribute 515: @data.skip(SPACE_CHARACTERS + ['/']) 516: 517: if @data.current_byte == '<' 518: @data.position -= 1 519: return nil 520: elsif @data.current_byte == '>' 521: return nil 522: end 523: 524: attr_name = [] 525: attr_value = [] 526: space_found = false 527: #Step 5 attribute name 528: while true 529: if @data.current_byte == '=' and attr_name 530: break 531: elsif SPACE_CHARACTERS.include?(@data.current_byte) 532: space_found = true 533: break 534: elsif ['/', '<', '>'].include?(@data.current_byte) 535: return [attr_name.join(''), ''] 536: elsif ASCII_UPPERCASE.include?(@data.current_byte) 537: attr_name.push(@data.current_byte.downcase) 538: else 539: attr_name.push(@data.current_byte) 540: end 541: #Step 6 542: @data.position += 1 543: end 544: #Step 7 545: if space_found 546: @data.skip 547: #Step 8 548: unless @data.current_byte == '=' 549: @data.position -= 1 550: return [attr_name.join(''), ''] 551: end 552: end 553: #XXX need to advance position in both spaces and value case 554: #Step 9 555: @data.position += 1 556: #Step 10 557: @data.skip 558: #Step 11 559: if ["'", '"'].include?(@data.current_byte) 560: #11.1 561: quote_char = @data.current_byte 562: while true 563: @data.position+=1 564: #11.3 565: if @data.current_byte == quote_char 566: @data.position += 1 567: return [attr_name.join(''), attr_value.join('')] 568: #11.4 569: elsif ASCII_UPPERCASE.include?(@data.current_byte) 570: attr_value.push(@data.current_byte.downcase) 571: #11.5 572: else 573: attr_value.push(@data.current_byte) 574: end 575: end 576: elsif ['>', '<'].include?(@data.current_byte) 577: return [attr_name.join(''), ''] 578: elsif ASCII_UPPERCASE.include?(@data.current_byte) 579: attr_value.push(@data.current_byte.downcase) 580: else 581: attr_value.push(@data.current_byte) 582: end 583: while true 584: @data.position += 1 585: if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte) 586: return [attr_name.join(''), attr_value.join('')] 587: elsif ASCII_UPPERCASE.include?(@data.current_byte) 588: attr_value.push(@data.current_byte.downcase) 589: else 590: attr_value.push(@data.current_byte) 591: end 592: end 593: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 426 426: def get_encoding 427: @data.each do |byte| 428: keep_parsing = true 429: @@method_dispatch.each do |(key, method)| 430: if @data.match_bytes(key, lower = true) 431: keep_parsing = send(method) 432: break 433: end 434: end 435: break unless keep_parsing 436: end 437: @encoding = @encoding.strip unless @encoding.nil? 438: return @encoding 439: end
Skip over comments
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 442 442: def handle_comment 443: return @data.jump_to('-->') 444: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 446 446: def handle_meta 447: # if we have <meta not followed by a space so just keep going 448: return true unless SPACE_CHARACTERS.include?(@data.current_byte) 449: 450: #We have a valid meta element we want to search for attributes 451: while true 452: #Try to find the next attribute after the current position 453: attr = get_attribute 454: 455: return true if attr.nil? 456: 457: if attr[0] == 'charset' 458: tentative_encoding = attr[1] 459: if HTML5.is_valid_encoding(tentative_encoding) 460: @encoding = tentative_encoding 461: return false 462: end 463: elsif attr[0] == 'content' 464: content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1])) 465: tentative_encoding = content_parser.parse 466: if HTML5.is_valid_encoding(tentative_encoding) 467: @encoding = tentative_encoding 468: return false 469: end 470: end 471: end 472: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 508 508: def handle_other 509: return @data.jump_to('>') 510: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 478 478: def handle_possible_end_tag 479: @data.position += 1 480: return handle_possible_tag(true) 481: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 474 474: def handle_possible_start_tag 475: return handle_possible_tag(false) 476: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 483 483: def handle_possible_tag(end_tag) 484: unless ASCII_LETTERS.include?(@data.current_byte) 485: #If the next byte is not an ascii letter either ignore this 486: #fragment (possible start tag case) or treat it according to 487: #handleOther 488: if end_tag 489: @data.position -= 1 490: handle_other 491: end 492: return true 493: end 494: 495: @data.find_next(SPACE_CHARACTERS + ['<', '>']) 496: 497: if @data.current_byte == '<' 498: #return to the first step in the overall "two step" algorithm 499: #reprocessing the < byte 500: @data.position -= 1 501: else 502: #Read all attributes 503: {} until get_attribute.nil? 504: end 505: return true 506: end