Class HTML5::HTMLTokenizer
In: lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb
Parent: Object
Phase XmlElementPhase InTablePhase RootElementPhase InHeadPhase AfterHeadPhase AfterFramesetPhase XmlRootPhase InitialPhase InTableBodyPhase InFramesetPhase InColumnGroupPhase InCaptionPhase TrailingEndPhase InSelectPhase BeforeHeadPhase InCellPhase InBodyPhase AfterBodyPhase InRowPhase Exception SerializeError EOF AssertionError ParseError HTMLSanitizer HTMLTokenizer XhmlRootPhase XMLParser XHTMLParser HTMLParser String EncodingBytes HTMLSerializer XHTMLSerializer TreeWalkers::Base NonRecursiveTreeWalker TreeWalker TreeWalker Base TreeWalker Element DocumentFragment Node CommentNode DocumentType TextNode Document Base::Node Node Node Base::TreeBuilder TreeBuilder TreeBuilder TreeBuilder Element DocumentFragment CommentNode DocumentType TextNode Document Element DocumentFragment CommentNode DocumentType TextNode Document Enumerable TestData Base OptionalTagFilter InjectMetaCharset WhitespaceFilter HTMLSanitizeFilter HTMLSanitizeModule SimpleDelegator HTMLInputStream EncodingParser ContentAttrParser Node TreeBuilder lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb lib/feed_tools/vendor/html5/lib/html5/constants.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb lib/feed_tools/vendor/html5/lib/html5/inputstream.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb Hpricot TokenConstructor lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb SimpleTree TreeWalkers HTMLSanitizeModule lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb Hpricot lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb Base lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb SimpleTree TreeBuilders lib/feed_tools/vendor/html5/tests/preamble.rb TestSupport Sniffer lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb lib/feed_tools/vendor/html5/lib/html5/filters/base.rb lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb Filters HTML5 dot/m_66_0.png

This class takes care of tokenizing HTML.

  • @current_token Holds the token that is currently being processed.
  • @state Holds a reference to the method to be invoked… XXX
  • @states Holds a mapping between states and methods that implement the state.
  • @stream Points to HTMLInputStream object.

Methods

Attributes

content_model_flag  [RW] 
current_token  [RW] 
stream  [R] 

Public Class methods

XXX need to fix documentation

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 26
26:     def initialize(stream, options = {})
27:       @stream = HTMLInputStream.new(stream, options)
28: 
29:       # Setup the initial tokenizer state
30:       @content_model_flag = :PCDATA
31:       @state              = :data_state
32:       @escapeFlag         = false
33:       @lastFourChars      = []
34: 
35:       # The current token being created
36:       @current_token = nil
37: 
38:       # Tokens to be processed.
39:       @token_queue             = []
40:       @lowercase_element_name = options[:lowercase_element_name] != false
41:       @lowercase_attr_name    = options[:lowercase_attr_name]    != false
42:     end

Public Instance methods

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 491
491:     def after_attribute_name_state
492:       data = @stream.char
493:       if SPACE_CHARACTERS.include? data
494:         @stream.chars_until(SPACE_CHARACTERS, true)
495:       elsif data == "="
496:         @state = :before_attribute_value_state
497:       elsif data == ">"
498:         emit_current_token
499:       elsif data == :EOF
500:         @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
501:         emit_current_token
502:       elsif ASCII_LETTERS.include? data
503:         @current_token[:data].push([data, ""])
504:         @state = :attribute_name_state
505:       elsif data == "/"
506:         process_solidus_in_tag
507:         @state = :before_attribute_name_state
508:       else
509:         @current_token[:data].push([data, ""])
510:         @state = :attribute_name_state
511:       end
512:       return true
513:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 762
762:     def after_doctype_name_state
763:       data = @stream.char
764:       if SPACE_CHARACTERS.include? data
765:       elsif data == ">"
766:         @token_queue << @current_token
767:         @state = :data_state
768:       elsif data == :EOF
769:         @current_token[:correct] = false
770:         @stream.unget(data)
771:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
772:         @token_queue << @current_token
773:         @state = :data_state
774:       else
775:         char_stack = [data]  
776:         5.times { char_stack << stream.char }
777:         token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
778:         if token == "public" and !char_stack.include?(:EOF)
779:           @state = :before_doctype_public_identifier_state
780:         elsif token == "system" and !char_stack.include?(:EOF)
781:           @state = :before_doctype_system_identifier_state
782:         else
783:           @stream.unget(char_stack)
784:           @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
785:           @state = :bogus_doctype_state
786:         end
787:       end
788:       return true
789:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 849
849:     def after_doctype_public_identifier_state
850:       data = @stream.char
851:       if SPACE_CHARACTERS.include?(data)
852:       elsif data == "\""
853:         @current_token[:systemId] = ""
854:         @state = :doctype_system_identifier_double_quoted_state
855:       elsif data == "'"
856:         @current_token[:systemId] = ""
857:         @state = :doctype_system_identifier_single_quoted_state
858:       elsif data == ">"
859:         @token_queue << @current_token
860:         @state = :data_state
861:       elsif data == :EOF
862:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
863:         @current_token[:correct] = false
864:         @token_queue << @current_token
865:         @state = :data_state
866:       else
867:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
868:         @state = :bogus_doctype_state
869:       end
870:       return true
871:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 929
929:     def after_doctype_system_identifier_state
930:       data = @stream.char
931:       if SPACE_CHARACTERS.include?(data)
932:       elsif data == ">"
933:         @token_queue << @current_token
934:         @state = :data_state
935:       elsif data == :EOF
936:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
937:         @current_token[:correct] = false
938:         @token_queue << @current_token
939:         @state = :data_state
940:       else
941:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
942:         @state = :bogus_doctype_state
943:       end
944:       return true
945:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 444
444:     def attribute_name_state
445:       data = @stream.char
446:       leavingThisState = true
447:       emitToken = false
448:       if data == "="
449:         @state = :before_attribute_value_state
450:       elsif data == :EOF
451:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
452:         @state = :data_state
453:         emitToken = true
454:       elsif ASCII_LETTERS.include? data
455:         @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
456:         leavingThisState = false
457:       elsif data == ">"
458:         # XXX If we emit here the attributes are converted to a dict
459:         # without being checked and when the code below runs we error
460:         # because data is a dict not a list
461:         emitToken = true
462:       elsif SPACE_CHARACTERS.include? data
463:         @state = :after_attribute_name_state
464:       elsif data == "/"
465:         process_solidus_in_tag
466:         @state = :before_attribute_name_state
467:       else
468:         @current_token[:data][-1][0] += data
469:         leavingThisState = false
470:       end
471: 
472:       if leavingThisState
473:         # Attributes are not dropped at this stage. That happens when the
474:         # start tag token is emitted so values can still be safely appended
475:         # to attributes, but we do want to report the parse error in time.
476:         if @lowercase_attr_name
477:             @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
478:         end
479:         @current_token[:data][0...-1].each {|name,value|
480:           if @current_token[:data].last.first == name
481:             @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
482:             break # don't report an error more than once
483:           end
484:         }
485:         # XXX Fix for above XXX
486:         emit_current_token if emitToken
487:       end
488:       return true
489:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 538
538:     def attribute_value_double_quoted_state
539:       data = @stream.char
540:       if data == "\""
541:         @state = :before_attribute_name_state
542:       elsif data == "&"
543:         process_entity_in_attribute
544:       elsif data == :EOF
545:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
546:         emit_current_token
547:       else
548:         @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
549:       end
550:       return true
551:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 553
553:     def attribute_value_single_quoted_state
554:       data = @stream.char
555:       if data == "'"
556:         @state = :before_attribute_name_state
557:       elsif data == "&"
558:         process_entity_in_attribute
559:       elsif data == :EOF
560:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
561:         emit_current_token
562:       else
563:         @current_token[:data][-1][1] += data +\
564:           @stream.chars_until(["'", "&"])
565:       end
566:       return true
567:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 569
569:     def attribute_value_unquoted_state
570:       data = @stream.char
571:       if SPACE_CHARACTERS.include? data
572:         @state = :before_attribute_name_state
573:       elsif data == "&"
574:         process_entity_in_attribute
575:       elsif data == ">"
576:         emit_current_token
577:       elsif data == :EOF
578:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
579:         emit_current_token
580:       else
581:         @current_token[:data][-1][1] += data +  @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
582:       end
583:       return true
584:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 423
423:     def before_attribute_name_state
424:       data = @stream.char
425:       if SPACE_CHARACTERS.include? data
426:         @stream.chars_until(SPACE_CHARACTERS, true)
427:       elsif data == :EOF
428:         @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
429:         emit_current_token
430:       elsif ASCII_LETTERS.include? data
431:         @current_token[:data].push([data, ""])
432:         @state = :attribute_name_state
433:       elsif data == ">"
434:         emit_current_token
435:       elsif data == "/"
436:         process_solidus_in_tag
437:       else
438:         @current_token[:data].push([data, ""])
439:         @state = :attribute_name_state
440:       end
441:       return true
442:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 515
515:     def before_attribute_value_state
516:       data = @stream.char
517:       if SPACE_CHARACTERS.include? data
518:         @stream.chars_until(SPACE_CHARACTERS, true)
519:       elsif data == "\""
520:         @state = :attribute_value_double_quoted_state
521:       elsif data == "&"
522:         @state = :attribute_value_unquoted_state
523:         @stream.unget(data);
524:       elsif data == "'"
525:         @state = :attribute_value_single_quoted_state
526:       elsif data == ">"
527:         emit_current_token
528:       elsif data == :EOF
529:         @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
530:         emit_current_token
531:       else
532:         @current_token[:data][-1][1] += data
533:         @state = :attribute_value_unquoted_state
534:       end
535:       return true
536:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 723
723:     def before_doctype_name_state
724:       data = @stream.char
725:       if SPACE_CHARACTERS.include? data
726:       elsif data == ">"
727:         @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
728:         @current_token[:correct] = false
729:         @token_queue << @current_token
730:         @state = :data_state
731:       elsif data == :EOF
732:         @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
733:         @current_token[:correct] = false
734:         @token_queue << @current_token
735:         @state = :data_state
736:       else
737:         @current_token[:name] = data
738:         @state = :doctype_name_state
739:       end
740:       return true
741:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 791
791:     def before_doctype_public_identifier_state
792:       data = @stream.char
793: 
794:       if SPACE_CHARACTERS.include?(data)
795:       elsif data == "\""
796:         @current_token[:publicId] = ""
797:         @state = :doctype_public_identifier_double_quoted_state
798:       elsif data == "'"
799:         @current_token[:publicId] = ""
800:         @state = :doctype_public_identifier_single_quoted_state
801:       elsif data == ">"
802:         @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
803:         @current_token[:correct] = false
804:         @token_queue << @current_token
805:         @state = :data_state
806:       elsif data == :EOF
807:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
808:         @current_token[:correct] = false
809:         @token_queue << @current_token
810:         @state = :data_state
811:       else
812:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
813:         @state = :bogus_doctype_state
814:       end
815: 
816:       return true
817:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 873
873:     def before_doctype_system_identifier_state
874:       data = @stream.char
875:       if SPACE_CHARACTERS.include?(data)
876:       elsif data == "\""
877:         @current_token[:systemId] = ""
878:         @state = :doctype_system_identifier_double_quoted_state
879:       elsif data == "'"
880:         @current_token[:systemId] = ""
881:         @state = :doctype_system_identifier_single_quoted_state
882:       elsif data == ">"
883:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
884:         @current_token[:correct] = false
885:         @token_queue << @current_token
886:         @state = :data_state
887:       elsif data == :EOF
888:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
889:         @current_token[:correct] = false
890:         @token_queue << @current_token
891:         @state = :data_state
892:       else
893:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
894:         @state = :bogus_doctype_state
895:       end
896:       return true
897:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 586
586:     def bogus_comment_state
587:       # Make a new comment token and give it as value all the characters
588:       # until the first > or :EOF (chars_until checks for :EOF automatically)
589:       # and emit it.
590:       @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
591: 
592:       # Eat the character directly after the bogus comment which is either a
593:       # ">" or an :EOF.
594:       @stream.char
595:       @state = :data_state
596:       return true
597:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 947
947:     def bogus_doctype_state
948:       data = @stream.char
949:       @current_token[:correct] = false
950:       if data == ">"
951:         @token_queue << @current_token
952:         @state = :data_state
953:       elsif data == :EOF
954:         # XXX EMIT
955:         @stream.unget(data)
956:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
957:         @current_token[:correct] = false
958:         @token_queue << @current_token
959:         @state = :data_state
960:       end
961:       return true
962:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 343
343:     def close_tag_open_state
344:       if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
345:         if @current_token
346:           char_stack = []
347: 
348:           # So far we know that "</" has been consumed. We now need to know
349:           # whether the next few characters match the name of last emitted
350:           # start tag which also happens to be the current_token. We also need
351:           # to have the character directly after the characters that could
352:           # match the start tag name.
353:           (@current_token[:name].length + 1).times do
354:             char_stack.push(@stream.char)
355:             # Make sure we don't get hit by :EOF
356:             break if char_stack[-1] == :EOF
357:           end
358: 
359:           # Since this is just for checking. We put the characters back on
360:           # the stack.
361:           @stream.unget(char_stack)
362:         end
363: 
364:         if @current_token and
365:           @current_token[:name].downcase == 
366:           char_stack[0...-1].join('').downcase and
367:           (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
368:           # Because the characters are correct we can safely switch to
369:           # PCDATA mode now. This also means we don't have to do it when
370:           # emitting the end tag token.
371:           @content_model_flag = :PCDATA
372:         else
373:           @token_queue << {:type => :Characters, :data => "</"}
374:           @state = :data_state
375: 
376:           # Need to return here since we don't want the rest of the
377:           # method to be walked through.
378:           return true
379:         end
380:       end
381: 
382:       data = @stream.char
383:       if data == :EOF
384:         @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
385:         @token_queue << {:type => :Characters, :data => "</"}
386:         @state = :data_state
387:       elsif ASCII_LETTERS.include? data
388:         @current_token = {:type => :EndTag, :name => data, :data => []}
389:         @state = :tag_name_state
390:       elsif data == ">"
391:         @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
392:         @state = :data_state
393:       else
394:         # XXX data can be _'_...
395:         @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
396:         @stream.unget(data)
397:         @state = :bogus_comment_state
398:       end
399: 
400:       return true
401:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 671
671:     def comment_end_dash_state
672:       data = @stream.char
673:       if data == "-"
674:         @state = :comment_end_state
675:       elsif data == :EOF
676:         @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
677:         @token_queue << @current_token
678:         @state = :data_state
679:       else
680:         @current_token[:data] += "-" + data +\
681:           @stream.chars_until("-")
682:         # Consume the next character which is either a "-" or an :EOF as
683:         # well so if there's a "-" directly after the "-" we go nicely to
684:         # the "comment end state" without emitting a ParseError there.
685:         @stream.char
686:       end
687:       return true
688:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 690
690:     def comment_end_state
691:       data = @stream.char
692:       if data == ">"
693:         @token_queue << @current_token
694:         @state = :data_state
695:       elsif data == "-"
696:         @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
697:         @current_token[:data] += data
698:       elsif data == :EOF
699:         @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
700:         @token_queue << @current_token
701:         @state = :data_state
702:       else
703:         # XXX
704:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
705:         @current_token[:data] += "--" + data
706:         @state = :comment_state
707:       end
708:       return true
709:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 638
638:     def comment_start_dash_state
639:         data = @stream.char
640:         if data == "-"
641:             @state = :comment_end_state
642:         elsif data == ">"
643:             @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
644:             @token_queue << @current_token
645:             @state = :data_state
646:         elsif data == :EOF
647:             @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
648:             @token_queue << @current_token
649:             @state = :data_state
650:         else
651:             @current_token[:data] += '-' + data + @stream.chars_until("-")
652:             @state = :comment_state
653:         end
654:         return true
655:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 619
619:     def comment_start_state
620:         data = @stream.char
621:         if data == "-"
622:             @state = :comment_start_dash_state
623:         elsif data == ">"
624:             @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
625:             @token_queue << @current_token
626:             @state = :data_state
627:         elsif data == :EOF
628:             @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
629:             @token_queue << @current_token
630:             @state = :data_state
631:         else
632:             @current_token[:data] += data + @stream.chars_until("-")
633:             @state = :comment_state
634:         end
635:         return true
636:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 657
657:     def comment_state
658:       data = @stream.char
659:       if data == "-"
660:         @state = :comment_end_dash_state
661:       elsif data == :EOF
662:         @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
663:         @token_queue << @current_token
664:         @state = :data_state
665:       else
666:         @current_token[:data] += data + @stream.chars_until("-")
667:       end
668:       return true
669:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 137
137:     def consume_entity(from_attribute=false)
138:       char = nil
139:       char_stack = [@stream.char]
140:       if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
141:         @stream.unget(char_stack)
142:       elsif char_stack[0] == '#'
143:         # We might have a number entity here.
144:         char_stack += [@stream.char, @stream.char]
145:         if char_stack[0 .. 1].include? :EOF
146:           # If we reach the end of the file put everything up to :EOF
147:           # back in the queue
148:           char_stack = char_stack[0...char_stack.index(:EOF)]
149:           @stream.unget(char_stack)
150:           @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
151:         else
152:           if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153:             # Hexadecimal entity detected.
154:             @stream.unget(char_stack[2])
155:             char = consume_number_entity(true)
156:           elsif DIGITS.include? char_stack[1]
157:             # Decimal entity detected.
158:             @stream.unget(char_stack[1..-1])
159:             char = consume_number_entity(false)
160:           else
161:             # No number entity detected.
162:             @stream.unget(char_stack)
163:             @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
164:           end
165:         end
166:       else
167:         # At this point in the process might have named entity. Entities
168:         # are stored in the global variable "entities".
169:         #
170:         # Consume characters and compare to these to a substring of the
171:         # entity names in the list until the substring no longer matches.
172:         filteredEntityList = ENTITIES.keys
173:         filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
174:         entityName = nil
175: 
176:         # Try to find the longest entity the string will match to take care
177:         # of &noti for instance.
178:         while char_stack.last != :EOF
179:           name = char_stack.join('')
180:           if filteredEntityList.any? {|e| e[0...name.length] == name}
181:             filteredEntityList.reject! {|e| e[0...name.length] != name}
182:             char_stack.push(@stream.char)
183:           else
184:             break
185:           end
186: 
187:           if ENTITIES.include? name
188:             entityName = name
189:             break if entityName[-1] == ';'
190:           end
191:         end
192: 
193:         if entityName != nil
194:           char = ENTITIES[entityName]
195: 
196:           # Check whether or not the last character returned can be
197:           # discarded or needs to be put back.
198:           if entityName[-1] != ?;
199:             @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
200:           end
201: 
202:           if entityName[-1] != ";" and from_attribute and
203:              (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204:               DIGITS.include?(char_stack[entityName.length]))
205:             @stream.unget(char_stack)
206:             char = '&'
207:           else
208:             @stream.unget(char_stack[entityName.length..-1])
209:           end
210:         else
211:           @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
212:           @stream.unget(char_stack)
213:         end
214:       end
215:       return char
216:     end

This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present @token_queue << {:type => :ParseError}" is invoked.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 84
 84:     def consume_number_entity(isHex)
 85: 
 86:       # XXX More need to be done here. For instance, #13 should prolly be
 87:       # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
 88:       # such. Thoughts on this appreciated.
 89:       allowed = DIGITS
 90:       radix = 10
 91:       if isHex
 92:         allowed = HEX_DIGITS
 93:         radix = 16
 94:       end
 95: 
 96:       char_stack = []
 97: 
 98:       # Consume all the characters that are in range while making sure we
 99:       # don't hit an EOF.
100:       c = @stream.char
101:       while allowed.include?(c) and c != :EOF
102:         char_stack.push(c)
103:         c = @stream.char
104:       end
105: 
106:       # Convert the set of characters consumed to an int.
107:       charAsInt = char_stack.join('').to_i(radix)
108: 
109:       if charAsInt == 13
110:         @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
111:         charAsInt = 10
112:       elsif (128..159).include? charAsInt
113:         # If the integer is between 127 and 160 (so 128 and bigger and 159
114:         # and smaller) we need to do the "windows trick".
115:         @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
116: 
117:         charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118:       end
119: 
120:       if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
121:         char = [charAsInt].pack('U')
122:       else
123:         char = [0xFFFD].pack('U')
124:         @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
125:       end
126: 
127:       # Discard the ; if present. Otherwise, put it back on the queue and
128:       # invoke parse_error on parser.
129:       if c != ";"
130:         @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
131:         @stream.unget(c)
132:       end
133: 
134:       return char
135:     end

XXX AT Perhaps we should have Hixie run some evaluation on billions of documents to figure out what the order of the various if and elsif statements should be.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 249
249:     def data_state
250:       data = @stream.char
251: 
252:       if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
253:         @lastFourChars << data
254:         @lastFourChars.shift if @lastFourChars.length > 4
255:       end
256: 
257:       if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
258:           @state = :entity_data_state
259:       elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
260:           @escapeFlag = true
261:           @token_queue << {:type => :Characters, :data => data}
262:       elsif data == "<" and !@escapeFlag and
263:         [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
264:           @state = :tag_open_state
265:       elsif data == ">" and @escapeFlag and 
266:         [:CDATA,:RCDATA].include?(@content_model_flag) and
267:         @lastFourChars[1..-1].join('') == "-->"
268:           @escapeFlag = false
269:           @token_queue << {:type => :Characters, :data => data}
270: 
271:       elsif data == :EOF
272:         # Tokenization ends.
273:         return false
274: 
275:       elsif SPACE_CHARACTERS.include? data
276:         # Directly after emitting a token you switch back to the "data
277:         # state". At that point SPACE_CHARACTERS are important so they are
278:         # emitted separately.
279:         # XXX need to check if we don't need a special "spaces" flag on
280:         # characters.
281:         @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
282:       else
283:         @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
284:       end
285:       return true
286:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 743
743:     def doctype_name_state
744:       data = @stream.char
745:       if SPACE_CHARACTERS.include? data
746:         @state = :after_doctype_name_state
747:       elsif data == ">"
748:         @token_queue << @current_token
749:         @state = :data_state
750:       elsif data == :EOF
751:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
752:         @current_token[:correct] = false
753:         @token_queue << @current_token
754:         @state = :data_state
755:       else
756:         @current_token[:name] += data
757:       end
758: 
759:       return true
760:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 819
819:     def doctype_public_identifier_double_quoted_state
820:       data = @stream.char
821:       if data == "\""
822:         @state = :after_doctype_public_identifier_state
823:       elsif data == :EOF
824:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
825:         @current_token[:correct] = false
826:         @token_queue << @current_token
827:         @state = :data_state
828:       else
829:         @current_token[:publicId] += data
830:       end
831:       return true
832:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 834
834:     def doctype_public_identifier_single_quoted_state
835:       data = @stream.char
836:       if data == "'"
837:         @state = :after_doctype_public_identifier_state
838:       elsif data == :EOF
839:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
840:         @current_token[:correct] = false
841:         @token_queue << @current_token
842:         @state = :data_state
843:       else
844:         @current_token[:publicId] += data
845:       end
846:       return true
847:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 711
711:     def doctype_state
712:       data = @stream.char
713:       if SPACE_CHARACTERS.include? data
714:         @state = :before_doctype_name_state
715:       else
716:         @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
717:         @stream.unget(data)
718:         @state = :before_doctype_name_state
719:       end
720:       return true
721:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 899
899:     def doctype_system_identifier_double_quoted_state
900:       data = @stream.char
901:       if data == "\""
902:         @state = :after_doctype_system_identifier_state
903:       elsif data == :EOF
904:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
905:         @current_token[:correct] = false
906:         @token_queue << @current_token
907:         @state = :data_state
908:       else
909:         @current_token[:systemId] += data
910:       end
911:       return true
912:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 914
914:     def doctype_system_identifier_single_quoted_state
915:       data = @stream.char
916:       if data == "'"
917:         @state = :after_doctype_system_identifier_state
918:       elsif data == :EOF
919:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
920:         @current_token[:correct] = false
921:         @token_queue << @current_token
922:         @state = :data_state
923:       else
924:         @current_token[:systemId] += data
925:       end
926:       return true
927:     end

This is where the magic happens.

We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested.

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 49
49:     def each
50:       @token_queue = []
51:       # Start processing. When EOF is reached @state will return false
52:       # instead of true and the loop will terminate.
53:       while send @state
54:         yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55:         yield @token_queue.shift until @token_queue.empty?
56:       end
57:     end

This method is a generic handler for emitting the tags. It also sets the state to "data" because that‘s what‘s needed after a token has been emitted.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 231
231:     def emit_current_token
232:       # Add token to the queue to be yielded
233:       token = @current_token
234:       if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
235:         if @lowercase_element_name
236:           token[:name] = token[:name].downcase
237:         end
238:         @token_queue << token
239:         @state = :data_state
240:       end
241:       
242:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 288
288:     def entity_data_state
289:       entity = consume_entity
290:       if entity
291:         @token_queue << {:type => :Characters, :data => entity}
292:       else
293:         @token_queue << {:type => :Characters, :data => "&"}
294:       end
295:       @state = :data_state
296:       return true
297:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 599
599:     def markup_declaration_open_state
600:       char_stack = [@stream.char, @stream.char]
601:       if char_stack == ["-", "-"]
602:         @current_token = {:type => :Comment, :data => ""}
603:         @state = :comment_start_state
604:       else
605:         5.times { char_stack.push(@stream.char) }
606:         # Put in explicit :EOF check
607:         if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
608:           @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
609:           @state = :doctype_state
610:         else
611:           @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
612:           @stream.unget(char_stack)
613:           @state = :bogus_comment_state
614:         end
615:       end
616:       return true
617:     end

This method replaces the need for "entityInAttributeValueState".

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 219
219:     def process_entity_in_attribute
220:       entity = consume_entity()
221:       if entity
222:         @current_token[:data][-1][1] += entity
223:       else
224:         @current_token[:data][-1][1] += "&"
225:       end
226:     end

If the next character is a ’>’, convert the current_token into an EmptyTag

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 64
64:     def process_solidus_in_tag
65: 
66:       # We need to consume another character to make sure it's a ">"
67:       data = @stream.char
68: 
69:       if @current_token[:type] == :StartTag and data == ">"
70:         @current_token[:type] = :EmptyTag
71:       else
72:         @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
73:       end
74: 
75:       # The character we just consumed need to be put back on the stack so it
76:       # doesn't get lost...
77:       @stream.unget(data)
78:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 403
403:     def tag_name_state
404:       data = @stream.char
405:       if SPACE_CHARACTERS.include? data
406:         @state = :before_attribute_name_state
407:       elsif data == :EOF
408:         @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
409:         emit_current_token
410:       elsif ASCII_LETTERS.include? data
411:         @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
412:       elsif data == ">"
413:         emit_current_token
414:       elsif data == "/"
415:         process_solidus_in_tag
416:         @state = :before_attribute_name_state
417:       else
418:         @current_token[:name] += data
419:       end
420:       return true
421:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 299
299:     def tag_open_state
300:       data = @stream.char
301:       if @content_model_flag == :PCDATA
302:         if data == "!"
303:           @state = :markup_declaration_open_state
304:         elsif data == "/"
305:           @state = :close_tag_open_state
306:         elsif data != :EOF and ASCII_LETTERS.include? data
307:           @current_token = {:type => :StartTag, :name => data, :data => []}
308:           @state = :tag_name_state
309:         elsif data == ">"
310:           # XXX In theory it could be something besides a tag name. But
311:           # do we really care?
312:           @token_queue << {:type => :ParseError, :data =>       "expected-tag-name-but-got-right-bracket"}
313:           @token_queue << {:type => :Characters, :data => "<>"}
314:           @state = :data_state
315:         elsif data == "?"
316:           # XXX In theory it could be something besides a tag name. But
317:           # do we really care?
318:           @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
319:           @stream.unget(data)
320:           @state = :bogus_comment_state
321:         else
322:           # XXX
323:           @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
324:           @token_queue << {:type => :Characters, :data => "<"}
325:           @stream.unget(data)
326:           @state = :data_state
327:         end
328:       else
329:         # We know the content model flag is set to either RCDATA or CDATA
330:         # now because this state can never be entered with the PLAINTEXT
331:         # flag.
332:         if data == "/"
333:           @state = :close_tag_open_state
334:         else
335:           @token_queue << {:type => :Characters, :data => "<"}
336:           @stream.unget(data)
337:           @state = :data_state
338:         end
339:       end
340:       return true
341:     end

[Validate]