Class HTML5::HTMLTokenizer
In: lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb
Parent: Object

This class takes care of tokenizing HTML.

  • @current_token Holds the token that is currently being processed.
  • @state Holds a reference to the method to be invoked… XXX
  • @states Holds a mapping between states and methods that implement the state.
  • @stream Points to HTMLInputStream object.

Methods

Attributes

content_model_flag  [RW] 
current_token  [RW] 
stream  [R] 

Public Class methods

XXX need to fix documentation

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 26
26:     def initialize(stream, options = {})
27:       @stream = HTMLInputStream.new(stream, options)
28: 
29:       # Setup the initial tokenizer state
30:       @content_model_flag = :PCDATA
31:       @state              = :data_state
32:       @escapeFlag         = false
33:       @lastFourChars      = []
34: 
35:       # The current token being created
36:       @current_token = nil
37: 
38:       # Tokens to be processed.
39:       @token_queue             = []
40:       @lowercase_element_name = options[:lowercase_element_name] != false
41:       @lowercase_attr_name    = options[:lowercase_attr_name]    != false
42:     end

Public Instance methods

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 491
491:     def after_attribute_name_state
492:       data = @stream.char
493:       if SPACE_CHARACTERS.include? data
494:         @stream.chars_until(SPACE_CHARACTERS, true)
495:       elsif data == "="
496:         @state = :before_attribute_value_state
497:       elsif data == ">"
498:         emit_current_token
499:       elsif data == :EOF
500:         @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
501:         emit_current_token
502:       elsif ASCII_LETTERS.include? data
503:         @current_token[:data].push([data, ""])
504:         @state = :attribute_name_state
505:       elsif data == "/"
506:         process_solidus_in_tag
507:         @state = :before_attribute_name_state
508:       else
509:         @current_token[:data].push([data, ""])
510:         @state = :attribute_name_state
511:       end
512:       return true
513:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 762
762:     def after_doctype_name_state
763:       data = @stream.char
764:       if SPACE_CHARACTERS.include? data
765:       elsif data == ">"
766:         @token_queue << @current_token
767:         @state = :data_state
768:       elsif data == :EOF
769:         @current_token[:correct] = false
770:         @stream.unget(data)
771:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
772:         @token_queue << @current_token
773:         @state = :data_state
774:       else
775:         char_stack = [data]  
776:         5.times { char_stack << stream.char }
777:         token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
778:         if token == "public" and !char_stack.include?(:EOF)
779:           @state = :before_doctype_public_identifier_state
780:         elsif token == "system" and !char_stack.include?(:EOF)
781:           @state = :before_doctype_system_identifier_state
782:         else
783:           @stream.unget(char_stack)
784:           @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
785:           @state = :bogus_doctype_state
786:         end
787:       end
788:       return true
789:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 849
849:     def after_doctype_public_identifier_state
850:       data = @stream.char
851:       if SPACE_CHARACTERS.include?(data)
852:       elsif data == "\""
853:         @current_token[:systemId] = ""
854:         @state = :doctype_system_identifier_double_quoted_state
855:       elsif data == "'"
856:         @current_token[:systemId] = ""
857:         @state = :doctype_system_identifier_single_quoted_state
858:       elsif data == ">"
859:         @token_queue << @current_token
860:         @state = :data_state
861:       elsif data == :EOF
862:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
863:         @current_token[:correct] = false
864:         @token_queue << @current_token
865:         @state = :data_state
866:       else
867:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
868:         @state = :bogus_doctype_state
869:       end
870:       return true
871:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 929
929:     def after_doctype_system_identifier_state
930:       data = @stream.char
931:       if SPACE_CHARACTERS.include?(data)
932:       elsif data == ">"
933:         @token_queue << @current_token
934:         @state = :data_state
935:       elsif data == :EOF
936:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
937:         @current_token[:correct] = false
938:         @token_queue << @current_token
939:         @state = :data_state
940:       else
941:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
942:         @state = :bogus_doctype_state
943:       end
944:       return true
945:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 444
444:     def attribute_name_state
445:       data = @stream.char
446:       leavingThisState = true
447:       emitToken = false
448:       if data == "="
449:         @state = :before_attribute_value_state
450:       elsif data == :EOF
451:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
452:         @state = :data_state
453:         emitToken = true
454:       elsif ASCII_LETTERS.include? data
455:         @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
456:         leavingThisState = false
457:       elsif data == ">"
458:         # XXX If we emit here the attributes are converted to a dict
459:         # without being checked and when the code below runs we error
460:         # because data is a dict not a list
461:         emitToken = true
462:       elsif SPACE_CHARACTERS.include? data
463:         @state = :after_attribute_name_state
464:       elsif data == "/"
465:         process_solidus_in_tag
466:         @state = :before_attribute_name_state
467:       else
468:         @current_token[:data][-1][0] += data
469:         leavingThisState = false
470:       end
471: 
472:       if leavingThisState
473:         # Attributes are not dropped at this stage. That happens when the
474:         # start tag token is emitted so values can still be safely appended
475:         # to attributes, but we do want to report the parse error in time.
476:         if @lowercase_attr_name
477:             @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
478:         end
479:         @current_token[:data][0...-1].each {|name,value|
480:           if @current_token[:data].last.first == name
481:             @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
482:             break # don't report an error more than once
483:           end
484:         }
485:         # XXX Fix for above XXX
486:         emit_current_token if emitToken
487:       end
488:       return true
489:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 538
538:     def attribute_value_double_quoted_state
539:       data = @stream.char
540:       if data == "\""
541:         @state = :before_attribute_name_state
542:       elsif data == "&"
543:         process_entity_in_attribute
544:       elsif data == :EOF
545:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
546:         emit_current_token
547:       else
548:         @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
549:       end
550:       return true
551:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 553
553:     def attribute_value_single_quoted_state
554:       data = @stream.char
555:       if data == "'"
556:         @state = :before_attribute_name_state
557:       elsif data == "&"
558:         process_entity_in_attribute
559:       elsif data == :EOF
560:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
561:         emit_current_token
562:       else
563:         @current_token[:data][-1][1] += data +\
564:           @stream.chars_until(["'", "&"])
565:       end
566:       return true
567:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 569
569:     def attribute_value_unquoted_state
570:       data = @stream.char
571:       if SPACE_CHARACTERS.include? data
572:         @state = :before_attribute_name_state
573:       elsif data == "&"
574:         process_entity_in_attribute
575:       elsif data == ">"
576:         emit_current_token
577:       elsif data == :EOF
578:         @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
579:         emit_current_token
580:       else
581:         @current_token[:data][-1][1] += data +  @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
582:       end
583:       return true
584:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 423
423:     def before_attribute_name_state
424:       data = @stream.char
425:       if SPACE_CHARACTERS.include? data
426:         @stream.chars_until(SPACE_CHARACTERS, true)
427:       elsif data == :EOF
428:         @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
429:         emit_current_token
430:       elsif ASCII_LETTERS.include? data
431:         @current_token[:data].push([data, ""])
432:         @state = :attribute_name_state
433:       elsif data == ">"
434:         emit_current_token
435:       elsif data == "/"
436:         process_solidus_in_tag
437:       else
438:         @current_token[:data].push([data, ""])
439:         @state = :attribute_name_state
440:       end
441:       return true
442:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 515
515:     def before_attribute_value_state
516:       data = @stream.char
517:       if SPACE_CHARACTERS.include? data
518:         @stream.chars_until(SPACE_CHARACTERS, true)
519:       elsif data == "\""
520:         @state = :attribute_value_double_quoted_state
521:       elsif data == "&"
522:         @state = :attribute_value_unquoted_state
523:         @stream.unget(data);
524:       elsif data == "'"
525:         @state = :attribute_value_single_quoted_state
526:       elsif data == ">"
527:         emit_current_token
528:       elsif data == :EOF
529:         @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
530:         emit_current_token
531:       else
532:         @current_token[:data][-1][1] += data
533:         @state = :attribute_value_unquoted_state
534:       end
535:       return true
536:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 723
723:     def before_doctype_name_state
724:       data = @stream.char
725:       if SPACE_CHARACTERS.include? data
726:       elsif data == ">"
727:         @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
728:         @current_token[:correct] = false
729:         @token_queue << @current_token
730:         @state = :data_state
731:       elsif data == :EOF
732:         @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
733:         @current_token[:correct] = false
734:         @token_queue << @current_token
735:         @state = :data_state
736:       else
737:         @current_token[:name] = data
738:         @state = :doctype_name_state
739:       end
740:       return true
741:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 791
791:     def before_doctype_public_identifier_state
792:       data = @stream.char
793: 
794:       if SPACE_CHARACTERS.include?(data)
795:       elsif data == "\""
796:         @current_token[:publicId] = ""
797:         @state = :doctype_public_identifier_double_quoted_state
798:       elsif data == "'"
799:         @current_token[:publicId] = ""
800:         @state = :doctype_public_identifier_single_quoted_state
801:       elsif data == ">"
802:         @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
803:         @current_token[:correct] = false
804:         @token_queue << @current_token
805:         @state = :data_state
806:       elsif data == :EOF
807:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
808:         @current_token[:correct] = false
809:         @token_queue << @current_token
810:         @state = :data_state
811:       else
812:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
813:         @state = :bogus_doctype_state
814:       end
815: 
816:       return true
817:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 873
873:     def before_doctype_system_identifier_state
874:       data = @stream.char
875:       if SPACE_CHARACTERS.include?(data)
876:       elsif data == "\""
877:         @current_token[:systemId] = ""
878:         @state = :doctype_system_identifier_double_quoted_state
879:       elsif data == "'"
880:         @current_token[:systemId] = ""
881:         @state = :doctype_system_identifier_single_quoted_state
882:       elsif data == ">"
883:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
884:         @current_token[:correct] = false
885:         @token_queue << @current_token
886:         @state = :data_state
887:       elsif data == :EOF
888:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
889:         @current_token[:correct] = false
890:         @token_queue << @current_token
891:         @state = :data_state
892:       else
893:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
894:         @state = :bogus_doctype_state
895:       end
896:       return true
897:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 586
586:     def bogus_comment_state
587:       # Make a new comment token and give it as value all the characters
588:       # until the first > or :EOF (chars_until checks for :EOF automatically)
589:       # and emit it.
590:       @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
591: 
592:       # Eat the character directly after the bogus comment which is either a
593:       # ">" or an :EOF.
594:       @stream.char
595:       @state = :data_state
596:       return true
597:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 947
947:     def bogus_doctype_state
948:       data = @stream.char
949:       @current_token[:correct] = false
950:       if data == ">"
951:         @token_queue << @current_token
952:         @state = :data_state
953:       elsif data == :EOF
954:         # XXX EMIT
955:         @stream.unget(data)
956:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
957:         @current_token[:correct] = false
958:         @token_queue << @current_token
959:         @state = :data_state
960:       end
961:       return true
962:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 343
343:     def close_tag_open_state
344:       if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
345:         if @current_token
346:           char_stack = []
347: 
348:           # So far we know that "</" has been consumed. We now need to know
349:           # whether the next few characters match the name of last emitted
350:           # start tag which also happens to be the current_token. We also need
351:           # to have the character directly after the characters that could
352:           # match the start tag name.
353:           (@current_token[:name].length + 1).times do
354:             char_stack.push(@stream.char)
355:             # Make sure we don't get hit by :EOF
356:             break if char_stack[-1] == :EOF
357:           end
358: 
359:           # Since this is just for checking. We put the characters back on
360:           # the stack.
361:           @stream.unget(char_stack)
362:         end
363: 
364:         if @current_token and
365:           @current_token[:name].downcase == 
366:           char_stack[0...-1].join('').downcase and
367:           (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
368:           # Because the characters are correct we can safely switch to
369:           # PCDATA mode now. This also means we don't have to do it when
370:           # emitting the end tag token.
371:           @content_model_flag = :PCDATA
372:         else
373:           @token_queue << {:type => :Characters, :data => "</"}
374:           @state = :data_state
375: 
376:           # Need to return here since we don't want the rest of the
377:           # method to be walked through.
378:           return true
379:         end
380:       end
381: 
382:       data = @stream.char
383:       if data == :EOF
384:         @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
385:         @token_queue << {:type => :Characters, :data => "</"}
386:         @state = :data_state
387:       elsif ASCII_LETTERS.include? data
388:         @current_token = {:type => :EndTag, :name => data, :data => []}
389:         @state = :tag_name_state
390:       elsif data == ">"
391:         @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
392:         @state = :data_state
393:       else
394:         # XXX data can be _'_...
395:         @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
396:         @stream.unget(data)
397:         @state = :bogus_comment_state
398:       end
399: 
400:       return true
401:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 671
671:     def comment_end_dash_state
672:       data = @stream.char
673:       if data == "-"
674:         @state = :comment_end_state
675:       elsif data == :EOF
676:         @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
677:         @token_queue << @current_token
678:         @state = :data_state
679:       else
680:         @current_token[:data] += "-" + data +\
681:           @stream.chars_until("-")
682:         # Consume the next character which is either a "-" or an :EOF as
683:         # well so if there's a "-" directly after the "-" we go nicely to
684:         # the "comment end state" without emitting a ParseError there.
685:         @stream.char
686:       end
687:       return true
688:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 690
690:     def comment_end_state
691:       data = @stream.char
692:       if data == ">"
693:         @token_queue << @current_token
694:         @state = :data_state
695:       elsif data == "-"
696:         @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
697:         @current_token[:data] += data
698:       elsif data == :EOF
699:         @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
700:         @token_queue << @current_token
701:         @state = :data_state
702:       else
703:         # XXX
704:         @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
705:         @current_token[:data] += "--" + data
706:         @state = :comment_state
707:       end
708:       return true
709:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 638
638:     def comment_start_dash_state
639:         data = @stream.char
640:         if data == "-"
641:             @state = :comment_end_state
642:         elsif data == ">"
643:             @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
644:             @token_queue << @current_token
645:             @state = :data_state
646:         elsif data == :EOF
647:             @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
648:             @token_queue << @current_token
649:             @state = :data_state
650:         else
651:             @current_token[:data] += '-' + data + @stream.chars_until("-")
652:             @state = :comment_state
653:         end
654:         return true
655:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 619
619:     def comment_start_state
620:         data = @stream.char
621:         if data == "-"
622:             @state = :comment_start_dash_state
623:         elsif data == ">"
624:             @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
625:             @token_queue << @current_token
626:             @state = :data_state
627:         elsif data == :EOF
628:             @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
629:             @token_queue << @current_token
630:             @state = :data_state
631:         else
632:             @current_token[:data] += data + @stream.chars_until("-")
633:             @state = :comment_state
634:         end
635:         return true
636:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 657
657:     def comment_state
658:       data = @stream.char
659:       if data == "-"
660:         @state = :comment_end_dash_state
661:       elsif data == :EOF
662:         @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
663:         @token_queue << @current_token
664:         @state = :data_state
665:       else
666:         @current_token[:data] += data + @stream.chars_until("-")
667:       end
668:       return true
669:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 137
137:     def consume_entity(from_attribute=false)
138:       char = nil
139:       char_stack = [@stream.char]
140:       if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
141:         @stream.unget(char_stack)
142:       elsif char_stack[0] == '#'
143:         # We might have a number entity here.
144:         char_stack += [@stream.char, @stream.char]
145:         if char_stack[0 .. 1].include? :EOF
146:           # If we reach the end of the file put everything up to :EOF
147:           # back in the queue
148:           char_stack = char_stack[0...char_stack.index(:EOF)]
149:           @stream.unget(char_stack)
150:           @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
151:         else
152:           if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153:             # Hexadecimal entity detected.
154:             @stream.unget(char_stack[2])
155:             char = consume_number_entity(true)
156:           elsif DIGITS.include? char_stack[1]
157:             # Decimal entity detected.
158:             @stream.unget(char_stack[1..-1])
159:             char = consume_number_entity(false)
160:           else
161:             # No number entity detected.
162:             @stream.unget(char_stack)
163:             @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
164:           end
165:         end
166:       else
167:         # At this point in the process might have named entity. Entities
168:         # are stored in the global variable "entities".
169:         #
170:         # Consume characters and compare to these to a substring of the
171:         # entity names in the list until the substring no longer matches.
172:         filteredEntityList = ENTITIES.keys
173:         filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
174:         entityName = nil
175: 
176:         # Try to find the longest entity the string will match to take care
177:         # of &noti for instance.
178:         while char_stack.last != :EOF
179:           name = char_stack.join('')
180:           if filteredEntityList.any? {|e| e[0...name.length] == name}
181:             filteredEntityList.reject! {|e| e[0...name.length] != name}
182:             char_stack.push(@stream.char)
183:           else
184:             break
185:           end
186: 
187:           if ENTITIES.include? name
188:             entityName = name
189:             break if entityName[-1] == ';'
190:           end
191:         end
192: 
193:         if entityName != nil
194:           char = ENTITIES[entityName]
195: 
196:           # Check whether or not the last character returned can be
197:           # discarded or needs to be put back.
198:           if entityName[-1] != ?;
199:             @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
200:           end
201: 
202:           if entityName[-1] != ";" and from_attribute and
203:              (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204:               DIGITS.include?(char_stack[entityName.length]))
205:             @stream.unget(char_stack)
206:             char = '&'
207:           else
208:             @stream.unget(char_stack[entityName.length..-1])
209:           end
210:         else
211:           @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
212:           @stream.unget(char_stack)
213:         end
214:       end
215:       return char
216:     end

This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present @token_queue << {:type => :ParseError}" is invoked.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 84
 84:     def consume_number_entity(isHex)
 85: 
 86:       # XXX More need to be done here. For instance, #13 should prolly be
 87:       # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
 88:       # such. Thoughts on this appreciated.
 89:       allowed = DIGITS
 90:       radix = 10
 91:       if isHex
 92:         allowed = HEX_DIGITS
 93:         radix = 16
 94:       end
 95: 
 96:       char_stack = []
 97: 
 98:       # Consume all the characters that are in range while making sure we
 99:       # don't hit an EOF.
100:       c = @stream.char
101:       while allowed.include?(c) and c != :EOF
102:         char_stack.push(c)
103:         c = @stream.char
104:       end
105: 
106:       # Convert the set of characters consumed to an int.
107:       charAsInt = char_stack.join('').to_i(radix)
108: 
109:       if charAsInt == 13
110:         @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
111:         charAsInt = 10
112:       elsif (128..159).include? charAsInt
113:         # If the integer is between 127 and 160 (so 128 and bigger and 159
114:         # and smaller) we need to do the "windows trick".
115:         @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
116: 
117:         charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118:       end
119: 
120:       if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
121:         char = [charAsInt].pack('U')
122:       else
123:         char = [0xFFFD].pack('U')
124:         @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
125:       end
126: 
127:       # Discard the ; if present. Otherwise, put it back on the queue and
128:       # invoke parse_error on parser.
129:       if c != ";"
130:         @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
131:         @stream.unget(c)
132:       end
133: 
134:       return char
135:     end

XXX AT Perhaps we should have Hixie run some evaluation on billions of documents to figure out what the order of the various if and elsif statements should be.

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 249
249:     def data_state
250:       data = @stream.char
251: 
252:       if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
253:         @lastFourChars << data
254:         @lastFourChars.shift if @lastFourChars.length > 4
255:       end
256: 
257:       if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
258:           @state = :entity_data_state
259:       elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
260:           @escapeFlag = true
261:           @token_queue << {:type => :Characters, :data => data}
262:       elsif data == "<" and !@escapeFlag and
263:         [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
264:           @state = :tag_open_state
265:       elsif data == ">" and @escapeFlag and 
266:         [:CDATA,:RCDATA].include?(@content_model_flag) and
267:         @lastFourChars[1..-1].join('') == "-->"
268:           @escapeFlag = false
269:           @token_queue << {:type => :Characters, :data => data}
270: 
271:       elsif data == :EOF
272:         # Tokenization ends.
273:         return false
274: 
275:       elsif SPACE_CHARACTERS.include? data
276:         # Directly after emitting a token you switch back to the "data
277:         # state". At that point SPACE_CHARACTERS are important so they are
278:         # emitted separately.
279:         # XXX need to check if we don't need a special "spaces" flag on
280:         # characters.
281:         @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
282:       else
283:         @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
284:       end
285:       return true
286:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 743
743:     def doctype_name_state
744:       data = @stream.char
745:       if SPACE_CHARACTERS.include? data
746:         @state = :after_doctype_name_state
747:       elsif data == ">"
748:         @token_queue << @current_token
749:         @state = :data_state
750:       elsif data == :EOF
751:         @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
752:         @current_token[:correct] = false
753:         @token_queue << @current_token
754:         @state = :data_state
755:       else
756:         @current_token[:name] += data
757:       end
758: 
759:       return true
760:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 819
819:     def doctype_public_identifier_double_quoted_state
820:       data = @stream.char
821:       if data ==