Class HTML5::EncodingParser
In: lib/feed_tools/vendor/html5/lib/html5/inputstream.rb
Parent: Object

Mini parser for detecting character encoding from meta elements

Methods

Public Class methods

string - the data to work on for encoding detection

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 412
412:     def initialize(data)
413:       @data = EncodingBytes.new(data.to_s)
414:       @encoding = nil
415:     end

Public Instance methods

Return a name,value pair for the next attribute in the stream, if one is found, or nil

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 514
514:     def get_attribute
515:       @data.skip(SPACE_CHARACTERS + ['/'])
516: 
517:       if @data.current_byte == '<'
518:         @data.position -= 1
519:         return nil
520:       elsif @data.current_byte == '>'
521:         return nil
522:       end
523: 
524:       attr_name = []
525:       attr_value = []
526:       space_found = false
527:       #Step 5 attribute name
528:       while true
529:         if @data.current_byte == '=' and attr_name
530:           break
531:         elsif SPACE_CHARACTERS.include?(@data.current_byte)
532:           space_found = true
533:           break
534:         elsif ['/', '<', '>'].include?(@data.current_byte)
535:           return [attr_name.join(''), '']
536:         elsif ASCII_UPPERCASE.include?(@data.current_byte)
537:           attr_name.push(@data.current_byte.downcase)
538:         else
539:           attr_name.push(@data.current_byte)
540:         end
541:         #Step 6
542:         @data.position += 1
543:       end
544:       #Step 7
545:       if space_found
546:         @data.skip
547:         #Step 8
548:         unless @data.current_byte == '='
549:           @data.position -= 1
550:           return [attr_name.join(''), '']
551:         end
552:       end
553:       #XXX need to advance position in both spaces and value case
554:       #Step 9
555:       @data.position += 1
556:       #Step 10
557:       @data.skip
558:       #Step 11
559:       if ["'", '"'].include?(@data.current_byte)
560:         #11.1
561:         quote_char = @data.current_byte
562:         while true
563:           @data.position+=1
564:           #11.3
565:           if @data.current_byte == quote_char
566:             @data.position += 1
567:             return [attr_name.join(''), attr_value.join('')]
568:           #11.4
569:           elsif ASCII_UPPERCASE.include?(@data.current_byte)
570:             attr_value.push(@data.current_byte.downcase)
571:           #11.5
572:           else
573:             attr_value.push(@data.current_byte)
574:           end
575:         end
576:       elsif ['>', '<'].include?(@data.current_byte)
577:         return [attr_name.join(''), '']
578:       elsif ASCII_UPPERCASE.include?(@data.current_byte)
579:         attr_value.push(@data.current_byte.downcase)
580:       else
581:         attr_value.push(@data.current_byte)
582:       end
583:       while true
584:         @data.position += 1
585:         if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
586:           return [attr_name.join(''), attr_value.join('')]
587:         elsif ASCII_UPPERCASE.include?(@data.current_byte)
588:           attr_value.push(@data.current_byte.downcase)
589:         else
590:           attr_value.push(@data.current_byte)
591:         end
592:       end
593:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 426
426:     def get_encoding
427:       @data.each do |byte|
428:         keep_parsing = true
429:         @@method_dispatch.each do |(key, method)|
430:           if @data.match_bytes(key, lower = true)
431:             keep_parsing = send(method)
432:             break
433:           end
434:         end
435:         break unless keep_parsing
436:       end
437:       @encoding = @encoding.strip unless @encoding.nil?
438:       return @encoding
439:     end

Skip over comments

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 442
442:     def handle_comment
443:       return @data.jump_to('-->')
444:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 446
446:     def handle_meta
447:       # if we have <meta not followed by a space so just keep going
448:       return true unless SPACE_CHARACTERS.include?(@data.current_byte)
449: 
450:       #We have a valid meta element we want to search for attributes
451:       while true
452:         #Try to find the next attribute after the current position
453:         attr = get_attribute
454: 
455:         return true if attr.nil?
456:         
457:         if attr[0] == 'charset'
458:           tentative_encoding = attr[1]
459:           if HTML5.is_valid_encoding(tentative_encoding)
460:             @encoding = tentative_encoding  
461:             return false
462:           end
463:         elsif attr[0] == 'content'
464:           content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
465:           tentative_encoding = content_parser.parse
466:           if HTML5.is_valid_encoding(tentative_encoding)
467:             @encoding = tentative_encoding
468:             return false
469:           end
470:         end
471:       end
472:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 508
508:     def handle_other
509:       return @data.jump_to('>')
510:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 478
478:     def handle_possible_end_tag
479:       @data.position += 1
480:       return handle_possible_tag(true)
481:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 474
474:     def handle_possible_start_tag
475:       return handle_possible_tag(false)
476:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 483
483:     def handle_possible_tag(end_tag)
484:       unless ASCII_LETTERS.include?(@data.current_byte)
485:         #If the next byte is not an ascii letter either ignore this
486:         #fragment (possible start tag case) or treat it according to 
487:         #handleOther
488:         if end_tag
489:           @data.position -= 1
490:           handle_other
491:         end
492:         return true
493:       end
494:     
495:       @data.find_next(SPACE_CHARACTERS + ['<', '>'])
496: 
497:       if @data.current_byte == '<'
498:         #return to the first step in the overall "two step" algorithm
499:         #reprocessing the < byte
500:         @data.position -= 1  
501:       else
502:         #Read all attributes
503:         {} until get_attribute.nil?
504:       end
505:       return true
506:     end

[Validate]