Class HTML5::HTMLParser
In: lib/feed_tools/vendor/html5/lib/html5/html5parser.rb
Parent: Object

HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML

Methods

Attributes

errors  [R] 
first_start_tag  [RW] 
inner_html  [RW] 
insert_from_table  [RW] 
last_phase  [RW] 
phase  [RW] 
phases  [R] 
tokenizer  [R] 
tree  [R] 

Public Class methods

:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through HTML5::TreeBuilders[treeType]

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 41
41:     def initialize(options = {})
42:       @strict = false
43:       @errors = []
44:      
45:       @tokenizer =  HTMLTokenizer
46:       @tree = TreeBuilders::REXML::TreeBuilder
47: 
48:       options.each {|name, value| instance_variable_set("@#{name}", value) }
49:       @lowercase_attr_name    = nil unless instance_variables.include?("@lowercase_attr_name")
50:       @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")
51: 
52:       @tree = @tree.new
53: 
54:       @phases = @@phases.inject({}) do |phases, phase_name|
55:         phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
56:         phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
57:         phases
58:       end
59:     end

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 23
23:     def self.parse(stream, options = {})
24:       encoding = options.delete(:encoding)
25:       new(options).parse(stream,encoding)
26:     end

[Source]

    # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 28
28:     def self.parse_fragment(stream, options = {})
29:       container = options.delete(:container) || 'div'
30:       encoding = options.delete(:encoding)
31:       new(options).parse_fragment(stream, container, encoding)
32:     end

Public Instance methods

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 245
245:     def _(string); string; end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 61
 61:     def _parse(stream, inner_html, encoding, container = 'div')
 62:       @tree.reset
 63:       @first_start_tag = false
 64:       @errors = []
 65: 
 66:       @tokenizer = @tokenizer.class unless Class === @tokenizer
 67:       @tokenizer = @tokenizer.new(stream, :encoding => encoding,
 68:         :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
 69: 
 70:       if inner_html
 71:         case @inner_html = container.downcase
 72:         when 'title', 'textarea'
 73:           @tokenizer.content_model_flag = :RCDATA
 74:         when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
 75:           @tokenizer.content_model_flag = :CDATA
 76:         when 'plaintext'
 77:           @tokenizer.content_model_flag = :PLAINTEXT
 78:         else
 79:           # content_model_flag already is PCDATA
 80:           @tokenizer.content_model_flag = :PCDATA
 81:         end
 82:       
 83:         @phase = @phases[:rootElement]
 84:         @phase.insert_html_element
 85:         reset_insertion_mode
 86:       else
 87:         @inner_html = false
 88:         @phase = @phases[:initial]
 89:       end
 90: 
 91:       # We only seem to have InBodyPhase testcases where the following is
 92:       # relevant ... need others too
 93:       @last_phase = nil
 94: 
 95:       # XXX This is temporary for the moment so there isn't any other
 96:       # changes needed for the parser to work with the iterable tokenizer
 97:       @tokenizer.each do |token|
 98:         token = normalize_token(token)
 99: 
100:         method = 'process%s' % token[:type]
101: 
102:         case token[:type]
103:         when :Characters, :SpaceCharacters, :Comment
104:           @phase.send method, token[:data]
105:         when :StartTag
106:           @phase.send method, token[:name], token[:data]
107:         when :EndTag
108:           @phase.send method, token[:name]
109:         when :Doctype
110:           @phase.send method, token[:name], token[:publicId],
111:             token[:systemId], token[:correct]
112:         else
113:           parse_error(token[:data], token[:datavars])
114:         end
115:       end
116: 
117:       # When the loop finishes it's EOF
118:       @phase.process_eof
119:     end

HTML5 specific normalizations to the token stream

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 157
157:     def normalize_token(token)
158: 
159:       if token[:type] == :EmptyTag
160:         # When a solidus (/) is encountered within a tag name what happens
161:         # depends on whether the current tag name matches that of a void
162:         # element.  If it matches a void element atheists did the wrong
163:         # thing and if it doesn't it's wrong for everyone.
164: 
165:         unless VOID_ELEMENTS.include?(token[:name])
166:           parse_error("incorrectly-placed-solidus")
167:         end
168: 
169:         token[:type] = :StartTag
170:       end
171: 
172:       if token[:type] == :StartTag
173:         token[:name] = token[:name].downcase
174: 
175:         # We need to remove the duplicate attributes and convert attributes
176:         # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
177: 
178:         unless token[:data].empty?
179:           data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
180:           token[:data] = Hash[*data.flatten]
181:         end
182: 
183:       elsif token[:type] == :EndTag
184:         parse_error("attributes-in-end-tag") unless token[:data].empty?
185:         token[:name] = token[:name].downcase
186:       end
187: 
188:       token
189:     end

Parse a HTML document into a well-formed tree

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 129
129:     def parse(stream, encoding=nil)
130:       _parse(stream, false, encoding)
131:       @tree.get_document
132:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 150
150:     def parse_error(code = 'XXX-undefined-error', data = {})
151:       # XXX The idea is to make data mandatory.
152:       @errors.push([@tokenizer.stream.position, code, data])
153:       raise ParseError if @strict
154:     end

container - name of the element we‘re setting the inner_html property if set to nil, default to ‘div‘

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 145
145:     def parse_fragment(stream, container='div', encoding=nil)
146:       _parse(stream, true, encoding, container)
147:       @tree.get_fragment
148:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 207
207:     def reset_insertion_mode
208:       # The name of this method is mostly historical. (It's also used in the
209:       # specification.)
210:       last = false
211: 
212:       @tree.open_elements.reverse.each do |node|
213:         node_name = node.name
214: 
215:         if node == @tree.open_elements.first
216:           last = true
217:           unless ['td', 'th'].include?(node_name)
218:             # XXX
219:             # assert @inner_html
220:             node_name = @inner_html
221:           end
222:         end
223: 
224:         # Check for conditions that should only happen in the inner_html
225:         # case
226:         if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
227:           # XXX
228:           # assert @inner_html
229:         end
230: 
231:         if @@new_modes.has_key?(node_name)
232:           @phase = @phases[@@new_modes[node_name]]
233:         elsif node_name == 'html'
234:           @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
235:         elsif last
236:           @phase = @phases[:inBody]
237:         else
238:           next
239:         end
240: 
241:         break
242:       end
243:     end

[Validate]