Module HTML5::HTMLSanitizeModule
In: lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb
Phase XmlElementPhase InTablePhase RootElementPhase AfterHeadPhase InHeadPhase AfterFramesetPhase XmlRootPhase InitialPhase InFramesetPhase InColumnGroupPhase InTableBodyPhase InCaptionPhase BeforeHeadPhase TrailingEndPhase InSelectPhase InCellPhase AfterBodyPhase InBodyPhase InRowPhase Exception SerializeError EOF AssertionError ParseError HTMLSanitizer HTMLTokenizer XhmlRootPhase String EncodingBytes XMLParser XHTMLParser HTMLParser HTMLSerializer XHTMLSerializer TreeWalkers::Base NonRecursiveTreeWalker TreeWalker TreeWalker Base TreeWalker Element DocumentFragment Node CommentNode DocumentType TextNode Document Base::Node Node Node Base::TreeBuilder TreeBuilder TreeBuilder TreeBuilder Element DocumentFragment CommentNode DocumentType TextNode Document Element DocumentFragment CommentNode DocumentType TextNode Document Base OptionalTagFilter InjectMetaCharset WhitespaceFilter HTMLSanitizeFilter HTMLSanitizeModule Enumerable TestData SimpleDelegator HTMLInputStream EncodingParser ContentAttrParser Node TreeBuilder lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb lib/feed_tools/vendor/html5/lib/html5/constants.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/inputstream.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb Hpricot TokenConstructor lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb SimpleTree TreeWalkers HTMLSanitizeModule lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb Hpricot lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb Base lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb REXML lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb SimpleTree TreeBuilders lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb lib/feed_tools/vendor/html5/lib/html5/filters/base.rb lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb Filters Sniffer lib/feed_tools/vendor/html5/tests/preamble.rb TestSupport HTML5 dot/m_75_0.png

This module provides sanitization of XHTML+MathML+SVG and of inline style attributes.

It can be either at the Tokenizer stage:

      HTMLParser.parse(html, :tokenizer => HTMLSanitizer)

or, if you already have a parse tree (in this example, a REXML tree), at the Serializer stage:

    tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
    HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
       :sanitize => true})

Methods

Constants

ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label legend li map menu ol optgroup option p pre q s samp select small span strike strong sub sup table tbody td textarea tfoot th thead tr tt u ul var]
MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder munderover none]
SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform circle defs desc ellipse font-face font-face-name font-face-src g glyph hkern image linearGradient line marker metadata missing-glyph mpath path polygon polyline radialGradient rect set stop svg switch text title tspan use]
ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action align alt axis border cellpadding cellspacing char charoff charset checked cite class clear cols colspan color compact coords datetime dir disabled enctype for frame headers height href hreflang hspace id ismap label lang longdesc maxlength media method multiple name nohref noshade nowrap prompt readonly rel rev rows rowspan rules scope selected shape size span src start style summary tabindex target title type usemap valign value vspace width xml:lang]
MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign columnalign columnlines columnspacing columnspan depth display displaystyle equalcolumns equalrows fence fontstyle fontweight frame height linethickness lspace mathbackground mathcolor mathvariant mathvariant maxsize minsize other rowalign rowalign rowalign rowlines rowspacing rowspan rspace scriptlevel selection separator stretchy width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic arabic-form ascent attributeName attributeType baseProfile bbox begin by calcMode cap-height class color color-rendering content cx cy d dx dy descent display dur end fill fill-rule font-family font-size font-stretch font-style font-variant font-weight from fx fy g1 g2 glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id ideographic k keyPoints keySplines keyTimes lang marker-end marker-mid marker-start markerHeight markerUnits markerWidth mathematical max min name offset opacity orient origin overline-position overline-thickness panose-1 path pathLength points preserveAspectRatio r refX refY repeatCount repeatDur requiredExtensions requiredFeatures restart rotate rx ry slope stemh stemv stop-color stop-opacity strikethrough-position strikethrough-thickness stroke stroke-dasharray stroke-dashoffset stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity stroke-width systemLanguage target text-anchor to transform type u1 u2 underline-position underline-thickness unicode unicode-range units-per-em values version viewBox visibility width widths x x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns xmlns:xlink y y1 y2 zoomAndPan]
ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color border-bottom-color border-collapse border-color border-left-color border-right-color border-top-color clear color cursor direction display elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space width]
ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom brown center collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal nowrap olive pointer purple red right solid silver teal top transparent underline white yellow]
ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke stroke-width stroke-linecap stroke-linejoin stroke-opacity]
ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS   subclasses may define their own versions of these constants
ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS

Public Instance methods

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb, line 151
151:     def sanitize_css(style)
152:       # disallow urls
153:       style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
154: 
155:       # gauntlet
156:       return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
157:       return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
158: 
159:       clean = []
160:       style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
161:         next if val.empty?
162:         prop.downcase!
163:         if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
164:           clean << "#{prop}: #{val};"
165:         elsif %w[background border margin padding].include?(prop.split('-')[0])
166:           clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
167:             !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
168:             keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
169:           end
170:         elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
171:           clean << "#{prop}: #{val};"
172:         end
173:       end
174: 
175:       style = clean.join(' ')
176:     end

[Source]

     # File lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb, line 110
110:     def sanitize_token(token)
111:         case token[:type]
112:         when :StartTag, :EndTag, :EmptyTag
113:           if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
114:             if token.has_key? :data
115:               attrs = Hash[*token[:data].flatten]
116:               attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
117:               ATTR_VAL_IS_URI.each do |attr|
118:                 val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
119:                 if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
120:                   attrs.delete attr
121:                 end
122:               end
123:               if attrs['style']
124:                 attrs['style'] = sanitize_css(attrs['style'])
125:               end
126:               token[:data] = attrs.map {|k,v| [k,v]}
127:             end
128:             return token
129:           else
130:             if token[:type] == :EndTag
131:               token[:data] = "</#{token[:name]}>"
132:             elsif token[:data]
133:               attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
134:               token[:data] = "<#{token[:name]}#{attrs}>"
135:             else
136:               token[:data] = "<#{token[:name]}>"
137:             end
138:             token[:data].insert(-2,'/') if token[:type] == :EmptyTag
139:             token[:type] = :Characters
140:             token.delete(:name)
141:             return token
142:           end
143:         when :Comment
144:           token[:data] = ""
145:           return token
146:         else
147:           return token
148:         end
149:     end

[Validate]