Module | FeedTools::HtmlHelper |
In: |
lib/feed_tools/helpers/html_helper.rb
|
Methods for pulling remote data
TIDY_OPTIONS | = | [ :add_xml_decl, :add_xml_space, :alt_text, :assume_xml_procins, :bare, :clean, :css_prefix, :decorate_inferred_ul, :doctype, :drop_empty_paras, :drop_font_tags, :drop_proprietary_attributes, :enclose_block_text, :enclose_text, :escape_cdata, :fix_backslash, :fix_bad_comments, :fix_uri, :hide_comments, :hide_endtags, :indent_cdata, :input_xml, :join_classes, :join_styles, :literal_attributes, :logical_emphasis, :lower_literals, :merge_divs, :ncr, :new_blocklevel_tags, :new_empty_tags, :new_inline_tags, :new_pre_tags, :numeric_entities, :output_html, :output_xhtml, :output_xml, :preserve_entities, :quote_ampersand, :quote_marks, :quote_nbsp, :repeated_attributes, :replace_color, :show_body_only, :uppercase_attributes, :uppercase_tags, :word_2000, :accessibility_check, :show_errors, :show_warnings, :break_before_br, :indent, :indent_attributes, :indent_spaces, :markup, :punctuation_wrap, :split, :tab_size, :vertical_space, :wrap, :wrap_asp, :wrap_attributes, :wrap_jste, :wrap_php, :wrap_script_literals, :wrap_sections, :ascii_chars, :char_encoding, :input_encoding, :language, :newline, :output_bom, :output_encoding, :error_file, :force_output, :gnu_emacs, :gnu_emacs_file, :keep_time, :output_file, :quiet, :slide_style, :tidy_mark, :write_back |
Removes all html tags from the html formatted text and removes escaped entities.
# File lib/feed_tools/helpers/html_helper.rb, line 93 93: def self.convert_html_to_plain_text(html) 94: return nil if html.nil? 95: stripped_html = html 96: stripped_html = FeedTools::HtmlHelper.strip_html_tags(stripped_html) 97: stripped_html = FeedTools::HtmlHelper.unescape_entities(stripped_html) 98: stripped_html.gsub!(/‘/, "'") 99: stripped_html.gsub!(/’/, "'") 100: stripped_html.gsub!(/“/, "\"") 101: stripped_html.gsub!(/”/, "\"") 102: return stripped_html 103: end
Escapes all html entities
# File lib/feed_tools/helpers/html_helper.rb, line 56 56: def self.escape_entities(html) 57: return nil if html.nil? 58: escaped_html = CGI.escapeHTML(html) 59: escaped_html.gsub!(/'/, "'") 60: escaped_html.gsub!(/"/, """) 61: return escaped_html 62: end
Given a block of html, locates feed links with a given mime type.
# File lib/feed_tools/helpers/html_helper.rb, line 563 563: def self.extract_link_by_mime_type(html, mime_type) 564: require 'feed_tools/helpers/xml_helper' 565: 566: # HACK: Prevent the parser from freaking out if it sees this: 567: html = html.gsub(/<!'/, "<!'") 568: 569: # This is technically very, very wrong. But it saves oodles of 570: # clock cycles, and probably works 99.999% of the time. 571: html.gsub!(/<body.*?>(.|\n)*?<\/body>/, "<body></body>") 572: html.gsub!(/<script.*?>(.|\n)*?<\/script>/, "") 573: html.gsub!(/<noscript.*?>(.|\n)*?<\/noscript>/, "") 574: html.gsub!(/<!--(.|\n)*?-->/, "") 575: 576: html = FeedTools::HtmlHelper.tidy_html(html) 577: 578: document = HTML5::HTMLParser.parse(html) 579: 580: link_nodes = [] 581: get_link_nodes = lambda do |root_node| 582: html_node = nil 583: head_node = nil 584: return nil if !root_node.respond_to?(:children) 585: if root_node.name.downcase == "html" && 586: root_node.children.size > 0 587: html_node = root_node 588: else 589: for node in fragment_node.children 590: next unless node.kind_of?(REXML::Element) 591: if node.name.downcase == "html" && 592: node.children.size > 0 593: html_node = node 594: break 595: end 596: end 597: end 598: if html_node != nil 599: for node in html_node.children 600: next unless node.kind_of?(REXML::Element) 601: if node.name.downcase == "head" 602: head_node = node 603: break 604: end 605: if node.name.downcase == "link" 606: link_nodes << node 607: end 608: end 609: if html_node != nil || !link_nodes.empty? 610: if head_node != nil 611: link_nodes = [] 612: for node in head_node.children 613: next unless node.kind_of?(REXML::Element) 614: if node.name.downcase == "link" 615: link_nodes << node 616: end 617: end 618: end 619: end 620: end 621: end 622: get_link_nodes.call(document.root) 623: process_link_nodes = lambda do |links| 624: for link in links 625: next unless link.kind_of?(REXML::Element) 626: if link.attributes['type'].to_s.strip.downcase == 627: mime_type.downcase && 628: link.attributes['rel'].to_s.strip.downcase == "alternate" 629: href = link.attributes['href'] 630: return href unless href.blank? 631: end 632: end 633: for link in links 634: next unless link.kind_of?(REXML::Element) 635: process_link_nodes.call(link.children) 636: end 637: end 638: process_link_nodes.call(link_nodes) 639: return nil 640: end
Returns a string containing normalized xhtml from within a REXML node.
# File lib/feed_tools/helpers/html_helper.rb, line 407 407: def self.extract_xhtml(rexml_node) 408: rexml_node_dup = rexml_node.deep_clone 409: namespace_hash = FEED_TOOLS_NAMESPACES.dup 410: normalize_namespaced_xhtml = lambda do |node, node_dup| 411: if node.kind_of? REXML::Element 412: node_namespace = node.namespace 413: if node_namespace != namespace_hash['atom10'] && 414: node_namespace != namespace_hash['atom03'] 415: # Massive hack, relies on REXML not changing 416: for index in 0...node.attributes.values.size 417: attribute = node.attributes.values[index] 418: attribute_dup = node_dup.attributes.values[index] 419: if attribute.namespace == namespace_hash['xhtml'] 420: attribute_dup.instance_variable_set( 421: "@expanded_name", attribute.name) 422: end 423: if node_namespace == namespace_hash['xhtml'] 424: if attribute.name == 'xmlns' 425: node_dup.attributes.delete('xmlns') 426: end 427: end 428: end 429: if node_namespace == namespace_hash['xhtml'] 430: node_dup.instance_variable_set("@expanded_name", node.name) 431: end 432: if !node_namespace.blank? && node.prefix.blank? 433: if node_namespace != namespace_hash['xhtml'] 434: prefix = nil 435: for known_prefix in namespace_hash.keys 436: if namespace_hash[known_prefix] == node_namespace 437: prefix = known_prefix 438: end 439: end 440: if prefix.nil? 441: prefix = "unknown" + 442: Digest::SHA1.new(node_namespace).to_s[0..4] 443: namespace_hash[prefix] = node_namespace 444: end 445: node_dup.instance_variable_set("@expanded_name", 446: "#{prefix}:#{node.name}") 447: node_dup.instance_variable_set("@prefix", 448: prefix) 449: node_dup.add_namespace(prefix, node_namespace) 450: end 451: end 452: end 453: end 454: for index in 0...node.children.size 455: child = node.children[index] 456: if child.kind_of? REXML::Element 457: child_dup = node_dup.children[index] 458: normalize_namespaced_xhtml.call(child, child_dup) 459: end 460: end 461: end 462: normalize_namespaced_xhtml.call(rexml_node, rexml_node_dup) 463: buffer = "" 464: rexml_node_dup.each_child do |child| 465: if child.kind_of? REXML::Comment 466: buffer << "<!--" + child.to_s + "-->" 467: else 468: buffer << child.to_s 469: end 470: end 471: return buffer.strip 472: end
Returns true if the type string provided indicates that something is html or xhtml content.
# File lib/feed_tools/helpers/html_helper.rb, line 310 310: def self.html_type?(type) 311: return [ 312: "html", 313: "xhtml", 314: "text/html", 315: "application/xhtml+xml" 316: ].include?(type) 317: end
Indents a text selection by a specified number of spaces.
# File lib/feed_tools/helpers/html_helper.rb, line 256 256: def self.indent(text, spaces) 257: lines = text.split("\n") 258: buffer = "" 259: for line in lines 260: line = " " * spaces + line 261: buffer << line << "\n" 262: end 263: return buffer 264: end
Returns true if the type string provided indicates that something is only html (not xhtml) content.
# File lib/feed_tools/helpers/html_helper.rb, line 321 321: def self.only_html_type?(type) 322: return [ 323: "html", 324: "text/html" 325: ].include?(type) 326: end
Given a REXML node, returns its content, normalized as HTML.
# File lib/feed_tools/helpers/html_helper.rb, line 475 475: def self.process_text_construct(content_node, feed_type, feed_version, 476: base_uri_sources=[]) 477: if content_node.nil? 478: return nil 479: end 480: 481: content = nil 482: root_node_name = nil 483: type = FeedTools::XmlHelper.try_xpaths(content_node, "@type", 484: :select_result_value => true) 485: mode = FeedTools::XmlHelper.try_xpaths(content_node, "@mode", 486: :select_result_value => true) 487: encoding = FeedTools::XmlHelper.try_xpaths(content_node, "@encoding", 488: :select_result_value => true) 489: 490: if type.nil? 491: atom_namespaces = [ 492: FEED_TOOLS_NAMESPACES['atom10'], 493: FEED_TOOLS_NAMESPACES['atom03'] 494: ] 495: if ((atom_namespaces.include?(content_node.namespace) || 496: atom_namespaces.include?(content_node.root.namespace)) || 497: feed_type == "atom") 498: type = "text" 499: end 500: end 501: 502: # Note that we're checking for misuse of type, mode and encoding here 503: if content_node.cdatas.size > 0 504: content = content_node.cdatas.first.to_s.strip 505: elsif type == "base64" || mode == "base64" || 506: encoding == "base64" 507: content = Base64.decode64(content_node.inner_xml.strip) 508: elsif type == "xhtml" || mode == "xhtml" || 509: type == "xml" || mode == "xml" || 510: type == "application/xhtml+xml" || 511: content_node.namespace == FEED_TOOLS_NAMESPACES['xhtml'] 512: content = FeedTools::HtmlHelper.extract_xhtml(content_node) 513: elsif type == "escaped" || mode == "escaped" || 514: type == "html" || mode == "html" || 515: type == "text/html" || mode == "text/html" 516: content = FeedTools::HtmlHelper.unescape_entities( 517: content_node.inner_xml.strip) 518: elsif type == "text" || mode == "text" || 519: type == "text/plain" || mode == "text/plain" 520: content = FeedTools::HtmlHelper.unescape_entities( 521: content_node.inner_xml.strip) 522: else 523: content = FeedTools::HtmlHelper.unescape_entities( 524: content_node.inner_xml.strip) 525: end 526: if type == "text" || mode == "text" || 527: type == "text/plain" || mode == "text/plain" 528: content = FeedTools::HtmlHelper.escape_entities(content) 529: end 530: unless content.nil? 531: content = FeedTools::HtmlHelper.resolve_relative_uris(content, 532: [content_node.base_uri] | base_uri_sources) 533: content = FeedTools::HtmlHelper.tidy_html(content) 534: end 535: if FeedTools.configurations[:tab_spaces] != nil 536: spaces = FeedTools.configurations[:tab_spaces].to_i 537: content.gsub!("\t", " " * spaces) unless content.blank? 538: end 539: content.strip unless content.blank? 540: content = nil if content.blank? 541: return content 542: end
Resolves all relative uris in a block of html.
# File lib/feed_tools/helpers/html_helper.rb, line 329 329: def self.resolve_relative_uris(html, base_uri_sources=[]) 330: relative_uri_attributes = [ 331: ["a", "href"], 332: ["applet", "codebase"], 333: ["area", "href"], 334: ["blockquote", "cite"], 335: ["body", "background"], 336: ["del", "cite"], 337: ["form", "action"], 338: ["frame", "longdesc"], 339: ["frame", "src"], 340: ["iframe", "longdesc"], 341: ["iframe", "src"], 342: ["head", "profile"], 343: ["img", "longdesc"], 344: ["img", "src"], 345: ["img", "usemap"], 346: ["input", "src"], 347: ["input", "usemap"], 348: ["ins", "cite"], 349: ["link", "href"], 350: ["object", "classid"], 351: ["object", "codebase"], 352: ["object", "data"], 353: ["object", "usemap"], 354: ["q", "cite"], 355: ["script", "src"] 356: ] 357: 358: # HACK: Prevent the parser from freaking out if it sees this: 359: html.gsub!(/<!'/, "<!'") 360: 361: if FeedTools.configurations[:sanitization_enabled] 362: fragments = HTML5::HTMLParser.parse_fragment( 363: html, :tokenizer => HTML5::HTMLSanitizer, :encoding => 'UTF-8') 364: else 365: fragments = HTML5::HTMLParser.parse_fragment(html) 366: end 367: resolve_node = lambda do |html_node| 368: if html_node.kind_of? REXML::Element 369: for element_name, attribute_name in relative_uri_attributes 370: if html_node.name.downcase == element_name 371: attribute = html_node.attribute(attribute_name) 372: if attribute != nil 373: href = attribute.value 374: href = FeedTools::UriHelper.resolve_relative_uri( 375: href, [html_node.base_uri] | base_uri_sources) 376: href = FeedTools::UriHelper.normalize_url(href) 377: html_node.attribute(attribute_name).instance_variable_set( 378: "@value", href) 379: html_node.attribute(attribute_name).instance_variable_set( 380: "@unnormalized", href) 381: html_node.attribute(attribute_name).instance_variable_set( 382: "@normalized", href) 383: if html_node.attribute(attribute_name).value != href 384: warn("Failed to update href to resolved value.") 385: end 386: end 387: end 388: end 389: end 390: if html_node.respond_to? :children 391: for child in html_node.children 392: resolve_node.call(child) 393: end 394: end 395: html_node 396: end 397: fragments.each do |fragment| 398: resolve_node.call(fragment) 399: end 400: html = (fragments.map do |stuff| 401: stuff.to_s 402: end).join("") 403: return html 404: end
Removes all html tags from the html formatted text, but leaves escaped entities alone.
# File lib/feed_tools/helpers/html_helper.rb, line 84 84: def self.strip_html_tags(html) 85: return nil if html.nil? 86: stripped_html = html 87: stripped_html.gsub!(/<\/?[^>]+>/, "") 88: return stripped_html 89: end
Strips semantically empty div wrapper elements
# File lib/feed_tools/helpers/html_helper.rb, line 545 545: def self.strip_wrapper_element(xhtml) 546: return nil if xhtml.nil? 547: return xhtml if xhtml.blank? 548: begin 549: doc = REXML::Document.new(xhtml.to_s.strip) 550: if doc.children.size == 1 551: child = doc.children[0] 552: if child.kind_of?(REXML::Element) && child.name.downcase == "div" 553: return child.inner_xml.strip 554: end 555: end 556: return xhtml.to_s.strip 557: rescue Exception 558: return xhtml.to_s.strip 559: end 560: end
Returns true if the type string provided indicates that something is html or xhtml content.
# File lib/feed_tools/helpers/html_helper.rb, line 301 301: def self.text_type?(type) 302: return [ 303: "text", 304: "text/plain" 305: ].include?(type) 306: end
Returns true if the html tidy module can be used.
Obviously, you need the tidy gem installed in order to run with html tidy features turned on.
This method does a fairly complicated, and probably unnecessarily desperate search for the libtidy library. If you want this thing to execute fast, the best thing to do is to set Tidy.path ahead of time. If Tidy.path is set, this method doesn‘t do much. If it‘s not set, it will do it‘s darnedest to find the libtidy library. If you set the LIBTIDYPATH environment variable to the libtidy library, it should be able to find it.
Once the library is located, this method will run much faster.
# File lib/feed_tools/helpers/html_helper.rb, line 119 119: def self.tidy_enabled? 120: # This is an override variable to keep tidy from being used even if it 121: # is available. 122: if FeedTools.configurations[:tidy_enabled] == false 123: return false 124: end 125: if @tidy_enabled.nil? || @tidy_enabled == false 126: @tidy_enabled = false 127: begin 128: require 'tidy' 129: if Tidy.path.nil? 130: # *Shrug*, just brute force it, I guess. There's a lot of places 131: # this thing might be hiding in, depending on platform and general 132: # sanity of the person who installed the thing. Most of these are 133: # probably unlikely, but it's not like checking unlikely locations 134: # hurts. Much. Especially if you actually find it. 135: libtidy_locations = [ 136: '/usr/local/lib/libtidy.dylib', 137: '/opt/local/lib/libtidy.dylib', 138: '/usr/lib/libtidy.dylib', 139: '/usr/local/lib/tidylib.dylib', 140: '/opt/local/lib/tidylib.dylib', 141: '/usr/lib/tidylib.dylib', 142: '/usr/local/lib/tidy.dylib', 143: '/opt/local/lib/tidy.dylib', 144: '/usr/lib/tidy.dylib', 145: '/usr/local/lib/libtidy.so', 146: '/opt/local/lib/libtidy.so', 147: '/usr/lib/libtidy.so', 148: '/usr/local/lib/tidylib.so', 149: '/opt/local/lib/tidylib.so', 150: '/usr/lib/tidylib.so', 151: '/usr/local/lib/tidy.so', 152: '/opt/local/lib/tidy.so', 153: '/usr/lib/tidy.so', 154: 'C:\Program Files\Tidy\tidy.dll', 155: 'C:\Tidy\tidy.dll', 156: 'C:\Ruby\bin\tidy.dll', 157: 'C:\Ruby\tidy.dll', 158: '/usr/local/lib', 159: '/opt/local/lib', 160: '/usr/lib' 161: ] 162: # We just made this thing up, but if someone sets it, we'll 163: # go ahead and check it 164: unless ENV['LIBTIDYPATH'].nil? 165: libtidy_locations = 166: libtidy_locations.reverse.push(ENV['LIBTIDYPATH']) 167: end 168: for path in libtidy_locations 169: if File.exists? path 170: if File.ftype(path) == "file" || File.ftype(path) == "link" 171: Tidy.path = path 172: @tidy_enabled = true 173: break 174: elsif File.ftype(path) == "directory" 175: # Ok, now perhaps we're getting a bit more desperate 176: lib_paths = 177: `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'` 178: # If there's more than one, grab the first one and 179: # hope for the best, and if it doesn't work, then blame the 180: # user for not specifying more accurately. 181: tidy_path = lib_paths.split("\n").first 182: unless tidy_path.nil? 183: Tidy.path = tidy_path 184: @tidy_enabled = true 185: break 186: end 187: end 188: end 189: end 190: # Still couldn't find it. 191: unless @tidy_enabled 192: @tidy_enabled = false 193: end 194: else 195: @tidy_enabled = true 196: end 197: rescue LoadError 198: # Tidy not installed, disable features that rely on tidy. 199: @tidy_enabled = false 200: end 201: end 202: return @tidy_enabled 203: end
Tidys up the html
# File lib/feed_tools/helpers/html_helper.rb, line 206 206: def self.tidy_html(html, options = {}) 207: return nil if html.nil? 208: FeedTools::GenericHelper.validate_options(TIDY_OPTIONS, options.keys) 209: 210: options = { 211: :add_xml_decl => false, 212: :char_encoding => "utf8", 213: :doctype => "omit", 214: :indent => false, 215: :logical_emphasis => true, 216: :markup => true, 217: :show_warnings => false, 218: :wrap => 0 219: }.merge(options) 220: 221: if FeedTools::HtmlHelper.tidy_enabled? 222: is_fragment = true 223: html.gsub!(/<!'/, "&lt;!'") 224: if (html.strip =~ /<html>(.|\n)*<body>/) != nil || 225: (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil 226: is_fragment = false 227: end 228: if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil 229: is_fragment = false 230: end 231: 232: options[:show_body_only] = true if is_fragment 233: 234: # Tidy sucks? 235: # TODO: find the correct set of tidy options to set so 236: # that *ugly* hacks like this aren't necessary. 237: html = html.gsub(/\302\240/, "\240") 238: 239: tidy_html = Tidy.open(options) do |tidy| 240: xml = tidy.clean(html) 241: xml 242: end 243: tidy_html.strip! 244: else 245: tidy_html = html 246: end 247: 248: if tidy_html.blank? && !html.blank? 249: tidy_html = html.strip 250: end 251: 252: return tidy_html 253: end
Unescapes all html entities
# File lib/feed_tools/helpers/html_helper.rb, line 65 65: def self.unescape_entities(html) 66: return nil if html.nil? 67: unescaped_html = html 68: unescaped_html.gsub!(/&/, "&") 69: unescaped_html.gsub!(/&/, "&") 70: substitute_numerical_entities = Proc.new do |s| 71: m = $1 72: m = "0#{m}" if m[0] == ?x 73: [Integer(m)].pack('U*') 74: end 75: unescaped_html.gsub!(/�*((?:\d+)|(?:x[a-f0-9]+));/, &substitute_numerical_entities) 76: unescaped_html = CGI.unescapeHTML(unescaped_html) 77: unescaped_html.gsub!(/'/, "'") 78: unescaped_html.gsub!(/"/, "\"") 79: return unescaped_html 80: end
Unindents a text selection by a specified number of spaces.
# File lib/feed_tools/helpers/html_helper.rb, line 267 267: def self.unindent(text, spaces) 268: lines = text.split("\n") 269: buffer = "" 270: for line in lines 271: for index in 0...spaces 272: if line[0...1] == " " 273: line = line[1..-1] 274: else 275: break 276: end 277: end 278: buffer << line << "\n" 279: end 280: return buffer 281: end
Returns true if the type string provided indicates that something is xml or xhtml content.
# File lib/feed_tools/helpers/html_helper.rb, line 285 285: def self.xml_type?(type) 286: if [ 287: "xml", 288: "xhtml", 289: "application/xhtml+xml" 290: ].include?(type) 291: return true 292: elsif type != nil && type[-3..-1] == "xml" 293: return true 294: else 295: return false 296: end 297: end