The Mechanize library is used for automating interaction with a website. It can follow links, and submit forms. Form fields can be populated and submitted. A history of URL‘s is maintained and can be queried.
require 'rubygems' require 'mechanize' require 'logger' agent = Mechanize.new { |a| a.log = Logger.new("mech.log") } agent.user_agent_alias = 'Mac Safari' page = agent.get("http://www.google.com/") search_form = page.form_with(:name => "f") search_form.field_with(:name => "q").value = "Hello" search_results = agent.submit(search_form) puts search_results.body
VERSION | = | '1.0.0' | The version of Mechanize you are using. | |
AGENT_ALIASES | = | { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" | User Agent aliases |
redirect_ok | -> | follow_redirect? |
ca_file | [RW] | |
cert | [RW] | |
conditional_requests | [RW] | |
cookie_jar | [RW] | |
follow_meta_refresh | [RW] | |
gzip_enabled | [RW] | |
history | [R] | |
history_added | [RW] | |
html_parser | [RW] | The HTML parser to be used when parsing documents |
html_parser | [RW] | |
keep_alive | [RW] | |
keep_alive_time | [RW] | |
key | [RW] | |
log | [RW] | |
open_timeout | [RW] | |
pass | [RW] | |
pluggable_parser | [R] | |
proxy_addr | [R] | Proxy settings |
proxy_pass | [R] | |
proxy_port | [R] | |
proxy_user | [R] | |
read_timeout | [RW] | |
redirect_ok | [RW] | |
redirection_limit | [RW] | |
request_headers | [RW] | A hash of custom request headers |
scheme_handlers | [RW] | |
user_agent | [RW] | |
verify_callback | [RW] | |
watch_for_set | [RW] |
# File lib/mechanize.rb, line 109 109: def inherited(child) 110: child.html_parser ||= html_parser 111: child.log ||= log 112: super 113: end
# File lib/mechanize.rb, line 116 116: def initialize 117: # attr_accessors 118: @cookie_jar = CookieJar.new 119: @log = nil 120: @open_timeout = nil 121: @read_timeout = nil 122: @user_agent = AGENT_ALIASES['Mechanize'] 123: @watch_for_set = nil 124: @history_added = nil 125: @ca_file = nil # OpenSSL server certificate file 126: 127: # callback for OpenSSL errors while verifying the server certificate 128: # chain, can be used for debugging or to ignore errors by always 129: # returning _true_ 130: @verify_callback = nil 131: @cert = nil # OpenSSL Certificate 132: @key = nil # OpenSSL Private Key 133: @pass = nil # OpenSSL Password 134: @redirect_ok = true # Should we follow redirects? 135: @gzip_enabled = true 136: 137: # attr_readers 138: @history = Mechanize::History.new 139: @pluggable_parser = PluggableParser.new 140: 141: # Auth variables 142: @user = nil # Auth User 143: @password = nil # Auth Password 144: @digest = nil # DigestAuth Digest 145: @auth_hash = {} # Keep track of urls for sending auth 146: @request_headers= {} # A hash of request headers to be used 147: 148: # Proxy settings 149: @proxy_addr = nil 150: @proxy_pass = nil 151: @proxy_port = nil 152: @proxy_user = nil 153: 154: @conditional_requests = true 155: 156: @follow_meta_refresh = false 157: @redirection_limit = 20 158: 159: # Connection Cache & Keep alive 160: @connection_cache = {} 161: @keep_alive_time = 300 162: @keep_alive = true 163: 164: @scheme_handlers = Hash.new { |h,k| 165: h[k] = lambda { |link, page| 166: raise UnsupportedSchemeError.new(k) 167: } 168: } 169: @scheme_handlers['http'] = lambda { |link, page| link } 170: @scheme_handlers['https'] = @scheme_handlers['http'] 171: @scheme_handlers['relative'] = @scheme_handlers['http'] 172: @scheme_handlers['file'] = @scheme_handlers['http'] 173: 174: @pre_connect_hook = Chain::PreConnectHook.new 175: @post_connect_hook = Chain::PostConnectHook.new 176: 177: @html_parser = self.class.html_parser 178: 179: yield self if block_given? 180: end
Sets the user and password to be used for authentication.
# File lib/mechanize.rb, line 213 213: def auth(user, password) 214: @user = user 215: @password = password 216: end
Clicks the Mechanize::Link object passed in and returns the page fetched.
# File lib/mechanize.rb, line 311 311: def click(link) 312: referer = link.page rescue referer = nil 313: href = link.respond_to?(:href) ? link.href : 314: (link['href'] || link['src']) 315: get(:url => href, :referer => (referer || current_page())) 316: end
DELETE to url with query_params, and setting options:
delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/mechanize.rb, line 280 280: def delete(url, query_params = {}, options = {}) 281: page = head(url, query_params, options.merge({:verb => :delete})) 282: add_to_history(page) 283: page 284: end
Fetches the URL passed in and returns a page.
# File lib/mechanize.rb, line 220 220: def get(options, parameters = [], referer = nil) 221: verb = :get 222: 223: unless options.is_a? Hash 224: url = options 225: unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0 226: referer = parameters 227: parameters = [] 228: end 229: else 230: raise ArgumentError.new("url must be specified") unless url = options[:url] 231: parameters = options[:params] || [] 232: referer = options[:referer] 233: headers = options[:headers] 234: verb = options[:verb] || verb 235: end 236: 237: unless referer 238: if url.to_s =~ /^http/ 239: referer = Page.new(nil, {'content-type'=>'text/html'}) 240: else 241: referer = current_page || Page.new(nil, {'content-type'=>'text/html'}) 242: end 243: end 244: 245: # FIXME: Huge hack so that using a URI as a referer works. I need to 246: # refactor everything to pass around URIs but still support 247: # Mechanize::Page#base 248: unless referer.is_a?(Mechanize::File) 249: referer = referer.is_a?(String) ? 250: Page.new(URI.parse(referer), {'content-type' => 'text/html'}) : 251: Page.new(referer, {'content-type' => 'text/html'}) 252: end 253: 254: # fetch the page 255: page = fetch_page( :uri => url, 256: :referer => referer, 257: :headers => headers || {}, 258: :verb => verb, 259: :params => parameters 260: ) 261: add_to_history(page) 262: yield page if block_given? 263: page 264: end
Fetch a file and return the contents of the file.
# File lib/mechanize.rb, line 305 305: def get_file(url) 306: get(url).body 307: end
HEAD to url with query_params, and setting options:
head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
# File lib/mechanize.rb, line 291 291: def head(url, query_params = {}, options = {}) 292: options = { 293: :uri => url, 294: :headers => {}, 295: :params => query_params, 296: :verb => :head 297: }.merge(options) 298: # fetch the page 299: page = fetch_page(options) 300: yield page if block_given? 301: page 302: end
Posts to the given URL with the request entity. The request entity is specified by either a string, or a list of key-value pairs represented by a hash or an array of arrays.
Examples:
agent.post('http://example.com/', "foo" => "bar") agent.post('http://example.com/', [ ["foo", "bar"] ]) agent.post('http://example.com/', "<message>hello</message>", 'Content-Type' => 'application/xml')
# File lib/mechanize.rb, line 334 334: def post(url, query={}, headers={}) 335: if query.is_a?(String) 336: return request_with_entity(:post, url, query, :headers => headers) 337: end 338: node = {} 339: # Create a fake form 340: class << node 341: def search(*args); []; end 342: end 343: node['method'] = 'POST' 344: node['enctype'] = 'application/x-www-form-urlencoded' 345: 346: form = Form.new(node) 347: query.each { |k,v| 348: if v.is_a?(IO) 349: form.enctype = 'multipart/form-data' 350: ul = Form::FileUpload.new({'name' => k.to_s},::File.basename(v.path)) 351: ul.file_data = v.read 352: form.file_uploads << ul 353: else 354: form.fields << Form::Field.new({'name' => k.to_s},v) 355: end 356: } 357: post_form(url, form, headers) 358: end
# File lib/mechanize.rb, line 191 191: def post_connect_hooks 192: @post_connect_hook.hooks 193: end
PUT to url with entity, and setting options:
put('http://tenderlovemaking.com/', 'new content', :headers => {'Content-Type' => 'text/plain'})
# File lib/mechanize.rb, line 271 271: def put(url, entity, options = {}) 272: request_with_entity(:put, url, entity, options) 273: end
# File lib/mechanize.rb, line 382 382: def request_with_entity(verb, url, entity, options={}) 383: cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'}) 384: 385: options = { 386: :uri => url, 387: :referer => cur_page, 388: :headers => {}, 389: }.update(options) 390: 391: headers = { 392: 'Content-Type' => 'application/octet-stream', 393: 'Content-Length' => entity.size.to_s, 394: }.update(options[:headers]) 395: 396: options.update({ 397: :verb => verb, 398: :params => [entity], 399: :headers => headers, 400: }) 401: 402: page = fetch_page(options) 403: add_to_history(page) 404: page 405: end
Sets the proxy address, port, user, and password addr should be a host, with no "http://"
# File lib/mechanize.rb, line 197 197: def set_proxy(addr, port, user = nil, pass = nil) 198: @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass 199: end
Submit a form with an optional button. Without a button:
page = agent.get('http://example.com') agent.submit(page.forms.first)
With a button
agent.submit(page.forms.first, page.forms.first.buttons.first)
# File lib/mechanize.rb, line 366 366: def submit(form, button=nil, headers={}) 367: form.add_button_to_query(button) if button 368: case form.method.upcase 369: when 'POST' 370: post_form(form.action, form, headers) 371: when 'GET' 372: get( :url => form.action.gsub(/\?[^\?]*$/, ''), 373: :params => form.build_query, 374: :headers => headers, 375: :referer => form.page 376: ) 377: else 378: raise "unsupported method: #{form.method.upcase}" 379: end 380: end
Runs given block, then resets the page history as it was before. self is given as a parameter to the block. Returns the value of the block.
# File lib/mechanize.rb, line 427 427: def transact 428: history_backup = @history.dup 429: begin 430: yield self 431: ensure 432: @history = history_backup 433: end 434: end
Returns whether or not a url has been visited
# File lib/mechanize.rb, line 413 413: def visited?(url) 414: ! visited_page(url).nil? 415: end
# File lib/mechanize.rb, line 638 638: def add_to_history(page) 639: @history.push(page, resolve(page.uri)) 640: history_added.call(page) if history_added 641: end
uri is an absolute URI
# File lib/mechanize.rb, line 470 470: def fetch_page(params) 471: options = { 472: :request => nil, 473: :response => nil, 474: :connection => nil, 475: :referer => current_page(), 476: :uri => nil, 477: :verb => :get, 478: :agent => self, 479: :redirects => 0, 480: :params => [], 481: :headers => {}, 482: }.merge(params) 483: 484: before_connect = Chain.new([ 485: Chain::URIResolver.new(@scheme_handlers), 486: Chain::ParameterResolver.new, 487: Chain::RequestResolver.new, 488: Chain::ConnectionResolver.new( 489: @connection_cache, 490: @keep_alive, 491: @proxy_addr, 492: @proxy_port, 493: @proxy_user, 494: @proxy_pass 495: ), 496: Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass), 497: Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest), 498: Chain::HeaderResolver.new( 499: @keep_alive, 500: @keep_alive_time, 501: @cookie_jar, 502: @user_agent, 503: @gzip_enabled, 504: @request_headers 505: ), 506: Chain::CustomHeaders.new, 507: @pre_connect_hook, 508: ]) 509: before_connect.handle(options) 510: 511: uri = options[:uri] 512: request = options[:request] 513: cur_page = options[:referer] 514: request_data = options[:params] 515: redirects = options[:redirects] 516: http_obj = options[:connection] 517: 518: # Add If-Modified-Since if page is in history 519: if( (page = visited_page(uri)) && page.response['Last-Modified'] ) 520: request['If-Modified-Since'] = page.response['Last-Modified'] 521: end if(@conditional_requests) 522: 523: http_obj.mu_lock 524: # Specify timeouts if given 525: http_obj.open_timeout = @open_timeout if @open_timeout 526: http_obj.read_timeout = @read_timeout if @read_timeout 527: http_obj.start unless http_obj.started? 528: 529: # Log specified headers for the request 530: log.info("#{ request.class }: #{ request.path }") if log 531: request.each_header do |k, v| 532: log.debug("request-header: #{ k } => #{ v }") 533: end if log 534: 535: # Send the request 536: attempts = 0 537: begin 538: response = http_obj.request(request, *request_data) { |r| 539: connection_chain = Chain.new([ 540: Chain::ResponseReader.new(r), 541: Chain::BodyDecodingHandler.new, 542: ]) 543: connection_chain.handle(options) 544: } 545: rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x 546: log.error("Rescuing EOF error") if log 547: http_obj.finish 548: raise x if attempts >= 2 549: request.body = nil 550: http_obj.start 551: attempts += 1 552: retry 553: end 554: 555: after_connect = Chain.new([ 556: @post_connect_hook, 557: Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set), 558: Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache), 559: ]) 560: after_connect.handle(options) 561: http_obj.mu_unlock 562: 563: res_klass = options[:res_klass] 564: response_body = options[:response_body] 565: page = options[:page] 566: 567: log.info("status: #{ page.code }") if log 568: 569: if follow_meta_refresh 570: redirect_uri = nil 571: referer = page 572: if (page.respond_to?(:meta) && (redirect = page.meta.first)) 573: redirect_uri = redirect.uri.to_s 574: sleep redirect.node['delay'].to_f 575: referer = Page.new(nil, {'content-type'=>'text/html'}) 576: elsif refresh = response['refresh'] 577: delay, redirect_uri = Page::Meta.parse(refresh, uri) 578: raise StandardError, "Invalid refresh http header" unless delay 579: if redirects + 1 > redirection_limit 580: raise RedirectLimitReachedError.new(page, redirects) 581: end 582: sleep delay.to_f 583: end 584: if redirect_uri 585: @history.push(page, page.uri) 586: return fetch_page( 587: :uri => redirect_uri, 588: :referer => referer, 589: :params => [], 590: :verb => :get, 591: :redirects => redirects + 1 592: ) 593: end 594: end 595: 596: return page if res_klass <= Net::HTTPSuccess 597: 598: if res_klass == Net::HTTPNotModified 599: log.debug("Got cached page") if log 600: return visited_page(uri) || page 601: elsif res_klass <= Net::HTTPRedirection 602: return page unless follow_redirect? 603: log.info("follow redirect to: #{ response['Location'] }") if log 604: from_uri = page.uri 605: raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit 606: redirect_verb = options[:verb] == :head ? :head : :get 607: page = fetch_page( :uri => response['Location'].to_s, 608: :referer => page, 609: :params => [], 610: :verb => redirect_verb, 611: :redirects => redirects + 1 612: ) 613: @history.push(page, from_uri) 614: return page 615: elsif res_klass <= Net::HTTPUnauthorized 616: raise ResponseCodeError.new(page) unless @user || @password 617: raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host) 618: if response['www-authenticate'] =~ /Digest/i 619: @auth_hash[uri.host] = :digest 620: if response['server'] =~ /Microsoft-IIS/ 621: @auth_hash[uri.host] = :iis_digest 622: end 623: @digest = response['www-authenticate'] 624: else 625: @auth_hash[uri.host] = :basic 626: end 627: return fetch_page( :uri => uri, 628: :referer => cur_page, 629: :verb => request.method.downcase.to_sym, 630: :params => request_data, 631: :headers => options[:headers] 632: ) 633: end 634: 635: raise ResponseCodeError.new(page), "Unhandled response", caller 636: end
# File lib/mechanize.rb, line 448 448: def post_form(url, form, headers = {}) 449: cur_page = form.page || current_page || 450: Page.new( nil, {'content-type'=>'text/html'}) 451: 452: request_data = form.request_data 453: 454: log.debug("query: #{ request_data.inspect }") if log 455: 456: # fetch the page 457: page = fetch_page( :uri => url, 458: :referer => cur_page, 459: :verb => :post, 460: :params => [request_data], 461: :headers => { 462: 'Content-Type' => form.enctype, 463: 'Content-Length' => request_data.size.to_s, 464: }.merge(headers)) 465: add_to_history(page) 466: page 467: end