1. 参考

2. 安装

pip install selenium

3. 样例

import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

class PythonOrgSearch(unittest.TestCase):

    def setUp(self):
        self.driver = webdriver.Firefox()

    def test_search_in_python_org(self):
        driver = self.driver
        driver.get("http://www.python.org")     #visit url
        self.assertIn("Python", driver.title)   # confirm
        elem = driver.find_element_by_name("q")
        elem.send_keys("pycon")         #input
        elem.send_keys(Keys.RETURN)
        assert "No results found." not in driver.page_source

    def tearDown(self):
        self.driver.close()     #close browser

if __name__ == "__main__":
    unittest.main()

4. 常用功能

4.1. 获取页面渲染后的源代码

获取页面渲染后的源代码, 可以做到网页的动态爬取。

page_source 属性输出页面源码

dirver.get('http://xxx.xxx.xxx')
soup = BeautifulSoup(dirver.page_source, 'lxml')

4.2. 执行js脚本

# 滚动致页面最下方
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')

4.3. 退出

  • driver.close()

    close browser

  • driver.quit()

    close a tab

5. driver的优化

优化表示可以指定各种参数运行,比如不加载图片,使用代理等。

5.1. phantomjs使用代理,指定UA,设置超时:

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

dcap = DesiredCapabilities.PHANTOMJS.copy()
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0"
dcap["phantomjs.page.settings.resourceTimeout"] = 1000
dcap["phantomjs.page.settings.loadImages"] = False
dcap["phantomjs.page.settings.disk-cache"] = True
dcap["phantomjs.page.customHeaders.Cookie"]
service_args = [ '--proxy=127.0.0.1:1080', '--proxy-type=socks5']   
driver = webdriver.PhantomJS('../path_to/phantomjs', service_args=service_args, desired_capabilities=dcap)

# 自定义请求头
headers = {'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', #这种修改 UA 也效
    'Connection': 'keep-alive'
    'Referer':'http://www.baidu.com/'
}

for key, value in headers.iteritems():
    desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
desired_capabilities['phantomjs.page.customHeaders.User-Agent'] ='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'

# phantom 命令行禁用图片
--load-images=false

其中service_args有很多可选参数,详细参数请参考:http://phantomjs.org/api/command-line.html

  • 方式二
from selenium import webdriver
from selenium.webdriver.common.proxy import ProxyType

# 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,我看意思就相当于浏览器清空缓存后,加上代理重新访问一次url
driver = webdriver.PhantomJS(PATH_PHANTOMJS)
proxy=driver.Proxy()
proxy.proxy_type=ProxyType.MANUAL
proxy.http_proxy='1.9.171.51:800'
# 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
driver.get('url')

5.2. Firefox Driver添加代理, 禁用图片,flash等

  • 方式1
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)             # 1代表手动设置
profile.set_preference("network.proxy.http", "127.0.0.1")
profile.set_preference("network.proxy.http_port", 1080)
profile.update_preferences()
driver = webdriver.Firefox(firefox_profile=profile)

# 禁用
firefoxProfile.set_preference('permissions.default.stylesheet', 2)  # 禁用CSS
firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so','false') # 禁用flash
firefoxProfile.set_preference('permissions.default.image', 2)   # 禁用图片

如果是socks5代理,变更其中两行

profile.setPreference("network.proxy.socks", "190.x.y.z");
profile.setPreference("network.proxy.socks_port", 8**8);
  • 方式2
from selenium import webdriver
from selenium.webdriver.common.proxy import *

myProxy = "86.111.144.194:3128"
proxy = Proxy({
    'proxyType': ProxyType.MANUAL,
    'httpProxy': myProxy,
    'ftpProxy': myProxy,
    'sslProxy': myProxy,
    'noProxy':''})

driver = webdriver.Firefox(proxy=proxy)
driver.set_page_load_timeout(30)
driver.get('http://whatismyip.com')

6. 其他

  • input like human

    • send_keys(string)

    • click()

    • clear()

  • find element

    • find_element_by_name() vs find_elements_by_name()

      find_elements_by_name(): return list

    • find_elements()

      from selenium.webdriver.common.by import By
      webdriver.find_elements(By.NAME, <string>)[0].send_keys(<string>)
      webdriver.find_elements(By.XPATH, "//input[@value='<string>']")[0].click()
      
  • Keys class provide keys in the keyboard like RETURN, F1, ALT etc.

  • support browser

Currently supported WebDriver implementations are Firefox, Chrome, Ie and Remote

  • common functions

1.获取当前页面的Url函数

方法:current_url

2.获取元素坐标

方法:location

解释:首先查找到你要获取元素的,然后调用location方法

实例:

driver.find_element_by_xpath("//*[@id='tablechart']/tbody/tr[14]/td[9]").location

3.表单的提交

方法:submit

解释:查找到表单(from)直接调用submit即可

实例:

driver.find_element_by_id("form1").submit()

4.获取CSS的属性值

方法:value_of_css_property(css_name) 实例: driver.find_element_by_css_selector("input.btn").value_of_css_property("input.btn") 5.获取元素的属性值 方法:get_attribute(element_name) 实例: driver.find_element_by_id("sellaiyuan").get_attribute("sellaiyuan") 6.判断元素是否被选中 方法:is_selected() 实例: driver.find_element_by_id("form1").is_selected() 7.返回元素的大小 方法:size 实例: driver.find_element_by_id("iptPassword").size 返回值:{'width': 250, 'height': 30} 8.判断元素是否显示 方法:is_displayed() 实例: driver.find_element_by_id("iptPassword").is_displayed() 9.判断元素是否被使用 方法:is_enabled() 实例: driver.find_element_by_id("iptPassword").is_enabled() 10.获取元素的文本值 方法:text 实例:driver.find_element_by_id("iptUsername").text 11.元素赋值 方法:send_keys(*values) 实例: driver.find_element_by_id("iptUsername").send_keys('admin') 注意如果是函数需要增加转义符u,eg. driver.find_element_by_id("iptUsername").send_keys(u'青春') 12.返回元素的tagName 方法:tag_name 实例: driver.find_element_by_id("iptUsername").tag_name 13.删除浏览器所以的cookies 方法:delete_all_cookies() 实例: driver.delete_all_cookies() 14.删除指定的cookie 方法:delete_cookie(name) 实例:deriver.delete_cookie("my_cookie_name") 15.关闭浏览器 方法:close() 实例:driver.close() 16.关闭浏览器并且推出驱动程序 方法:quit() 实例:driver.quit() 17.返回上一页 方法:back() 实例:driver.back() 18.设置等待超时 方法:implicitly_wait(wait_time) 实例:driver.implicitly_wait(30) 19.浏览器窗口最大化 方法:maximize_window() 实例:driver.maximize_window() 20.查看浏览器的名字 方法:name 实例:drvier.name

Xvfb

It is an X11 server that performs all graphical operations in memory, not showing any screen output

Only a network layer is necessary.

7. install

System Requirements:

sudo apt-get install xvfb   # or similar)
pip install xvfbwrapper

8. example

import unittest

from selenium import webdriver
from xvfbwrapper import Xvfb


class TestPages(unittest.TestCase):

    def setUp(self):
        self.xvfb = Xvfb(width=1280, height=720)
        self.addCleanup(self.xvfb.stop)
        self.xvfb.start()
        self.browser = webdriver.Firefox()
        self.addCleanup(self.browser.quit)

    def testUbuntuHomepage(self):
        self.browser.get('http://www.ubuntu.com')
        self.assertIn('Ubuntu', self.browser.title)

if __name__ == '__main__':
    unittest.main(verbosity=2)

results matching ""

    No results matching ""