使用Selenium进行网页自动化操作
环境准备与浏览器初始化
安装Selenium库:
pip3 install selenium
创建浏览器实例,支持多种主流浏览器:
from selenium import webdriver
driver = webdriver.Chrome()
# driver = webdriver.Firefox()
# driver = webdriver.Edge()
# driver = webdriver.PhantomJS()
# driver = webdriver.Safari()
页面加载与内容获取
打开目标网址并获取页面源码:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.taobao.com')
print(driver.page_source)
driver.close()
元素定位方法
通过不同方式查找单个或多个元素:
- 单个元素:使用
find_element_by_*或find_element(By.XX, 'value') - 多个元素:使用
find_elements_by_*或find_elements(By.XX, 'value')
示例:通过ID、CSS选择器、XPath定位输入框:
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://www.taobao.com')
# 方法一:传统方式
input_elem1 = driver.find_element_by_id('q')
input_elem2 = driver.find_element_by_css_selector('#q')
input_elem3 = driver.find_element_by_xpath('//*[@id="q"]')
# 方法二:推荐方式(更清晰且兼容性好)
input_elem = driver.find_element(By.ID, 'q')
print(input_elem)
多元素查找与处理
获取一组相同类名的列表项:
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://www.taobao.com')
items = driver.find_elements(By.CSS_SELECTOR, '.service-bd li')
for item in items:
print(item.text)
交互操作与行为模拟
在输入框中输入内容并点击搜索按钮:
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('https://www.taobao.com')
search_input = driver.find_element(By.ID, 'q')
search_input.send_keys('iPhone')
time.sleep(1)
search_input.clear()
search_input.send_keys('iPad')
submit_button = driver.find_element(By.CLASS_NAME, 'btn-search')
submit_button.click()
拖拽动作实现
使用ActionChains完成元素拖拽:
from selenium import webdriver
from selenium.webdriver import ActionChains
driver = webdriver.Chrome()
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
driver.get(url)
# 切换到iframe子页面
driver.switch_to.frame('iframeResult')
source = driver.find_element(By.CSS_SELECTOR, '#draggable')
target = driver.find_element(By.CSS_SELECTOR, '#droppable')
actions = ActionChains(driver)
actions.drag_and_drop(source, target).perform()
执行JavaScript脚本
滚动页面到底部并弹出提示:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.zhihu.com/explore')
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
driver.execute_script('alert("已到达底部")')
获取元素属性与信息
提取元素的类名、文本内容、位置、尺寸等:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.zhihu.com/explore')
logo = driver.find_element(By.ID, 'zh-top-link-logo')
print(logo.get_attribute('class')) # 获取类名
print(logo.text) # 获取文本
print(logo.id) # 元素唯一标识
print(logo.location) # 坐标位置
print(logo.tag_name) # 标签名
print(logo.size) # 宽高尺寸
处理嵌套框架(iframe)
切换至iframe并访问内部元素:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome()
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
driver.switch_to.frame('iframeResult')
try:
logo = driver.find_element(By.CLASS_NAME, 'logo')
except NoSuchElementException:
print('未找到logo元素')
# 返回主文档
driver.switch_to.parent_frame()
main_logo = driver.find_element(By.CLASS_NAME, 'logo')
print(main_logo.text)
等待机制设置
隐式等待:全局等待,直到元素出现为止:
from selenium import webdriver
driver = webdriver.Chrome()
driver.implicitly_wait(10) # 最长等待10秒
driver.get('https://www.zhihu.com/explore')
element = driver.find_element(By.CLASS_NAME, 'zu-top-add-question')
显式等待:精准控制特定条件满足后继续执行:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://www.taobao.com/')
wait = WebDriverWait(driver, 10)
search_input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
submit_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
导航控制与历史管理
前进与后退页面:
import time
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.baidu.com')
driver.get('https://www.taobao.com')
driver.get('https://www.python.org')
driver.back() # 后退
time.sleep(1)
driver.forward() # 前进
driver.close()
Cookie管理
读取、添加和删除Cookies:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.zhihu.com/explore')
print(driver.get_cookies()) # 查看当前所有Cookie
driver.add_cookie({'name': 'test_cookie', 'domain': 'www.zhihu.com', 'value': 'test_value'})
print(driver.get_cookies())
driver.delete_all_cookies()
print(driver.get_cookies())
窗口句柄管理
打开新标签页并切换操作:
import time
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.baidu.com')
# 新开一个标签页
driver.execute_script('window.open()')
print(driver.window_handles) # 打印所有窗口句柄
# 切换到第二个标签页
driver.switch_to.window(driver.window_handles[1])
driver.get('https://www.taobao.com')
# 回到第一个标签页
driver.switch_to.window(driver.window_handles[0])
driver.get('https://python.org')
异常捕获与健壮性处理
避免因元素不存在导致程序崩溃:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Chrome()
try:
driver.get('https://www.baidu.com')
except TimeoutException:
print('页面加载超时')
try:
element = driver.find_element(By.ID, 'nonexistent_id')
except NoSuchElementException:
print('指定元素未找到')
finally:
driver.quit()