使用Selenium与Requests构建网页爬虫及代理池实践
处理含反调试机制的页面抓取
某些网站为防止自动化工具访问,会加入反调试逻辑。例如页面通过debugger语句阻塞开发者工具的使用,或禁用右键菜单。这类防护可通过浏览器开发者工具中的"停用所有断点"功能绕过,也可通过修改关键JavaScript代码实现。
常见做法是在加载页面后,定位包含debugger的脚本位置,并将其替换为空函数。例如在Chrome控制台执行:
// 替换 debugger 语句
(function() {
var originalDebugger = window.debugger;
window.debugger = function() {};
})();
随后可结合Selenium操作页面元素,尤其是存在嵌套iframe结构时,必须先切换上下文。
Selenium操作带frame的页面
目标网址使用了iframe嵌套内容,需先定位并切换至对应frame才能操作内部元素。以下示例展示如何获取城市监测数据:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 启动浏览器
browser = webdriver.Chrome()
browser.get("https://szzdjc.cnemc.cn:8070/GJZ/Business/Publish/Main.html")
# 等待主页面加载完成
time.sleep(5)
# 切入名为 MF 的 iframe
frame_element = browser.find_element(By.ID, "MF")
browser.switch_to.frame(frame_element)
# 触发区域选择按钮
area_trigger = browser.find_element(By.CSS_SELECTOR, "#ddm_Area span")
area_trigger.click()
time.sleep(2)
# 点击第一个地区链接
first_region = browser.find_element(By.XPATH, "//a[contains(@href, 'javascript')]")
first_region.click()
# 等待动态数据渲染(可能需要更智能的等待策略)
time.sleep(45)
# 滚动到底部以触发懒加载
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# 提取城市名称和检测时间
city_nodes = browser.find_elements(By.XPATH, '//span[@class="city-name"]')
time_nodes = browser.find_elements(By.XPATH, '//td[4][@class="detect-time"]')
for city, timestamp in zip(city_nodes, time_nodes):
print(f"城市: {city.text}, 检测时间: {timestamp.text}")
# 返回主文档上下文
browser.switch_to.default_content()
browser.quit()
基于公开代理构建可用IP池
为避免频繁请求导致IP被封禁,常采用代理服务器中转流量。以下介绍从免费代理平台采集并验证有效代理的方法。
利用Selenium抓取89ip代理并测试
通过模拟浏览器行为访问89ip.cn,提取每页列出的IP与端口,并使用requests进行连通性测试。
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requests
valid_proxies = []
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless") # 可选:无头模式运行
driver = webdriver.Chrome(options=chrome_options)
for page_num in range(1, 3):
driver.get(f"https://www.89ip.cn/index_{page_num}.html")
time.sleep(3)
ip_elements = driver.find_elements(By.XPATH, "//tbody/tr/td[1]")
port_elements = driver.find_elements(By.XPATH, "//tbody/tr/td[2]")
for ip_elem, port_elem in zip(ip_elements, port_elements):
ip_address = ip_elem.text.strip()
port_number = port_elem.text.strip()
proxy_str = f"{ip_address}:{port_number}"
proxy_dict = {
"http": f"http://{proxy_str}",
"https": f"http://{proxy_str}"
}
try:
response = requests.get(
"http://httpbin.org/ip",
proxies=proxy_dict,
timeout=5,
verify=False
)
if response.status_code == 200:
valid_proxies.append(proxy_str)
except Exception:
continue
driver.quit()
selected_proxy = random.choice(valid_proxies) if valid_proxies else None
print(f"共收集到 {len(valid_proxies)} 个有效代理")
if selected_proxy:
print(f"随机选用代理: {selected_proxy}")
else:
print("未找到可用代理")
使用Requests直接解析89ip代理列表
相比Selenium,直接使用requests配合lxml可提升效率,尤其适用于静态页面。
import requests
from lxml import html
import random
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
valid_pool = []
for page in range(1, 3):
url = f"https://www.89ip.cn/index_{page}.html"
res = requests.get(url, headers=headers)
tree = html.fromstring(res.content)
ips = tree.xpath('//tbody/tr/td[1]/text()')
ports = tree.xpath('//tbody/tr/td[2]/text()')
for ip, port in zip(ips, ports):
clean_ip = ip.strip()
clean_port = port.strip()
proxy_url = f"{clean_ip}:{clean_port}"
protocol = "http"
test_proxies = {
"http": f"{protocol}://{proxy_url}",
"https": f"{protocol}://{proxy_url}"
}
try:
check = requests.get("http://httpbin.org/ip", proxies=test_proxies, timeout=5)
if check.status_code == 200:
valid_pool.append(proxy_url)
except Exception:
continue
time.sleep(1) # 控制请求频率
chosen = random.choice(valid_pool) if valid_pool else "无可用代理"
print(f"有效代理总数: {len(valid_pool)}")
print(f"选定代理: {chosen}")
从快代理平台提取HTTPS代理
快代理提供HTTP和HTTPS两种类型代理,需根据协议类型配置请求参数。
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import requests
working_proxies = []
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
for p in range(1, 3):
browser.get(f"https://www.kuaidaili.com/free/inha/{p}/")
time.sleep(3)
protocols = browser.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[4]')
ips = browser.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[1]')
ports = browser.find_elements(By.XPATH, '//*[@id="table__free-proxy"]/div/table/tbody/tr/td[2]')
for proto, ip_elem, port_elem in zip(protocols, ips, ports):
scheme = proto.text.lower()
host = ip_elem.text
port = port_elem.text
full_proxy = f"{scheme}://{host}:{port}"
test_proxy = {scheme: full_proxy}
try:
resp = requests.get("http://httpbin.org/ip", proxies=test_proxy, timeout=5, verify=False)
if resp.status_code == 200:
working_proxies.append(full_proxy)
except Exception:
continue
browser.quit()
final_proxy = random.choice(working_proxies) if working_proxies else "无有效代理"
print(f"成功验证代理数: {len(working_proxies)}")
print(f"最终使用的代理: {final_proxy}")
搜狗微信搜索多页内容抓取
针对分页内容,可通过循环点击"下一页"按钮持续采集信息。
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
browser = webdriver.Chrome()
browser.get("https://weixin.sogou.com/pcindex/")
search_input = browser.find_element(By.ID, "query")
search_input.send_keys("爬虫")
time.sleep(1)
submit_btn = browser.find_element(By.XPATH, '//input[@type="submit"]')
submit_btn.click()
time.sleep(5)
for page_index in range(10):
print(f"正在抓取第 {page_index + 1} 页数据...")
for item_idx in range(10):
try:
title_el = browser.find_element(By.ID, f"sogou_vr_11002601_title_{item_idx}")
author_el = browser.find_element(By.XPATH, f'//li[@id="sogou_vr_11002601_box_{item_idx}"]//span[@node-type="media-name"]')
print(f"标题: {title_el.text}, 公众号: {author_el.text}")
except Exception as e:
print(f"读取第{item_idx+1}条失败: {e}")
continue
try:
next_page = browser.find_element(By.ID, "sogou_next")
next_page.click()
time.sleep(5)
except Exception as error:
print("无法继续翻页:", error)
break
browser.quit()
