最近需要获取某些信息,但每次页面都要滑动一次验证码实在太烦了,so研究一下破解方案。
使用的技术
- python3
- opencv
- pymydb
- selenium
效果
核心思路
1) 使用selenium加载网站,定位到验证码的iframe
2) 背景版,缺口图,使用opencv灰度图像分析出缺口位置。
result = cv2.matchTemplate(target, template, cv2.TM_CCOEFF_NORMED)
x, y = np.unravel_index(result.argmax(), result.shape)
3) 控制验证码的滑块到指定位置。
4) 代理ip的使用,因为对于ip访问次数太多,验证码就算滑中了,也是不正确的。
具体代码
#!/usr/bin/env python
# encoding: utf-8
import random
import time
import os
import cv2
import ssl
import pymysql
import urllib.request
import numpy as np
from PIL import Image
from selenium.webdriver import ActionChains
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
class Login(object):
"""
python + seleniuum + cv2
"""
def __init__(self,proxyIP=''):
# 如果是实际应用中,可在此处账号和密码
self.option = ChromeOptions()
# 不加ip,模拟验证码可能有问题
if(proxyIP):
self.option.add_argument("--proxy-server="+proxyIP);
self.option.add_argument('--headless');
self.option.add_experimental_option('excludeSwitches', ['enable-automation'])
self.driver = Chrome(options=self.option)
def seturl(self,url):
self.url = url;
@staticmethod
def get_postion(otemp, oblk):
"""
判断缺口位置
:param chunk: 缺口图片是原图
:param canves:
:return: 位置 x, y
"""
target = cv2.imread(otemp, 0)
template = cv2.imread(oblk, 0)
# w, h = target.shape[::-1]
temp = 'temp.jpg'
targ = 'targ.jpg'
#imwrite
cv2.imwrite(temp, template)
cv2.imwrite(targ, target)
target = cv2.imread(targ)
target = cv2.cvtColor(target, cv2.COLOR_BGR2GRAY)
target = abs(255 - target)
cv2.imwrite(targ, target)
target = cv2.imread(targ)
template = cv2.imread(temp)
result = cv2.matchTemplate(target, template, cv2.TM_CCOEFF_NORMED)
x, y = np.unravel_index(result.argmax(), result.shape)
return x, y
@staticmethod
def get_track(distance):
tracks = []
distance = int(distance);
tracks.append(distance)
return tracks;
@staticmethod
def urllib_download(imgurl, imgsavepath):
from urllib.request import urlretrieve
urlretrieve(imgurl, imgsavepath)
def quit2(self):
self.driver.execute_script("window.close();");
self.driver.switch_to.window(self.driver.window_handles[-1])
def quit(self):
self.driver.quit();
def login_main(self, driver):
driver.switch_to.default_content()
#滑块iframe
driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
# 大图
bk_block = driver.find_element_by_xpath('//img[@id="slideBkg"]')
web_image_width = bk_block.size
web_image_width = web_image_width['width']
bk_block_x = bk_block.location['x']
# 小滑块
slide_block = driver.find_element_by_xpath('//img[@id="slideBlock"]')
slide_block_x = slide_block.location['x']
bk_block = driver.find_element_by_xpath('//img[@id="slideBkg"]').get_attribute('src') # 大图 url
# print(bk_block);
slide_block = driver.find_element_by_xpath('//img[@id="slideBlock"]').get_attribute('src') # 小滑块 图片url
# print(slide_block);
slid_ing = driver.find_element_by_xpath('//div[@id="slide_bar_head"]') # 滑块
#下载图片分析
os.makedirs('./image/', exist_ok=True)
self.urllib_download(bk_block, './image/bkBlock.png')
self.urllib_download(slide_block, './image/slideBlock.png')
time.sleep(0.5)
img_bkblock = Image.open('./image/bkBlock.png')
bkblock_real_width = img_bkblock.size[0]
# 滑块宽度/大图背景宽度
width_scale = float(bkblock_real_width) / float(web_image_width)
#opencv分析,获取滑块与嵌块的位置
position = self.get_postion('./image/bkBlock.png', './image/slideBlock.png')
real_position = position[1] / width_scale
real_position = real_position - (slide_block_x - bk_block_x)
#滑动的轨迹
track_list = self.get_track(real_position)
# 点击鼠标左键,按住不放
ActionChains(driver).click_and_hold(on_element=slid_ing).perform()
time.sleep(0.2)
#拖动滑块
for track in track_list:
#移动鼠标到嵌块距离
ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform()
# 释放鼠标
ActionChains(driver).release(on_element=slid_ing).perform()
def main(self):
driver = self.driver
driver.maximize_window()
print(self.url);
#url 这里开始赋值
driver.get(self.url)
time.sleep(4)
self.login_main(driver)
driver.switch_to.default_content() #切换主页面
#获取相关详细信息
time.sleep(4)
# 这里获取不了,只能尝试再获取一次验证码的
try:
test = driver.find_element_by_class_name('profile-card-user-info-description')
#和对应的id存起来
content = test.text;
return test.text;
except Exception as e:
reutrn;
#获取代理ip
def getproxyIP():
# 大家可以找出免费的代码ip
proxyurl = "xx";
response = urllib.request.urlopen(proxyurl)
proxyIP = response.read();
proxyIP = proxyIP.decode("unicode_escape")
proxyIP = str(proxyIP);
return proxyIP;
if __name__ == '__main__':
ssl._create_default_https_context = ssl._create_unverified_context
#跑取url的组成
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='phpshow', charset='utf8mb4')
# 创建游标
cursor = conn.cursor()
while(1):
cursor.execute("select UserId from spider where crawl='1' and intro is Null limit 10");
res = cursor.fetchall();
if res==False:
break;
proxyIP = getproxyIP();
print(proxyIP);
#这里要有异常处理
login = Login(proxyIP)
for val in res:
print(val[0]);
try:
time.sleep(1);
#url规则,手动更改
url = str("https://xx.com/search/author?keyword=")+str(val[0])+str("&page=1");
login.seturl(url);
text = login.main()
print(text);
if text:
print("update");
cursor2 = conn.cursor();
# 表名
update_sql = "update xx set intro='"+str(text)+"' where UserId = '"+str(val[0])+"'";
print(update_sql);
tmp = cursor2.execute(update_sql);
conn.commit();
cursor2.close;
#退出浏览器
# login.quit();
except Exception as e:
login.quit2();
print("error");
login.quit();
print("---------------------------");
```