模擬登陸:
爬取基於某些使用者的使用者資訊
點選登陸按鈕之後發起post請求
post請求中會攜帶登陸之前錄入的相關登陸資訊(使用者名稱,密碼,驗證碼。。。)
Cookie
http/https協定特性:無狀態。
沒有請求到對應頁碼資料的原因:
發起第二次基於個人主頁的頁面請求的時候,伺服器端並不知到本次請求是基於登陸狀態下的請求。
Cookie:用來讓伺服器端記錄使用者端的相關狀態
手動處理:
通過抓包工具獲取Cookie值,將該值封裝到headers中
自動處理:
session對談物件:
操作步驟:
https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx
Classcjy
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf-8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 圖片位元組
codetype: 題目型別 參考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:報錯題目的圖片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
主程式
import requests
from lxml import etree
from Classcjy import Chaojiying_Client
def getCodeText(imgPath):
chaojiying = Chaojiying_Client('azb123 ', 'azb123', '914332') # 使用者中心>>軟體ID 生成一個替換 96001
im = open(imgPath, 'rb').read() # 本地圖片檔案路徑 來替換 a.jpg 有時WIN系統須要//
img_code = chaojiying.PostPic(im, 1902)['pic_str']
return img_code
if __name__ == '__main__':
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
img_url = 'https://so.gushiwen.cn/'+tree.xpath('//*[@id="imgCode"]/@src')[0]
img_data = requests.get(url=img_url,headers=headers).content
with open('./Code.jpg','wb') as fp:
fp.write(img_data)
img_code = getCodeText('./Code.jpg')
print(img_code)
import requests
from lxml import etree
if __name__ == '__main__':
#通過Session物件記錄獲取cookie
session = requests.Session()
login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
#獲取登入頁面,並建立etree物件
page_text = session.get(url=login_url,headers=headers).text
tree = etree.HTML(page_text)
#使用xpath解析驗證碼圖片地址、動態屬性
img_url = 'https://so.gushiwen.cn/'+tree.xpath('//*[@id="imgCode"]/@src')[0]
viewstate = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
viewstategenerator = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
#將驗證碼圖片儲存本地
img_data = session.get(url=img_url,headers=headers).content
with open('./Code.jpg','wb') as fp:
fp.write(img_data)
# 提示使用者輸入驗證碼
img_code = input('請輸入驗證碼:')
data = {
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR':viewstategenerator,
'from':'http://so.gushiwen.cn/user/collect.aspx',
'email': '註冊郵箱',
'pwd': '密碼',
'code': img_code,
'denglu': '登入',
}
#post請求模擬登陸
index = session.post(url=login_url, headers=headers, data=data)
detial_url = 'https://so.gushiwen.cn/user/collectbei.aspx?sort=t'
detial_text = session.get(url=detial_url,headers=headers).text
with open('index.html', 'w', encoding='utf-8') as fp:
fp.write(detial_text)
效果圖