记一次破解AES加密的爬虫

这次爬的网站是漫画粉上的(原)高木同学,啊,这就是日常啊,我只能靠着这些日常来维持我对异性的兴趣了艹。

首先,我需要把每一话地址爬取下来,发现漫画粉页面下有全部的地址,就先从这个地址下手

1
2
3
4
5
6
7
8
list_url = []
def getlink(url):
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
each_url = soup.select('#chapter-list-1 > li > a')
for each_link in each_url:
href = each_link.get("href")
list_url.append("https://www.manhuafen.com" + str(href))

根据dom节点树找到所有的地址,然后加入肯德基豪华。。不是,list里面,用于遍历爬取

第二步就是遍历上面的list,对每个进行爬取,里面的内容发现加载的页面有图片地址,但是爬取的dom树却找不到

在多次调整请求头无果后,我开始读它的js,才发现地址已经给出,就是加密了艹

比如第一话的图片地址是

1
ebFZ94Vok0+P2wszDe/R6K7J79pWZV7Jhmoh9vqriOSBjqfii7WMi3EHyOnilq47Ari4oa1DEBWNam33hc5J8JqS1LGS5xn0/7sG3xLEAhdj3Mv6PPzT60o0iKPHitBuFKA0qw72ZyhJG1vyPCjIgOhOCFZf3xuIXtzX+qggd2x89sTodqI5Kv1RWx/987kI0jNlSe6YEXUMWutk/4GF3dRnafwMxs2LjgBnHwqpFRRqe0mK0ZIMogBddlvNBCX1npt5DMlvglxOCc28usgKWJWNLS9ailCU5HX7ohNljt0LvcW4zBP/Z/lnQO7yWpsc4Y8d7QzlTcE7aWnGcwATc9xegzA2Lnvk/8GzhCKRUJWPUoE33NGHaBE62uv4z3k+y37NYPTI73txbFt5TbGkn0Mk5UtcAu0vtY+Eu+HfIfh34h1E2G7vjX/LgZ25VqXWUznAWBEt0Md58EBiiAfDk5xQvSDbYzHJaphaxtNfMzekv4AXfpPFNFyemyU/FDHwvcP6F0QyCab/sg75LiGiQKTSMmeXt+L1wSROkkNAT39Z1bG0FtgJUPlQ1j654VcBqeLdnHmKC9+E/zCV2VY2/lIOUP5pwCNzLYKMP6dDpD9vD6oNNx4W0jsp2TVFdIAYLZo0cxpIyOgGtC/3ylwS3pISFkgOXyoXxZgc/aZKDtoOFSHMBJW4vR5qsj/LQSCv+dg4qFRVZVgB2+exlTGnJovt3Tg2Ktu1Q1u3VK2eJyuALsnJ3a1x7XpZ+s/KLE4CMStCpvOKmPc+rLzuSIRJ15MmkWkqfWSioU8utb0rPmHJna1QG3KjCuzoN3bZ6Ot4PsGdG74YK0TNb2h4fipNdEDC/8GJTw0B2hH49u+d6jq5clhpduiTCegrTC+GeKa+OJGXJwfrfzPIUCOC8lSf430Y/VpAGlf+z5yw/rFBKMuluBRxW3VKXSDpMNAVST5/S4YSodU6Q3k5wbF28n3aOR4Ez65FJsa7356JL5tpFj3mxrsN5gP7FoG/vQpBhSAEB3HAc6glXZbUp0kahE9s0hLOlYgoLQIJopp3vCrmDNcWuKqexvko2C/8uY9Jk+h/TQpSE9LlbJYOM5BI97zzFq74tRNIvTnrPzOuSs5wtQ5wLcamIoFa274sAUywhkKHmPUUpT6Nde40xmx0Pot2jNNaYakIVSAOwkYSzCIzu9jluTVJVRnKljJRJhLpYK3dysYFrRIlNOpe2/z5KilZ0bgqfL6Je2VKUCl3XTkJSajepAwVLajNzCtI6ohlYA2FcBi4mTax723/Csic0/ODYMa8oLnvgUOyYIubjAck8TidrmMK/AfnNUH5kXbggFiDKKCln9WrXCD0KlOwuvPD6RtlVa+/lDu81yCPCcqv36E=

我什么场面没见过

这场面我真没见过,目前不知道是什么加密方式

然后我在decrypt.js里面发现了它的解密函数

1
2
3
4
5
6
7
8
9
10
function decrypt20180904(chapterId,chapterName,comicId,comicName) {
var key = CryptoJS.enc.Utf8.parse("123456781234567G"); //十六位字符作为密钥
var iv = CryptoJS.enc.Utf8.parse('ABCDEF1G34123412');
var decrypt = CryptoJS.AES.decrypt(chapterImages,key, { iv: iv, mode: CryptoJS.mode.CBC, padding: CryptoJS.pad.Pkcs7 });
console.log(decrypt);
var decryptedStr = decrypt.toString(CryptoJS.enc.Utf8);
chapterImages = JSON.parse(decryptedStr.toString());
;SinMH.initChapter(chapterId,chapterName,comicId,comicName);
SinTheme.initChapter(chapterId,chapterName,comicId,comicName);
}

很明显,这是通过AES CBC模式加密的,为了确认一下

我通过这个函数把上面的密文进行输入,结果是我好了

1
["https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942391.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942392.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942393.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942394.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942395.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942396.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942397.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942398.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/201942399.jpg", "https://mhimg.eshanyao.com/ManHuaKu/s/shanchangzhuonongrendeyuangaomutongxue/1/lkdwbu5cqzc42390.jpg"]

这些都是图片的地址且可以直接访问,然后我需要在python里面把这个解密过程重写一下

1
2
3
4
5
6
7
8
9
10
11
12
def decrypt(text):
text = base64.b64decode(text)
key = '123456781234567G'.encode('utf-8')
iv = 'ABCDEF1G34123412'.encode('utf-8')
mode = AES.MODE_CBC
cryptos = AES.new(key, mode, iv)
plain_text = cryptos.decrypt(text)
#text_decrypted = cipher.decrypt(encodebytes)
#plain_text = cryptos.decrypt(a2b_hex(text))
#print(bytes.decode(plain_text).rstrip('\0'))
print(plain_text)
return (eval(plain_text.decode('utf-8').split("]")[0]+"]"))

然后图片list出来后最后就是下载了,下载如下

1
2
3
4
5
6
7
8
9
10
11
12
13
def download(image, count):
try:
if not os.path.exists(file_path+str(count)):
os.mkdir(file_path+str(count))
print("ok")
else:
print("no")
for img in range(1, len(image)+1):
url = image[img-1].replace('\\', '')
urllib.request.urlretrieve(url, file_path+str(count)+"/"+str(img)+".jpg")

except IOError as e:
print("IOError")

放一段总代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# coding=utf-8

import requests
from bs4 import BeautifulSoup
from Crypto.Cipher import AES
import js2xml
import base64
import os,stat
import urllib.request

file_path="D:/其他/comic/"

list_url = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Host': 'www.manhuafen.com',
"Cookie" : "__guid=218441119.3187460153474006000.1573616071796.2761; UM_distinctid=16e62d3bf3d272-05a9ce4887ca7-3c604504-1fa400-16e62d3bf3e1fd; CNZZDATA1277232052=127552348-1573615977-https%253A%252F%252Fm.manhuafen.com%252F%7C1585621691; _csrf=NZOaev8erSnfAvzDztJ7_Pt8kJjhJle2; monitor_count=4"
}

def download(image, count):
try:
if not os.path.exists(file_path+str(count)):
os.mkdir(file_path+str(count))
print("ok")
else:
print("no")
for img in range(1, len(image)+1):
url = image[img-1].replace('\\', '')
urllib.request.urlretrieve(url, file_path+str(count)+"/"+str(img)+".jpg")

except IOError as e:
print("IOError")

def decrypt(text):
text = base64.b64decode(text)
key = '123456781234567G'.encode('utf-8')
iv = 'ABCDEF1G34123412'.encode('utf-8')
mode = AES.MODE_CBC
cryptos = AES.new(key, mode, iv)
plain_text = cryptos.decrypt(text)
#text_decrypted = cipher.decrypt(encodebytes)
#plain_text = cryptos.decrypt(a2b_hex(text))
#print(bytes.decode(plain_text).rstrip('\0'))
print(plain_text)
return (eval(plain_text.decode('utf-8').split("]")[0]+"]"))


def getlink(url):
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
each_url = soup.select('#chapter-list-1 > li > a')
for each_link in each_url:
href = each_link.get("href")
list_url.append("https://www.manhuafen.com" + str(href))

def getpagelink(url):
header = headers
header.update({"Referer": url})
wb_data = requests.get(url, headers=header)
soup = BeautifulSoup(wb_data.text, 'lxml')
each_url = soup.select('script')[2].string
src_text = js2xml.parse(each_url, encoding='utf-8', debug=False)
src_tree = js2xml.pretty_print(src_text)
src_tree = BeautifulSoup(src_tree, 'lxml')
print(str(src_tree.select("var")[0].text))
image = decrypt(src_tree.select("var")[0].text)
count = soup.select(".head_title > h2")[0].getText()
print(count)
print(image)
download(image, count)

if __name__ == '__main__':
url = "https://www.manhuafen.com/comic/2265/"
getlink(url)
for page_url in list_url[:80]:
getpagelink(page_url)

总结

好的完事了,记得上次遇到加密的时候是爬取教务系统课程表,我是直接用了selenium,就是相当于直接用chrome登陆,但是预先模拟好操作,这样的好处是不管什么东西都会完整加载,但是这样的操作会让爬虫大幅变慢,像这次如果这样操作的话,129话的3分钟估计会变成20分钟。

ps:坐等开学,学期都快过了一半了,有点难啊:-(

ps的ps:一写blog就想写ps(银魂neta)

ps的ps的ps:啊,高木好甜啊,可是死宅看不到春天咯