前言

前几天学了根据json爬取网页源码中没有的内容,知乎就是这种类型。正好最近没有好看的图片做博客封面,就用这种方式一劳永逸吧。在查阅资料的时候发现可以用两种方式实现图片获取。

方法一

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import re
import os


def Askurl(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
# 请求头要添加,不然访问失败
response = requests.get(url = url, headers = headers)
# print(response.text)
return response

piclink = []

def Getpic(html):
Findurl = re.compile('<noscript>.*?data-original="(.*?)".*?</noscript>', re.S) # .*? 表示任意个,任意字符
soup = BeautifulSoup(html, "html.parser")
items = soup.select('figure')
# 将figure节点选择出来
for index in items:
try:
index = str(index)
link = re.findall(Findurl,index)[0]
piclink.append(link)
except:
print("存在图片缺失") #用来解决数据为空的情况,很重要

def download(path):
for i in range(0,len(piclink)):
bytes = Askurl(piclink[i]).content #.content返回二进制数据,适用于获取图片,视频等,与.text相对(返回文本)
if not os.path.exists(path):
os.makedirs(path)

with open(path + "image" + str(i) + ".jpg", "wb+") as f:
f.write(bytes)

print(str(i+1) + "/" + str(len(piclink)))
print("图片下载成功")



def main():
url = "https://zhuanlan.zhihu.com/p/100765620"
path = "Images/"
Getpic(Askurl(url).text)
download(path)

if __name__ == "__main__":
main()

这是通过request直接获取整个页面源码

方法二

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from bs4 import BeautifulSoup
import requests
import json
import os

def fetchUrl(url):
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
}

r = requests.get(url, headers=header)
r.encoding = "utf-8"
return r

def parseJson(jsonStr): #这个函数实际就是把json源码中的各项数据扒下来

jsonObj = json.loads(jsonStr) #解析json
data = jsonObj['data'] #data是json中的属性,后面author也是,name是suthor的子属性,可以参照源码理解
# datalist = []
for item in data:
name = item["author"]["name"]
print("正在爬取", name, "的回答")
Download(item["content"])
# headline = item["author"]["headline"]
# dateTIme = time.strftime("%Y-%m-%d", time.localtime(item['updated_time']))
# comment_count = item['comment_count']
# voteup_count = item['voteup_count']
# content = parseHtml(item["content"])
# datalist.append([name, headline, dateTIme, comment_count, voteup_count, content])
# return datalist

def Download(html):

bsObj = BeautifulSoup(html, "html.parser")
images = bsObj.find_all("noscript")

if(len(images) == 0):
print("回答内容无图片")
else:
print("回答中共有",len(images),"张图片,正在下载……")
for item in images:
link = item.img['data-original']
print(link)
path = "Images/"
bytes = fetchUrl(link).content
# url : https://pic3.zhimg.com/c7ad985268e7144b588d7bf94eedb487_r.jpg?source=1940ef5c
# filename: c7ad985268e7144b588d7bf94eedb487_r.jpg
filename = link.split("?")[0].split("/")[-1]

# 如果没有该文件夹,则自动生成
if not os.path.exists(path):
os.makedirs(path)

with open(path + filename, "wb+") as f:
f.write(bytes)
print("图片下载完成")

return bsObj.text

def downloadImage(url, path):

bytes = fetchUrl(url).content
# url : https://pic3.zhimg.com/c7ad985268e7144b588d7bf94eedb487_r.jpg?source=1940ef5c
# filename: c7ad985268e7144b588d7bf94eedb487_r.jpg
filename = url.split("?")[0].split("/")[-1]

# 如果没有该文件夹,则自动生成
if not os.path.exists(path):
os.makedirs(path)

with open(path + filename, "wb+") as f:
f.write(bytes)

# def saveData(data, filename):
#
# dataframe = pd.DataFrame(data)
# dataframe.to_csv(filename, mode='a', index=False, sep=',', header=False, encoding="utf_8_sig")

if __name__ == "__main__":

# 保存的文件名
filename = "data.csv"
qid = 316039999
offset = 0
totalNum = 50

while offset < totalNum:
url = "https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset={1}&platform=desktop&sort_by=default" .format(qid, offset)
#这个请求链接包含网页参数。用format函数替换连接中的{0},{1}等实现不同的请求。offset:请求页数,qid:知乎问题id
html = fetchUrl(url).text

for data in parseJson(html):
offset += 1
print("已经爬取完成",offset,"条回答数据,一共有",totalNum, "条")
print("---"*20)

这是获取json加载的源码