前言
前几天学了根据json爬取网页源码中没有的内容,知乎就是这种类型。正好最近没有好看的图片做博客封面,就用这种方式一劳永逸吧。在查阅资料的时候发现可以用两种方式实现图片获取。
方法一
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| import requests from bs4 import BeautifulSoup import re import os
def Askurl(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'} response = requests.get(url = url, headers = headers) return response
piclink = []
def Getpic(html): Findurl = re.compile('<noscript>.*?data-original="(.*?)".*?</noscript>', re.S) soup = BeautifulSoup(html, "html.parser") items = soup.select('figure') for index in items: try: index = str(index) link = re.findall(Findurl,index)[0] piclink.append(link) except: print("存在图片缺失")
def download(path): for i in range(0,len(piclink)): bytes = Askurl(piclink[i]).content if not os.path.exists(path): os.makedirs(path)
with open(path + "image" + str(i) + ".jpg", "wb+") as f: f.write(bytes)
print(str(i+1) + "/" + str(len(piclink))) print("图片下载成功")
def main(): url = "https://zhuanlan.zhihu.com/p/100765620" path = "Images/" Getpic(Askurl(url).text) download(path)
if __name__ == "__main__": main()
|
这是通过request直接获取整个页面源码
方法二
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
| from bs4 import BeautifulSoup import requests import json import os
def fetchUrl(url): header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36', }
r = requests.get(url, headers=header) r.encoding = "utf-8" return r
def parseJson(jsonStr):
jsonObj = json.loads(jsonStr) data = jsonObj['data'] for item in data: name = item["author"]["name"] print("正在爬取", name, "的回答") Download(item["content"])
def Download(html):
bsObj = BeautifulSoup(html, "html.parser") images = bsObj.find_all("noscript")
if(len(images) == 0): print("回答内容无图片") else: print("回答中共有",len(images),"张图片,正在下载……") for item in images: link = item.img['data-original'] print(link) path = "Images/" bytes = fetchUrl(link).content filename = link.split("?")[0].split("/")[-1]
if not os.path.exists(path): os.makedirs(path)
with open(path + filename, "wb+") as f: f.write(bytes) print("图片下载完成")
return bsObj.text
def downloadImage(url, path):
bytes = fetchUrl(url).content filename = url.split("?")[0].split("/")[-1]
if not os.path.exists(path): os.makedirs(path)
with open(path + filename, "wb+") as f: f.write(bytes)
if __name__ == "__main__":
filename = "data.csv" qid = 316039999 offset = 0 totalNum = 50
while offset < totalNum: url = "https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=5&offset={1}&platform=desktop&sort_by=default" .format(qid, offset) html = fetchUrl(url).text
for data in parseJson(html): offset += 1 print("已经爬取完成",offset,"条回答数据,一共有",totalNum, "条") print("---"*20)
|
这是获取json加载的源码