1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
| import requests from bs4 import BeautifulSoup import pandas as pd import os import time import json
def fetchUrl(url, kw, page): headers = { "Accept": "application/json, text/plain, */*", "Content-Type": "application/json;charset=UTF-8", "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55", "Referer": "http://search.people.cn/", "Cookie": "__jsluid_h = 42549df2735923361d7d5862a7faf64f;sso_c = 0;sfr = 1", }
payloads = { "endTime": 0, "hasContent": True, "hasTitle": True, "isFuzzy": True, "key": kw, "limit": 10, "page": page, "sortType": 2, "startTime": 0, "type": 0, }
r = requests.post(url, headers=headers, data=json.dumps(payloads)) return r.json()
def parseJson(jsonObj): records = jsonObj["data"]["records"]; for item in records: pid = item["id"] belongsName = item["belongsName"] content = BeautifulSoup(item["content"], "html.parser").text displayTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item["displayTime"] / 1000)) title = BeautifulSoup(item["title"], "html.parser").text url = item["url"] yield [[pid, title, displayTime, belongsName, content, url]]
def saveFile(path, filename, data): if not os.path.exists(path): os.makedirs(path) dataframe = pd.DataFrame(data) dataframe.to_csv(path + filename + ".csv", encoding='utf_8_sig', mode='a', index=False, sep=',', header=False )
if __name__ == "__main__": start = 1 end = 3 kw = "春节"
headline = [["文章id", "标题", "发表时间", "版面", "摘要", "链接"]] saveFile("./data/", kw, headline) for page in range(start, end + 1): url = "http://search.people.cn/search-platform/front/search" html = fetchUrl(url, kw, page) for data in parseJson(html): saveFile("./data/", kw, data) print("第{}页爬取完成".format(page))
print("爬虫执行完毕!数据已保存至以下路径中,请查看!") print(os.getcwd(), "\\data")
|