前言

算是整了个活吧,先爬取人民网的新闻排行榜,再写入到md文件里面,按照博客的格式样式进行编辑,可以直接作为博客文件发表。整体没有什么难度,就当练手了。

直接上源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- codeing = utf-8 -*-
# @time:2021/12/31 12:44
import time
from bs4 import BeautifulSoup
import re
import requests
import xlwt
import sqlite3 #进行SQlite数据库操作
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

def Askurl(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55"
}
req = requests.get(url=url, headers=head)
response = req.content.decode("gbk")
return response

datalist = []

def Gtedata(url):
Findtitle = re.compile(r'.html">(.*?)</a>')
Findlink = re.compile(r'<a class="anavy" href="(.*?)">')
#
html=Askurl(url)
# print(html)
soup=BeautifulSoup(html,"html.parser")
index = soup.find_all('table',width="712")
fg = str(index[0]).split('</tr>')
fg = fg[1:]
fg = fg[:(len(fg)-1)] #删除元组首尾元素,在这里面是无用信息
for item in fg:
# print(item)
data = []
item = str(item)
if re.findall(Findtitle, item):
title = re.findall(Findtitle, item)[0]
title = str(title).replace('\xa0',' ').replace('|','--')
data.append(title)

slink = re.findall(Findlink,item)[0]
link = str(slink)
data.append(link)

datalist.append(data)

#输出测试
# for index in datalist:
# print(index)

def wirtexx():
tit = []
link = []
for item in datalist:
tit.append(item[0])
link.append(item[1])
zh = []
for i in range(0, len(tit)-1):
a = '| ' +str(tit[i]) + ' | ' + str(link[i]) + ' |'
zh.append(a)

f = open("rmw.md", "a",encoding='utf-8')
f.write('''---
title: 新闻
date: 2022-01-05 20:21:09
tags:
- 新闻
- python
categories: python
cover: https://cdn.jsdelivr.net/gh/ELIXsion/pictureck@master/pictureck/炮姐.28izrer0hq4g.webp
top_img: https://cdn.jsdelivr.net/gh/ELIXsion/pictureck@master/pictureck/炮姐.28izrer0hq4g.webp

---

''')
f.write('| 标题 | 链接 |'+'''
'''+'| ---- | ---- |'+'''
''')
for item in zh:
f.write(item+ '''
''')
f.close()

# 追加后,打开并读取该文件:
f = open("rmw.md", "r",encoding='utf-8')
print(f.read())

def main():
baseurl="http://news.people.com.cn/GB/28053/index.html"
Gtedata(baseurl)
wirtexx()

if __name__ == "__main__":
main()