前言
算是整了个活吧,先爬取人民网的新闻排行榜,再写入到md文件里面,按照博客的格式样式进行编辑,可以直接作为博客文件发表。整体没有什么难度,就当练手了。
直接上源码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
|
import time from bs4 import BeautifulSoup import re import requests import xlwt import sqlite3 from selenium import webdriver from selenium.webdriver.chrome.service import Service
def Askurl(url): head = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55" } req = requests.get(url=url, headers=head) response = req.content.decode("gbk") return response
datalist = []
def Gtedata(url): Findtitle = re.compile(r'.html">(.*?)</a>') Findlink = re.compile(r'<a class="anavy" href="(.*?)">') html=Askurl(url) soup=BeautifulSoup(html,"html.parser") index = soup.find_all('table',width="712") fg = str(index[0]).split('</tr>') fg = fg[1:] fg = fg[:(len(fg)-1)] for item in fg: data = [] item = str(item) if re.findall(Findtitle, item): title = re.findall(Findtitle, item)[0] title = str(title).replace('\xa0',' ').replace('|','--') data.append(title)
slink = re.findall(Findlink,item)[0] link = str(slink) data.append(link)
datalist.append(data)
def wirtexx(): tit = [] link = [] for item in datalist: tit.append(item[0]) link.append(item[1]) zh = [] for i in range(0, len(tit)-1): a = '| ' +str(tit[i]) + ' | ' + str(link[i]) + ' |' zh.append(a)
f = open("rmw.md", "a",encoding='utf-8') f.write('''--- title: 新闻 date: 2022-01-05 20:21:09 tags: - 新闻 - python categories: python cover: https://cdn.jsdelivr.net/gh/ELIXsion/pictureck@master/pictureck/炮姐.28izrer0hq4g.webp top_img: https://cdn.jsdelivr.net/gh/ELIXsion/pictureck@master/pictureck/炮姐.28izrer0hq4g.webp
---
''') f.write('| 标题 | 链接 |'+''' '''+'| ---- | ---- |'+''' ''') for item in zh: f.write(item+ ''' ''') f.close()
f = open("rmw.md", "r",encoding='utf-8') print(f.read())
def main(): baseurl="http://news.people.com.cn/GB/28053/index.html" Gtedata(baseurl) wirtexx()
if __name__ == "__main__": main()
|