前言

以前曾经接触过python这门语言,对它极高的简洁性和可读性印象深刻。也做过一些小的爬虫程序,但是经过一年的时间已经遗忘殆尽了,于是想通过博客来记录一下第二次学习的过程和自己的理解,加深印象同时备忘。刚开始接触可能有很多错误,但编程不就是在bug中不断进步吗(

练手小程序(一):NGA游戏排行榜

对照着以前做的豆瓣信息爬取做了爬取51job的程序,但是爬了几次后发现不能获取信息了,可能是被反爬程序识别了吧,以我现在的状况还没有余力继续深入研究,于是改从NGA爬取信息。

爬取信息的过程大致为:获取网页—解析网页—提取信息—保存信息

获取,解析:

1
2
3
4
5
6
7
def Askurl(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55"
}
req = requests.get(url=url, headers=head)
response = req.content.decode("gbk")
return response

这里做的主要是模仿人的操作向网页传递浏览器信息并获取返回的网页源码,response中保存到便是源码

提取信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def Gtedata(baseurl):
Findimg = re.compile(r'style="background-image: url(.*?);"')
Findtitle = re.compile(r'<h5><a href=.* target="_blank">(.*?)</a></h5>')
Findnum = re.compile(r'<div class="num">(.*?)</div>')
Findtime = re.compile(r'<div class="time">(.*?)<font>')
#
for i in range(1,10):
if i==1:
url=baseurl
else:
url=baseurl+'?page='+str(i) #获取全部网页
html=Askurl(url)
#print(html)
soup=BeautifulSoup(html,"html.parser")
#
for item in soup.find_all('div',class_="blockItem"):
data = []
item = str(item)

title = re.findall(Findtitle, item)[0]
data.append(title)

num = re.findall(Findnum,item)[0]
data.append(num)

time = re.findall(Findtime,item)[0]
data.append(time)

img = re.findall(Findimg,item)[0]
img=img.strip('(').strip(')')
data.append(img)


datalist.append(data)
datalist.sort(key=lambda x:x[1],reverse=True)
# key = lambda x: x[1]:按照数组索引1列进行排序,无此参数默认以0列进行排序
# reverse = True:倒序,无此参数默认升序

#输出测试
# for index in datalist:
# print(index)

这步看着复杂,其实就是用正则表达式将获取到的网页源码中的信息提取出来,(.#?)表示的就是目标信息

提取之后将各部分信息都加到数组data里面,最后将data再汇总到datalist里面

保存

1
2
3
4
5
6
7
8
9
10
11
12
def Savedata(path):
work = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = work.add_sheet("NGAgame", cell_overwrite_ok=True)
col = ('名称', '评分', '发售日期', '封面图片')
for i in range(0, 4):
sheet.write(0, i, col[i]) # write前两个参数分别是行和列的坐标
for j in range(0, len(datalist)):
data = datalist[j]
for k in range(0, 4):
sheet.write(j + 1, k, data[k])
work.save(path)
print("save completely")

这步就是利用xlwt将数据存入excle表格里面

同理也可以存到数据库里面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def init_database(path):
conn=sqlite3.connect(path)
c=conn.cursor()
sql='''
create table movie(
id integer primary key autoincrement,
mlink text,
ilink text,
cname varchar ,
ename varchar ,
ins text ,
score numeric ,
number numeric ,
info text
)
'''
c.execute(sql)
conn.commit()
conn.close()
def Savedate2(path):
init_database(path)
conn=sqlite3.connect(path)
cur=conn.cursor()
for item in datalist:
for data in range(len(item)):
sql='''
insert into movie(
mlink,ilink,cname,ename,ins,score,number,info
)
values(%s)
'''%",".join(item) #item中的每个成员以字符','分隔开再拼接成一个字符串,和value要求的形式相同
cur.execute(sql)
conn.commit()
conn.close()
print("OVER")

成果图

完整代码

点击展开
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- codeing = utf-8 -*-
# @time:2021/12/31 12:44

from bs4 import BeautifulSoup
import re
import urllib.parse,urllib.request,urllib.error
import requests
import xlwt
import sqlite3 #进行SQlite数据库操作

def Askurl(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55"
}
req = requests.get(url=url, headers=head)
response = req.content.decode("gbk")
return response

datalist = []

def Gtedata(baseurl):
Findimg = re.compile(r'style="background-image: url(.*?);"')
Findtitle = re.compile(r'<h5><a href=.* target="_blank">(.*?)</a></h5>')
Findnum = re.compile(r'<div class="num">(.*?)</div>')
Findtime = re.compile(r'<div class="time">(.*?)<font>')
#
for i in range(1,10):
if i==1:
url=baseurl
else:
url=baseurl+'?page='+str(i)
html=Askurl(url)
#print(html)
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="blockItem"):
data = []
item = str(item)

title = re.findall(Findtitle, item)[0]
data.append(title)

num = re.findall(Findnum,item)[0]
data.append(num)

time = re.findall(Findtime,item)[0]
data.append(time)

img = re.findall(Findimg,item)[0]
img=img.strip('(').strip(')')
data.append(img)


datalist.append(data)
datalist.sort(key=lambda x:x[1],reverse=True)
# key = lambda x: x[1]:按照数组索引1列进行排序,无此参数默认以0列进行排序
# reverse = True:倒序,无此参数默认升序

#输出测试
# for index in datalist:
# print(index)

def Savedata(path):
work = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = work.add_sheet("NGAgame", cell_overwrite_ok=True)
col = ('名称', '评分', '发售日期', '封面图片')
for i in range(0, 4):
sheet.write(0, i, col[i]) # write前两个数据分别是行和列的坐标
for j in range(0, len(datalist)):
data = datalist[j]
for k in range(0, 4):
sheet.write(j + 1, k, data[k])
work.save(path)
print("save completely")

def main():
baseurl="http://game.nga.cn/all"
savepath = "NGAgame.xls"
Gtedata(baseurl)
Savedata(savepath)

if __name__ == "__main__":
main()
点击展开
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- codeing = utf-8 -*-
# @time:2021/8/7 21:31

from bs4 import BeautifulSoup
import re
import urllib.parse,urllib.request,urllib.error
import xlwt
import sqlite3 #进行SQlite数据库操作

def Askurl(url):
head = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55"
}
res = requests.get(url=url, headers=head)
response = res.content.decode("gbk")
return response

datalist=[]

def Getdate(url):

Findlink=re.compile(r'<a href="(.*?)">')
Findimg=re.compile(r'<img.*src="(.*?)"',re.S)
Findtitle=re.compile(r'<span class="title">(.*)</span>')
Findbd=re.compile(r'<p class="">(.*?)</p>',re.S)
Findrating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
Findnum=re.compile(r'<span>(\d*)人评价</span>')
Findinq=re.compile(r'<span class="inq">(.*)</span>',re.S) #用正则表达式设定提取规则


for i in range(0,10):
html=Askurl(url+str(i*25))
soup=BeautifulSoup(html,"html.parser") #前面是要解析的内容,后面是解析器
for item in soup.find_all('div',class_="item"): #div标签,属性是item,后面是对前面属性的补充
data=[]
item=str(item)
#
link=re.findall(Findlink,item)[0]
data.append(link)

img=re.findall(Findimg,item)[0]
data.append(img)

titles=re.findall(Findtitle,item)
if len(titles)==2:
data.append(titles[0])
etitle=titles[1].replace('/',' ')
etitle=re.sub('\s',' ',etitle)
data.append(etitle)
else:
data.append(titles[0])
data.append(" ")

bd=re.findall(Findbd,item)[0]
bd=re.sub('<br(\s+)?/>(\s+)?'," ",str(bd))
bd=re.sub('\s',' ',str(bd))
data.append(bd.strip()) #strip():删除字符串首尾空格(也可以是指定字符)

rating=re.findall(Findrating,item)[0]
data.append(rating)

num=re.findall(Findnum,item)[0]
data.append(num)

inq=re.findall(Findinq,item)
if len(inq)!=0:
inq=inq[0].replace('/'," ")
data.append(inq)
else:
data.append(" ")
datalist.append(data)
# for index in datalist:
# print(index)

def Savedate(path):
work=xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet=work.add_sheet("豆瓣250",cell_overwrite_ok=True)
col=('电影链接','图片链接','中文名','英文名','详情','评分','评价人数','简介')
for i in range(0,8):
sheet.write(0,i,col[i]) #write前两个数据分别是行和列的坐标
for j in range(0,250):
print("%d/250"%(j+1))
data=datalist[j]
for k in range(0,8):
sheet.write(j+1,k,data[k])
work.save(path)
print("OK")

def Savedate2(path):
init_database(path)
conn=sqlite3.connect(path)
cur=conn.cursor()
for item in datalist:
for data in range(len(item)):
if data!=4 or data!=5:
item[data]='"'+item[data]+'"'
sql='''
insert into movie(
mlink,ilink,cname,ename,ins,score,number,info
)
values(%s)
'''%",".join(item)
cur.execute(sql)
conn.commit()
conn.close()
print("OVER")

def init_database(path):
conn=sqlite3.connect(path)
c=conn.cursor()
sql='''
create table movie(
id integer primary key autoincrement,
mlink text,
ilink text,
cname varchar ,
ename varchar ,
ins text ,
score numeric ,
number numeric ,
info text
)
'''
c.execute(sql)
conn.commit()
conn.close()

def main():
baseurl="https://movie.douban.com/top250?start="
savepath="结果.xls"
datapath="movie.db"
Getdate(baseurl)
Savedate(savepath)
Savedate2(datapath)


if __name__ == "__main__":
main()