1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
|
from bs4 import BeautifulSoup import re import urllib.parse,urllib.request,urllib.error import xlwt import sqlite3
def Askurl(url): head = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55" } res = requests.get(url=url, headers=head) response = res.content.decode("gbk") return response
datalist=[]
def Getdate(url):
Findlink=re.compile(r'<a href="(.*?)">') Findimg=re.compile(r'<img.*src="(.*?)"',re.S) Findtitle=re.compile(r'<span class="title">(.*)</span>') Findbd=re.compile(r'<p class="">(.*?)</p>',re.S) Findrating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>') Findnum=re.compile(r'<span>(\d*)人评价</span>') Findinq=re.compile(r'<span class="inq">(.*)</span>',re.S)
for i in range(0,10): html=Askurl(url+str(i*25)) soup=BeautifulSoup(html,"html.parser") for item in soup.find_all('div',class_="item"): data=[] item=str(item) link=re.findall(Findlink,item)[0] data.append(link)
img=re.findall(Findimg,item)[0] data.append(img)
titles=re.findall(Findtitle,item) if len(titles)==2: data.append(titles[0]) etitle=titles[1].replace('/',' ') etitle=re.sub('\s',' ',etitle) data.append(etitle) else: data.append(titles[0]) data.append(" ")
bd=re.findall(Findbd,item)[0] bd=re.sub('<br(\s+)?/>(\s+)?'," ",str(bd)) bd=re.sub('\s',' ',str(bd)) data.append(bd.strip())
rating=re.findall(Findrating,item)[0] data.append(rating)
num=re.findall(Findnum,item)[0] data.append(num)
inq=re.findall(Findinq,item) if len(inq)!=0: inq=inq[0].replace('/'," ") data.append(inq) else: data.append(" ") datalist.append(data)
def Savedate(path): work=xlwt.Workbook(encoding="utf-8",style_compression=0) sheet=work.add_sheet("豆瓣250",cell_overwrite_ok=True) col=('电影链接','图片链接','中文名','英文名','详情','评分','评价人数','简介') for i in range(0,8): sheet.write(0,i,col[i]) for j in range(0,250): print("%d/250"%(j+1)) data=datalist[j] for k in range(0,8): sheet.write(j+1,k,data[k]) work.save(path) print("OK")
def Savedate2(path): init_database(path) conn=sqlite3.connect(path) cur=conn.cursor() for item in datalist: for data in range(len(item)): if data!=4 or data!=5: item[data]='"'+item[data]+'"' sql=''' insert into movie( mlink,ilink,cname,ename,ins,score,number,info ) values(%s) '''%",".join(item) cur.execute(sql) conn.commit() conn.close() print("OVER")
def init_database(path): conn=sqlite3.connect(path) c=conn.cursor() sql=''' create table movie( id integer primary key autoincrement, mlink text, ilink text, cname varchar , ename varchar , ins text , score numeric , number numeric , info text ) ''' c.execute(sql) conn.commit() conn.close()
def main(): baseurl="https://movie.douban.com/top250?start=" savepath="结果.xls" datapath="movie.db" Getdate(baseurl) Savedate(savepath) Savedate2(datapath)
if __name__ == "__main__": main()
|