前言

大数据方向实训,主要是爬取数据,整理数据,分析数据,最后以网页方式展示。个人感觉前端技术居多,本部分主要是利用python,使用scrapy框架进行数据爬取。这种方式与之前的爬取方式不同,给人的感觉更加模块化,但对于初学者(我)可以说不太友好了。目前只对这个框架进行了比较基础的了解,操作也仅限于简单的爬取,只能一步步深入。

项目结构

spiders

爬虫主体,包括了链接的解析及获取,数据的爬取(正则提取等),与之前写的爬虫脚本相似度很高,最明显的区别是它并非直接输出结果,而是将数据抛给后端程序进行进一步处理

movie

对主页面进行解析,获取概要信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
import scrapy
import json
from tttest.items import TttestItem, MovieItem


class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['movie.douban.com']
start_urls = ['http://movie.douban.com/']

def parse(self, response):
for page in range(122,123):
print(page)
url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=' + str(page*20)
yield scrapy.Request(url=url,callback=self.parse_list)

def parse_list(self,response):
info =json.loads(response.text)

for movie in info["data"]:
#print(movie)
item = TttestItem()
item["db_id"] = movie["id"]
item["title"] = movie["title"]
item["score"] = movie["rate"]
item["directedBy"] = movie['directors'][0]

yield item
yield scrapy.Request(url=movie["url"],callback=self.parse_info)

def parse_info(self,response):
info = MovieItem()
info['directedBy'] = response.xpath('//*[@rel="v:directedBy"]/text()').extract_first()
infotext = "".join(response.xpath('//div[@id="info"]').extract())
info_title = response.xpath('//div[@id="info"]/span').extract()
info['db_id'] = response.url.split('/')[-2]
# print(infotext)
for i in range(0, len(info_title)):
if "语言" in info_title[i]:
lang = re.findall(r'<span class="pl">语言:</span>(.*?)<br>', infotext, re.S)
if "类型" in info_title[i]:
kind = re.findall(r'<span class="pl">类型:</span>(.*?)<br>', infotext, re.S)[0].replace('<span property="v:genre">','').replace('</span>','')
if "制片国家" in info_title[i]:
country = re.findall(r'<span class="pl">制片国家/地区:</span>(.*?)<br>', infotext, re.S)
if "主演" in info_title[i]:
actor = "/".join(re.findall(r'rel="v:starring">(.*?)</a>', infotext, re.S))
if "上映日期" in info_title[i]:
date = "".join(re.findall(r'<span class="pl">上映日期:</span> <span property="v:initialReleaseDate" content="(.*?)">', infotext, re.S))
if "片长" in info_title[i]:
time = re.findall(r'片长:</span> <span property="v:runtime" content="(.*?)">', infotext, re.S)
if "编剧" in info_title[i]:
writer = "/".join(re.findall(r'<a href="/celebrity/.*?/">(.*?)</a>', infotext, re.S))

info['lang'] = lang
info['kind'] = kind
info['country'] = country
info['actor'] = actor
info['date'] = date
info['time'] = time
info['writer'] = writer
yield info
douban250

对获取到的子页面进行解析,进一步获取详细信息,两部分获取的内容有重合。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
import scrapy
from tttest.items import TttestItem, MovieItem

class Douban250Spider(scrapy.Spider):
name = 'douban250'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250']

def parse(self, response):#获取页码链接
for page in(0,1):
url = 'top250?start=' + str(page*25)
url = response.urljoin(url)
yield scrapy.Request(url=url,callback=self.parse_list)

def parser_info(self, response):#爬取info信息
info = MovieItem()
info['directedBy'] = response.xpath('//*[@rel="v:directedBy"]/text()').extract_first()
infotext = "".join(response.xpath('//div[@id="info"]').extract())
info_title =response.xpath('//div[@id="info"]/span').extract()
info['db_id'] = response.url.split('/')[-2]
for i in range(0,len(info_title)):
if "语言" in info_title[i]:
lang = re.findall(r'<span class="pl">语言:</span>(.*?)<br>',infotext,re.S)
info['lang'] = lang
yield info

def parse_list(self, response):
li_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
#//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/p[1]/text()[2]
for li in li_list:
url =li.xpath('div/div[1]/a/@href').extract_first()
item = TttestItem()
item["title"] = li.xpath('div/div[2]/div[1]/a/span[1]/text()').extract_first()
item["score"] = li.xpath('div/div[2]/div[2]/div/span[2]/text()').extract_first()
item["db_id"] = url.split('/')[-2]
item["author"] = "".join(li.xpath('div/div[2]/div[2]/p[1]/text()[1]').extract_first().replace(" ","").replace("\n","").split())
item["kind"] = "".join(li.xpath('div/div[2]/div[2]/p[1]/text()[2]').extract_first().replace(" ","").replace("\n","").split())


yield scrapy.Request(url = url,callback = self.parser_info)

yield item
settings/middlewares

这部分主要是对python进行信息修饰,伪装用户操作,降低被识别封禁的概率。这里只进行了初级的配置。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Scrapy settings for tttest project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tttest'

SPIDER_MODULES = ['tttest.spiders']
NEWSPIDER_MODULE = 'tttest.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tttest (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'tttest.middlewares.TttestSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'tttest.middlewares.TttestDownloaderMiddleware': 543,
'tttest.middlewares.RotateUserAgentMiddleware': 542,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tttest.pipelines.TttestPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware


class TttestSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, or item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Request or item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)


class TttestDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self,user_agent=''):
self.user_agent = user_agent
def process_request(self,request,spider):
self.user_agent = random.choice(self.user_agent_list)
if self.user_agent:
print(self.user_agent)
request.headers.setdefault('User-agent',self.user_agent)

user_agent_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
items

信息项,作为爬虫与数据库存储的桥梁,设定了需要获取的变量。由爬虫获取,pipelines进行存入数据库操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TttestItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
score = scrapy.Field()
author = scrapy.Field()
db_id = scrapy.Field()
directedBy = scrapy.Field()

class MovieItem(scrapy.Item):
directedBy = scrapy.Field()
lang = scrapy.Field()
kind = scrapy.Field()
country = scrapy.Field()
actor = scrapy.Field()
writer = scrapy.Field()
date = scrapy.Field()
time = scrapy.Field()
db_id = scrapy.Field()
pipelines

数据库操作,包括连接数据库,接收数据,用SQL语句存入数据库等。由于获取了两批数据,这里也是分两次将影片信息存入数据库。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import pymysql
from itemadapter import ItemAdapter

from tttest.items import MovieItem, TttestItem


class TttestPipeline:
def __init__(self):
self.connect = pymysql.connect(
host='www.??.com',
port=3391,
db='',
user='',
passwd='',
charset='utf8',
use_unicode=True,
cursorclass=pymysql.cursors.DictCursor
)
self.cursor = self.connect.cursor()

def process_item(self, item, spider):

if isinstance(item,TttestItem):
self.TttestItem(item)

if isinstance(item,MovieItem):
self.process_Movie_item(item)

return item


def TttestItem(self,item):
try:
self.cursor.execute('''
SELECT * FROM movies_top250 WHERE db_id=%s
''',(item["db_id"],))
film = self.cursor.fetchone()
if film is None:
self.cursor.execute('''
INSERT INTO movies_top250(db_id,title,score,director)
VALUE(%s,%s,%s,%s)
''',(item['db_id'],item['title'],item['score'],item['directedBy']))
else:
self.cursor.execute('''
UPDATE movies_top250
SET title = %s,score = %s,director = %s
WHERE db_id = %s
''',(item['title'],item['score'],item['directedBy'],item['db_id']))
self.connect.commit()
except Exception as err:
print("错误1"+str(err))


def process_Movie_item(self,item):
try:
self.cursor.execute('''
UPDATE movies_top250
SET lang = %s,kind = %s,country = %s,actor = %s,date = %s,time = %s,writer = %s
WHERE db_id = %s
''', (item['lang'],item['kind'],item["country"],item["actor"],item["date"],item["time"],item["writer"],item["db_id"]))
self.connect.commit()
print(item['lang'])
except Exception as err:
print("错误2"+str(err))

debug

启动程序,万物之源(雾)

1
2
3
from scrapy.cmdline import execute
# execute('scrapy crawl douban250'.split())
execute('scrapy crawl movie'.split())