前言

大数据方向实训，主要是爬取数据，整理数据，分析数据，最后以网页方式展示。个人感觉前端技术居多，本部分主要是利用python，使用scrapy框架进行数据爬取。这种方式与之前的爬取方式不同，给人的感觉更加模块化，但对于初学者（我）可以说不太友好了。目前只对这个框架进行了比较基础的了解，操作也仅限于简单的爬取，只能一步步深入。

项目结构

spiders

爬虫主体，包括了链接的解析及获取，数据的爬取（正则提取等），与之前写的爬虫脚本相似度很高，最明显的区别是它并非直接输出结果，而是将数据抛给后端程序进行进一步处理

movie

对主页面进行解析，获取概要信息

import re
import scrapy
import json
from tttest.items import TttestItem, MovieItem


class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['movie.douban.com']
    start_urls = ['http://movie.douban.com/']

    def parse(self, response):
        for page in range(122,123):
            print(page)
            url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=' + str(page*20)
            yield scrapy.Request(url=url,callback=self.parse_list)

    def parse_list(self,response):
        info =json.loads(response.text)

        for movie in info["data"]:
            #print(movie)
            item = TttestItem()
            item["db_id"] = movie["id"]
            item["title"] = movie["title"]
            item["score"] = movie["rate"]
            item["directedBy"] = movie['directors'][0]

            yield item
            yield scrapy.Request(url=movie["url"],callback=self.parse_info)

    def parse_info(self,response):
        info = MovieItem()
        info['directedBy'] = response.xpath('//*[@rel="v:directedBy"]/text()').extract_first()
        infotext = "".join(response.xpath('//div[@id="info"]').extract())
        info_title = response.xpath('//div[@id="info"]/span').extract()
        info['db_id'] = response.url.split('/')[-2]
        # print(infotext)
        for i in range(0, len(info_title)):
            if "语言" in info_title[i]:
                lang = re.findall(r'<span class="pl">语言:</span>(.*?)<br>', infotext, re.S)
            if "类型" in info_title[i]:
                kind = re.findall(r'<span class="pl">类型:</span>(.*?)<br>', infotext, re.S)[0].replace('<span property="v:genre">','').replace('</span>','')
            if "制片国家" in info_title[i]:
                country = re.findall(r'<span class="pl">制片国家/地区:</span>(.*?)<br>', infotext, re.S)
            if "主演" in info_title[i]:
                actor = "/".join(re.findall(r'rel="v:starring">(.*?)</a>', infotext, re.S))
            if "上映日期" in info_title[i]:
                date = "".join(re.findall(r'<span class="pl">上映日期:</span> <span property="v:initialReleaseDate" content="(.*?)">', infotext, re.S))
            if "片长" in info_title[i]:
                time = re.findall(r'片长:</span> <span property="v:runtime" content="(.*?)">', infotext, re.S)
            if "编剧" in info_title[i]:
                writer = "/".join(re.findall(r'<a href="/celebrity/.*?/">(.*?)</a>', infotext, re.S))

        info['lang'] = lang
        info['kind'] = kind
        info['country'] = country
        info['actor'] = actor
        info['date'] = date
        info['time'] = time
        info['writer'] = writer
        yield info

douban250

对获取到的子页面进行解析，进一步获取详细信息，两部分获取的内容有重合。

import re
import scrapy
from tttest.items import TttestItem, MovieItem

class Douban250Spider(scrapy.Spider):
    name = 'douban250'
    allowed_domains = ['movie.douban.com']
    start_urls = ['https://movie.douban.com/top250']

    def parse(self, response):#获取页码链接
        for page in(0,1):
            url = 'top250?start=' + str(page*25)
            url = response.urljoin(url)
            yield scrapy.Request(url=url,callback=self.parse_list)

    def parser_info(self, response):#爬取info信息
        info = MovieItem()
        info['directedBy'] = response.xpath('//*[@rel="v:directedBy"]/text()').extract_first()
        infotext = "".join(response.xpath('//div[@id="info"]').extract())
        info_title =response.xpath('//div[@id="info"]/span').extract()
        info['db_id'] = response.url.split('/')[-2]
        for i in range(0,len(info_title)):
            if "语言" in info_title[i]:
                lang = re.findall(r'<span class="pl">语言:</span>(.*?)<br>',infotext,re.S)
        info['lang'] = lang
        yield info

    def parse_list(self, response):
        li_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
        #//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/p[1]/text()[2]
        for li in li_list:
            url =li.xpath('div/div[1]/a/@href').extract_first()
            item = TttestItem()
            item["title"] = li.xpath('div/div[2]/div[1]/a/span[1]/text()').extract_first()
            item["score"] = li.xpath('div/div[2]/div[2]/div/span[2]/text()').extract_first()
            item["db_id"] = url.split('/')[-2]
            item["author"] = "".join(li.xpath('div/div[2]/div[2]/p[1]/text()[1]').extract_first().replace(" ","").replace("\n","").split())
            item["kind"] = "".join(li.xpath('div/div[2]/div[2]/p[1]/text()[2]').extract_first().replace(" ","").replace("\n","").split())


            yield scrapy.Request(url = url,callback = self.parser_info)

            yield item

settings/middlewares

这部分主要是对python进行信息修饰，伪装用户操作，降低被识别封禁的概率。这里只进行了初级的配置。

# Scrapy settings for tttest project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tttest'

SPIDER_MODULES = ['tttest.spiders']
NEWSPIDER_MODULE = 'tttest.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tttest (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   'tttest.middlewares.TttestSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'tttest.middlewares.TttestDownloaderMiddleware': 543,
   'tttest.middlewares.RotateUserAgentMiddleware': 542,
   'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'tttest.pipelines.TttestPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware


class TttestSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class TttestDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

class RotateUserAgentMiddleware(UserAgentMiddleware):
    def __init__(self,user_agent=''):
        self.user_agent = user_agent
    def process_request(self,request,spider):
        self.user_agent = random.choice(self.user_agent_list)
        if self.user_agent:
            print(self.user_agent)
            request.headers.setdefault('User-agent',self.user_agent)

    user_agent_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

items

信息项，作为爬虫与数据库存储的桥梁，设定了需要获取的变量。由爬虫获取，pipelines进行存入数据库操作。

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TttestItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    score = scrapy.Field()
    author = scrapy.Field()
    db_id = scrapy.Field()
    directedBy = scrapy.Field()

class MovieItem(scrapy.Item):
    directedBy = scrapy.Field()
    lang = scrapy.Field()
    kind = scrapy.Field()
    country = scrapy.Field()
    actor = scrapy.Field()
    writer = scrapy.Field()
    date = scrapy.Field()
    time = scrapy.Field()
    db_id = scrapy.Field()

pipelines

数据库操作，包括连接数据库，接收数据，用SQL语句存入数据库等。由于获取了两批数据，这里也是分两次将影片信息存入数据库。

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import pymysql
from itemadapter import ItemAdapter

from tttest.items import MovieItem, TttestItem


class TttestPipeline:
    def __init__(self):
        self.connect = pymysql.connect(
            host='www.？？.com',
            port=3391,
            db='',
            user='',
            passwd='',
            charset='utf8',
            use_unicode=True,
            cursorclass=pymysql.cursors.DictCursor
        )
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):

        if isinstance(item,TttestItem):
            self.TttestItem(item)

        if isinstance(item,MovieItem):
            self.process_Movie_item(item)

        return item


    def TttestItem(self,item):
        try:
            self.cursor.execute('''
                SELECT * FROM movies_top250 WHERE db_id=%s
                ''',(item["db_id"],))
            film = self.cursor.fetchone()
            if film is None:
                self.cursor.execute('''
                    INSERT INTO movies_top250(db_id,title,score,director)
                    VALUE(%s,%s,%s,%s)
                ''',(item['db_id'],item['title'],item['score'],item['directedBy']))
            else:
                self.cursor.execute('''
                        UPDATE movies_top250 
                        SET title = %s,score = %s,director = %s
                        WHERE db_id = %s
                    ''',(item['title'],item['score'],item['directedBy'],item['db_id']))
            self.connect.commit()
        except Exception as err:
            print("错误1"+str(err))


    def process_Movie_item(self,item):
        try:
            self.cursor.execute('''
                UPDATE movies_top250 
                SET lang = %s,kind = %s,country = %s,actor = %s,date = %s,time = %s,writer = %s
                WHERE db_id = %s
                ''', (item['lang'],item['kind'],item["country"],item["actor"],item["date"],item["time"],item["writer"],item["db_id"]))
            self.connect.commit()
            print(item['lang'])
        except Exception as err:
            print("错误2"+str(err))

debug

启动程序，万物之源（雾）

1
2
3

from scrapy.cmdline import execute
# execute('scrapy crawl douban250'.split())
execute('scrapy crawl movie'.split())