python爬虫入门(九)Scrapy框架之数据库保存
2023-09-11 14:19:33 时间
豆瓣电影TOP 250爬取-->>>数据保存到MongoDB
要求:
1.爬取豆瓣top 250电影名字、演员列表、评分和简介
2.设置随机UserAgent和Proxy
3.爬取到的数据保存到MongoDB数据库
items.py
# -*- coding: utf-8 -*- import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # 标题 title = scrapy.Field() # 信息 bd = scrapy.Field() # 评分 star = scrapy.Field() # 简介 quote = scrapy.Field()
doubanmovie.py
# -*- coding: utf-8 -*- import scrapy from douban.items import DoubanItem class DoubamovieSpider(scrapy.Spider): name = "doubanmovie" allowed_domains = ["movie.douban.com"] offset = 0 url = "https://movie.douban.com/top250?start=" start_urls = ( url+str(offset), ) def parse(self, response): item = DoubanItem() movies = response.xpath("//div[@class='info']") for each in movies: # 标题 item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0] # 信息 item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0] # 评分 item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0] # 简介 quote = each.xpath(".//p[@class='quote']/span/text()").extract() if len(quote) != 0: item['quote'] = quote[0] yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
pipelines.py
# -*- coding: utf-8 -*- import pymongo from scrapy.conf import settings class DoubanPipeline(object): def __init__(self): host = settings["MONGODB_HOST"] port = settings["MONGODB_PORT"] dbname = settings["MONGODB_DBNAME"] sheetname= settings["MONGODB_SHEETNAME"] # 创建MONGODB数据库链接 client = pymongo.MongoClient(host = host, port = port) # 指定数据库 mydb = client[dbname] # 存放数据的数据库表名 self.sheet = mydb[sheetname] def process_item(self, item, spider): data = dict(item) self.sheet.insert(data) return item
settings.py
DOWNLOAD_DELAY = 2.5 COOKIES_ENABLED = False DOWNLOADER_MIDDLEWARES = { 'douban.middlewares.RandomUserAgent': 100, 'douban.middlewares.RandomProxy': 200, } USER_AGENTS = [ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' ] PROXIES = [ {"ip_port" :"121.42.140.113:16816", "user_passwd" : "****"}, #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} ] ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, } # MONGODB 主机名 MONGODB_HOST = "127.0.0.1" # MONGODB 端口号 MONGODB_PORT = 27017 # 数据库名称 MONGODB_DBNAME = "Douban" # 存放数据的表名称 MONGODB_SHEETNAME = "doubanmovies"
middlewares.py
#!/usr/bin/env python # -*- coding:utf-8 -*- import random import base64 from settings import USER_AGENTS from settings import PROXIES # 随机的User-Agent class RandomUserAgent(object): def process_request(self, request, spider): useragent = random.choice(USER_AGENTS) #print useragent request.headers.setdefault("User-Agent", useragent) class RandomProxy(object): def process_request(self, request, spider): proxy = random.choice(PROXIES) if proxy['user_passwd'] is None: # 没有代理账户验证的代理使用方式 request.meta['proxy'] = "http://" + proxy['ip_port'] else: # 对账户密码进行base64编码转换 base64_userpasswd = base64.b64encode(proxy['user_passwd']) # 对应到代理服务器的信令格式里 request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd request.meta['proxy'] = "http://" + proxy['ip_port']
相关文章
- 简明Python教程学习笔记_8_异常
- python模块分析之sqlite3数据库
- Python内置数据类型
- python copy()和deepcopy()解释(浅拷贝、深拷贝)
- Python怎么就火起来了?学会python可以做的兼职
- Python中的海象运算符“:=”使用方法详解
- python中difflib库用法详解
- 《 Python树莓派编程》——2.2 命令
- Python编程-基础知识-python项目包和文件的管理以及如何引用相对路径的包和模块
- 【Python】Linux Acanoda PySpark Spark
- 基于Python+AlphaBot 实现红外遥控且自动避障的嵌入式智能小车系统【100010367】
- 使用Python采集数据,制作个性化COSer图书
- Python 爬虫教程之使用 Python 从 YouTube 链接获取频道名称并存储到SQLite数据库中(教程含源码)
- Python与数据库 sqlalchemy 建立声明层表对象的两种方式
- python操作mysql数据库系列-操作MySql数据库(二)
- python连接PostgreSQL数据库的学习一
- java的sha1加密,转化为python版本
- python:找出两个列表中相同和不同的元素(使用推导式)
- 学习python第三天数据库day2
- Python字典
- Python - 3MySQL 数据库连接
- 【python 学习】——pycharm终端解释器和Python解释器配置
- 学习笔记(25):Python网络编程&并发编程-Thread其他属性和守护线程
- Selenium4 Python实现Page Factory设计模式,python新的定位方法
- Python opencv-python 简单测试
- Python数据库操作
- python封装一个工具类 ,对MySQL数据库增删改查,可多字段动态插入mysql数据库中