zl程序教程

您现在的位置是:首页 >  大数据

当前栏目

简单爬取新浪新闻数据

数据 简单 新闻 爬取 新浪
2023-06-13 09:12:08 时间
# -*- coding: utf-8 -*-
"""
@author: sato
@file: sina_spider.py
@time: 2019-09-03 15:57

"""
import requests
import re
import multiprocessing
import os


class Spider(object):

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/76.0.3809.132 Safari/537.36'
    }
        # 新浪新闻首页 每日要闻、重点新闻 id="wrap" >  class="part_01 clearfix" >  class="p_middle"
        self.base_url = 'https://news.sina.com.cn/'

    def get_news_list(self):
        res = requests.get(self.base_url, self.headers)
        if res.status_code not in (200, 201):
            raise Exception('network error!')
        res.encoding = 'utf-8'
        part_01_clearfix = re.findall('<div class="part_01 clearfix">([\S\s]*?)<div class="part_01 clearfix" data-sudaclick="blk_livevideo">', res.text)
        if part_01_clearfix:
            part_01_clearfix = part_01_clearfix[0]
        p_middle = re.findall('<div class="p_middle">([\S\s]*?)<div class="p_right">', part_01_clearfix)
        if p_middle:
            return re.findall('<a target="_blank" href="([\S\s]*?)"', p_middle[0])

    def rep_and_write(self, link):
        print(f'get data from {link}')
        ret = requests.get(url=link, headers=self.headers)
        if ret.status_code not in (200, 201):
            raise Exception(f'get {link} error!')
        ret.encoding = 'utf-8'
        content = ret.text
        title = re.findall('<title>([\S\s]*?)</title>', content)[0]
        with open(os.path.join('./html', title + '.html'), 'w') as f:
            f.write(content)

    def run(self):
        links = self.get_news_list()
        if not links:
            raise Exception('error!')
        if not os.path.exists('./html'):
            os.mkdir('./html')
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
        for link in links:
            pool.apply_async(self.rep_and_write, (link,))
        pool.close()
        pool.join()
        print('done')


spider = Spider()
spider.run()