zl程序教程

您现在的位置是:首页 >  后端

当前栏目

Python爬虫实战(三):爬网易新闻

Python爬虫 实战 新闻 网易
2023-09-14 08:59:06 时间

代码:

# _*_ coding:utf-8 _*_
import urllib2
import re
#import sys

#reload(sys)
#sys.setdefaultencoding('utf-8')
    
class Tool:
    removeImg = re.compile(r'<p class="f_center".*?</p>')
    removeAddr = re.compile(r'<a.*?>|</a>')
    replaceLine = re.compile(r'<tr>|<div>|</div>|</p>')
    replaceTD = re.compile(r'<td>')
    replacePara = re.compile(r'<p.*?>')
    replaceBR = re.compile(r'<br<br>|<br>')
    removeExtraTag = re.compile(r'<.*?>')

    def replace(self,text):
        text = re.sub(self.removeImg,"",text)
        text = re.sub(self.removeAddr,"",text)
        text = re.sub(self.replaceLine,"\n",text)
        text = re.sub(self.replaceTD,"\t",text)
        text = re.sub(self.replacePara,"\n"+"  ",text)
        text = re.sub(self.replaceBR,"\n",text)
        text = re.sub(self.removeExtraTag,"",text)
        return text.strip()
        

class WYXW:
    def __init__(self,baseUrl):
        self.baseURL = baseUrl
        self.user_agent = 'Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent':self.user_agent}
        #self.file = None
        self.fileName = u'网易新闻'
        self.tool = Tool()

    def get_homepage(self):
        url = self.baseURL
        request = urllib2.Request(url,headers = self.headers)
        response = urllib2.urlopen(request)
        content = response.read().decode('utf-8','ignore')
        #print content#.encode('gbk','ignore')
        return content

    def extract_url(self,homepage):
        pattern = "http://news.163.com/\d{2}/\d{4}/\d{2}/\w{16}.html"
        news_url = re.findall(pattern,homepage)
        #print news_url
        return news_url

    def extract_sub_web_time(self,sub_web):
        pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',re.S)
        time = re.findall(pattern,sub_web)
        print time[0]
        return time[0]

    def extract_sub_web_source(self,sub_web):
        pattern = re.compile(r'<a id="ne_article_source".*?>(.*?)</a>')
        source = re.findall(pattern,sub_web)
        print source[0]
        return source[0]

    def extract_sub_web_title(self,sub_web):
        #pattern = "<title>.+</title>"
        #pattern = '<h1 id="h1title" class="ep-h1">(.*?)</h1>'
        pattern = re.compile(r'<h1 id="h1title" class="ep-h1">(.*?)</h1>',re.S)
        title = re.findall(pattern,sub_web)
        if title is not None:
            print title[0]
            return title[0]
        else:
            return None

    def extract_sub_web_content(self,sub_web):
        #pattern = "<div id=\"Cnt-Main-Article-QQ\".*</div>"
        pattern = re.compile(r'<div id="endText".*?>(.*?)<!.*?-->',re.S)
        content = re.findall(pattern,sub_web)
        #print content[0]
        if content is not None:
            return content[0]
        else:
            return None

    def writeData(self,fName):
        if fName is not None: 
            file = open(fName + '.txt',"w+")
        else:
            file = open(self.fileName + '.txt',"w+")
        homepage = self.get_homepage()
        news_urls = self.extract_url(homepage)
        for url in news_urls:
            print url
            web = urllib2.urlopen(url).read()
            title = self.extract_sub_web_title(web).strip()
            content = self.extract_sub_web_content(web)
            time = self.extract_sub_web_time(web).strip()
            source = self.extract_sub_web_source(web).strip()
            if content is not None:
                content = self.tool.replace(content)
                news = title + "\n\n" + time + "\t" + source + "\n\n" + content + "\n"
                file.write(news)
                sep = "\n" + "-------------------------------------------------------------------------" + "\n"
                file.write(sep)
                print u"新闻写入成功" + "\n"
  

baseUrl = "http://news.163.com"
wyxw = WYXW(baseUrl)
wyxw.writeData(None)