Python爬虫实战(三):爬网易新闻
2023-09-14 08:59:06 时间
代码:
# _*_ coding:utf-8 _*_ import urllib2 import re #import sys #reload(sys) #sys.setdefaultencoding('utf-8') class Tool: removeImg = re.compile(r'<p class="f_center".*?</p>') removeAddr = re.compile(r'<a.*?>|</a>') replaceLine = re.compile(r'<tr>|<div>|</div>|</p>') replaceTD = re.compile(r'<td>') replacePara = re.compile(r'<p.*?>') replaceBR = re.compile(r'<br<br>|<br>') removeExtraTag = re.compile(r'<.*?>') def replace(self,text): text = re.sub(self.removeImg,"",text) text = re.sub(self.removeAddr,"",text) text = re.sub(self.replaceLine,"\n",text) text = re.sub(self.replaceTD,"\t",text) text = re.sub(self.replacePara,"\n"+" ",text) text = re.sub(self.replaceBR,"\n",text) text = re.sub(self.removeExtraTag,"",text) return text.strip() class WYXW: def __init__(self,baseUrl): self.baseURL = baseUrl self.user_agent = 'Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)' self.headers = {'User-Agent':self.user_agent} #self.file = None self.fileName = u'网易新闻' self.tool = Tool() def get_homepage(self): url = self.baseURL request = urllib2.Request(url,headers = self.headers) response = urllib2.urlopen(request) content = response.read().decode('utf-8','ignore') #print content#.encode('gbk','ignore') return content def extract_url(self,homepage): pattern = "http://news.163.com/\d{2}/\d{4}/\d{2}/\w{16}.html" news_url = re.findall(pattern,homepage) #print news_url return news_url def extract_sub_web_time(self,sub_web): pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',re.S) time = re.findall(pattern,sub_web) print time[0] return time[0] def extract_sub_web_source(self,sub_web): pattern = re.compile(r'<a id="ne_article_source".*?>(.*?)</a>') source = re.findall(pattern,sub_web) print source[0] return source[0] def extract_sub_web_title(self,sub_web): #pattern = "<title>.+</title>" #pattern = '<h1 id="h1title" class="ep-h1">(.*?)</h1>' pattern = re.compile(r'<h1 id="h1title" class="ep-h1">(.*?)</h1>',re.S) title = re.findall(pattern,sub_web) if title is not None: print title[0] return title[0] else: return None def extract_sub_web_content(self,sub_web): #pattern = "<div id=\"Cnt-Main-Article-QQ\".*</div>" pattern = re.compile(r'<div id="endText".*?>(.*?)<!.*?-->',re.S) content = re.findall(pattern,sub_web) #print content[0] if content is not None: return content[0] else: return None def writeData(self,fName): if fName is not None: file = open(fName + '.txt',"w+") else: file = open(self.fileName + '.txt',"w+") homepage = self.get_homepage() news_urls = self.extract_url(homepage) for url in news_urls: print url web = urllib2.urlopen(url).read() title = self.extract_sub_web_title(web).strip() content = self.extract_sub_web_content(web) time = self.extract_sub_web_time(web).strip() source = self.extract_sub_web_source(web).strip() if content is not None: content = self.tool.replace(content) news = title + "\n\n" + time + "\t" + source + "\n\n" + content + "\n" file.write(news) sep = "\n" + "-------------------------------------------------------------------------" + "\n" file.write(sep) print u"新闻写入成功" + "\n" baseUrl = "http://news.163.com" wyxw = WYXW(baseUrl) wyxw.writeData(None)
相关文章
- 快速入门Python机器学习(37)
- Python爬虫之fiddler手机抓包
- 终于来了, 彭涛Python 爬虫训练营 !本周最低价,这次千万别错过了!
- 用Python的turtle库画太极图
- Python中的groupby分组
- Python 的数据结构
- 怎么用python打开csv文件_Python文本处理之csv-csv文件怎么打开[通俗易懂]
- python爬虫库_python爬虫实战百度云盘
- Python编程经典案例【考题】判断日期是该年中的第几天
- 盘点一个Python网络爬虫过程中中文乱码的问题
- python爬虫之lxml库xpath的基本使用
- 吐槽一下 Python 混乱的 multiprocessing 和 threading
- Python爬虫之xpath语法及案例使用
- python教程:isinstance和issubclass的用法
- 自动规整微信接收文件-python
- python爬虫增加多线程获取数据
- 【开发环境】Mac 中安装 Python3 最新版本 ( 下载 Python 最新版本 | 安装 Python3 | 验证 Python3 )
- Python基础语法-基本数据类型-字符串的常用方法
- Python class:定义类(入门必读)
- Python 中使用 MongoDB 存储爬虫数据详解编程语言
- python、java爬虫使用代理的区别
- Python语言编写电脑时间自动同步小工具
- python使用cPickle模块序列化实例