您现在的位置是：首页 > Python

当前栏目

python爬虫之网易新闻网（简洁版）

Python 文件对象

2023-04-18 14:44:00 时间

网易新闻
爬虫
python

注释挺详细了，直接上全部代码，欢迎各位大佬批评指正。

from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from time import sleep from lxml import etree import os import requests import csv

# 创建一个无头浏览器对象 chrome_options = Options() # 设置它为无框模式 chrome_options.add_argument('–headless') # 如果在windows上运行需要加代码 chrome_options.add_argument('–disable-gpu') browser = webdriver.Chrome(chrome_options=chrome_options) # 设置一个10秒的隐式等待 browser.implicitly_wait(10) # 使用谷歌无头浏览器来加载动态js def start_get(url,news_type): browser.get(url) sleep(1) # 翻到页底 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(1) # 点击加载更多 more_btn = browser.find_elements(By.CSS_SELECTOR, '.load_more_btn') if more_btn: try: more_btn[0].click() except Exception as e: print(e) print('继续….') sleep(1) # 再次翻页到底 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 拿到页面源代码 source = browser.page_source parse_page(source)

# 对新闻列表页面进行解析 def parse_page(html): # 创建etree对象 tree = etree.HTML(html) new_lst = tree.xpath('//div[@class="news_title"]//a') for one_new in new_lst: title = one_new.xpath('./text()')[0] link = one_new.xpath('./@href')[0] try: write_in(title, link,news_type) except Exception as e: print(e)

# 将其写入到文件 def write_in(title, link,news_type): alist = [] print('开始写入新闻{}'.format(title)) browser.get(link) sleep(1) # 再次翻页到底 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') # 拿到页面源代码 source = browser.page_source tree = etree.HTML(source)

alist.append(news_type) alist.append(title)

con_link = link alist.append(con_link)

content_lst = tree.xpath('//div[@class="post_text"]//p') con = '' if content_lst: for one_content in content_lst: if one_content.text: con = con + ' ' + one_content.text.strip() alist.append(con)

post_time_source = tree.xpath('//div[@class="post_time_source"]')[0].text if "来源:" in post_time_source: post_time = post_time_source.split("来源:")[0] alist.append(post_time)

else: post_time = post_time_source[0].text alist.append(post_time) post_source = tree.xpath('//div[@class="post_time_source"]/a[@id="ne_article_source"]')[0].text alist.append(post_source)

# browser.get(url) tiecount = tree.xpath('//a[@class="js-tiecount js-tielink"]')[0].text alist.append(tiecount)

tiejoincount = tree.xpath('//a[@class="js-tiejoincount js-tielink"]')[0].text alist.append(tiejoincount)

# 1. 创建文件对象 f = open('网易.csv', 'a+', encoding='utf-8',newline='') # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 3. 构建列表头 # csv_writer.writerow(["姓名", "年龄", "性别"]) # 4. 写入csv文件内容 print(alist) csv_writer.writerow(alist) f.close()

if __name__ == '__main__': urls = browser.find_elements_by_xpath('//a[@ne-role="tab-nav"]') links =[] for one in urls: url = one.get_attribute('href') links.append(url)

for one in range(len(links)): if 'http' in links[one]: # headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} if not os.path.exists('new'): os.mkdir('new') news_type = urls[one].text start_get(links[one],news_type)

browser.quit()

结果如下：
python爬虫之网易新闻网（简洁版）
注：本文仅用于技术交流，不得用于商业用途。不遵守者，与本文作者无关。

猜你喜欢

Jease 2.6发布 Java开源内容框架
EasyCVR对接华为iVS订阅摄像机和用户变更请求接口介绍
JVM调优总结：反思
【技术种草】cdn+轻量服务器+hugo=让博客“云原生”一下
JVM调优总结：调优方法
前端面试【JavaScript】— typeof 是否能正确判断类型？
JVM调优总结：新一代的垃圾回收算法
前端面试【JavaScript】— instanceof 能否判断基本数据类型？
JVM调优总结：典型配置举例
前端面试【JavaScript】— 能不能手动实现一下 instanceof 的功能？
前端面试【JavaScript】— Object.is和=== 有什么区别？
JVM调优总结：分代垃圾回收详述
前端面试【JavaScript】— JS中类型转换有哪几种？
WPF开发入门尝试
前端面试【JavaScript】— == 和 ===有什么区别？
一个Java程序员对2011年的回顾
前端面试【JavaScript】— 对象转原始类型是根据什么流程运行的？
JVM调优总结：垃圾回收面临的问题
直接在代码里面对list集合进行分页
JVM调优总结：基本垃圾回收算法

zl程序教程

当前栏目

python爬虫之网易新闻网（简洁版）

相关文章