操作excel文件爬取nvd.nist数据
2023-09-27 14:22:25 时间
#!/usr/bin/env python # encoding: utf-8 #@author: jack import random from time import sleep import pandas as pd from openpyxl import load_workbook from urllib import request from lxml import etree wb = load_workbook('cve.xlsx')#要读取的excel文件名,用openpyxl考虑到wrxl库网友反馈对excel后面版本兼容不是太好 sheet = wb['Sheet1']#默认excel右下角表名称 cve_list =[] for i in sheet["D"][1:25]:#第D列第一行开始读起到24结束 cve_code = i.value#读取到的每个列表参数 cve_list.append(cve_code)#前面定义的空列表来存放excel读取的数据 start_url = 'https://nvd.nist.gov/vuln/detail/'#老美网站待爬数据,右击页面看到数据是静态的爬起来舒服 score_li=[] vector3_li=[] vector2_li=[] for url_code in cve_list:#取列表参数 url = '{}{}'.format(start_url,url_code)#url拼接 response = request.urlopen(url) result = response.read().decode() html = etree.HTML(result) v3BaseScore = html.xpath('//span[@data-testid="vuln-cvssv3-base-score"]/text()')#etree定位so easy Vector3 = html.xpath('//span[@data-testid="vuln-cvssv3-vector"]/text()') Vector2 = html.xpath('//span[@data-testid="vuln-cvssv2-vector"]/text()') score_li.append(' '.join(v3BaseScore))#格式化保存页面提取的数据 vector3_li.append(' '.join(Vector3)) vector2_li.append(' '.join(Vector2)) df1 = pd.DataFrame({'v3BaseScore': score_li})#构建表头字段pandas方法 df2 = pd.DataFrame({'Vector3': vector3_li}) df3 = pd.DataFrame({'Vector2': vector2_li}) All = [df1, df2, df3] writer = pd.ExcelWriter('test1.xlsx')#新建excel文件, df1.to_excel(writer, sheet_name='Sheet1', startcol=1, index=False)#指定列逐行写入数据 df2.to_excel(writer, sheet_name='Sheet1', startcol=2, index=False) df3.to_excel(writer, sheet_name='Sheet1', startcol=3, index=False) writer.save() writer.close()
业务需求现学pandas和openpyxl,
1、页面分析
2、定位分析
3、数据读写分析
4、网站容易爬挂,并发调低,user-agent代理可以搞起
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import base64 import random from multiprocessing.pool import ThreadPool import time import pandas as pd from openpyxl import load_workbook from urllib import request from lxml import etree from proxies import * def task1(): user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] # count = 0 header = {} header['User-Agent'] = random.choice(user_agent_list) header.update({ 'Host':' nvd.nist.gov', 'User-Agent:':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }) time.sleep(0.3) # read cve excle wb = load_workbook('cve.xlsx') sheet = wb['Sheet1'] cve_list = [] # part request for i in sheet["D"][1:]: cve_code = i.value cve_list.append(cve_code) start_url = 'https://nvd.nist.gov/vuln/detail/' score_li = [] vector3_li = [] vector3_href_li = [] vector2_li = [] vector2_href_li = [] for url_code in cve_list: url = '{}{}'.format(start_url, url_code) res = request.Request(url,headers=header) response = request.urlopen(res) result = response.read().decode() # 数据清洗部分 html = etree.HTML(result) score_list = html.xpath('//span[@data-testid="vuln-cvssv3-base-score"]/text()') vector3_list = html.xpath('//span[@data-testid="vuln-cvssv3-vector"]/text()') vector2_list = html.xpath('//span[@data-testid="vuln-cvssv2-vector"]/text()') vector3_href_list = html.xpath('//span[@data-testid="vuln-cvssv3-vector"]//a/@href') vector2_href_list = html.xpath('//span[@data-testid="vuln-cvssv2-vector"]//a/@href') score_li.append(' '.join(score_list)) vector3_li.append(' '.join(vector3_list)) vector3_href_li.append(' '.join(vector3_href_list)) vector2_li.append(' '.join(vector2_list)) vector2_href_li.append(' '.join(vector2_href_list)) # create some Pandas DateFrame from some data df1 = pd.DataFrame({'CVSSv3.0BaseScore': score_li}) df2 = pd.DataFrame({'CVSS v3.0 Vector': vector3_li}) df3 = pd.DataFrame({'CVSS v3.0 Vector link': vector3_href_li}) df4 = pd.DataFrame({'CVSS v2.0 Vector': vector2_li}) df5 = pd.DataFrame({'CVSS v2.0 Vector link': vector2_href_li}) All = [df1, df2, df3, df4, df5] # create a Pandas Excel writer using xlswriter writer = pd.ExcelWriter('basescore.xlsx') df1.to_excel(writer, sheet_name='Sheet2', startcol=2, index=False) df2.to_excel(writer, sheet_name='Sheet2', startcol=3, index=False) df3.to_excel(writer, sheet_name='Sheet2', startcol=4, index=False) df4.to_excel(writer, sheet_name='Sheet2', startcol=5, index=False) df5.to_excel(writer, sheet_name='Sheet2', startcol=6, index=False) writer.save() writer.close() def process_request(self, request, spider): PROXIES = [ {'ip_port': '61.160.233.8', 'user_pass': ''}, {'ip_port': '125.93.149.186', 'user_pass': ''}, {'ip_port': '58.38.86.181', 'user_pass': ''}, {'ip_port': '119.142.86.110', 'user_pass': ''}, {'ip_port': '124.161.16.89', 'user_pass': ''}, {'ip_port': '61.160.233.8', 'user_pass': ''}, {'ip_port': '101.94.131.237', 'user_pass': ''}, {'ip_port': '219.157.162.97', 'user_pass': ''}, {'ip_port': '61.152.89.18', 'user_pass': ''}, {'ip_port': '139.224.132.192', 'user_pass': ''} ] proxy = random.choice(PROXIES) if proxy['user_pass'] is not None: request.meta['proxy'] = "http://%s" % proxy['ip_port'] encodebytes = base64.b64encode(proxy['user_pass'].encode(encoding='utf-8')) # 注意encodebytes类型是byte,不是str encoded_user_pass = str(encodebytes, 'utf-8') request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass else: request.meta['proxy'] = "http://%s" % proxy['ip_port'] if __name__ == '__main__': pool = ThreadPool() pool.apply_async(task1) pool.apply_async(process_request) pool.close() pool.join()
相关文章
- python使用openpyxl库按 行/列 合并/拆分 Excel表格
- C# Excel文件导入操作
- Qt监控excel
- 使用EasyPoi导入Excel直接读流(不保存到本地文件)
- JavaScript - 本地上传 Excel 文件页面表格预览(不可编辑)
- 【Excel】获取一列的有效行数
- 《从Excel到R 数据分析进阶指南》一2.8 查看前10行数据
- 《机器学习与数据科学(基于R的统计学习方法)》——2.6 读取Excel文件
- Java poi Excel导出文件,Java poi 分批次导出大批量数据
- 纳德拉:Excel是微软的最佳象征 难以想象没有它的世界
- Python Excel自动化之 Openpyx如何Python程序读取和修改 Excel电子表格文件
- Excel VLOOKUP实用教程之 06 vlookup如何从使用通配符进行部分查找?(教程含数据excel)
- Excel 函数大全之 INTERCEPT function 获取线性回归线的截距
- 《Tableau数据可视化实战》——1.3节连接Excel文件
- 使用XSSFWork创建的xlsx后缀Excel文件无法打开
- clientdataset 读取excel 如果excel 文件不存在的时候 相应的gird 会不显示数据, 鼠标掠过 gird 格子 才会显示数据。 这是一个bug 哈哈
- PowerDesigner 表格导出为excel
- win10安装Offic2016以后,Word文件、Excel文件、PPT文件图标显示不正常解决方法
- Pandas之read_excel()和to_excel()函数解析
- NPOI 生成Excel (单元格合并、设置单元格样式:字段,颜色、设置单元格为下拉框并限制输入值、设置单元格只能输入数字等)
- 【文件处理】——Python pandas 写入数据到excel中
- [Excel知识技能] 将“假“日期转为“真“日期格式