python 70行完成requests抓取csdn阅读量.
Python 完成 阅读 抓取 csdn requests 70
2023-06-13 09:16:51 时间
第三方库 requests, fake_useragent
import random, requests, re, logging, time
from fake_useragent import UserAgent
日志
logging.basicConfig(level=logging.INFO , format='%(asctime)s[%(levelname)s]: %(message)s')
logger = logging.getLogger(__name__)
随机获取请求头User-Agent
def getHeader():
return {'Referer': 'https://blog.csdn.net','User-Agent': UserAgent().random}
解析读取量
def parse_html(text:str):
return int(re.search(r'<span class="read-count">([0-9]*)</span>', text).group(1))
请求网页
def requestUrlText(url):
try:
header = getHeader()
response = requests.get(url, headers=header)
if response.status_code == 200:
return parse_html(response.text)
except (requests.RequestException, AttributeError):
return False
封装为请求url类
class request_url():
def __init__(self, url:str):
self.url = url
self.read = 0
def succeeded(self): #弃用
logging.debug(f"Request Url{self.url} successfully.")
def _update(self):
_res = requestUrlText(self.url)
if isinstance(_res, int):
self.succeeded()
_var = _res - self.read
self.read = _res
return _var
return 0
def update(self):
while True:
r = self._update()
if r: return r
def readnum(self) -> int:
return self.read
封装总调用类
class request_urls():
def __init__(self, *args):
self.__urls = []
self.add(*args)
self.all_add = 0
def add(self, *urls):
for url in urls:
self.__urls.append(request_url(url))
def get(self) -> int:
return sum([u.update() for u in self.__urls])
def total(self) -> int:
return sum([u.readnum() for u in self.__urls])
def update(self) -> None:
add = self.get()
self.all_add += add
logger.info(f"total {self.total()}({len(self.__urls)}blogs), ↑{add} [ALL {self.all_add}]")
运行与主程序
def run(urls):
req = request_urls(*urls)
req.get() #初始化.
while True:
req.update()
time.sleep(random.randint(60, 70))
if __name__ == '__main__':
run(["https://blog.csdn.net/m0_60394896/article/details/124571653?spm=1001.2014.3001.5502",
"https://blog.csdn.net/m0_60394896/article/details/124530993",
"https://blog.csdn.net/m0_60394896/article/details/124529941?spm=1001.2014.3001.5502",
"https://blog.csdn.net/m0_60394896/article/details/124519531",
"https://blog.csdn.net/m0_60394896/article/details/124508776",
"https://blog.csdn.net/m0_60394896/article/details/124361092",
"https://blog.csdn.net/m0_60394896/article/details/124094245",
"https://blog.csdn.net/m0_60394896/article/details/124034831",
"https://blog.csdn.net/m0_60394896/article/details/124033445",
"https://blog.csdn.net/m0_60394896/article/details/123981398",
"https://blog.csdn.net/m0_60394896/article/details/123772011",
"https://blog.csdn.net/m0_60394896/article/details/123583566",
"https://blog.csdn.net/m0_60394896/article/details/122505828",
"https://blog.csdn.net/m0_60394896/article/details/122371110"])
全部代码
import random, requests, re, logging, time
from fake_useragent import UserAgent
logging.basicConfig(level=logging.INFO , format='%(asctime)s[%(levelname)s]: %(message)s'); logger = logging.getLogger(__name__)
def getHeader():
return {'Referer': 'https://blog.csdn.net','User-Agent': UserAgent().random}
def parse_html(text:str):
return int(re.search(r'<span class="read-count">([0-9]*)</span>', text).group(1))
def requestUrlText(url):
try:
header = getHeader()
response = requests.get(url, headers=header)
if response.status_code == 200:
return parse_html(response.text)
except (requests.RequestException, AttributeError):
return False
class request_url():
def __init__(self, url:str):
self.url = url
self.read = 0
def succeeded(self): #弃用
logging.debug(f"Request Url{self.url} successfully.")
def _update(self):
_res = requestUrlText(self.url)
if isinstance(_res, int):
self.succeeded()
_var = _res - self.read
self.read = _res
return _var
return 0
def update(self):
while True:
r = self._update()
if r: return r
def readnum(self) -> int:
return self.read
class request_urls():
def __init__(self, *args):
self.__urls = []
self.add(*args)
self.all_add = 0
def add(self, *urls):
for url in urls:
self.__urls.append(request_url(url))
def get(self) -> int:
return sum([u.update() for u in self.__urls])
def total(self) -> int:
return sum([u.readnum() for u in self.__urls])
def update(self) -> None:
add = self.get()
self.all_add += add
logger.info(f"total {self.total()}({len(self.__urls)}blogs), ↑{add} [ALL {self.all_add}]")
def run(urls):
req = request_urls(*urls)
req.get() #初始化.
while True:
req.update()
time.sleep(60 * 60 * 2) #自动两小时请求一次
if __name__ == '__main__':
run(["https://blog.csdn.net/m0_60394896/article/details/124571653?spm=1001.2014.3001.5502",
"https://blog.csdn.net/m0_60394896/article/details/124530993",
"https://blog.csdn.net/m0_60394896/article/details/124529941?spm=1001.2014.3001.5502",
"https://blog.csdn.net/m0_60394896/article/details/124519531",
"https://blog.csdn.net/m0_60394896/article/details/124508776",
"https://blog.csdn.net/m0_60394896/article/details/124361092",
"https://blog.csdn.net/m0_60394896/article/details/124094245",
"https://blog.csdn.net/m0_60394896/article/details/124034831",
"https://blog.csdn.net/m0_60394896/article/details/124033445",
"https://blog.csdn.net/m0_60394896/article/details/123981398",
"https://blog.csdn.net/m0_60394896/article/details/123772011",
"https://blog.csdn.net/m0_60394896/article/details/123583566",
"https://blog.csdn.net/m0_60394896/article/details/122505828",
"https://blog.csdn.net/m0_60394896/article/details/122371110"])
什么? 刷取csdn阅读量?那是不可能 csdn主要靠这几种方式获取你是否在requests
项目 | Value |
---|---|
1 | 时间戳 time.time() 时间计算是一个重要方式 如果你再过少时间内请求多次,则视为增加一次阅读量 |
2 | 请求头 header 这是大多数网站返回结果考虑的因素 |
3 | Referer 你从哪里来? |
4 | Cookie + IP 单个IP或者是cookie文件 请求次数过多,也会减少阅读量,但其实这一步可以用代理IP 但效果不怎么样 |
5 | 用户信息 如(4), 单个用户只能点赞收藏一次 |
阅读量并不能代表什么 点赞等,综合指标才能上去所以有时间还是要想想发布一些优质的博文.
相关文章
- Python计算中位数_用频率直方图求中位数
- pycharm选择运行环境_python编程入门
- python jieba库_Python jieba库的使用说明「建议收藏」
- Python 第三方模块 科学计算 SciPy模块1 简介,常数,IO「建议收藏」
- Python中strip()函数
- python chr()和ord()_Python函数ord
- python中全局变量和局部变量详解
- Python 技巧篇-pip卸载python库实例演示,查看pip命令大全方法[通俗易懂]
- 【说站】python字典的底层原理
- Python修改文件后缀_python重命名文件名
- h5 Python_python做h5网站
- lambda表达式pythonlist_python lambda表达式简单用法【转】「建议收藏」
- python判断文件后缀_Python 判断文件后缀是否被篡改
- Python学生信息管理系统课程设计报告_python做的项目管理系统
- 上手Python之列表
- python降低scikit-learn版本
- 有没有一个在线工具可以将Python代码转换为Java代码?
- python-Python与MySQL数据库-处理MySQL查询结果
- 三种Python下载url并保存文件的代码详解编程语言
- Python常见报错问题(不定时更新)详解编程语言
- Linux中运行Python文件的步骤(linux运行python文件)
- python学习Linux、Python,体验自由的乐趣(lexlinux)
- 一步步学习:利用Python连接MySQL数据库(python连接mysql数据库)
- 跟老齐学Python之通过Python连接数据库
- Python中os.path用法分析