requests多进程爬取数据
好长时间没有更新,今天更新一次!!。
因为工作原因,一直在使用pyspider框架有半年没有用过requests了,知识点也忘记了很多,今天写了一个多线程爬取APP的图片时间的脚本,供大家参考!
import re, random, time, json, requests, datetime, os from pyquery import PyQuery as pq from multiprocessing import Pool #detail_page函数是获取详情页的内容,当然有不同的数据解析包获取数。这个地方可以视情况而定 def detail_page(page_url): res = requests.get(page_url) res_dict = res.json() for each in json.loads(res_dict['info']['image_list']): data = { "pid": str(time.time()).split('.')[0] + str(random.randint(10000, 99999)), "task_id": 257609, "clue_id": 437389, "clue_name": '玩咖', "company_id": 230433, "url": page_url, "pic_url": each, "client_date": get_date2(res_dict['info']['mtime']), "url_article_title": res_dict['info']['title'], "url_article": pq(res_dict['info']['content'])('p').text(), "is_cover": 0, } aa = {'resource': data} d = json.dumps(aa) try: url = 'http://shijue.qingapi.cn/task_python/start' r = requests.post(url, data={"data": d}) except: filemame = os.path.join(os.path.abspath('.'), 'wanka_error') with open(filemame,'a') as f: f.write(d) f.write(' ') print(data) #list_page函数是APP每个板块的内容,进行分页,再将获取到的详情页的url传给detail_page函数 def list_page1(pid): for i in range(pid, pid+100): print(i) url = 'http://data.gm825.com/api/channel/recommendation?pn='+str(i)+'&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D' res = requests.get(url) res_dict = res.json() for each in res_dict['list']: page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D'.format(mid=str(each['module_id'])) try: detail_page(page_url) except: pass for j in range(pid,pid+100): # print(j) url='http://data.gm825.com/api/channel/mha?pn='+str(j)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D' rese=requests.get(url) res_dict=rese.json() #print(res_dict) for each in res_dict['list']: #print(each) page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id'])) #print(page_url) try: detail_page(page_url) except: pass
for q in range(pid,pid+100):
url='http://data.gm825.com/api/channel/gallery?pn='+str(q)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D' rese=requests.get(url) res_dict=rese.json() for each in res_dict['list']: page_url = 'http://data.gm825.com/api/gallery/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id'])) try: detail_page(page_url) except: pass for e in range(pid,pid+100): url='http://data.gm825.com/api/channel/mixture?pn='+str(e)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D' rese=requests.get(url) res_dict=rese.json() for each in res_dict['list']: page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id'])) try: detail_page(page_url) except: pass if __name__ == '__main__': p = Pool(10)#定义了10个进程的进程池 for i in range(10,70): p.apply_async(list_page1, args=(i*100,)) p.close() p.join()
代码就这些,如果有不懂的朋友可以加我Q353061949,我会给你讲解哦!
相关文章
- PyTorch版YOLOv4更新了,适用于自定义数据集
- 没有计算机文凭,我在两个月内搞定4份Offer,且收入翻倍
- 记一次生产事故:30万单就这样没了!
- 为什么不应该在分页中使用offset和limit
- 面试官让我聊聊 ArrayList 解决了数组的哪些问题
- 类脑计算机:一种新型的计算系统
- Uber为什么放弃Postgres选择迁移到MySQL?
- 无需GPT-3!国外小哥徒手开发Text2Code,数据分析代码一键生成
- 什么时候以及为什么基于树的模型可以超过神经网络模型?
- 刚刚!史上超大规模半导体交易诞生:英伟达宣布 400 亿美元收购 Arm
- 计算机世界里的“堆栈”你真的懂吗?
- 量子计算前景一片大好?MIT新研究「泼冷水」| Nature
- 你上世纪写的代码现在还work吗?挑战者:我需要读磁带的机器
- 谷歌实现全球首个量子化学模拟,用量子「计算」出化学反应过程
- 五分钟搞懂布隆过滤器,亿级数据过滤算法值得拥有
- 备战解决方案架构师考试,你需要哪些知识和技能?
- 突破:芯片存储容量提高1000倍
- 10大高性能开发宝石,我要消灭一半程序员!
- 今天这个时代到底需要什么样的技术思维?
- 希望我这是最后一次谈SaaS