zl程序教程

您现在的位置是:首页 >  后端

当前栏目

Python爬虫基础学习案例

2023-09-14 09:16:29 时间

学习案例请多多指教

以下案例仅为个人学习,无侵权意识
  • 解释器版本 python3.8

爬取搜狗指定词条对应的搜索结果页面(简易网页采集器):

#!/usr/bin/env python
# encoding: utf-8

"""
@file: 获取搜狗首页.py
@time: 2020/2/17 9:14
"""
import requests

if __name__ == '__main__':
    # step1.指定url
    url = "https://www.sogou.com/"
    # step2.发起请求
    # get方法会返回一个响应对象
    response = requests.get(url=url)
    # step3.获取响应数据
    page_text = response.text
    print(page_text)
    # step4.持久化存储
    with open('./sogou.HTML', 'w', encoding='utf-8') as fp:
        fp.write(page_text)
    print('爬取数据结束!!!')

“破解”百度翻译:

#!/usr/bin/env python
# encoding: utf-8

"""
@file: 破解百度翻译.py
@time: 2020/2/17 12:51
"""
import json

import requests


def post_bai_du():
    # 1、指定url
    post_url = 'https://fanyi.baidu.com/sug'
    # 2、进行UA伪装
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.54'
    }
    # 3、post请求参数处理(同get请求一致)
    word = input('enter a word:')
    data = {
        'kw': word
    }
    # 4、发送请求
    response = requests.post(url=post_url, data=data, headers=headers)
    # 5、获取响应数据,.json()方法返回的是obj(如果确认相应的数据是json类型的,才可以使用json())
    response.json()
    dic_obj = response.json()
    print(dic_obj)
    # 持久化存储(即时翻译不需要)
    # fileName = word + '.json'
    # fp = open(fileName, 'w', encoding='utf-8')
    # json.dump(dic_obj, fp=fp, ensure_ascii=False)
    print('over!')


if __name__ == '__main__':
    post_bai_du()

爬取豆瓣电影分类排行榜 https://movie.douban.com/中的电影详情数据:

#!/usr/bin/env python
# encoding: utf-8

"""
@file: 爬取豆瓣电影分类排行榜.py
@time: 2020/2/17 13:21
"""
import requests
import json


def get_dou_ban():
    url = 'https://movie.douban.com/j/chart/top_list'
    param = {
        'type': '24',
        'interval_id': '100:90',
        'action': '',
        ' start': '0',  # 从第几部电影开始
        'limit': '20'  # 到第几部电影结束
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.54'
    }
    response = requests.get(url=url, params=param, headers=headers)
    list_data = response.json()
    #持久化存储
    # fp = open('./douban.json', 'w', encoding='utf-8')
    # json.dump(list_data, fp=fp, ensure_ascii=False)
    print(list_data)
    print('over!')


if __name__ == '__main__':
    get_dou_ban()

爬取肯德基餐厅查询http://www.kfc.com.cn/kfccda/storelist/index.aspx中指定地点的餐厅数据:

#!/usr/bin/env python
# encoding: utf-8

"""
@file: 根据城市查询肯德基餐厅信息.py
@time: 2020/2/17 20:56
"""

import requests


def post_KFC():
    pass
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
    word = input('请输入城市:')
    data = {
        'cname': word,
        'pid': '',
        'keyword': '',
        'pageIndex': '1',
        'pageSize': '10'
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.56'
    }
    response = requests.post(url=url, data=data, headers=headers)
    list_data = response.json()
    print(list_data)
    print('over!')


if __name__ == '__main__':
    post_KFC()

爬取国家药品监督管理总局中基于中华人民共和国化妆品生产许可证相关数据http://125.35.6.84:81/xk/:

#!/usr/bin/env python
# encoding: utf-8

"""
@file: 国家药监局化妆品许可证.py
@time: 2020/2/17 21:58
"""

import requests


def post_GY():
    pass
    url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList"
    word = int(input('页码:'))
    for page in range(1, word + 1):
        data = {
            'on': 'true',
            'page': page,
            'pageSize': '15',
            'productName': '',
            'conditionType': '1',
            'applyname': '',
            'applysn': ''
        }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/80.0.3987.106 Safari/537.36 Edg/80.0.361.56'
    }

    response = requests.post(url=url, data=data, headers=headers)
    list_data = response.json()
    print(list_data)
    print('over!')


if __name__ == '__main__':
    post_GY()

希望通过这些学习案例,让自己对爬虫思想有一个更深的体会,也希望自己更快更好地掌握爬虫技术

  • 继续加油!!!