zl程序教程

您现在的位置是:首页 >  后端

当前栏目

爬虫之selenium爬取斗鱼网站

爬虫网站 selenium 爬取
2023-09-11 14:15:15 时间

爬虫之selenium爬取斗鱼网站

示例代码:

from selenium import webdriver
import time

class Douyu(object):

    def __init__(self):
        self.url = 'https://www.douyu.com/directory/all'
        self.driver = webdriver.Chrome()

    def parse_data(self):
        time.sleep(3)
        room_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a')
        print(len(room_list))
        data_list = []
        for room in room_list:
            temp = {}

            temp['title'] = room.find_element_by_xpath('./div[2]/div[1]/h3').text
            temp['type'] = room.find_element_by_xpath('./div[2]/div[1]/span').text 
           
            data_list.append(temp)
        return data_list

    def save_data(self, data_list):
        for data in data_list:
            print(data)

    def run(self):
        #  url
        #  driver
        #  get
        self.driver.get(self.url)
        #  用于判断首页是否有广告
        try:
            time.sleep(6)
            self.driver.find_element_by_xpath('/html/body/div[2]/span[1]').click()  【此处需要等待几秒把弹窗关闭】
        except Exception as e:
            print(e)

        while True:
            #  parse
            data_list = self.parse_data()

            #  save
            self.save_data(data_list)

            #  next
            try:
                # el_next = self.driver.find_element_by_xpath('//*[contains(text(),"下一页")]')
                el_next = self.driver.find_element_by_xpath('//*[@id="listAll"]/section[2]/div[2]/div/ul/li[9]/span')
                self.driver.execute_script('scrollTo(110,100000)')
                el_next.click()
            except:
                break
if __name__ == '__main__':
    douyu = Douyu()
    douyu.run()

运行效果:

思路用图: