爬取斗鱼英雄联盟主播+人气值 Weask-Blog

学习 python，当然不能忘记 pyhon 的最优用处-爬虫，本次练习通过对斗鱼英雄联盟版块爬虫，获取英雄联盟版块下的所有主播名称和人气值，并根据人气值进行排行，打印出来。

from urllib import request
import re,time

class Spider():
#    url = 'https://www.douyu.com/directory/myFollow'
    url = 'https://www.douyu.com/g_LOL/?page='

    #浏览器头信息
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

    #主要正则表达式
    finaly_pattern = 'shark-pager-next shark-pager-disable shark-pager-disable-next'
    douyu_pattern = '<span class="dy-name ellipsis fl">([\s\S]*?)</p>'
    name_pattern = '([\s\S]*?)</span>'
    hot_pattern = '<span class="dy-num fr"  >([\s\S]*?)</span>'
    all_hot = 0

    def for_page(self):
        names_hot = []
        i = 1
        jump = 0
        n = 0
        #循环加载多页面信息
        while True :
            url = Spider.url+'%s'%i
            p = 0

            #模拟浏览器加载页面信息
            req = request.Request(url=url, headers=Spider.headers)
            r = request.urlopen(req)
            htmls = r.read().decode()
            douyu_html = re.findall(Spider.douyu_pattern,htmls)
            for html in douyu_html:

                #正则匹配主播和人气值
                name = re.findall(Spider.name_pattern,html)
                hot = re.findall(Spider.hot_pattern,html)
                name_hot = {'name':name[0],'hot':hot[0]}

                #判断所有页面加载完成后跳出
                if n>0 and name[0] in names_hot[0]['name'] :
                    jump = 1
#                    print(name_hot)
                    break
                else:
                    names_hot.append(name_hot)
                    n += 1
                    p += 1
            print("第%s 页加载完成,共%s 位主播，正在加载下一页！"%(i,p))
            i += 1

            #判断 jump 值是否为 1，1 表示所有主播信息加载完成跳出
            if jump == 1 :
                print("所有页面加载完成！")
                time.sleep(3)
                break
#            print(i)
            time.sleep(0.5)
        return names_hot

    #根据人气值进行排序
    def __sort(self,names_hot):
        names_hot = sorted(names_hot,key=self.__sort_seed,reverse=True)
        return names_hot

    #对人气值中的“万”换成数字
    def __sort_seed(self,name_hot):
        num_hot = re.findall('\d*',name_hot['hot'])
        num_hot = float(num_hot[0])
        if '万' in name_hot['hot']:
            num_hot *= 10000
        Spider.all_hot += num_hot
        return num_hot

    #打印排序好的主播-人气值信息
    def __show(self,names_hot):
        n = 0
        for name_hot in names_hot:
            n +=1
            print('Num:%s 主播：%s  人气值：%s'%(n,name_hot['name'],name_hot['hot']))
        print('主播人数：%s   版块人气值：%s'%(n,Spider.all_hot))

    #主函数调用
    def go(self):
        names_hot = self.for_page()
        names_hot = self.__sort(names_hot)
        self.__show(names_hot)

spider = Spider()
spider.go()

您必须 登录 才能发表评论！

您必须登录才能发表评论！