学习 python,当然不能忘记 pyhon 的最优用处-爬虫,本次练习通过对斗鱼英雄联盟版块爬虫,获取英雄联盟版块下的所有主播名称和人气值,并根据人气值进行排行,打印出来。
from urllib import request import re,time class Spider(): # url = 'https://www.douyu.com/directory/myFollow' url = 'https://www.douyu.com/g_LOL/?page=' #浏览器头信息 headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} #主要正则表达式 finaly_pattern = 'shark-pager-next shark-pager-disable shark-pager-disable-next' douyu_pattern = '<span class="dy-name ellipsis fl">([\s\S]*?)</p>' name_pattern = '([\s\S]*?)</span>' hot_pattern = '<span class="dy-num fr" >([\s\S]*?)</span>' all_hot = 0 def for_page(self): names_hot = [] i = 1 jump = 0 n = 0 #循环加载多页面信息 while True : url = Spider.url+'%s'%i p = 0 #模拟浏览器加载页面信息 req = request.Request(url=url, headers=Spider.headers) r = request.urlopen(req) htmls = r.read().decode() douyu_html = re.findall(Spider.douyu_pattern,htmls) for html in douyu_html: #正则匹配主播和人气值 name = re.findall(Spider.name_pattern,html) hot = re.findall(Spider.hot_pattern,html) name_hot = {'name':name[0],'hot':hot[0]} #判断所有页面加载完成后跳出 if n>0 and name[0] in names_hot[0]['name'] : jump = 1 # print(name_hot) break else: names_hot.append(name_hot) n += 1 p += 1 print("第%s 页加载完成,共%s 位主播,正在加载下一页!"%(i,p)) i += 1 #判断 jump 值是否为 1,1 表示所有主播信息加载完成跳出 if jump == 1 : print("所有页面加载完成!") time.sleep(3) break # print(i) time.sleep(0.5) return names_hot #根据人气值进行排序 def __sort(self,names_hot): names_hot = sorted(names_hot,key=self.__sort_seed,reverse=True) return names_hot #对人气值中的“万”换成数字 def __sort_seed(self,name_hot): num_hot = re.findall('\d*',name_hot['hot']) num_hot = float(num_hot[0]) if '万' in name_hot['hot']: num_hot *= 10000 Spider.all_hot += num_hot return num_hot #打印排序好的主播-人气值信息 def __show(self,names_hot): n = 0 for name_hot in names_hot: n +=1 print('Num:%s 主播:%s 人气值:%s'%(n,name_hot['name'],name_hot['hot'])) print('主播人数:%s 版块人气值:%s'%(n,Spider.all_hot)) #主函数调用 def go(self): names_hot = self.for_page() names_hot = self.__sort(names_hot) self.__show(names_hot) spider = Spider() spider.go()