学习 python,当然不能忘记 pyhon 的最优用处-爬虫,本次练习通过对斗鱼英雄联盟版块爬虫,获取英雄联盟版块下的所有主播名称和人气值,并根据人气值进行排行,打印出来。
from urllib import request
import re,time
class Spider():
# url = 'https://www.douyu.com/directory/myFollow'
url = 'https://www.douyu.com/g_LOL/?page='
#浏览器头信息
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
#主要正则表达式
finaly_pattern = 'shark-pager-next shark-pager-disable shark-pager-disable-next'
douyu_pattern = '<span class="dy-name ellipsis fl">([\s\S]*?)</p>'
name_pattern = '([\s\S]*?)</span>'
hot_pattern = '<span class="dy-num fr" >([\s\S]*?)</span>'
all_hot = 0
def for_page(self):
names_hot = []
i = 1
jump = 0
n = 0
#循环加载多页面信息
while True :
url = Spider.url+'%s'%i
p = 0
#模拟浏览器加载页面信息
req = request.Request(url=url, headers=Spider.headers)
r = request.urlopen(req)
htmls = r.read().decode()
douyu_html = re.findall(Spider.douyu_pattern,htmls)
for html in douyu_html:
#正则匹配主播和人气值
name = re.findall(Spider.name_pattern,html)
hot = re.findall(Spider.hot_pattern,html)
name_hot = {'name':name[0],'hot':hot[0]}
#判断所有页面加载完成后跳出
if n>0 and name[0] in names_hot[0]['name'] :
jump = 1
# print(name_hot)
break
else:
names_hot.append(name_hot)
n += 1
p += 1
print("第%s 页加载完成,共%s 位主播,正在加载下一页!"%(i,p))
i += 1
#判断 jump 值是否为 1,1 表示所有主播信息加载完成跳出
if jump == 1 :
print("所有页面加载完成!")
time.sleep(3)
break
# print(i)
time.sleep(0.5)
return names_hot
#根据人气值进行排序
def __sort(self,names_hot):
names_hot = sorted(names_hot,key=self.__sort_seed,reverse=True)
return names_hot
#对人气值中的“万”换成数字
def __sort_seed(self,name_hot):
num_hot = re.findall('\d*',name_hot['hot'])
num_hot = float(num_hot[0])
if '万' in name_hot['hot']:
num_hot *= 10000
Spider.all_hot += num_hot
return num_hot
#打印排序好的主播-人气值信息
def __show(self,names_hot):
n = 0
for name_hot in names_hot:
n +=1
print('Num:%s 主播:%s 人气值:%s'%(n,name_hot['name'],name_hot['hot']))
print('主播人数:%s 版块人气值:%s'%(n,Spider.all_hot))
#主函数调用
def go(self):
names_hot = self.for_page()
names_hot = self.__sort(names_hot)
self.__show(names_hot)
spider = Spider()
spider.go()

