根据微博热搜爬取热搜内容以及相关博⽂的评论信息
#!/usr/bin/python3
# -*- coding:utf-8 -*-
from lxml import etree
import requests
def resouban(url):
# TODO 这个函数简单,就是获取热搜榜的信息
# 热搜的名称、url以及排序和热度(通过热度可以过滤置顶)
dic = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Mobile Safari/537.36",
'cookie': 'SINAGLOBAL=5702871423690.898.1595515471453; SCF=Ah2tNvhR8eWX01S-
DmF8uwYWORUbgfA0U3GnciJplYvqE1sn2zJtPdkJ9ork9dAVV8G7m-9kbF-PwIHsf3jHsUw.;
SUB=_2A25NDifYDeRhGeBK7lYS9ifFwjSIHXVu8UmQrDV8PUJbkNANLRmlkW1NR7rne18NXZNqVxsfD3DngazoVlT-Fvpf;
SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhhI1TcfcjnxZJInnV-kd405NHD95QcSh-Xe0q41K.RWs4DqcjQi--ciK.RiKLsi--Ni-24i-iWi--Xi-z4iKyFi--fi-2XiKLhSKeEeBtt; wvr=6; _s_tentry=www.sogou; UOR=,,www.sogou; Apache=9073188868783.379.1611369496580;
ULV=1611369496594:3:3:3:9073188868783.379.1611369496580:1611281802597;
webim_unReadCount=%7B%22time%22%3A1611369649613%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_grou p_notice%22%3A0%2C%22allcountNum%22%3A63%2C%22msgbox%22%3A0%7D'}
微博关闭评论resp = (url, headers=dic)
html = etree.)
divs = html.xpath('/html/body/div/section/ul')
resou_dic={}
resouxinxi_num=0
for i in range(0,60):
for div in divs:
text = div.xpath(f'./li[{i}]/a/span/text()')
if len(text)==0:
continue
text=text[0]
num=div.xpath(f'./li[{i}]/a/strong/text()')
hot_num=div.xpath(f'./li[{i}]/a/span/em/text()')
son_url=div.xpath(f'./li[{i}]/a/@href')
son_url='s.weibo'+son_url[0]
# print(num, text, hot_num, son_url)
resou_dic[f'{i}']=[num,text,hot_num,son_url]
with open('热搜数据.txt', 'a', encoding='utf-8') as f:
for i in resou_dic:
# print(i)
f.write(str(resou_dic[i])[1:] + '\n')
resouxinxi_num+=1
f.write('\n\n')
print('⼀共有热搜:'+str(resouxinxi_num)+'条')
print("输⼊你想要获取第⼏条热搜")
num = input()
return resou_dic[num][1],resou_dic[num][3]
def weiboxinxi(url):
# TODO 这个函数获取热搜博⽂就信息,不过url需要在热搜函数返回的url基础上进⾏拼接
dic = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Mobile Safari/537.36",
'cookie': 'SINAGLOBAL=5702871423690.898.1595515471453; SCF=Ah2tNvhR8eWX01S-
DmF8uwYWORUbgfA0U3GnciJplYvqE1sn2zJtPdkJ9ork9dAVV8G7m-9kbF-PwIHsf3jHsUw.;
SUB=_2A25NDifYDeRhGeBK7lYS9ifFwjSIHXVu8UmQrDV8PUJbkNANLRmlkW1NR7rne18NXZNqVxsfD3DngazoVlT-Fvpf;
SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhhI1TcfcjnxZJInnV-kd405NHD95QcSh-Xe0q41K.RWs4DqcjQi--ciK.RiKLsi--Ni-24i-iWi--Xi-z4iKyFi--fi-2XiKLhSKeEeBtt; wvr=6; _s_tentry=www.sogou; UOR=,,www.sogou; Apache=9073188868783.379.1611369496580;
ULV=1611369496594:3:3:3:9073188868783.379.1611369496580:1611281802597;
webim_unReadCount=%7B%22time%22%3A1611369649613%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_grou p_notice%22%3A0%2C%22allcountNum%22%3A63%2C%22msgbox%22%3A0%7D'}
p_notice%22%3A0%2C%22allcountNum%22%3A63%2C%22msgbox%22%3A0%7D'}
response = (url, headers=dic)
data=response.json()
# 导语:
# print(data['data']['cards'][0]['desc'])
with open('页⾯博⽂.txt','a',encoding='utf-8') as f:
f.write(data['data']['cards'][0]['desc']+'\n')
f.write('热度分析页⾯:'+data['data']['cards'][0]['scheme']+'\n\n')
# 热度分析呈现的连接(需要的话把这个页⾯的图⽚这种热度图也可以爬下来)
# print(data['data']['cards'][0]['scheme'])
xinxi_num=0
for i in data['data']['cards']:
if 'mblog' in i:
with open('页⾯博⽂.txt', 'a', encoding='utf-8') as f:
f.write("博主信息:"+'\n')
f.write('博主id:'+str(i['mblog']['user']['id'])+'\n')
f.write('博主博名:'+str(i['mblog']['user']['screen_name'])+'\n')
f.write('博主粉丝数:'+str(i['mblog']['user']['followers_count'])+'\n')
f.write('博⽂内容'+str(i['mblog']['text'])+'\n')
f.write('博⽂id:'+str(i['mblog']['id'])+'\t'+'发博时间:'+str(i['mblog']['created_at'])+'\t'+'评论数:'+str(i['mblog']['comments_count'])+'\t'                        +'点赞数:'+str(i['mblog']['attitudes_count'])+'\t'+'转发数:'+str(i['mblog']['reposts_count'])+'\n')
f.write('\n\n')
xinxi_num+=1
print("共有博⽂"+str(xinxi_num)+"条")
# print(i['mblog']['text'])# 页⾯显⽰什么,这⾥就给什么⽂本,所以⽂本太长会留下全⽂两个字,再就是url太长,
# TODO 有的是特有信息,可以根据场景再改进代码
def get_conmmtens():
#  TODO 获取某个博⽂的评论,通过对获取的微博信息得到微博id,完成url的拼接,然后获取评论
# TODO 其实不⽌是这⾥,每个地⽅都要注意,因为没有设置容错代码,所以输⼊的数字其实是有局限的,⽐如
#  说这⾥,由于每个页⾯能显⽰出来的博⽂数不⼀样,要是这⾥太⼤可能就提⽰关键字错误,因为超过这个数
#  ,可能就是其他形式的url,原来设置的url拼接规则就不合适了,那个url就可能没有这个信息,所以报错
# TODO 所以写了⼀个没有什么信息也可以容纳的代码(还要很多地⽅会报错,超索引是最常见的,就不修改了
#  因为做成Qt就不会有这种错误了)
f=open('页⾯博⽂.txt','r',encoding='utf-8')
ad()
xinxi=t.split('\n\n')
# TODO 这⾥t有⼤⼩限制以及输⼊限制
print('输⼊要获取第⼏条博⽂的评论')
t=eval(input())
# 取出博⽂id
bowen_id=xinxi[t].split('\n')[-1].split('\t')[0].split(':')[1]
# url='m.weibo/comments/hotflow?'+'id=4687951096713999&mid=4687951096713999'+'&max_id_type=0'
url='m.weibo/comments/hotflow?'+f'id={bowen_id}&mid={bowen_id}'+'&max_id_type=0'
commtents(url,bowen_id)
# #    TODO 调⽤评论函数
def commtents(url,id):
commtents_info={}
(url)
data=response.json()
# pprint.pprint(data)
try:
# print(data)
# print(url)
cards=data['data']['data']
except:
with open('评论数据.txt', 'a', encoding='utf-8') as f:
f.write('这个博⽂没有什么数据可以值得获取的'+f'm.weibo/detail/{id}'+'\n\n')
f.write('\n\n')
f.write('\n\n')
return
for c in cards:
# 有可能没有这个关键字,然后就会报错(但是通⽤的显⽰像是评论应该都是⼀样的)c.get('',None) 这样
# 没有关键字就是返回None
text=c['text'].split('<')[0]
commtents_info['text']=text
user=c['user']
commtents_info['user_id']=user['id']
commtents_info['name']= user['screen_name']
commtents_info['time']= c['created_at']
commtents_info['like']=c['like_count']
with open('评论数据.txt','a',encoding='utf-8') as f:
for i in commtents_info:
i=str(i)
f.write(i+":"+str(commtents_info[i])+'\n')
f.write('\n\n')
# TODO 写⼀下逻辑
# TODO 先获取热搜榜,再根据热搜榜得到每个热搜的url进⾏拼接,得到有博⽂信息的url,再获得博⽂的相关信息,
# TODO 然后获取博⽂id,再进⾏拼接,进⼊博⽂页⾯,得到博⽂评论信息
def main():
while 1:
# 做成界⾯还是更灵活
print('输⼊数字1获取微博热搜')
n=eval(input())
if n==1:
url='s.weibo/top/summary?cate=realtimehot'
son_text, son_url = resouban(url)
print(son_text)
resou_url = "m.weibo/api/container/getIndex?containerid=231522type%3D1%26q%3D" + son_url[son_url.find(                "=") + 1:son_url.rfind("&")] + '&page_type=searchall'
weiboxinxi(resou_url)
get_conmmtens()
main()