爬取youtube视频评论并进⾏情感分析
爬取视频评论并进⾏情感分析
⼀、参考代码
⼆、修改后的代码(本⽂讲述的代码,是根据视频的地址来爬取的)
三、再次升级后代码(可以根据关键字来爬取,只需要将key改为⾃⼰的google 即可)
四、申请google api
五、爬取评论(可以选择爬取多少赞以上的评论,需要科学上⽹)
import lxml
import requests
import time
import sys
import progress_bar as PB
import json
YOUTUBE_IN_LINK ='leapis/youtube/v3/commentThreads?part=snippet&maxResults=100&order=relevance&pageToken={pageTok YOUTUBE_LINK ='leapis/youtube/v3/commentThreads?part=snippet&maxResults=100&order=relevance&videoId={videoId}&key={k key ='key'#改为⾃⼰申请的google api
def commentExtract(videoId, count =-1):
print("\nComments downloading")
#关闭http连接,增加重连次数
陈宝莲的儿子page_info = (YOUTUBE_LINK.format(videoId = videoId, key = key))
while page_info.status_code !=200:
if page_info.status_code !=429:
print("Comments disabled")
time.sleep(20)
page_info = (YOUTUBE_LINK.format(videoId = videoId, key = key))
page_info = page_info.json()
#test
# print(page_info)
comments =[]
co =0;
for i in range(len(page_info['items'])):
#对3000赞以上的评论进⾏保留,可以根据需求更改
if page_info['items'][i]['snippet']['topLevelComment']['snippet']['likeCount']>=3000:
comments.append(page_info['items'][i]['snippet']['topLevelComment']['snippet']['textOriginal'])
co +=1
if co == count:
PB.progress(co, count, cond =True)
return comments
PB.progress(co, count)
# INFINTE SCROLLING
while'nextPageToken'in page_info:
六、对评论进⾏情感对积极和消极评论进⾏分类,使⽤的是分类器,并计算积极评论和消极评论的占⽐  temp
= page_info  page_info = requests .get (YOUTUBE_IN_LINK .format (videoId = videoId , key = key , pageToken = page_info ['nextPageToken']))
while  page_info .status_code != 200:
time .sleep (20)
page_info = requests .get (YOUTUBE_IN_LINK .format (videoId = videoId , key = key , pageToken = temp ['nextPageToken']))
page_info = page_info .json ()
for  i in  range (len (page_info ['items'])):
comments .append (page_info ['items'][i ]['snippet']['topLevelComment']['snippet']['textOriginal'])
co += 1
if  co == count :
PB .progress (co , count , cond = True )
return  comments
PB .progress (co , count )
PB .progress (count , count , cond = True )
print  ()
return  comments
import  training_classifier as  tcl
from  nltk .corpus import  stopwords
from  nltk .tokenize import  word_tokenize
import  os .path
import  pickle
from  statistics import  mode
from  nltk .classify import  ClassifierI
from  nltk .metrics import  BigramAssocMeasures
from  nltk .collocations import  BigramCollocationFinder as  BCF
import  itertools
from  nltk .classify import  NaiveBayesClassifier
def  features (words ):
temp = word_tokenize (words )
尚雯婕老公个人资料words = [temp [0]]
for  i in  range (1, len (temp )):
if (temp [i ] != temp [i -1]):
words .append (temp [i ])
scoreF = BigramAssocMeasures .chi_sq
#bigram count
n = 150
bigrams = BCF .from_words (words ).nbest (scoreF , n )
return  dict ([word ,True ] for  word in  itertools .chain (words , bigrams ))
class  VoteClassifier (ClassifierI ):
def  __init__(self , *classifiers ):
self .__classifiers = classifiers
def  classify (self , comments ):
votes = []
for  c in  self .__classifiers :
for c in self.__classifiers: Array  v = c.classify(comments)
votes.append(v)
con = mode(votes)
choice_votes = unt(mode(votes))
conf =(1.0* choice_votes)/len(votes)
return con, conf
def sentiment(comments):
if not os.path.isfile('classifier.pickle'):
fl =open('classifier.pickle','rb')
classifier = pickle.load(fl)
fl.close()
pos =0
neg =0
for words in comments:
# print(words)
comment = features(words)
sentiment_value, confidence = VoteClassifier(classifier).classify(comment)
if sentiment_value =='positive':# and confidence * 100 >= 60:
李连杰打甄子丹pos +=1
else:
neg +=1
print("\nPositive sentiment : ",(pos *100.0/len(comments)))
print("\nNegative sentiment : ",(neg *100.0/len(comments)))
七、对评论进⾏分词,词频⾼的通过进⾏可视化
⼋、调⽤
运⾏driver.py
import  string
from  nltk .corpus import  stopwords
from  nltk .tokenize import  word_tokenize
from  wordcloud import  WordCloud
import  matplotlib .pyplot as  plt
import  nltk
# nltk.download('stopwords')
# nltk.download('punkt')
def  fancySentiment (comments ):
空调不制冷是什么原因
stopword = set (stopwords .words ('english') + list (string .punctuation ) + ['n\'t'])
filtered_comments = []
for  i in  comments :
words = word_tokenize (i )
temp_filter = ""
for  w in  words :
if  w not  in  stopword :
temp_filter += str (w )
temp_filter += ' '
filtered_comments .append (temp_filter )
filtered_comments_str = ' '.join (filtered_comments )
sentiment = WordCloud (background_color = 'orange', max_words =100)
sentiment .generate (filtered_comments_str )
# with open('','w',encoding='utf-8') as f:
#  f.write(ate(filtered_comments_str)))
plt .figure ()
plt .imshow (sentiment )
家庭蛋糕的制作方法plt .axis ("off")
plt .subplots_adjust (top =1, bottom =0, right =1, left =0, hspace =0, wspace =0)
plt .margins (0, 0)
plt .savefig ("final.png",dpi =300)
plt .show ()
九、本项⽬的特点
1.可以爬取多个,并对爬取的所有评论进⾏整体分析,弥补了原程序只能⼀个⽹址⼀个⽹址爬的缺陷import
comment_downloader as  CD
import  fancySentiment as  FS
# import sys
# sys.path.append('E:/爬取utube 评论/YouTube-Sentiment-Analysis/CommentSentiment/')
import  sentimentYouTube as  SYT
import  requests
import  json
def  main ():
# EXAMPLE videoID = 'tCXGJQYZ9JA'
# videoId = input("Enter the videoID : ")
videoId_all = ['FWMIPukvdsQ','QHTnuI9IKBA','LTejJnrzGPM','_jUJrIWp2I4','OrXiXDUQia8','wUJ-57SAE5A','Yx4JnDez1sk','fhkE3e7lT_g','K92fPB3lKCc' #将你想要爬取视频评论的id 放⼊其中
# Fetch the number of comments
# if count = -1, fetch all comments
# count = int(input("Enter the no. of comment to extract : "))
count = 2000
#count 为每个地址想要爬取评论的数⽬
comments = []
with  open ('verified_proxies.json', encoding ='utf-8') as  f :
# for line in f:
a = json .load (f )
#我在⾥⾯放了代理ip ,防⽌ip 被封,每爬⼀个⽹址换⼀个代理ip ,运⾏ip.py 可以获得代理ip
# final[a['type']] = a['host']+':'+a['port']
for  videoId in  videoId_all :
requests .adapters .DEFAULT_RETRIES = 20
s = requests .session ()
flag = 0
会计学专业实习报告# s.proxies = {"http": "27.152.8.152:9999", "https": "117.57.91.131:24978"}
s .keep_alive = False
s .proxies = {a [flag ]['type']:str (a [flag ]['host'])+':'+str (a [flag ]['port'])}
flag = flag +1
comments = comments + CD mentExtract (videoId , count )
# print(comments)
with  open ('','w',encoding ='utf-8') as  f :
for  i in  comments :
f .write (i +'\n')#将爬取的评论记录下来(这些评论是经过筛选的,是超过设置的赞的阈值的评论)
SYT .sentiment (comments )
FS .fancySentiment (comments )
if  __name__ == '__main__':
main ()