import pandas as pd杜海涛沈梦辰同居
import jieba
data = pd.read_csv(r"E:\",sep = '\t',names=['label','text'])#read data and name the row
#print(data.head())
data['cut_message'] = data["text"].apply(lambda x:' '.join(jieba.cut(x)))#use ' ' cut the sentences into words
#print(data.head())gmail邮箱注册
x = data['cut_message'].values
y = data['label'].values
汪雨个人资料ss_validation import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.1)#test_size:train_size=1:9
from sklearn. import TfidfTransformer,CountVectorizer
vectorizer = CountVectorizer()
x_train_termcounts = vectorizer.fit_transform(train_x)
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_termcounts)
from sklearn.naive_bayes import GaussianNB,MultinomialNB
classifier = MultinomialNB().fit(x_train_tfidf,train_y)
x_input_termcounts = ansform(test_x)
x_input_tfidf = ansform(x_input_termcounts)
predicted_categories = classifier.predict(x_input_tfidf)
ics import accuracy_score#accurency_score
accuracy_score(test_y,predicted_categories)
#output some examples
category_map = {
0:'normal',
喜剧片1:'spam'
碧昂斯老公是谁}
for sentence,category,real in zip(test_x[:10],predicted_categories[:10],test_y[:10]):
print('\nmessage_content:',sentence,'\npredicted_type:',category_map[category],'real_values:',category_map[real])
,最后输出⼏个实例展⽰。(sklearn包⾥的函数使⽤还不太熟练,jieba还有pandas⼯具包以后要多加练习)正时链条
发布评论