python携程酒店评论_Python基于selenium爬取携程酒店评论
信息
爬取站点
任意⼀个携程酒店的详细链接,这⾥给出了四个,准备开四个线程爬取:
准备⼯作
Python版本:Python3.6
sunhonglei安装selenium模块:
pip3 install selenium
安装lxml模块:
pip3 install lxml
使⽤到的知识
selenium模块的使⽤史记中的故事
Xpath的使⽤
多线程
IO
代码
from selenium import webdriver
from selenium.webdrivermon.keys import Keys
from selenium.webdriver import ActionChains
import time
import os
from lxml import etree
import threading
import base64
# 爬取⼀个站点的评论信息
class ScrapyOne(object):
def __init__(self, url):
super(ScrapyOne, self).__init__()
# 站点url
self.url = url冷酷老婆啵一个
# 当前爬取的页
self.page = 1
# 评价总页数
self.allPage = 1
# 使⽤Chrome浏览器
self.driver = webdriver.Chrome()
# 重新点击次数
# 翻页
def nextPage(self):
try:
self.page += 1
# ⼀下的代码可能由于页⾯的刷新报错,因此这⾥使⽤了try-catch
# 清空页码输⼊框
self.driver.find_elements_by_id("cPageNum")[0].clear()
# 输⼊下⼀页的页码
self.driver.find_elements_by_id("cPageNum")[0].send_keys(self.page)
# 点击翻页按钮
self.driver.find_elements_by_id('cPageBtn')[0].click()
# 获取当前的页码
currentPage = int(self.driver.find_elements_by_xpath("//*/a[@class='current']/span")[ 0].text)
while currentPage != self.page:
# 这⾥的休眠⼗分重要,最好设置⼤⼀点,不然可能什么也爬不到
2020父亲节是哪天time.sleep(3)
self.driver.find_elements_by_id("cPageNum")[0].clear()
self.driver.find_elements_by_id(
"cPageNum")[0].send_keys(self.page)
self.driver.find_elements_by_id('cPageBtn')[0].click()
currentPage = int(self.driver.find_elements_by_xpath("//*/a[@class='current']/span")[ 0].text)
except Exception as e:
# 重试次数超过5次,结束翻页
Num == 0:
return False
# 如果不是爬到最后⼀页
if self.page <= self.allPage:
# 页码回滚
self.page -= 1
# 刷新当前页⾯
韩国自杀明星fresh()
time.sleep(3)
# 继续翻页
# 爬取评论信息
def scrapyComment(self):
data = etree.HTML(self.driver.page_source)
# 使⽤Xpath获取评价
for text in data.xpath("//*[@class='J_commentDetail']/text()"):
text = place('\n', '').replace('\r', 's')
if text:
# 保存
self.save(text)
print('[%s]\t%s\t第%s页爬取完成' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), self.url, self.page)) # 获取当前站点的评价信息总共有多少页
def getPage(self):
return int(self.driver.find_element_by_xpath(
"//*[@id='divCtripComment']/div[4]/div/div[1]/a[7]/span").text)
# 保存评价信息
def save(self, line):
with open('', 'a', encoding='UTF-8-sig') as f:
f.write(line + '\n')
def main(self):
# 这⾥会打开Chrome浏览器
(self.url)
self.allPage = Page()
# 点击查看评价的按钮
# self.driver.find_element_by_id('commentTab').click()
while self.page <= self.allPage:
# 爬取评价
self.scrapyComment()
# 翻页
print('爬取完成', self.url)
# 关闭当前浏览器窗⼝
self.driver.close()
# 爬取线程
class ScrapyThread(threading.Thread):
def __init__(self, url):
super(ScrapyThread, self).__init__()
张卫健 宣萱self.url = url
def run(self):
# 爬取⼀个站点
one = ScrapyOne(self.url)
one.main()
if __name__ == "__main__":
# 站点列表
urlList = ['ip/hotel/6278770.html#ctm_ref=hod_hp_hot_dl_n_2_7', 'ip/hotel/6657909.html#ctm_ref=hod_hp_hot_dl_n_2_8',
'ip/hotel/441351.html#ctm_ref=hod_hp_hot_dl_n_2_1',
'ip/hotel/5470972.html#ctm_ref=hod_hp_hot_dl_n_2_3']
for url in urlList:
# 创建爬取线程
thread = ScrapyThread(url)
thread.start()
程序运⾏结果:
查看保存的评论:
⼀⾏⼀条评论。
实测爬取速度⼤概⼀分钟三四百条评论,太快了会被携程forbidden,因此需要注意sleep。