上⽹的全过程:
普通⽤户:
爬⾍程序:
模拟浏览器 --> 往⽬标站点发送请求 --> 接收响应数据 --> 提取有⽤的数据 --> 保存到本地/数据库。
爬⾍的过程:
1.发送请求(requests模块)
2.获取响应数据(服务器返回)
3.解析并提取数据(BeautifulSoup查或者re正则)
4.保存数据
request模块:
requests是python实现的简单易⽤的HTTP库,官⽹地址:/zh_CN/latest/
<(url)可以发送⼀个http get请求,返回服务器响应内容。
BeautifulSoup库:
BeautifulSoup 是⼀个可以从HTML或XML⽂件中提取数据的Python库。⽹址:adthedocs.io/zh_CN/v4.4.0/ BeautifulSoup⽀持Python标准库中的HTML解析器,还⽀持⼀些第三⽅的解析器,其中⼀个是 lxml。
BeautifulSoup(markup, "html.parser")或者BeautifulSoup(markup, "lxml"),推荐使⽤lxml作为解析器,因为效率更⾼。
⼀、爬取百度百科中《青春有你2》中所有参赛选⼿信息,返回页⾯数据
import json
import re
import requests
import datetime
from bs4 import BeautifulSoup
import os
#获取当天的⽇期,并进⾏格式化,⽤于后⾯⽂件命名,格式:20200420
today = day().strftime('%Y%m%d')
def crawl_wiki_data():
"""
爬取百度百科中《青春有你2》中参赛选⼿信息,返回html
"""
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }
url='baike.baidu/item/青春有你第⼆季'
try:
response = (url,headers=headers)
#将⼀段⽂档传⼊BeautifulSoup的构造⽅法,就能得到⼀个⽂档的对象, 可以传⼊⼀段字符串
soup = ,'lxml')
#返回的是class为table-view log-set-param的<table>所有标签
tables = soup.find_all('table',{'class':'table-view log-set-param'})
crawl_table_title ="参赛学员"
for table in tables:
#对当前节点前⾯的标签和字符串进⾏查
table_titles = table.find_previous('div').find_all('h3')
for title in table_titles:
if(crawl_table_title in title):
return table
except Exception as e:
print(e)
```
## ⼆、对爬取的页⾯数据进⾏解析,并保存为JSON⽂件
```python
def parse_wiki_data(table_html):
'''
从百度百科返回的html中解析得到选⼿信息,以当前⽇期作为⽂件名,存JSON⽂件,保存到work⽬录下 '''
bs = BeautifulSoup(str(table_html),'lxml')
all_trs = bs.find_all('tr')
error_list =['\'','\"']
stars =[]
for tr in all_trs[1:]:
all_tds = tr.find_all('td')
star ={}
#姓名
star["name"]=all_tds[0].text
#个⼈百度百科链接
star["link"]='baike.baidu'+ all_tds[0].find('a').get('href')
#籍贯
star["zone"]=all_tds[1].text
#星座
star["constellation"]=all_tds[2].text
#⾝⾼
star["height"]=all_tds[3].text
#体重
star["weight"]= all_tds[4].text
#花语,去除掉花语中的单引号或双引号
flower_word = all_tds[5].text
for c in flower_word:
if c in error_list:
flower_word=place(c,'')
star["flower_word"]=flower_word
#公司
if not all_tds[6].find('a')is None:
star["company"]= all_tds[6].find('a').text
else:
star["company"]= all_tds[6].text
stars.append(star)
json_data = json.loads(str(stars).replace("\'","\""))
with open('work/'+ today +'.json','w', encoding='UTF-8')as f:
json.dump(json_data, f, ensure_ascii=False)
三、爬取每个选⼿的百度百科图⽚,并进⾏保存
def crawl_pic_urls():
'''
爬取每个选⼿的百度百科图⽚,并保存
'''
with open('work/'+ today +'.json','r', encoding='UTF-8')as file:
json_array = json.ad())
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }
for star in json_array:
name = star['name']
link = star['link']
#请在以下完成对每个选⼿图⽚的爬取,将所有图⽚url存储在⼀个列表pic_urls中
pic_urls=[]
try:
青春有你2前九名#【1】读取选⼿百科页⾯
response_baike = (link,headers=headers)
soup_baike = BeautifulSoup(,'lxml')
#【2】定位选⼿相册标签内容,并到相册地址
summary_div = soup_baike.find_all('div',{'class':'summary-pic'})
a_label=summary_div[0].a
album_url='baike.baidu'+a_label['href']
#【3】遍历相册
response_album = (album_url,headers=headers)
soup_album = BeautifulSoup(,'lxml')
pir_list_label = soup_album.find('div',{'class':'pic-list'})
#【4.1】如不需要⾼清图可执⾏下了注释代码
# for imgz in pir_list_label.find_all('img'):
# pic_urls.append(imgz['src'])
#【4.2】获取⾼清图
for a in pir_list_label.find_all('a'):
big_pic_url='baike.baidu'+a['href'];
response_big_album = (big_pic_url,headers=headers)
soup_big_album = BeautifulSoup(response_,'lxml')
big_img = soup_big_album.find('img',{'id':'imgPicture'})
pic_urls.append(big_img['src'])
except Exception as e:
print(e)
#根据图⽚链接列表pic_urls, 下载所有图⽚,保存在以name命名的⽂件夹中
down_pic(name,pic_urls)
def down_pic(name,pic_urls):
'''
根据图⽚链接列表pic_urls, 下载所有图⽚,保存在以name命名的⽂件夹中, '''
path ='work/'+'pics/'+name+'/'
if not ists(path):
os.makedirs(path)
for i, pic_url in enumerate(pic_urls):
try:
pic = (pic_url, timeout=15)
string =str(i +1)+'.jpg'
with open(path+string,'wb')as f:
f.t)
print('成功下载第%s张图⽚: %s'%(str(i +1),str(pic_url))) except Exception as e:
print('下载第%s张图⽚时失败: %s'%(str(i +1),str(pic_url)))
print(e)
continue
四、打印爬取的所有图⽚的路径
def show_pic_path(path):
'''
遍历所爬取的每张图⽚,并打印所有图⽚的绝对路径
'''
pic_num =0
for(dirpath,dirnames,filenames)in os.walk(path):
for filename in filenames:
pic_num +=1
print("第%d张照⽚:%s"%(pic_num,os.path.join(dirpath,filename))) print("共爬取《青春有你2》选⼿的%d照⽚"% pic_num)
if __name__ =='__main__':
#爬取百度百科中《青春有你2》中参赛选⼿信息,返回html
html = crawl_wiki_data()
#解析html,得到选⼿信息,保存为json⽂件
parse_wiki_data(html)
#从每个选⼿的百度百科页⾯上爬取图⽚,并保存
crawl_pic_urls()
#打印所爬取的选⼿图⽚路径
show_pic_path('/home/aistudio/work/pics/')
print("所有信息爬取完成!")
发布评论