python-《青春有你2》选手信息爬取

python-《青春有你2》选⼿信息爬取

上⽹的全过程:

普通⽤户:

打开浏览器 --> 往⽬标站点发送请求 --> 接收响应数据 --> 渲染到页⾯上。

爬⾍程序:

模拟浏览器 --> 往⽬标站点发送请求 --> 接收响应数据 --> 提取有⽤的数据 --> 保存到本地/数据库。

爬⾍的过程：

1.发送请求（requests模块）

2.获取响应数据（服务器返回）

3.解析并提取数据（BeautifulSoup查或者re正则）

4.保存数据

request模块：

requests是python实现的简单易⽤的HTTP库，官⽹地址：/zh_CN/latest/

<(url)可以发送⼀个http get请求，返回服务器响应内容。

BeautifulSoup库：

BeautifulSoup 是⼀个可以从HTML或XML⽂件中提取数据的Python库。⽹址：adthedocs.io/zh_CN/v4.4.0/ BeautifulSoup⽀持Python标准库中的HTML解析器，还⽀持⼀些第三⽅的解析器，其中⼀个是 lxml。

BeautifulSoup(markup, "html.parser")或者BeautifulSoup(markup, "lxml")，推荐使⽤lxml作为解析器，因为效率更⾼。

⼀、爬取百度百科中《青春有你2》中所有参赛选⼿信息，返回页⾯数据

import json

import re

import requests

import datetime

from bs4 import BeautifulSoup

import os

#获取当天的⽇期，并进⾏格式化,⽤于后⾯⽂件命名，格式:20200420

today = day().strftime('%Y%m%d')

def crawl_wiki_data():

"""

爬取百度百科中《青春有你2》中参赛选⼿信息，返回html

"""

headers ={

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }

url='baike.baidu/item/青春有你第⼆季'

try:

response = (url,headers=headers)

#将⼀段⽂档传⼊BeautifulSoup的构造⽅法，就能得到⼀个⽂档的对象, 可以传⼊⼀段字符串

soup = ,'lxml')

#返回的是class为table-view log-set-param的<table>所有标签

tables = soup.find_all('table',{'class':'table-view log-set-param'})

crawl_table_title ="参赛学员"

for table in tables:

#对当前节点前⾯的标签和字符串进⾏查

table_titles = table.find_previous('div').find_all('h3')

for title in table_titles:

if(crawl_table_title in title):

return table

except Exception as e:

print(e)

```

## ⼆、对爬取的页⾯数据进⾏解析，并保存为JSON⽂件

```python

def parse_wiki_data(table_html):

'''

从百度百科返回的html中解析得到选⼿信息，以当前⽇期作为⽂件名，存JSON⽂件，保存到work⽬录下 '''

bs = BeautifulSoup(str(table_html),'lxml')

all_trs = bs.find_all('tr')

error_list =['\'','\"']

stars =[]

for tr in all_trs[1:]:

all_tds = tr.find_all('td')

star ={}

#姓名

star["name"]=all_tds[0].text

#个⼈百度百科链接

star["link"]='baike.baidu'+ all_tds[0].find('a').get('href')

#籍贯

star["zone"]=all_tds[1].text

#星座

star["constellation"]=all_tds[2].text

#⾝⾼

star["height"]=all_tds[3].text

#体重

star["weight"]= all_tds[4].text

#花语，去除掉花语中的单引号或双引号

flower_word = all_tds[5].text

for c in flower_word:

if c in error_list:

flower_word=place(c,'')

star["flower_word"]=flower_word

#公司

if not all_tds[6].find('a')is None:

star["company"]= all_tds[6].find('a').text

else:

star["company"]= all_tds[6].text

stars.append(star)

json_data = json.loads(str(stars).replace("\'","\""))

with open('work/'+ today +'.json','w', encoding='UTF-8')as f:

json.dump(json_data, f, ensure_ascii=False)

三、爬取每个选⼿的百度百科图⽚，并进⾏保存

def crawl_pic_urls():

'''

爬取每个选⼿的百度百科图⽚，并保存

'''

with open('work/'+ today +'.json','r', encoding='UTF-8')as file:

json_array = json.ad())

headers ={

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }

for star in json_array:

name = star['name']

link = star['link']

#请在以下完成对每个选⼿图⽚的爬取，将所有图⽚url存储在⼀个列表pic_urls中

pic_urls=[]

try:

青春有你2前九名#【1】读取选⼿百科页⾯

response_baike = (link,headers=headers)

soup_baike = BeautifulSoup(,'lxml')

#【2】定位选⼿相册标签内容，并到相册地址

summary_div = soup_baike.find_all('div',{'class':'summary-pic'})

a_label=summary_div[0].a

album_url='baike.baidu'+a_label['href']

#【3】遍历相册

response_album = (album_url,headers=headers)

soup_album = BeautifulSoup(,'lxml')

pir_list_label = soup_album.find('div',{'class':'pic-list'})

#【4.1】如不需要⾼清图可执⾏下了注释代码

# for imgz in pir_list_label.find_all('img'):

# pic_urls.append(imgz['src'])

#【4.2】获取⾼清图

for a in pir_list_label.find_all('a'):

big_pic_url='baike.baidu'+a['href'];

response_big_album = (big_pic_url,headers=headers)

soup_big_album = BeautifulSoup(response_,'lxml')

big_img = soup_big_album.find('img',{'id':'imgPicture'})

pic_urls.append(big_img['src'])

except Exception as e:

print(e)

#根据图⽚链接列表pic_urls, 下载所有图⽚，保存在以name命名的⽂件夹中

down_pic(name,pic_urls)

def down_pic(name,pic_urls):

'''

根据图⽚链接列表pic_urls, 下载所有图⽚，保存在以name命名的⽂件夹中, '''

path ='work/'+'pics/'+name+'/'

if not ists(path):

os.makedirs(path)

for i, pic_url in enumerate(pic_urls):

try:

pic = (pic_url, timeout=15)

string =str(i +1)+'.jpg'

with open(path+string,'wb')as f:

f.t)

print('成功下载第%s张图⽚: %s'%(str(i +1),str(pic_url))) except Exception as e:

print('下载第%s张图⽚时失败: %s'%(str(i +1),str(pic_url)))

print(e)

continue

四、打印爬取的所有图⽚的路径

def show_pic_path(path):

'''

遍历所爬取的每张图⽚，并打印所有图⽚的绝对路径

'''

pic_num =0

for(dirpath,dirnames,filenames)in os.walk(path):

for filename in filenames:

pic_num +=1

print("第%d张照⽚：%s"%(pic_num,os.path.join(dirpath,filename))) print("共爬取《青春有你2》选⼿的%d照⽚"% pic_num)

if __name__ =='__main__':

#爬取百度百科中《青春有你2》中参赛选⼿信息，返回html

html = crawl_wiki_data()

#解析html，得到选⼿信息，保存为json⽂件

parse_wiki_data(html)

#从每个选⼿的百度百科页⾯上爬取图⽚，并保存

crawl_pic_urls()

#打印所爬取的选⼿图⽚路径

show_pic_path('/home/aistudio/work/pics/')

print("所有信息爬取完成！")

python-《青春有你2》选手信息爬取

发布评论取消回复

最近发表

热门文章

标签列表