python爬⾍笔记⼀:爬取⾖瓣中指定的明星所有图⽚------------------------------------------------------------------------
打开页⾯F12进⼊开发者⼯具,查看 下载的页数,以及每页最多显⽰30张
a_list=content.find_all('div',attrs={'class','cover'}) #获取⽹页中的所有a标签对象
杭州旅游景点推荐picture_list = []
for d in a_list:
plist=d.find('img')['src']
picture_list.append(plist)
获取(共348张)并根据正规则获取纯数字:348
clist = content.find('span', attrs={'class', 'count'}) # 获取
ret= re.findall(r'\d+', _text())
写得太痛苦,直接放完整代码,⾃⾏看看理解就是了:
import re
import time
import requests
import os
from bs4 import BeautifulSoup
import lxml
#j时⼀定要加headers,否则会获取为空,只需要保留'User-Agent'⼀项即可
headers={
# 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
# 'Cache-Control':'max-age=0',
# 'Connection':'keep-alive',
# 'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=22369 # 'Host':'movie.douban',
# 'sec-ch-ua':'" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
# 'sec-ch-ua-mobile':'?0',
# 'Sec-Fetch-Dest':'document',
# 'Sec-Fetch-Mode':'navigate',
# 'Sec-Fetch-Site':'none',
# 'Sec-Fetch-User':'?1',
# 'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 设置post的数据
data = {}
def get_poster_url(res):
content=,'lxml') #将⽹页源码构造成BeautifulSoup对象,⽅便操作
#content=,'html.parser') #将⽹页源码构造成BeautifulSoup对象,⽅便操作
a_list=content.find_all('div',attrs={'class','cover'}) #获取⽹页中的所有a标签对象
picture_list = []
for d in a_list:
plist=d.find('img')['src']
picture_list.append(plist)
return picture_list;
def getCount(id,session):
url = 'movie.douban/celebrity/{0}/photos/'.format(id)
#res = (url=url, headers=headers)
res = (url=url,data=data, headers=headers)
content = , 'lxml')
#clists = content.find_all('span', attrs={'class', 'count'}) # 获取
#[_text() for span in clists]
clist = content.find('span', attrs={'class', 'count'}) # 获取
姜艺声ret= re.findall(r'\d+', _text())
if len(ret)>0:
return [res,int(ret[0])]
else:
return [res,0]
def fire(mc,id,session):
res,pagenums=getCount(id,session)
if pagenums==0:
return
page=0
for i in range(0,pagenums,30):
#print("\n开始爬取⼭⼝百惠{}页\n".format(page))
#url='movie.douban/celebrity/1014823/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
#print("开始爬取刘涛{}页\n".format(page))
#url='movie.douban/celebrity/1011562/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
print("\n开始爬取{0}{1}页:{2}~{3}张/共{4}张\n".format(mc,page+1,page*30+1,(page+1)*30,pagenums))
url='movie.douban/celebrity/{0}/photos/?type=C&start={1}&sortby=like&size=a&subtype=a'.format(id,i)
if i>0:
res = session.post(url=url,data=data, headers=headers)
#(url=url,headers=headers)
piclist=get_poster_url(res)
download_picture(piclist,session)
page=page+1
time.sleep(1)
def download_picture(pic_l,session):
if not ists(r'picture'):
os.mkdir(r'picture')
明星素颜照对比for i in pic_l:
print("\r开始下载图⽚{0}".format(i))
#(i)
(i)
p_name=i.split('/')[7]
杨紫李现合体大片with open('picture\\'+p_name,'wb') as f:明晓溪经典语录
f.t)
刚小希 波斗门mxarr=[('⼭⼝百惠','1014823'),('刘涛','1011562')]
if __name__ == '__main__':
# 创建⼀个session,Session是requests库中的⼀个类,创建session对象进⾏访问的好处是,session对象能够⾃动维护访问的cookies信息。当然,它是不具备执⾏ session = requests.Session()
for i,k in mxarr:
fire(i,k,session)
发布评论