爬取⽬标⽹页内容并保存在word
import os
郭台铭妻子import re
import docx
from docx.shared import RGBColor#设置字体
from docx import Document
from docx.shared import Pt#设置字体
l.ns import qn#设置中⽂字体
import urllib
import urllib.parse
quest
import sys
import time
from bs4 import BeautifulSoup
def getHtlm(url):
quest.urlopen(url)
soup=BeautifulSoup(page)
e=soup.select('title')
ee=e[0]
if ee.string[0:3]!='404':
return soup
else :
return 0
def getHtlmcode(url1):
htlm=getHtlm(url1)
if htlm!=0:
#soup=BeautifulSoup(htlm)
烟雨江湖攻略e=htlm.select('div[class=titArea]')
ee=e[0]
eee=ee.select('h2')
eeee=eee[0]
#print(eeee.string,end='/n')
a=htlm.select('div[class=detArea]')
aa=a[0]
aaa=aa.select('dd')
bb=aaa[1].string+':'#.sring乘号符号
王小利老婆李琳cc=aaa[3].string #.sring
gen=eeee.string+':'
消防安全警示语#print(bb.string,end='/n')
#print(cc.string)
if cc==None:
#print(gen+bb)
return gen+bb
else:
#print(gen+bb+cc)
return gen+bb+cc
else:
演员刘欢个人资料
return 0
f=open("企业.txt","r")#将⽂档和python放在⼀个⽬录下⽆需复杂的路径
i=0
document = Document()#新建word
p = document.add_paragraph('')#新建段落,这句话放在循环外⾯可以减少空⾏
while i<3:
print(i)
content = f.readline()
if not content:#判断是否已经是⽂件末尾
break
htlm=getHtlmcode(content)
time.sleep(1)
if htlm!=0:
if htlm!=0:
run = p.add_run(htlm+'\n')#输⼊关键字之前的字符  run.font.name=u'宋体' #设置插⼊的字体
run.font.size = Pt(15)
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
i+=1
'''
if i==100:
document.save('路线1.docx')
document = Document()#新建word
p = document.add_paragraph('')#100⾏保存⼀次  print(i)
elif i==200:
document.save('路线2.docx')
document = Document()#新建word
p = document.add_paragraph('')#200⾏保存⼀次  print(i)#定位问题的计数器
document.save('路线1.docx')#关闭保存word
f.close() #关闭TXT