zl程序教程

您现在的位置是:首页 >  后端

当前栏目

Python通过解析网页实现看报程序的方法

Python网页方法程序 实现 解析 通过
2023-06-13 09:15:40 时间

本文所述实例可以实现基于Python的查看图片报纸《参考消息》并将当天的图片报纸自动下载到本地供查看的功能,具体实现代码如下:

#coding=gbk
importurllib2
importsocket
importre
importtime
importos

#timeoutinseconds
#timeout=10
#socket.setdefaulttimeout(timeout)
timeout=10
urllib2.socket.setdefaulttimeout(timeout)

home_url="http://www.hqck.net"
home_page=""
try:
home_page_context=urllib2.urlopen(home_url)
home_page=home_page_context.read()

print"Readhomepagefinishd."
print"-------------------------------------------------"
excepturllib2.URLError,e:
printe.code
exit()
except:
printe.code
exit()

reg_str=r"<aclass="item-baozhi"href="/arc/jwbt/ckxx/\d{4}/\d{4}/\w+\.html"rel="externalnofollow"><spanclass.+>.+</span></a>"

news_url_reg=re.compile(reg_str)

today_cankao_news=news_url_reg.findall(home_page)

iflen(today_cankao_news)==0:
print"Cannotfindtoday"snews!"
exit()

my_news=today_cankao_news[0]
print"Latestnewslink="+my_news
print

url_s=my_news.find("/arc/")
url_e=my_news.find(".html")
url_e=url_e+5

print"Linkindex=["+str(url_s)+","+str(url_e)+"]"
my_news=my_news[url_s:url_e]
print"parturl="+my_news

full_news_url=home_url+my_news
print"fullurl="+full_news_url
print

image_folder="E:\\new_folder\\"

if(os.path.exists(image_folder)==False):
os.makedirs(image_folder)
today_num=time.strftime("%Y-%m-%d",time.localtime(time.time()))
image_folder=image_folder+today_num+"\\"
if(os.path.exists(image_folder)==False):
os.makedirs(image_folder)
print"Newsimagefolder="+image_folder
print

context_uri=full_news_url[0:-5]

first_page_url=context_uri+".html"
try:
first_page_context=urllib2.urlopen(first_page_url)
first_page=first_page_context.read()
excepturllib2.HTTPError,e:
printe.code
exit()

tot_page_index=first_page.find("共")
tot_page_index=tot_page_index

tmp_str=first_page[tot_page_index:tot_page_index+10]
end_s=tmp_str.find("页")

page_num=tmp_str[2:end_s]
printpage_num

page_count=int(page_num)
print"Total"+page_num+"pages:"
print

page_index=1
download_suc=True
whilepage_index<=page_count:
page_url=context_uri
ifpage_index>1:
page_url=page_url+"_"+str(page_index)
page_url=page_url+".html"
print"Newspagelink="+page_url

try:
news_img_page_context=urllib2.urlopen(page_url)
excepturllib2.URLError,e:
printe.reason
download_suc=False
break

news_img_page=news_img_page_context.read()

#f=open("e:\\page.html","w")
#f.write(news_img_page)
#f.close()

reg_str=r"http://image\S+jpg"
image_reg=re.compile(reg_str)
image_results=image_reg.findall(news_img_page)
iflen(image_results)==0:
print"Cannotfindnewspage"+str(page_index)+"!"
download_suc=False
break

image_url=image_results[0]

print"Newsimageurl="+image_url
news_image_context=urllib2.urlopen(image_url)

image_name=image_folder+"page_"+str(page_index)+".jpg"
imgf=open(image_name,"wb")
print"Gettingimage..."
try:
whileTrue:
date=news_image_context.read(1024*10)
ifnotdate:
break
imgf.write(date)
imgf.close()
except:
download_suc=False
print"Saveimage"+str(page_index)+"failed!"
print"Unexpectederror:"+sys.exc_info()[0]+sys.exc_info()[1]
else:
print"Saveimage"+str(page_index)+"succeed!"
print
page_index=page_index+1

ifdownload_suc==True:
print"Newsdownloadsucceed!Path=\""+str(image_folder)+"\""
print"Enjoyit!^^"
else:
print"newsdownloadfailed!"