zl程序教程

您现在的位置是:首页 >  后端

当前栏目

python通过urllib2爬网页上种子下载示例

Python网页下载 示例 通过 种子 urllib2
2023-06-13 09:15:18 时间

通过urllib2、re模块抓种子

思路

1.用程序登录论坛(如果需要登录才能访问的版块)

2.访问指定版块

3.遍历帖子(先取指定页,再遍历页面所有帖子的url)

4.循环访问所有帖子url,从帖子页面代码中取种子下载地址(通过正则表达式或第三方页面解析库)

5.访问种子页面下载种子

复制代码代码如下:


importurllib
importurllib2
importcookielib
importre
importsys
importos

#siteiswebsiteaddress|fidispartid
site="http://xxx.yyy.zzz/"
source="thread0806.php?fid=x&search=&page="

btSave="./clyzwm/"
ifos.path.isdir(btSave):
 printbtSave+"existing"
else:
 os.mkdir(btSave)

logfile="./clyzwm/down.log"
errorfile="./clyzwm/error.log"
sucfile="./clyzwm/sucess.log"

headers={"User-Agent":"Mozilla/5.0(Macintosh;IntelMacOSX10_9_1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/32.0.1700.77Safari/537.36", 
          "Referer":"http://xxx.yyy.zzz/"}

defbtDown(url,dirPath):
 logger(logfile,"downloadfile:"+url)
 try:
  #pageCode=urllib2.urlopen(url).read()
  #printpageCode
  btStep1=re.findall("http://[\w]+\.[\w]+\.[\w]{0,4}/[\w]{2,6}\.php\?[\w]{2,6}=([\w]+)",url,re.I)
  #printbtStep1
  iflen(btStep1)>0:
   ref=btStep1[0]
   downsite=""
   downData={}
   iflen(ref)>20:
    downsite=re.findall("http://www.[\w]+\.[\w]+/",url)[0]
    downsite=downsite+"download.php"
    reff=re.findall("input\stype=\"hidden\"\sname=\"reff\"\svalue=\"([\w=]+)\"",urllib2.urlopen(url).read(),re.I)[0]
    downData={"ref":ref,"reff":reff,"submit":"download"}
   else:
    downsite="http://www.downhh.com/download.php"
    downData={"ref":ref,"rulesubmit":"download"}
   #print"btsite-"+ downsite+"\ndownData:"
   #printdownData
   downData=urllib.urlencode(downData)
   downReq=urllib2.Request(downsite,downData)
   downReq.add_header("User-Agent","Mozilla/5.0(Macintosh;IntelMacOSX10_9_1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/32.0.1700.77Safari/537.36")
   downPost=urllib2.urlopen(downReq)
   stream=downPost.read(-1)
   if(len(stream)>1000):
    downPost.close()
    name=btStep1[0]+".torrent"
    fw=open(dirPath+name,"w")
    fw.write(stream)
    fw.close()
    logger(sucfile,url+"\n")
   else:
    logger(errorfile,url+"\n")
 excepturllib2.URLError,e:
  printe.reason

deflogger(logfile,msg):
 printmsg
 fw=open(logfile,"a")
 fw.write(msg)
 fw.close()

foriinrange(1,1000):
 logger(logfile,"\n\n\n@page"+str(i)+"...")
 part=site+source+str(i)

 content=urllib2.urlopen(part).read()
 content=content.decode("gbk").encode("utf8")
 #printcontent

 pages=re.findall("<a\s+href=\"(htm_data/[\d]+/[\d]+/[\d]+\.html).*?<\/a>",content,re.I)
 #printpages

 forpageinpages:
  page=site+page;
  #logger(logfile,"\n#visiting"+page+"...")
  pageCode=urllib2.urlopen(page).read()
  #printpageCode
  zzJump=re.findall("http://www.viidii.info/\?http://[\w]+/[\w]+\?[\w]{2,6}=[\w]+",pageCode)  
  #zzJump=re.findall("http://www.viidii.info/\?http://[\w/\?=]*",pageCode)
  iflen(zzJump)>0:
   zzJump=zzJump[0]
   #print"-jumppage-"+zzJump
   pageCode=urllib2.urlopen(page).read()
   zzPage=re.findall("http://[\w]+\.[\w]+\.[\w]+/link[\w]?\.php\?[\w]{2,6}=[\w]+",pageCode)
   iflen(zzPage)>0:
    zzPage=zzPage[0]
    logger(logfile,"\n-zhongzipage-"+zzPage)
    btDown(zzPage,btSave)
   else:
    logger(logfile,"\n.NOTFOUND.")
  else:
   logger(logfile,"\n...NOTFOUND...")
  zzPage=re.findall("http://[\w]+\.[\w]+\.[\w]+/link[\w]?\.php\?ref=[\w]+",pageCode)