zl程序教程

您现在的位置是:首页 >  后端

当前栏目

多线程爬虫批量下载pcgame图片url保存为xml的实现代码

2023-06-13 09:14:44 时间

复制代码代码如下:


#coding=gbk
fromxml.domimportminidom,Node
importurllib2,re,os
defreadsrc(src):
   try:
       url=urllib2.urlopen(src)
       content=url.read()#.decode("utf-8")
       returncontent
   except:
       print"error"
       returnNone
defpictype(content):
   """
   通过抓取网站导航栏,获得网站的图片类型
   返回列表,每个列表元素为一个字典,addr代表图片类型对于的链接,name代表图片类型的名称
   错误会返回None
   """
   p=re.compile(r"<ul>(.*)</ul>",re.S)
   r=p.search(content)
   ifr:
       content=r.group()
   else:
       printNone
   p=re.compile(r"<li\s*.*?>\s*<ahref*=*"(?P<addr>.*?)">(?P<name>.*?)\s*</a>\s*</li>")

   l=[i.groupdict()foriinp.finditer(content)]
   l=l[1:]
   iflen(l):returnl
   else:returnNone
defpageinfo(src):
   """
   获取一个页面的详细信息
   返回对于的字典列表
   name:图片的名字
   cutaddr:缩小的浏览图
   picaddr:实际图片的地址
   """
   d=os.path.split(src)[0]
   try:
       url=urllib2.urlopen(src)
       content=url.read()#.decode("utf-8")
   except:
       print"error"
       returnNone
   #findallthepicturesinfoinapage
   p=re.compile(r"<ul.*?>(.*?)</ul>",re.S)
   r=p.findall(content)
   ifnotr:returnNone
   r=r[1]
   p=re.compile(r"<li><ahref="(?P<picaddr>.*?)".*?><img.*?alt="(?P<name>.*?)"*src="(?P<cutaddr>.*?)"*/></a>.*?</li>")
   l=[i.groupdict()foriinp.finditer(r)]
   foriinl:
       i["picaddr"]=d+"/"+i["picaddr"]
   iflen(l):returnl
   else:returnNone

defnextpageaddr(src):
   """
   从页面的html源码中获取下一个页面地址的名称,最后一页返回None
   """
   content=readsrc(src)
   p=re.compile(r"<aclass="next"href="(.*?)">.*?</a>")
   r=p.search(content)
   ifr:
       returnos.path.dirname(src)+"/"+r.group(1)
   else:
       returnNone
defpicinfoaddr(src):
   """
   参数相册图集的html代码
   返回全部图片的相对地址
   """
   content=readsrc(src)
   p=re.compile(r"<divclass="picinfo">.*?<ahref="(?P<addr>.*?)".*?>.*?</div>",re.S)
   r=p.search(content)
   ifr:
       returnos.path.dirname(src)+"/"+r.group(1)
   else:
       returnNone
defparseinfo(content):
   """
   读取全部图片html代码,获得一个相册的详细信息
   kw:关键字
   title:标题
   type:类型
   pic:各个图片的地址列表,末尾加上_220x165,_medium,_small可以得到不同大小的图片
   """
   info={}
   temp=str()

   #title
   temp=""
   r=re.search("<h1>(.*?)</h1>",content)#getthepictitle
   ifr:
       temp=r.group(1)
   info["title"]=temp

   #keyword
   temp=""
   r=re.search("<metaname="keywords"content="(.*?)"/>",content)
   ifr:
       temp=r.group(1)
   info["kw"]=temp

   #type
   r=re.findall("<i><a.*?>(.*?)</a></i>.*?&gt",content)
   ifr:
       info["type"]=":".join(r)
   else:
       info["type"]=""
   r=re.search("<ulclass=".*?">(.*?)</ul>",content,re.S)
   ifnotr:returnNone
   content=r.group(1)#filtercontent
#   printcontent
   r=re.findall("<ahref=".*?<img.*?src="(.*?)".*?</a>",content)

   forindex,iinenumerate(r):
       r[index]=i[0:i.rfind("_")]
#       printr[index]
   info["pic"]=r
   returninfo
importthreading
classmthread(threading.Thread):
   def__init__(self,tp,addr,lock):
       threading.Thread.__init__(self)
#       self.doc=minidom.Document()
       self.doc=minidom.Document()
       self.tp=tp
       self.lock=lock
       self.addr=addr
       self.thread_stop=False
       self.picdoc=None
   defrun(self):
       self.picdoc=self.doc.createElement("urlclass")
#       printself.tp
       self.picdoc.setAttribute("type",self.tp)
#       self.doc.appendChild(self.picdoc)
       m=pageinfo(self.addr)
       whileself.addr:
           foriinm:
#               printi["picaddr"]
               picaddr=picinfoaddr(i["picaddr"])
#               printpicaddr
               info=parseinfo(readsrc(picaddr))
               name=info["title"]


               picture=doc.createElement("picture")

               title=doc.createElement("title")
               title.appendChild(doc.createTextNode(info["title"]))
               picture.appendChild(title)

               keyword=doc.createElement("keywords")
               keyword.appendChild(doc.createTextNode(info["kw"]))
               picture.appendChild(keyword)

               tp=doc.createElement("pictype")
               tp.appendChild(doc.createTextNode(info["type"]))
               picture.appendChild(tp)

               cuturl=doc.createElement("piccut")
               cuturl.appendChild(doc.createTextNode(i["cutaddr"]))
               picture.appendChild(cuturl)

               urls=doc.createElement("urls")
               self.lock.acquire()
               print"downloading",name
               self.lock.release()
               forpicurlininfo["pic"]:
                   singleurl=doc.createElement("url")
                   singleurl.appendChild(doc.createTextNode(picurl+".jpg"))
                   urls.appendChild(singleurl)

               picture.appendChild(urls)
               self.picdoc.appendChild(picture)
           m=pageinfo(self.addr)
           self.addr=nextpageaddr(self.addr)
#       f=open("c:\\"+self.tp+".xml","w")
#       f.write(doc.toprettyxml(indent=""))
#       f.close()
   defstop(self):
       self.thread_stop=True


path="C:\\pict\\"#下载的路径
#importsys
sys.exit(12)
content=readsrc("http://photos.pcgames.com.cn/cate/3/1.html")
r=pictype(content)
lt=[]
doc=minidom.Document()
root=doc.createElement("url_resource")
root.setAttribute("type","url")
root.setAttribute("urltype","image")
root.setAttribute("imgfmt","jpg")
doc.appendChild(root)
lock=threading.RLock()
foriaddrinr:
   print"downloadingtype:",iaddr["name"]
   addr=iaddr["addr"]
   th=mthread(iaddr["name"],addr,lock)
   lt.append(th)
   th.start()
fortinlt:
   t.join()
   root.appendChild(t.picdoc)

print"write"
f=open("c:\\"+"urls"+".xml","w")
f.write(doc.toprettyxml(indent=""))
f.close()
printdoc.toprettyxml()
print"end"