zl程序教程

您现在的位置是:首页 >  后端

当前栏目

python读取html中指定元素生成excle文件示例

Python文件HTML 示例 生成 读取 指定 元素
2023-06-13 09:15:24 时间

Python2.7编写的读取html中指定元素,并生成excle文件

复制代码代码如下:


#coding=gbk
importstring
importcodecs
importos,time
importxlwt
importxlrd
frombs4importBeautifulSoup
fromxlrdimportopen_workbook

classLogMsg:
       def__init__(self,logfile,Level=0):
               try:
                       importlogging
                       #self.logger=None
                       self.logger=logging.getLogger()
                       self.hdlr=logging.FileHandler(logfile)
                       formatter=logging.Formatter("[%(asctime)s]:%(message)s","%Y%m%d%H:%M:%S")
                       self.hdlr.setFormatter(formatter)
                       self.logger.addHandler(self.hdlr)
                       #logger.setLevel()
                       ifLevel==10:
                               self.logger.setLevel(logging.DEBUG)
                       elifLevel==20:
                               self.logger.setLevel(logging.INFO)
                       elifLevel==30:
                               self.logger.setLevel(logging.WARNING)
                       elifLevel==40:
                               self.logger.setLevel(logging.ERROR)
                       elifLevel==50:
                               self.logger.setLevel(logging.CRITICAL)
                       else:
                               self.logger.setLevel(logging.NOTSET)
               except:
                       print"loginiterror!"
                       exit(1)

       defoutput(self,logInfo):
               Level=self.logger.getEffectiveLevel()
               try:
                       ifLevel==10:
                               self.logger.debug(logInfo)
                       elifLevel==20:
                               self.logger.info(logInfo)
                       elifLevel==30:
                               self.logger.warning(logInfo)
                       elifLevel==40:
                               self.logger.error(logInfo)
                       elifLevel==50:
                               self.logger.critical(logInfo)
                       else:
                               self.logger.info(logInfo)
               except:
                       print"logoutputerror!"
                       exit(1)

       defclose(self):
               try:
               #logging.shutdown([self.hdlr])
                       self.logger.removeHandler(self.hdlr)
               except:
                       print"logclosederror!"
                       exit(1)

Logtime=time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime=time.strftime("%Y%m%d",time.localtime())
Logfile="/data/pyExample/logs/htmlparser_%s.log"%logFileTime
log=LogMsg(Logfile,20)


DATAPATH="/data/pyExample/"
XLSname="dangjian_"+Logtime+".xls"


if__name__=="__main__":
   

   wbk=xlwt.Workbook(encoding="gbk")
   sheet=wbk.add_sheet("基本内容导入模板")
   sheet.write(0,0,"内容类型")
   sheet.write(0,1,"栏目名称")
   sheet.write(0,2,"栏目编号")
   sheet.write(0,3,"内容名称")
   sheet.write(0,4,"时长")
   sheet.write(0,5,"关键字")
   sheet.write(0,6,"看点")
   sheet.write(0,7,"作者")
   sheet.write(0,8,"来源")
   sheet.write(0,9,"子内容1")
   sheet.write(0,10,"子内容2")
   xlsContent=[]  
   files=os.listdir(DATAPATH)
   k=0
   forfinfiles: 
       ifos.path.splitext(f)[1]==".html":
           content=[]
           log.output("当前文件:"+f)
           htmlFile=codecs.open(DATAPATH+f,"r","gbk")
           lines=htmlFile.readlines()
           ifnotlines:
               log.output("notline")
           forlineinlines:
               ifline.strip()=="\n":
                   log.output("该处是空行")
               else:
                   line=line.replace(" ","")
                   soup =BeautifulSoup(line)
                   fortddinsoup.findAll("td"): 
                       #printtdd.text.encode("gbk")
                       content.append(tdd.text.encode("gbk"))      
               #printline.encode("gbk")
           htmlFile.close()   
           foriincontent:
               printcontent.index(i),",",i
               log.output(i)
               log.output(content.index(i))
           print"----------------------------------------"
           

           folderName= content[6]
           contentName= content[4]      
           duration=   filter(str.isdigit,content[16])
           int_duration=string.atoi(duration)*60
           str_duration="%i"%int_duration
           keyWord=    content[6]
           desciption= content[36]
           videoName_1=content[10]
           printfolderName
           printcontentName
           printstr_duration
           printkeyWord
           printdesciption
           printvideoName_1
           log.output("输出xls数据:"+","+folderName+",,"+contentName+","+str_duration+","+keyWord+","+desciption+",管理员,华数编辑,"+videoName_1+",,")
           printk           
           sheet.write(k+1,0,"")
           sheet.write(k+1,1,folderName)
           sheet.write(k+1,2,"")
           sheet.write(k+1,3,contentName)
           sheet.write(k+1,4,str_duration)
           sheet.write(k+1,5,keyWord)
           sheet.write(k+1,6,desciption)
           sheet.write(k+1,7,"管理员")
           sheet.write(k+1,8,"华数编辑")
           sheet.write(k+1,9,videoName_1)
           sheet.write(k+1,10,"")
           k+=1

   wbk.save(DATAPATH+XLSname)       

   print"========================================="