您现在的位置是：首页 > 后端

当前栏目

python读取html中指定元素生成excle文件示例

Python 文件 HTML 示例生成读取指定元素

2023-06-13 09:15:24 时间

Python2.7编写的读取html中指定元素，并生成excle文件

复制代码代码如下:

#coding=gbk
importstring
importcodecs
importos,time
importxlwt
importxlrd
frombs4importBeautifulSoup
fromxlrdimportopen_workbook

classLogMsg:
       def__init__(self,logfile,Level=0):
               try:
                       importlogging
                       #self.logger=None
                       self.logger=logging.getLogger()
                       self.hdlr=logging.FileHandler(logfile)
                       formatter=logging.Formatter("[%(asctime)s]:%(message)s","%Y%m%d%H:%M:%S")
                       self.hdlr.setFormatter(formatter)
                       self.logger.addHandler(self.hdlr)
                       #logger.setLevel()
                       ifLevel==10:
                               self.logger.setLevel(logging.DEBUG)
                       elifLevel==20:
                               self.logger.setLevel(logging.INFO)
                       elifLevel==30:
                               self.logger.setLevel(logging.WARNING)
                       elifLevel==40:
                               self.logger.setLevel(logging.ERROR)
                       elifLevel==50:
                               self.logger.setLevel(logging.CRITICAL)
                       else:
                               self.logger.setLevel(logging.NOTSET)
               except:
                       print"loginiterror!"
                       exit(1)

       defoutput(self,logInfo):
               Level=self.logger.getEffectiveLevel()
               try:
                       ifLevel==10:
                               self.logger.debug(logInfo)
                       elifLevel==20:
                               self.logger.info(logInfo)
                       elifLevel==30:
                               self.logger.warning(logInfo)
                       elifLevel==40:
                               self.logger.error(logInfo)
                       elifLevel==50:
                               self.logger.critical(logInfo)
                       else:
                               self.logger.info(logInfo)
               except:
                       print"logoutputerror!"
                       exit(1)

       defclose(self):
               try:
               #logging.shutdown([self.hdlr])
                       self.logger.removeHandler(self.hdlr)
               except:
                       print"logclosederror!"
                       exit(1)

Logtime=time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime=time.strftime("%Y%m%d",time.localtime())
Logfile="/data/pyExample/logs/htmlparser_%s.log"%logFileTime
log=LogMsg(Logfile,20)

DATAPATH="/data/pyExample/"
XLSname="dangjian_"+Logtime+".xls"

if__name__=="__main__":

   wbk=xlwt.Workbook(encoding="gbk")
   sheet=wbk.add_sheet("基本内容导入模板")
   sheet.write(0,0,"内容类型")
   sheet.write(0,1,"栏目名称")
   sheet.write(0,2,"栏目编号")
   sheet.write(0,3,"内容名称")
   sheet.write(0,4,"时长")
   sheet.write(0,5,"关键字")
   sheet.write(0,6,"看点")
   sheet.write(0,7,"作者")
   sheet.write(0,8,"来源")
   sheet.write(0,9,"子内容1")
   sheet.write(0,10,"子内容2")
   xlsContent=[]
   files=os.listdir(DATAPATH)
   k=0
   forfinfiles:
       ifos.path.splitext(f)[1]==".html":
           content=[]
           log.output("当前文件："+f)
           htmlFile=codecs.open(DATAPATH+f,"r","gbk")
           lines=htmlFile.readlines()
           ifnotlines:
               log.output("notline")
           forlineinlines:
               ifline.strip()=="\n":
                   log.output("该处是空行")
               else:
                   line=line.replace(" ","")
                   soup =BeautifulSoup(line)
                   fortddinsoup.findAll("td"):
                       #printtdd.text.encode("gbk")
                       content.append(tdd.text.encode("gbk"))
               #printline.encode("gbk")
           htmlFile.close()
           foriincontent:
               printcontent.index(i),",",i
               log.output(i)
               log.output(content.index(i))
           print"----------------------------------------"

           folderName= content[6]
           contentName= content[4]
           duration=   filter(str.isdigit,content[16])
           int_duration=string.atoi(duration)*60
           str_duration="%i"%int_duration
           keyWord=    content[6]
           desciption= content[36]
           videoName_1=content[10]
           printfolderName
           printcontentName
           printstr_duration
           printkeyWord
           printdesciption
           printvideoName_1
           log.output("输出xls数据："+","+folderName+",,"+contentName+","+str_duration+","+keyWord+","+desciption+",管理员,华数编辑,"+videoName_1+",,")
           printk
           sheet.write(k+1,0,"")
           sheet.write(k+1,1,folderName)
           sheet.write(k+1,2,"")
           sheet.write(k+1,3,contentName)
           sheet.write(k+1,4,str_duration)
           sheet.write(k+1,5,keyWord)
           sheet.write(k+1,6,desciption)
           sheet.write(k+1,7,"管理员")
           sheet.write(k+1,8,"华数编辑")
           sheet.write(k+1,9,videoName_1)
           sheet.write(k+1,10,"")
           k+=1

   wbk.save(DATAPATH+XLSname)

   print"========================================="

猜你喜欢

轻松获取Redis Set（获取redis set）
javascriptAutoScroller函数类
redis学习之redis分布式（三）
ladon生成xmlrpc标准的webservice时报错解决办法详解编程语言
ORA-06564: object string does not exist ORACLE 报错故障修复远程处理
Java暂停/挂起线程（suspend()）和恢复线程（resume()）
AWS正尝试使用ChatGPT；BuzzFeed也因使用ChatGPT致股价上涨3倍；理想吉利纷纷重仓智能化丨每日大事件
Oracle默认的约束完美守护数据安全（oracle中的默认约束）
meyOracle Academy让下一代数据科学家持续受益（oracle acad）
Redis无需数据库即可运行（redis需要数据库吗）
ORA-39940: Child reference partitioned table string.string in tablespace string or parent table string.string in tablespace string is included but not both. ORACLE 报错故障修复远程处理

zl程序教程

当前栏目

python读取html中指定元素生成excle文件示例

相关文章