python读取html中指定元素生成excle文件示例
Python2.7编写的读取html中指定元素,并生成excle文件
#coding=gbk
importstring
importcodecs
importos,time
importxlwt
importxlrd
frombs4importBeautifulSoup
fromxlrdimportopen_workbook
classLogMsg:
def__init__(self,logfile,Level=0):
try:
importlogging
#self.logger=None
self.logger=logging.getLogger()
self.hdlr=logging.FileHandler(logfile)
formatter=logging.Formatter("[%(asctime)s]:%(message)s","%Y%m%d%H:%M:%S")
self.hdlr.setFormatter(formatter)
self.logger.addHandler(self.hdlr)
#logger.setLevel()
ifLevel==10:
self.logger.setLevel(logging.DEBUG)
elifLevel==20:
self.logger.setLevel(logging.INFO)
elifLevel==30:
self.logger.setLevel(logging.WARNING)
elifLevel==40:
self.logger.setLevel(logging.ERROR)
elifLevel==50:
self.logger.setLevel(logging.CRITICAL)
else:
self.logger.setLevel(logging.NOTSET)
except:
print"loginiterror!"
exit(1)
defoutput(self,logInfo):
Level=self.logger.getEffectiveLevel()
try:
ifLevel==10:
self.logger.debug(logInfo)
elifLevel==20:
self.logger.info(logInfo)
elifLevel==30:
self.logger.warning(logInfo)
elifLevel==40:
self.logger.error(logInfo)
elifLevel==50:
self.logger.critical(logInfo)
else:
self.logger.info(logInfo)
except:
print"logoutputerror!"
exit(1)
defclose(self):
try:
#logging.shutdown([self.hdlr])
self.logger.removeHandler(self.hdlr)
except:
print"logclosederror!"
exit(1)
Logtime=time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime=time.strftime("%Y%m%d",time.localtime())
Logfile="/data/pyExample/logs/htmlparser_%s.log"%logFileTime
log=LogMsg(Logfile,20)
DATAPATH="/data/pyExample/"
XLSname="dangjian_"+Logtime+".xls"
if__name__=="__main__":
wbk=xlwt.Workbook(encoding="gbk")
sheet=wbk.add_sheet("基本内容导入模板")
sheet.write(0,0,"内容类型")
sheet.write(0,1,"栏目名称")
sheet.write(0,2,"栏目编号")
sheet.write(0,3,"内容名称")
sheet.write(0,4,"时长")
sheet.write(0,5,"关键字")
sheet.write(0,6,"看点")
sheet.write(0,7,"作者")
sheet.write(0,8,"来源")
sheet.write(0,9,"子内容1")
sheet.write(0,10,"子内容2")
xlsContent=[]
files=os.listdir(DATAPATH)
k=0
forfinfiles:
ifos.path.splitext(f)[1]==".html":
content=[]
log.output("当前文件:"+f)
htmlFile=codecs.open(DATAPATH+f,"r","gbk")
lines=htmlFile.readlines()
ifnotlines:
log.output("notline")
forlineinlines:
ifline.strip()=="\n":
log.output("该处是空行")
else:
line=line.replace(" ","")
soup =BeautifulSoup(line)
fortddinsoup.findAll("td"):
#printtdd.text.encode("gbk")
content.append(tdd.text.encode("gbk"))
#printline.encode("gbk")
htmlFile.close()
foriincontent:
printcontent.index(i),",",i
log.output(i)
log.output(content.index(i))
print"----------------------------------------"
folderName= content[6]
contentName= content[4]
duration= filter(str.isdigit,content[16])
int_duration=string.atoi(duration)*60
str_duration="%i"%int_duration
keyWord= content[6]
desciption= content[36]
videoName_1=content[10]
printfolderName
printcontentName
printstr_duration
printkeyWord
printdesciption
printvideoName_1
log.output("输出xls数据:"+","+folderName+",,"+contentName+","+str_duration+","+keyWord+","+desciption+",管理员,华数编辑,"+videoName_1+",,")
printk
sheet.write(k+1,0,"")
sheet.write(k+1,1,folderName)
sheet.write(k+1,2,"")
sheet.write(k+1,3,contentName)
sheet.write(k+1,4,str_duration)
sheet.write(k+1,5,keyWord)
sheet.write(k+1,6,desciption)
sheet.write(k+1,7,"管理员")
sheet.write(k+1,8,"华数编辑")
sheet.write(k+1,9,videoName_1)
sheet.write(k+1,10,"")
k+=1
wbk.save(DATAPATH+XLSname)
print"========================================="
相关文章
- 快速入门Python机器学习(25)
- vscode怎么快速生成html模板_vscode怎样新建HTML文件
- mt4 python_一个使用Python自动化交易外汇MT4脚本实现「建议收藏」
- python安装不了whl文件_Python安装whl文件过程图解
- python删除文件中指定内容
- python删除首行_Python删除文件第一行
- 【Python】python文件打开方式详解——a、a+、r+、w+、rb、rt区别[通俗易懂]
- 怎么用python打开csv文件_Python文本处理之csv-csv文件怎么打开[通俗易懂]
- 跟我一起学Python从入门到精通《PyInstaller制作可执行exe文件》
- 【说站】Python psd-tools如何转换文件
- 【说站】python两种不同的文件流读写
- 【说站】Python f-string字符串格式化的介绍
- 【说站】python字典如何遍历数据
- python pkl文件_Python字符串格式化输出的方式包括
- python线程池的使用
- python解析json文件
- PYTHON链家租房数据分析:岭回归、LASSO、随机森林、XGBOOST、KERAS神经网络、KMEANS聚类、地理可视化|附代码数据
- Python+chatGPT编程5分钟快速上手,强烈推荐!!!
- python-Python与PostgreSQL数据库-处理PostgreSQL查询结果
- Linux创建Python文件的步骤(linux新建python文件)
- 掌握Python访问MySQL的新技能(python访问mysql)
- 文件Linux 打开 HTML 文件的方法(linux打开html)
- Linux环境下安装Python(linux装python)
- 文件MySQL数据库存储HTML文件.(mysql存html)
- 文件MySQL 使用之禅 利用HTML文件实现完美保存(mysql保存html)
- 在Linux上运行Python脚本的简单指南(linux运行python)
- Linux环境实现HTML文件编辑(linux编辑html)
- 通过C++学习Python