Java实现PDF读取
在ppm.xml中引用pdfbox
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.cicdtest</groupId>
<artifactId>cicdtest</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>cicdtest</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.26</version>
</dependency>
</dependencies>
</project>
package pdf.txt;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class PDF {
public static void main(String[] args){
String path="";
path = "D:/Project/e-Statement/estatement_pdf/2021_07_09_estatement/8000054710_est_9ec4a09254a67c1690837ef62f64f9e9.pdf";
PDF p = new PDF();
String content= p.Get_PDF_Content(path);
System.out.println(content);
p.save_result_to_txt(content);
}
public String save_result_to_txt (String content) {
String filepath="D:/PDF.txt";
try {
File file = new File(filepath);
if (!file.exists()) {
file.createNewFile();
}
FileOutputStream outStream = new FileOutputStream(file);
outStream.write(content.trim().getBytes());
outStream.close();
} catch (Exception e) {
e.printStackTrace();
}
return filepath;
}
public String save_result_to_txt_with_path (String path) {
String content = Get_PDF_Content(path);
File f = new File(path);
String folder =f.getParentFile().toString();
String file_name =f.getName();
String filepath=folder +"/"+file_name.replace(".pdf", ".txt");
try {
File file = new File(filepath);
if (!file.exists()) {
file.createNewFile();
}
FileOutputStream outStream = new FileOutputStream(file);
outStream.write(content.trim().getBytes());
outStream.close();
} catch (Exception e) {
e.printStackTrace();
}
return filepath;
}
public String read_PDF(String path) {
String test = "";
File file = new File(path);
FileInputStream in = null;
try {
in = new FileInputStream(file);
RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(in);
PDFParser parser = new PDFParser(randomAccessRead);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
test = stripper.getText(pdDocument);
//System.out.println(test);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return test;
}
public String Get_PDF_Content(String path) {
File pdfFile = new File(path);
PDDocument document = null;
String content = "";
try
{
// 方式一:
/**
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
**/
// 方式二:
document=PDDocument.load(pdfFile);
// 获取页码
int pages = document.getNumberOfPages();
// 读文本内容
PDFTextStripper stripper=new PDFTextStripper();
// 设置按顺序输出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
content = stripper.getText(document);
//System.out.println(content);
document.close();
//if(content.contains("Reversal")||content.contains("refund")||content.contains("Invaild")) {
//System.out.println("Reversal: "+path);
//}
} catch (Exception e) {
System.out.println(e);
}
return content;
}
}
相关文章
- Java 基本的IO流
- Java基础__Java中集合类
- 【Java】java扩展机制SPI 实现
- JAVA学习(五):Java面向对象编程基础
- Java操作Mongo
- Java实现 蓝桥杯 历届试题 最大子阵
- Java 蓝桥杯 算法训练 字符串的展开 (JAVA语言实现)
- 【JAVA】java中的length和length()
- java多线程 -- 同步鎖
- 【JAVA】 01-Java基础知识
- 在Java中可以使用自定义的java.net.InetAddress实现来解决虚拟hosts的问题
- Atitit web httphandler的实现 java python node.js c# net php 目录 1.1. Java 过滤器 servelet1 1.2. Python的
- atitit groovy 总结java 提升效率
- How to improve Java's I/O performance( 提升 java i/o 性能)
- 【java】Java连接mysql数据库及mysql驱动jar包下载和使用
- 【Java】java 性能监控及工具
- Java通过PDF模板导出数据 adobe acrobat的PDF编辑器 itextpdf java导出文件输出流
- 【错误记录】Android Studio 编译报错 ( The dependency contains Java 8 bytecode. Please enable desugaring by )
- JAVA语言之Java 中不同的并行实现的性能比较
- Java:openjdk: error: Student is abstract; cannot be instantiated;java编译环境
- 最灵活的PDF:Docotic.Pdf 8.7.13797 Crack
- Java 继承
- 【java】Java 接口(Interface)
- 【java】Java 重写(Override)与重载(Overload)
- 【java】Java线程池实现原理及业务中的实践
- JAVA开发讲义(二)-Java程序设计之数据之谜一