您现在的位置是：首页 > 后端

当前栏目

Java实现PDF读取

JAVA PDF 实现读取

2023-09-11 14:22:10 时间

在ppm.xml中引用pdfbox

<groupId>com.cicdtest</groupId>
<artifactId>cicdtest</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>cicdtest</name>
<url>http://maven.apache.org</url>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.26</version>
</dependency>
</dependencies>

</project>

package pdf.txt;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;

import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

public class PDF {

   public static void main(String[] args){
       String path="";
       path = "D:/Project/e-Statement/estatement_pdf/2021_07_09_estatement/8000054710_est_9ec4a09254a67c1690837ef62f64f9e9.pdf";
       PDF p = new PDF();
       String content= p.Get_PDF_Content(path);
       System.out.println(content);
       p.save_result_to_txt(content);
   }

   public String save_result_to_txt (String content) {
       String filepath="D:/PDF.txt";
       try {
           File file = new File(filepath);
           if (!file.exists()) {
               file.createNewFile();
           }
           FileOutputStream outStream = new FileOutputStream(file);
           outStream.write(content.trim().getBytes());
           outStream.close();
       } catch (Exception e) {
           e.printStackTrace();
       }
       return filepath;
   }

   public String save_result_to_txt_with_path (String path) {
       String content = Get_PDF_Content(path);
       File f = new File(path);
       String folder =f.getParentFile().toString();
       String file_name =f.getName();
       String filepath=folder +"/"+file_name.replace(".pdf", ".txt");
       try {
           File file = new File(filepath);
           if (!file.exists()) {
               file.createNewFile();
           }
           FileOutputStream outStream = new FileOutputStream(file);
           outStream.write(content.trim().getBytes());
           outStream.close();
       } catch (Exception e) {
           e.printStackTrace();
       }
       return filepath;
   }

   public String read_PDF(String path) {
       String test = "";
       File file = new File(path);
       FileInputStream in = null;
       try {
           in = new FileInputStream(file);
           RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(in);
           PDFParser parser = new PDFParser(randomAccessRead);
           parser.parse();
           PDDocument pdDocument = parser.getPDDocument();
           PDFTextStripper stripper = new PDFTextStripper();
           test = stripper.getText(pdDocument);
           //System.out.println(test);

       } catch (FileNotFoundException e) {
           e.printStackTrace();
       } catch (Exception e) {
           e.printStackTrace();
       }
       return test;
   }

   public String Get_PDF_Content(String path) {
       File pdfFile = new File(path);
       PDDocument document = null;
       String content = "";
       try
       {
           // 方式一：
           /**
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
           **/

// 方式二：
document=PDDocument.load(pdfFile);

           // 获取页码
           int pages = document.getNumberOfPages();
           // 读文本内容
           PDFTextStripper stripper=new PDFTextStripper();
           // 设置按顺序输出
           stripper.setSortByPosition(true);
           stripper.setStartPage(1);
           stripper.setEndPage(pages);
           content = stripper.getText(document);
           //System.out.println(content);
           document.close();

           //if(content.contains("Reversal")||content.contains("refund")||content.contains("Invaild")) {
           //System.out.println("Reversal: "+path);
           //}
       } catch (Exception e) {
           System.out.println(e);
       }

return content;
}

}

猜你喜欢

【树莓派4B学习】六、树莓派4BOpenCV的视频/摄像头基本操作
Design Tic-Tac Toe
Python实现基于最小二乘法的线性回归
NI开启射频、微波及无线通信测试全国巡回研讨会
[转载]Linux libusb "LIBUSB_ERROR_ACCESS" libusb_open返回值为-3 error解决方法
《第一本Docker书（修订版）》——2.2 在Ubuntu和Debian中安装Docker
解决Eclipse、Android Studio ADT AVD不能检测到手机
贵州运用大数据创新政府公共服务
谈谈一些有趣的CSS题目（十七）-- 不可思议的颜色混合模式 mix-blend-mode
中国移动联招商局进军大数据征信
基于SNN脉冲神经网络的FPGA实现介绍
XML组成结构以及C#通过DTD验证规范性
[LeetCode] Power of Three 判断3的次方数
Vue 之 Toast 消息提示插件的简单封装
《Python编程实战：运用设计模式、并发和程序库创建高质量程序》—— 2.1　适配器模式
php_curl模拟登录有验证码实例
jQuery碎语(1) 基础、选择要操作的元素、处理DOM元素
js 补零方法，如果不足位数
【区间DP】XOR-pyramid
Google Earth Engine APPS（GEE）—— Landsat 数据的时间序列分析来监测森林转化和退化 (CODED)整体框架（万字长文）
Linux script命令 —— 终端里的记录器
Linux环境下Kafka的安装与使用(SpringBoot整合云服务器上的Kafka)

相关主题

Java动态数组
java.lang.Integer
Java　复习

zl程序教程

当前栏目

Java实现PDF读取

在ppm.xml中引用pdfbox

相关文章