zl程序教程

您现在的位置是:首页 >  其他

当前栏目

PDF 转 TXT 后格式化处理 [MD]

PDF 处理 格式化 txt md
2023-09-14 09:00:05 时间

博文地址

我的GitHub 我的博客 我的微信 我的邮箱
baiqiantao baiqiantao bqt20094 baiqiantao@sina.com

目录

PDF 转 TXT 后格式化处理

public class PdfUtils {
    private static final int MIN_CHARS = 38; //【这个参数至关重要】
    private static final boolean DEBUG = false;
    private static final String IGNORE_CONTENT = "本文档资源来自互联网,仅供个人学习交流,请勿用作商业";
    private static final char[] TAGS_END_CHARS = { '。', '!', '”', '”' };
    private static final String TAG_NO_LINE = "【不换行】";

    public static void main(String[] args) {
        replaceFileContent("D:\\from.txt", "D:\\to.txt");
    }

    /**
     * 替换文件中的内容
     */
    public static void replaceFileContent(String from, String to) {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(from), "UTF-8"));
            String currentLine;//当前行的内容
            String nextLine;//下一行的内容
            List<String> origineList = new ArrayList<>();//每一行的内容集合
            while ((currentLine = reader.readLine()) != null) {
                origineList.add(currentLine);
            }

            List<String> contentList = new ArrayList<>();
            for (int i = 0; i < origineList.size(); i++) {
                currentLine = origineList.get(i);
                if (currentLine.equals(IGNORE_CONTENT)) {
                    contentList.remove(contentList.size() - 1);
                    contentList.remove(contentList.size() - 1);
                    contentList.remove(contentList.size() - 1);
                    i += 4;
                } else if (currentLine.equals("o")) {
                    //忽略
                } else {
                    contentList.add(origineList.get(i));
                }
            }

            for (int i = 0; i < contentList.size() - 1; i++) {
                currentLine = contentList.get(i);
                if (currentLine.length() >= MIN_CHARS) {//当前行较长,说明可能不需要换行
                    char currentEndChar = currentLine.charAt(currentLine.length() - 1);
                    if (!isHasChar(currentEndChar)) { //当前行不以标点符号结尾,进一步确定可能不需要换行【重中之重】
                        contentList.set(i, currentLine + TAG_NO_LINE);
                    }
                } else {//当前行较短,说明可能需要增加换行
                    nextLine = contentList.get(i + 1);
                    if (currentLine.length() == 0) { //当前行是空行
                        if (nextLine.length() == 0) { //如果下一行也是空行,则不需要换行
                            contentList.set(i, TAG_NO_LINE);
                        }
                    } else {
                        if (nextLine.length() >= MIN_CHARS) { //如果下一行较长,则增加换行
                            contentList.set(i, currentLine + "\n");
                        }
                    }
                }
                if (DEBUG) {
                    contentList.set(i, "【" + currentLine.length() + "】" + contentList.get(i));
                }
            }

            writeFile(to, contentList);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            closeStream(reader);
        }
    }

    /**
     * 写内容到指定文件
     */
    private static void writeFile(String file, List<String> contentList) {
        PrintWriter writer = null;
        try {
            writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
            for (String string : contentList) {
                if (string.endsWith(TAG_NO_LINE)) {
                    writer.append(string.replace(TAG_NO_LINE, ""));
                } else {
                    writer.append(string).append("\n");
                }
            }
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            closeStream(writer);
        }
    }

    /**
     * 关闭流
     */
    private static void closeStream(Closeable... closeable) {
        for (Closeable c : closeable) {
            if (c != null) {
                try {
                    c.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static boolean isHasChar(char c) {
        for (char c2 : TAGS_END_CHARS) {
            if (c2 == c) {
                return true;
            }
        }
        return false;
    }
}

2017-03-09