import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import org.apache.log4j.Logger; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.pdfbox.encryption.DecryptDocument; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; import org.textmining.text.extraction.WordExtractor; public class FileReader { /** * Description:解析文件,返回文档内容 <br> * * * @param logger * Longger * @param realPath * String * @return String */ public static String readOneFile(Logger logger, String realPath) { FileInputStream in = null; BufferedReader reader = null; String content = ""; try { File file = new File(realPath); if (!file.canRead()) { logger.error(MessageCode.getPageMessage("iiss.info.common.filenotread") + realPath); return ""; } String suffix = realPath.substring(realPath.lastIndexOf(".") + 1, realPath.length()); if ("doc".equalsIgnoreCase(suffix)) { WordExtractor extractor = new WordExtractor(); content = extractor.extractText(new FileInputStream(file)); if (logger.isDebugEnabled()) { logger.debug("summary=" + content); } }//解析excel文件 else if ("xls".equalsIgnoreCase(suffix)) { StringBuffer rowData = new StringBuffer(KeyConstant.INITIAL_BUFFER); in = new FileInputStream(file); // 创建对Excel对象 HSSFWorkbook workbook = new HSSFWorkbook(in); //获得excel的页数 int sheetNo = workbook.getNumberOfSheets(); if (sheetNo > 0) { for (int i = 0; i < sheetNo; i++) { // 获取每一页对象 HSSFSheet sheet = workbook.getSheetAt(i); if (sheet != null) { //逐行获得内容 HSSFRow row = null; for (int j = 0; j <= sheet.getLastRowNum(); j++) { try { row = sheet.getRow(j); } catch (Exception e) { row = null; } if (row == null) { continue; } //逐个单元格获得内容 HSSFCell cell = null; String fieldValue = null; for (int k = 0; k <= row.getLastCellNum() - 1; k++) { try { cell = row.getCell((short)k); } catch (Exception e) { cell = row.createCell((short)k); cell.setCellType(HSSFCell.CELL_TYPE_STRING); cell.setCellValue(""); } fieldValue = POITools.getCellValue(cell); if (fieldValue != null && !"".equals(fieldValue)) { rowData.append(fieldValue); rowData.append(""); } } } } } } content = rowData.toString(); if (logger.isDebugEnabled()) { logger.debug("summary=" + content); } } else if ("txt".equalsIgnoreCase(suffix)) { in = new FileInputStream(file); reader = new BufferedReader(new InputStreamReader(in)); StringBuffer sBuffer = new StringBuffer(); String s = null; do { s = reader.readLine(); if (s != null) { sBuffer.append(s); } } while (s != null); content = sBuffer.toString(); if (logger.isDebugEnabled()) { logger.debug("summary=" + content); } } else if ("html".equalsIgnoreCase(suffix) || "htm".equalsIgnoreCase(suffix)) { HTMLParser parser = new HTMLParser(file); content = parser.getContent(); } else if ("pdf".equalsIgnoreCase(suffix)) { PDDocument pdf = null; try { PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); pdf = parser.getPDDocument(); if (pdf.isEncrypted()) { DecryptDocument decryptor = new DecryptDocument(pdf); decryptor.decryptDocument(""); } PDFTextStripper stripper = new PDFTextStripper(); content = stripper.getText(pdf); if (logger.isDebugEnabled()) { logger.debug("summary=" + content); } } catch (Exception e) { logger.error(e, e); } catch (OutOfMemoryError t) { logger.error(t, t); } finally { try { if (pdf != null) { pdf.close(); } } catch (IOException e) { logger.error(MessageCode.getPageMessage("iiss.info.common.readfilefail") + realPath); } } } else { content = " "; } } catch (FileNotFoundException e) { logger.error(MessageCode.getPageMessage("iiss.info.common.filenotfound") + realPath); } catch (IOException e) { logger.error(MessageCode.getPageMessage("iiss.info.common.readfilefail") + realPath); } catch (InterruptedException e) { logger.error(MessageCode.getPageMessage("iiss.info.common.readhtmlfilefail") + realPath); } catch (Exception e) { logger.error(MessageCode.getPageMessage("iiss.info.common.parsefilefail") + e, e); } finally { try { if (in != null) { in.close(); } } catch (IOException e) { logger.error(e, e); } try { if (reader != null) { reader.close(); } } catch (IOException e) { logger.error(e, e); } } return content; } }