信息抽取:OFFICE文档使用POI控件,PDF可以使用PDFBOX0.7.3控件

JAVA读取WORD,EXCEL,POWERPOINT,PDF文件的方法

关键字: word, excel, powerpoint, pdf, pdfbox

OFFICE文档使用POI控件,PDF可以使用PDFBOX0.7.3控件,完全支持中文,用XPDF也行,不过感觉PDFBOX比较好,而且作者也在更新。水平有限,万望各位指正 WORD:Java代码 复制代码
  1. import org.apache.lucene.document.Document;   
  2. import org.apache.lucene.document.Field;   
  3. import org.apache.poi.hwpf.extractor.WordExtractor;   
  4.   
  5. import java.io.File;   
  6. import java.io.InputStream;   
  7. import java.io.FileInputStream;   
  8.   
  9. import com.search.code.Index;   
  10.   
  11. public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException {   
  12.   
  13.    String bodyText = null;   
  14.   try {   
  15.     WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream   
  16.     bodyText = ex.getText();   
  17.    if(!bodyText.equals("")){   
  18.      index.AddIndex(url, title, bodyText);   
  19.     }   
  20.    }catch (DocCenterException e) {   
  21.    throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e);   
  22.    }catch(Exception e){   
  23.     e.printStackTrace();   
  24.    }   
  25. }   
  26.   return null;   
  27. }  
import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.poi.hwpf.extractor.WordExtractor;import java.io.File;import java.io.InputStream;import java.io.FileInputStream;import com.search.code.Index;public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException { String bodyText = null; try { WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream bodyText = ex.getText(); if(!bodyText.equals("")){ index.AddIndex(url, title, bodyText); } }catch (DocCenterException e) { throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e); }catch(Exception e){ e.printStackTrace(); }} return null; }

  

Excel:

Java代码 复制代码
  1. import org.apache.lucene.document.Document;   
  2. import org.apache.lucene.document.Field;   
  3.   
  4. import org.apache.poi.hwpf.extractor.WordExtractor;   
  5. import   org.apache.poi.hssf.usermodel.HSSFWorkbook;   
  6. import   org.apache.poi.hssf.usermodel.HSSFSheet;   
  7. import   org.apache.poi.hssf.usermodel.HSSFRow;   
  8. import   org.apache.poi.hssf.usermodel.HSSFCell;   
  9.   
  10. import java.io.File;   
  11. import java.io.InputStream;   
  12. import java.io.FileInputStream;   
  13.   
  14. import com.search.code.Index;   
  15.   
  16.   
  17.   
  18. public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException {   
  19.    StringBuffer content = new StringBuffer();   
  20.   try{   
  21.     HSSFWorkbook   workbook   =  new   HSSFWorkbook(is);//创建对Excel工作簿文件的引用   
  22.    for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {   
  23.     if (null != workbook.getSheetAt(numSheets)) {   
  24.       HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet   
  25.         for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {   
  26.          if (null != aSheet.getRow(rowNumOfSheet)) {   
  27.            HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行   
  28.           for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {   
  29.            if (null != aRow.getCell(cellNumOfRow)) {   
  30.              HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值   
  31.              content.append(aCell.getStringCellValue());   
  32.             }   
  33.            }   
  34.           }   
  35.          }   
  36.      }   
  37.     }   
  38.    if(!content.equals("")){   
  39.      index.AddIndex(url, title, content.toString());   
  40.     }   
  41.    }catch (DocCenterException e) {   
  42.   
  43.    throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e);   
  44.    }catch(Exception   e)   {   
  45.     System.out.println("已运行xlRead()   :   "   +   e   );   
  46.    }   
  47.   return null;   
  48. }  
import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFCell; import java.io.File;import java.io.InputStream;import java.io.FileInputStream;import com.search.code.Index; public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException { StringBuffer content = new StringBuffer(); try{ HSSFWorkbook workbook = new HSSFWorkbook(is);//创建对Excel工作簿文件的引用 for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { if (null != workbook.getSheetAt(numSheets)) { HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) { if (null != aSheet.getRow(rowNumOfSheet)) { HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行 for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) { if (null != aRow.getCell(cellNumOfRow)) { HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值 content.append(aCell.getStringCellValue()); } } } } } } if(!content.equals("")){ index.AddIndex(url, title, content.toString()); } }catch (DocCenterException e) { throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e); }catch(Exception e) { System.out.println("已运行xlRead() : " + e ); } return null; }

 

 

PowerPoint:

Java代码 复制代码
  1. import java.io.InputStream;   
  2.   
  3. import org.apache.lucene.document.Document;   
  4. import org.apache.poi.hslf.HSLFSlideShow;   
  5. import org.apache.poi.hslf.model.TextRun;   
  6. import org.apache.poi.hslf.model.Slide;   
  7. import org.apache.poi.hslf.usermodel.SlideShow;   
  8.   
  9. public Document getDocument(Index index, String url, String title, InputStream is)   
  10. throws DocCenterException {   
  11.    StringBuffer content = new StringBuffer("");   
  12.   try{   
  13.     SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream,建立SlideShow   
  14.     Slide[] slides = ss.getSlides();//获得每一张幻灯片   
  15.    for(int i=0;i      TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun   
  16.     for(int j=0;j       content.append(t[j].getText());//这里会将文字内容加到content中去   
  17.      }   
  18.      content.append(slides[i].getTitle());   
  19.     }   
  20.     index.AddIndex(url, title, content.toString());   
  21.    }catch(Exception ex){   
  22.     System.out.println(ex.toString());   
  23.    }   
  24.   return null;   
  25. }   
  26.   
  27.    
import java.io.InputStream;import org.apache.lucene.document.Document;import org.apache.poi.hslf.HSLFSlideShow;import org.apache.poi.hslf.model.TextRun;import org.apache.poi.hslf.model.Slide;import org.apache.poi.hslf.usermodel.SlideShow; public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException { StringBuffer content = new StringBuffer(""); try{ SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream,建立SlideShow Slide[] slides = ss.getSlides();//获得每一张幻灯片 for(int i=0;i TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun for(int j=0;j content.append(t[j].getText());//这里会将文字内容加到content中去 } content.append(slides[i].getTitle()); } index.AddIndex(url, title, content.toString()); }catch(Exception ex){ System.out.println(ex.toString()); } return null; }

PDF:

Java代码 复制代码
  1. import java.io.InputStream;   
  2. import java.io.IOException;   
  3. import org.apache.lucene.document.Document;   
  4.   
  5. import org.pdfbox.cos.COSDocument;   
  6. import org.pdfbox.pdfparser.PDFParser;   
  7. import org.pdfbox.pdmodel.PDDocument;   
  8. import org.pdfbox.pdmodel.PDDocumentInformation;   
  9. import org.pdfbox.util.PDFTextStripper;   
  10.   
  11. import com.search.code.Index;   
  12.   
  13.   
  14.   
  15. public Document getDocument(Index index, String url, String title, InputStream is)throws DocCenterException {   
  16.      
  17.    COSDocument cosDoc = null;   
  18.   try {   
  19.     cosDoc = parseDocument(is);   
  20.    } catch (IOException e) {   
  21.     closeCOSDocument(cosDoc);   
  22.    throw new DocCenterException("无法处理该PDF文档", e);   
  23.    }   
  24.   if (cosDoc.isEncrypted()) {   
  25.    if (cosDoc != null)   
  26.      closeCOSDocument(cosDoc);   
  27.    throw new DocCenterException("该PDF文档是加密文档,无法处理");   
  28.    }   
  29.    String docText = null;   
  30.   try {   
  31.     PDFTextStripper stripper = new PDFTextStripper();   
  32.     docText = stripper.getText(new PDDocument(cosDoc));   
  33.    } catch (IOException e) {   
  34.     closeCOSDocument(cosDoc);   
  35.    throw new DocCenterException("无法处理该PDF文档", e);   
  36.    }   
  37.   
  38.    PDDocument pdDoc = null;   
  39.   try {   
  40.     pdDoc = new PDDocument(cosDoc);   
  41.     PDDocumentInformation docInfo = pdDoc.getDocumentInformation();   
  42.    if(docInfo.getTitle()!=null && !docInfo.getTitle().equals("")){   
  43.      title = docInfo.getTitle();   
  44.     }   
  45.   
  46.    } catch (Exception e) {   
  47.     closeCOSDocument(cosDoc);   
  48.     closePDDocument(pdDoc);   
  49.     System.err.println("无法取得该PDF文档的元数据" + e.getMessage());   
  50.    } finally {   
  51.     closeCOSDocument(cosDoc);   
  52.     closePDDocument(pdDoc);   
  53.    }   
  54.      
  55.   return null;   
  56. }   
  57.   
  58. private static COSDocument parseDocument(InputStream is) throws IOException {   
  59.    PDFParser parser = new PDFParser(is);   
  60.    parser.parse();   
  61.   return parser.getDocument();   
  62. }   
  63.   
  64. private void closeCOSDocument(COSDocument cosDoc) {   
  65.   if (cosDoc != null) {   
  66.    try {   
  67.      cosDoc.close();   
  68.     } catch (IOException e) {   
  69.     }   
  70.    }   
  71. }   
  72.   
  73. private void closePDDocument(PDDocument pdDoc) {   
  74.   if (pdDoc != null) {   
  75.    try {   
  76.      pdDoc.close();   
  77.     } catch (IOException e) {   
  78.     }   
  79.    }   
  80. }  
import java.io.InputStream;import java.io.IOException;import org.apache.lucene.document.Document;import org.pdfbox.cos.COSDocument;import org.pdfbox.pdfparser.PDFParser;import org.pdfbox.pdmodel.PDDocument;import org.pdfbox.pdmodel.PDDocumentInformation;import org.pdfbox.util.PDFTextStripper;import com.search.code.Index; public Document getDocument(Index index, String url, String title, InputStream is)throws DocCenterException { COSDocument cosDoc = null; try { cosDoc = parseDocument(is); } catch (IOException e) { closeCOSDocument(cosDoc); throw new DocCenterException("无法处理该PDF文档", e); } if (cosDoc.isEncrypted()) { if (cosDoc != null) closeCOSDocument(cosDoc); throw new DocCenterException("该PDF文档是加密文档,无法处理"); } String docText = null; try { PDFTextStripper stripper = new PDFTextStripper(); docText = stripper.getText(new PDDocument(cosDoc)); } catch (IOException e) { closeCOSDocument(cosDoc); throw new DocCenterException("无法处理该PDF文档", e); } PDDocument pdDoc = null; try { pdDoc = new PDDocument(cosDoc); PDDocumentInformation docInfo = pdDoc.getDocumentInformation(); if(docInfo.getTitle()!=null && !docInfo.getTitle().equals("")){ title = docInfo.getTitle(); } } catch (Exception e) { closeCOSDocument(cosDoc); closePDDocument(pdDoc); System.err.println("无法取得该PDF文档的元数据" + e.getMessage()); } finally { closeCOSDocument(cosDoc); closePDDocument(pdDoc); } return null; } private static COSDocument parseDocument(InputStream is) throws IOException { PDFParser parser = new PDFParser(is); parser.parse(); return parser.getDocument(); } private void closeCOSDocument(COSDocument cosDoc) { if (cosDoc != null) { try { cosDoc.close(); } catch (IOException e) { } } } private void closePDDocument(PDDocument pdDoc) { if (pdDoc != null) { try { pdDoc.close(); } catch (IOException e) { } } }

代码复制可能出错,不过代码经过测试,绝对能用,POI为3.0-rc4,PDFBOX为0.7.3