4

lucene5(6)读取word、pdf内容

 2 years ago
source link: https://wakzz.cn/2017/10/01/lucene5/(6)%E8%AF%BB%E5%8F%96word%E3%80%81pdf%E5%86%85%E5%AE%B9/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

lucene5(6)读取word、pdf内容

祈雨的博客
2017-10-01

No bibi 亮代码

/**
* 读取doc
*/
@Test
public void readWord2003() throws Exception{
InputStream is = new FileInputStream("E:/test.doc");
HWPFDocument doc2003 = new HWPFDocument(is);
WordExtractor word2003 = new WordExtractor(doc2003);
System.out.println(word2003.getText());
word2003.close();
doc2003.close();
is.close();
}

/**
* 读取docx
*/
@Test
public void readWord2007() throws Exception{
InputStream is = new FileInputStream("E:/test.docx");
XWPFDocument doc2007 = new XWPFDocument(is);
XWPFWordExtractor word2007 = new XWPFWordExtractor(doc2007);
System.out.println(word2007.getText());
word2007.close();
doc2007.close();
is.close();
}

/**
* 读取pdf
*/
@Test
public void readPDF() throws Exception{
InputStream is = new FileInputStream("E:/test.pdf");
PDDocument document=PDDocument.load(is);
//获取一个PDFTextStripper文本剥离对象
PDFTextStripper stripper = new PDFTextStripper();
//获取文本内容
String content = stripper.getText(document);
System.out.println(content);
document.close();
is.close();
}

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK