Initial commit for Apache Tika
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
package com.baeldung.tika;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.containsString;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertThat;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.junit.Test;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
public class TikaUnitTest {
|
||||
private ClassLoader loader = this.getClass().getClassLoader();
|
||||
|
||||
@Test
|
||||
public void whenUsingDetector_thenDocumentTypeIsReturned() throws IOException {
|
||||
InputStream stream = loader.getResourceAsStream("tika.txt");
|
||||
String mediaType = TikaAnalysis.detectDocTypeUsingDetector(stream);
|
||||
|
||||
assertEquals("application/pdf", mediaType);
|
||||
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenUsingFacade_thenDocumentTypeIsReturned() throws IOException {
|
||||
InputStream stream = loader.getResourceAsStream("tika.txt");
|
||||
String mediaType = TikaAnalysis.detectDocTypeUsingFacade(stream);
|
||||
|
||||
assertEquals("application/pdf", mediaType);
|
||||
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenUsingParser_thenContentIsReturned() throws IOException, TikaException, SAXException {
|
||||
InputStream stream = loader.getResourceAsStream("tika.docx");
|
||||
String content = TikaAnalysis.extractContentUsingParser(stream);
|
||||
|
||||
assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
|
||||
assertThat(content, containsString("detects and extracts metadata and text"));
|
||||
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException {
|
||||
InputStream stream = loader.getResourceAsStream("tika.docx");
|
||||
String content = TikaAnalysis.extractContentUsingFacade(stream);
|
||||
|
||||
assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
|
||||
assertThat(content, containsString("detects and extracts metadata and text"));
|
||||
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException {
|
||||
InputStream stream = loader.getResourceAsStream("tika.xlsx");
|
||||
Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream);
|
||||
|
||||
assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
|
||||
assertEquals("Microsoft Office User", metadata.get("Author"));
|
||||
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException {
|
||||
InputStream stream = loader.getResourceAsStream("tika.xlsx");
|
||||
Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream);
|
||||
|
||||
assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
|
||||
assertEquals("Microsoft Office User", metadata.get("Author"));
|
||||
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user