diff --git a/apache-tika/.gitignore b/apache-tika/.gitignore new file mode 100644 index 0000000000..5f88621edc --- /dev/null +++ b/apache-tika/.gitignore @@ -0,0 +1,3 @@ +*.docx +temp.xls +temp.xlsx \ No newline at end of file diff --git a/apache-tika/pom.xml b/apache-tika/pom.xml new file mode 100644 index 0000000000..34013dee89 --- /dev/null +++ b/apache-tika/pom.xml @@ -0,0 +1,25 @@ + + 4.0.0 + com.baeldung + apache-tika + 0.0.1-SNAPSHOT + + + com.baeldung + parent-modules + 1.0.0-SNAPSHOT + + + + 1.17 + + + + + org.apache.tika + tika-parsers + ${tika.version} + + + \ No newline at end of file diff --git a/apache-tika/src/main/java/com/baeldung/tika/TikaAnalysis.java b/apache-tika/src/main/java/com/baeldung/tika/TikaAnalysis.java new file mode 100644 index 0000000000..85eafc7c08 --- /dev/null +++ b/apache-tika/src/main/java/com/baeldung/tika/TikaAnalysis.java @@ -0,0 +1,67 @@ +package com.baeldung.tika; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.Tika; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class TikaAnalysis { + public static String detectDocTypeUsingDetector(InputStream stream) throws IOException { + Detector detector = new DefaultDetector(); + Metadata metadata = new Metadata(); + + MediaType mediaType = detector.detect(stream, metadata); + return mediaType.toString(); + } + + public static String detectDocTypeUsingFacade(InputStream stream) throws IOException { + Tika tika = new Tika(); + String mediaType = tika.detect(stream); + return mediaType; + } + + public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + parser.parse(stream, handler, metadata, context); + return handler.toString(); + } + + public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException { + Tika tika = new Tika(); + String content = tika.parseToString(stream); + return content; + } + + public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException { + Parser parser = new AutoDetectParser(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + parser.parse(stream, handler, metadata, context); + return metadata; + } + + public static Metadata extractMetadatatUsingFacade(InputStream stream) throws IOException, TikaException { + Tika tika = new Tika(); + Metadata metadata = new Metadata(); + + tika.parse(stream, metadata); + return metadata; + } +} diff --git a/apache-tika/src/test/java/com/baeldung/tika/TikaUnitTest.java b/apache-tika/src/test/java/com/baeldung/tika/TikaUnitTest.java new file mode 100644 index 0000000000..555a796d59 --- /dev/null +++ b/apache-tika/src/test/java/com/baeldung/tika/TikaUnitTest.java @@ -0,0 +1,81 @@ +package com.baeldung.tika; + +import static org.hamcrest.CoreMatchers.containsString; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThat; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.junit.Test; +import org.xml.sax.SAXException; + +public class TikaUnitTest { + private ClassLoader loader = this.getClass().getClassLoader(); + + @Test + public void whenUsingDetector_thenDocumentTypeIsReturned() throws IOException { + InputStream stream = loader.getResourceAsStream("tika.txt"); + String mediaType = TikaAnalysis.detectDocTypeUsingDetector(stream); + + assertEquals("application/pdf", mediaType); + + stream.close(); + } + + @Test + public void whenUsingFacade_thenDocumentTypeIsReturned() throws IOException { + InputStream stream = loader.getResourceAsStream("tika.txt"); + String mediaType = TikaAnalysis.detectDocTypeUsingFacade(stream); + + assertEquals("application/pdf", mediaType); + + stream.close(); + } + + @Test + public void whenUsingParser_thenContentIsReturned() throws IOException, TikaException, SAXException { + InputStream stream = loader.getResourceAsStream("tika.docx"); + String content = TikaAnalysis.extractContentUsingParser(stream); + + assertThat(content, containsString("Apache Tika - a content analysis toolkit")); + assertThat(content, containsString("detects and extracts metadata and text")); + + stream.close(); + } + + @Test + public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException { + InputStream stream = loader.getResourceAsStream("tika.docx"); + String content = TikaAnalysis.extractContentUsingFacade(stream); + + assertThat(content, containsString("Apache Tika - a content analysis toolkit")); + assertThat(content, containsString("detects and extracts metadata and text")); + + stream.close(); + } + + @Test + public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException { + InputStream stream = loader.getResourceAsStream("tika.xlsx"); + Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream); + + assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By")); + assertEquals("Microsoft Office User", metadata.get("Author")); + + stream.close(); + } + + @Test + public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException { + InputStream stream = loader.getResourceAsStream("tika.xlsx"); + Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream); + + assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By")); + assertEquals("Microsoft Office User", metadata.get("Author")); + + stream.close(); + } +} diff --git a/apache-tika/src/test/resources/tika.txt b/apache-tika/src/test/resources/tika.txt new file mode 100644 index 0000000000..26923836de Binary files /dev/null and b/apache-tika/src/test/resources/tika.txt differ diff --git a/apache-tika/src/test/resources/tika.xlsx b/apache-tika/src/test/resources/tika.xlsx new file mode 100644 index 0000000000..110a0b2dd5 Binary files /dev/null and b/apache-tika/src/test/resources/tika.xlsx differ diff --git a/pom.xml b/pom.xml index 75cf4b84b8..1731e1661c 100644 --- a/pom.xml +++ b/pom.xml @@ -38,6 +38,7 @@ apache-cxf apache-fop apache-poi + apache-tika apache-thrift autovalue axon @@ -383,4 +384,4 @@ - \ No newline at end of file +