diff --git a/apache-tika/.gitignore b/apache-tika/.gitignore
new file mode 100644
index 0000000000..5f88621edc
--- /dev/null
+++ b/apache-tika/.gitignore
@@ -0,0 +1,3 @@
+*.docx
+temp.xls
+temp.xlsx
\ No newline at end of file
diff --git a/apache-tika/pom.xml b/apache-tika/pom.xml
new file mode 100644
index 0000000000..34013dee89
--- /dev/null
+++ b/apache-tika/pom.xml
@@ -0,0 +1,25 @@
+
+ 4.0.0
+ com.baeldung
+ apache-tika
+ 0.0.1-SNAPSHOT
+
+
+ com.baeldung
+ parent-modules
+ 1.0.0-SNAPSHOT
+
+
+
+ 1.17
+
+
+
+
+ org.apache.tika
+ tika-parsers
+ ${tika.version}
+
+
+
\ No newline at end of file
diff --git a/apache-tika/src/main/java/com/baeldung/tika/TikaAnalysis.java b/apache-tika/src/main/java/com/baeldung/tika/TikaAnalysis.java
new file mode 100644
index 0000000000..85eafc7c08
--- /dev/null
+++ b/apache-tika/src/main/java/com/baeldung/tika/TikaAnalysis.java
@@ -0,0 +1,67 @@
+package com.baeldung.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.Tika;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TikaAnalysis {
+ public static String detectDocTypeUsingDetector(InputStream stream) throws IOException {
+ Detector detector = new DefaultDetector();
+ Metadata metadata = new Metadata();
+
+ MediaType mediaType = detector.detect(stream, metadata);
+ return mediaType.toString();
+ }
+
+ public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
+ Tika tika = new Tika();
+ String mediaType = tika.detect(stream);
+ return mediaType;
+ }
+
+ public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ parser.parse(stream, handler, metadata, context);
+ return handler.toString();
+ }
+
+ public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
+ Tika tika = new Tika();
+ String content = tika.parseToString(stream);
+ return content;
+ }
+
+ public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ parser.parse(stream, handler, metadata, context);
+ return metadata;
+ }
+
+ public static Metadata extractMetadatatUsingFacade(InputStream stream) throws IOException, TikaException {
+ Tika tika = new Tika();
+ Metadata metadata = new Metadata();
+
+ tika.parse(stream, metadata);
+ return metadata;
+ }
+}
diff --git a/apache-tika/src/test/java/com/baeldung/tika/TikaUnitTest.java b/apache-tika/src/test/java/com/baeldung/tika/TikaUnitTest.java
new file mode 100644
index 0000000000..555a796d59
--- /dev/null
+++ b/apache-tika/src/test/java/com/baeldung/tika/TikaUnitTest.java
@@ -0,0 +1,81 @@
+package com.baeldung.tika;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+public class TikaUnitTest {
+ private ClassLoader loader = this.getClass().getClassLoader();
+
+ @Test
+ public void whenUsingDetector_thenDocumentTypeIsReturned() throws IOException {
+ InputStream stream = loader.getResourceAsStream("tika.txt");
+ String mediaType = TikaAnalysis.detectDocTypeUsingDetector(stream);
+
+ assertEquals("application/pdf", mediaType);
+
+ stream.close();
+ }
+
+ @Test
+ public void whenUsingFacade_thenDocumentTypeIsReturned() throws IOException {
+ InputStream stream = loader.getResourceAsStream("tika.txt");
+ String mediaType = TikaAnalysis.detectDocTypeUsingFacade(stream);
+
+ assertEquals("application/pdf", mediaType);
+
+ stream.close();
+ }
+
+ @Test
+ public void whenUsingParser_thenContentIsReturned() throws IOException, TikaException, SAXException {
+ InputStream stream = loader.getResourceAsStream("tika.docx");
+ String content = TikaAnalysis.extractContentUsingParser(stream);
+
+ assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
+ assertThat(content, containsString("detects and extracts metadata and text"));
+
+ stream.close();
+ }
+
+ @Test
+ public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException {
+ InputStream stream = loader.getResourceAsStream("tika.docx");
+ String content = TikaAnalysis.extractContentUsingFacade(stream);
+
+ assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
+ assertThat(content, containsString("detects and extracts metadata and text"));
+
+ stream.close();
+ }
+
+ @Test
+ public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException {
+ InputStream stream = loader.getResourceAsStream("tika.xlsx");
+ Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream);
+
+ assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
+ assertEquals("Microsoft Office User", metadata.get("Author"));
+
+ stream.close();
+ }
+
+ @Test
+ public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException {
+ InputStream stream = loader.getResourceAsStream("tika.xlsx");
+ Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream);
+
+ assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
+ assertEquals("Microsoft Office User", metadata.get("Author"));
+
+ stream.close();
+ }
+}
diff --git a/apache-tika/src/test/resources/tika.txt b/apache-tika/src/test/resources/tika.txt
new file mode 100644
index 0000000000..26923836de
Binary files /dev/null and b/apache-tika/src/test/resources/tika.txt differ
diff --git a/apache-tika/src/test/resources/tika.xlsx b/apache-tika/src/test/resources/tika.xlsx
new file mode 100644
index 0000000000..110a0b2dd5
Binary files /dev/null and b/apache-tika/src/test/resources/tika.xlsx differ
diff --git a/pom.xml b/pom.xml
index 75cf4b84b8..1731e1661c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -38,6 +38,7 @@
apache-cxf
apache-fop
apache-poi
+ apache-tika
apache-thrift
autovalue
axon
@@ -383,4 +384,4 @@
-
\ No newline at end of file
+