BAEL-1557 Intro to Apache OpenNlp

This commit is contained in:
mansi2392
2018-03-30 11:28:42 +05:30
parent 764ccb54a9
commit db10f66e60
15 changed files with 301773 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
/**
*
* @author Parth
*/
public class ChunkerTest {
@Test
public void givenSentence_whenChunk_thenGetChunks() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
ChunkerME chunker = new ChunkerME(chunkerModel);
String[] chunks = chunker.chunk(tokens, tags);
assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
}
}

View File

@@ -0,0 +1,41 @@
package com.baeldung.apache.opennlp;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import opennlp.tools.langdetect.Language;
import opennlp.tools.langdetect.LanguageDetector;
import opennlp.tools.langdetect.LanguageDetectorFactory;
import opennlp.tools.langdetect.LanguageDetectorME;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.langdetect.LanguageDetectorSampleStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class LanguageDetectorAndTrainingData {
@Test
public void test() throws FileNotFoundException, IOException {
InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
params.put(TrainingParameters.CUTOFF_PARAM, 5);
params.put("DataIndexer", "TwoPass");
params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());
LanguageDetector ld = new LanguageDetectorME(model);
Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
assertThat(Arrays.asList(languages).toString()).contains("pob (0.9999999950605625)", "ita (4.939427661577956E-9)", "spa (9.665954064665144E-15)",
"fra (8.250349924885834E-25)");
}
}

View File

@@ -0,0 +1,31 @@
package com.baeldung.apache.opennlp;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.lemmatizer.DictionaryLemmatizer;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class LemmetizerTest {
@Test
public void givenSentence_whenLemmetize_thenGetLemmas() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
InputStream dictLemmatizer = new FileInputStream("src/main/resources/models/en-lemmatizer.dict");
DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
String[] lemmas = lemmatizer.lemmatize(tokens, tags);
assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
}
}

View File

@@ -0,0 +1,40 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.Span;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class NamedEntityRecognitionTest {
@Test
public void givenTextWithPersonNames_whenNER_thenGetPersonNamesList() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
InputStream inputStreamNameFinder = new FileInputStream("src/main/resources/models/en-ner-person.bin");
TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
NameFinderME nameFinderME = new NameFinderME(model);
List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
List<String> names = new ArrayList<String>();
int k = 0;
for (Span s : spans) {
names.add("");
for (int index = s.getStart(); index < s.getEnd(); index++) {
names.set(k, names.get(k) + tokens[index]);
}
k++;
}
assertThat(names).contains("John","Leonard","Penny");
}
}

View File

@@ -0,0 +1,25 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class POSTaggerTest {
@Test
public void givenSentence_whenPOSTagging_thenGetTags() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("John has a sister named Penny.");
InputStream inputStreamPOSTagger = new FileInputStream("src/main/resources/models/en-pos-maxent.bin");
POSModel posModel = new POSModel(inputStreamPOSTagger);
POSTaggerME posTagger = new POSTaggerME(posModel);
String tags[] = posTagger.tag(tokens);
assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
}
}

View File

@@ -0,0 +1,29 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import java.io.InputStream;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class SentenceDetectionTest {
@Test
public void givenText_whenDetectSent_thenGetSentences() throws Exception {
String paragraph = "This is a statement. This is another statement. Now is an abstract word for time, "
+ "that is always flying. And my email address is google@gmail.com.";
InputStream is = new FileInputStream("src/main/resources/models/en-sent.bin");
SentenceModel model = new SentenceModel(is);
SentenceDetectorME sdetector = new SentenceDetectorME(model);
String sentences[] = sdetector.sentDetect(paragraph);
assertThat(sentences).contains("This is a statement.",
"This is another statement.",
"Now is an abstract word for time, that is always flying.",
"And my email address is google@gmail.com.");
}
}

View File

@@ -0,0 +1,36 @@
package com.baeldung.apache.opennlp;
import java.io.FileInputStream;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.Test;
public class TokenizerTest {
@Test
public void givenString_whenTokenize_thenGetTokens() throws Exception {
FileInputStream fileInputStream = new FileInputStream("src/main/resources/models/en-token.bin");
TokenizerModel model = new TokenizerModel(fileInputStream);
TokenizerME tokenizer = new TokenizerME(model);
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}
@Test
public void givenString_whenWhitespaceTokenizer_thenGetTokens() throws Exception {
WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
}
@Test
public void givenString_whenSimpleTokenizer_thenGetTokens() throws Exception {
SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}
}