remove stop words from string
This commit is contained in:
@@ -0,0 +1,73 @@
|
||||
package com.baeldung.string.performance;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
|
||||
|
||||
@Fork(value = 3, warmups = 1)
|
||||
@State(Scope.Benchmark)
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
public class RemovingStopwordsPerformanceComparison {
|
||||
|
||||
private String data;
|
||||
|
||||
private List<String> stopwords;
|
||||
|
||||
private String stopwordsRegex;
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
org.openjdk.jmh.Main.main(args);
|
||||
}
|
||||
|
||||
@Setup
|
||||
public void setup() throws IOException {
|
||||
data = new String(Files.readAllBytes(Paths.get("src/main/resources/shakespeare-hamlet.txt")));
|
||||
data = data.toLowerCase();
|
||||
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
|
||||
stopwordsRegex = stopwords.stream().collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public String removeManually() {
|
||||
String[] allWords = data.split(" ");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for(String word:allWords) {
|
||||
if(! stopwords.contains(word)) {
|
||||
builder.append(word);
|
||||
builder.append(' ');
|
||||
}
|
||||
}
|
||||
return builder.toString().trim();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public String removeAll() {
|
||||
ArrayList<String> allWords = Stream.of(data.split(" "))
|
||||
.collect(Collectors.toCollection(ArrayList<String>::new));
|
||||
allWords.removeAll(stopwords);
|
||||
return allWords.stream().collect(Collectors.joining(" "));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public String replaceRegex() {
|
||||
return data.replaceAll(stopwordsRegex, "");
|
||||
}
|
||||
|
||||
}
|
||||
127
java-strings-2/src/main/resources/english_stopwords.txt
Normal file
127
java-strings-2/src/main/resources/english_stopwords.txt
Normal file
@@ -0,0 +1,127 @@
|
||||
i
|
||||
me
|
||||
my
|
||||
myself
|
||||
we
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
you
|
||||
your
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
||||
he
|
||||
him
|
||||
his
|
||||
himself
|
||||
she
|
||||
her
|
||||
hers
|
||||
herself
|
||||
it
|
||||
its
|
||||
itself
|
||||
they
|
||||
them
|
||||
their
|
||||
theirs
|
||||
themselves
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
am
|
||||
is
|
||||
are
|
||||
was
|
||||
were
|
||||
be
|
||||
been
|
||||
being
|
||||
have
|
||||
has
|
||||
had
|
||||
having
|
||||
do
|
||||
does
|
||||
did
|
||||
doing
|
||||
a
|
||||
an
|
||||
the
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
s
|
||||
t
|
||||
can
|
||||
will
|
||||
just
|
||||
don
|
||||
should
|
||||
now
|
||||
4922
java-strings-2/src/main/resources/shakespeare-hamlet.txt
Normal file
4922
java-strings-2/src/main/resources/shakespeare-hamlet.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,60 @@
|
||||
package com.baeldung.string;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class RemoveStopwordsUnitTest {
|
||||
final String original = "The quick brown fox jumps over the lazy dog";
|
||||
final String target = "quick brown fox jumps lazy dog";
|
||||
static List<String> stopwords;
|
||||
|
||||
@BeforeClass
|
||||
public static void loadStopwords() throws IOException {
|
||||
stopwords = Files.readAllLines(Paths.get("src/main/resources/english_stopwords.txt"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenRemoveStopwordsManually_thenSuccess() {
|
||||
String[] allWords = original.toLowerCase()
|
||||
.split(" ");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (String word : allWords) {
|
||||
if (!stopwords.contains(word)) {
|
||||
builder.append(word);
|
||||
builder.append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
String result = builder.toString().trim();
|
||||
assertEquals(result, target);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenRemoveStopwordsUsingRemoveAll_thenSuccess() {
|
||||
ArrayList<String> allWords = Stream.of(original.toLowerCase()
|
||||
.split(" "))
|
||||
.collect(Collectors.toCollection(ArrayList<String>::new));
|
||||
allWords.removeAll(stopwords);
|
||||
String result = allWords.stream().collect(Collectors.joining(" "));
|
||||
assertEquals(result, target);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void whenRemoveStopwordsUsingRegex_thenSuccess() {
|
||||
String stopwordsRegex = stopwords.stream()
|
||||
.collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
|
||||
String result = original.toLowerCase().replaceAll(stopwordsRegex, "");
|
||||
assertEquals(result, target);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user