From acb317d17164a9abdb49731a6f66bd4573a88b66 Mon Sep 17 00:00:00 2001 From: macroscopic64 Date: Fri, 7 Feb 2020 23:08:53 +0530 Subject: [PATCH] BAEL-3480 Java Fast pattern matching using trie and suffix tree --- .../baeldung/algorithms/suffixtree/Node.java | 57 ++++++ .../algorithms/suffixtree/SuffixTree.java | 178 ++++++++++++++++++ .../suffixtree/SuffixTreeUnitTest.java | 78 ++++++++ 3 files changed, 313 insertions(+) create mode 100644 algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/Node.java create mode 100644 algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/SuffixTree.java create mode 100644 algorithms-searching/src/test/java/com/baeldung/algorithms/suffixtree/SuffixTreeUnitTest.java diff --git a/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/Node.java b/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/Node.java new file mode 100644 index 0000000000..5a77b09753 --- /dev/null +++ b/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/Node.java @@ -0,0 +1,57 @@ +package com.baeldung.algorithms.suffixtree; + +import java.util.ArrayList; +import java.util.List; + +public class Node { + private String text; + private List children; + private int position; + + public Node(String word, int position) { + this.text = word; + this.position = position; + this.children = new ArrayList<>(); + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public int getPosition() { + return position; + } + + public void setPosition(int position) { + this.position = position; + } + + public List getChildren() { + return children; + } + + public void setChildren(List children) { + this.children = children; + } + + public String printTree(String depthIndicator) { + String str = ""; + String positionStr = position > -1 ? "[" + String.valueOf(position) + "]" : ""; + str += depthIndicator + text + positionStr + "\n"; + + for (int i = 0; i < children.size(); i++) { + str += children.get(i) + .printTree(depthIndicator + "\t"); + } + return str; + } + + @Override + public String toString() { + return printTree(""); + } +} \ No newline at end of file diff --git a/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/SuffixTree.java b/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/SuffixTree.java new file mode 100644 index 0000000000..50d9702cd3 --- /dev/null +++ b/algorithms-searching/src/main/java/com/baeldung/algorithms/suffixtree/SuffixTree.java @@ -0,0 +1,178 @@ +package com.baeldung.algorithms.suffixtree; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class SuffixTree { + private static final Logger LOGGER = LoggerFactory.getLogger(SuffixTree.class); + private static final String WORD_TERMINATION = "$"; + private static final int POSITION_UNDEFINED = -1; + private Node root; + private String fullText; + + public SuffixTree() { + root = new Node("", POSITION_UNDEFINED); + fullText = ""; + } + + public void addText(String text) { + for (int i = 0; i < text.length(); i++) { + addSuffix(text.substring(i) + WORD_TERMINATION, i); + } + fullText += text; + } + + public List searchText(String pattern) { + LOGGER.info("Searching for pattern \"{}\"", pattern); + List result = new ArrayList<>(); + List nodes = getAllNodesInTraversePath(pattern, root, false); + + if (nodes.size() > 0) { + Node lastNode = nodes.get(nodes.size() - 1); + if (lastNode != null) { + List positions = getPositions(lastNode); + positions = positions.stream() + .sorted() + .collect(Collectors.toList()); + positions.forEach(m -> result.add((markPatternInText(m, pattern)))); + } + } + return result; + } + + private void addSuffix(String suffix, int position) { + LOGGER.info(">>>>>>>>>>>> Adding new suffix {}", suffix); + List nodes = getAllNodesInTraversePath(suffix, root, true); + if (nodes.size() == 0) { + addChildNode(root, suffix, position); + LOGGER.info("{}", printTree()); + } else { + Node lastNode = nodes.remove(nodes.size() - 1); + String newText = suffix; + if (nodes.size() > 0) { + String existingSuffixUptoLastNode = nodes.stream() + .map(a -> a.getText()) + .reduce("", String::concat); + + // Remove prefix from newText already included in parent + newText = newText.substring(existingSuffixUptoLastNode.length()); + } + extendNode(lastNode, newText, position); + LOGGER.info("{}", printTree()); + } + } + + private List getPositions(Node node) { + List positions = new ArrayList<>(); + if (node.getText() + .endsWith(WORD_TERMINATION)) { + positions.add(node.getPosition()); + } + for (int i = 0; i < node.getChildren() + .size(); i++) { + positions.addAll(getPositions(node.getChildren() + .get(i))); + } + return positions; + } + + private String markPatternInText(Integer startPosition, String pattern) { + String matchingTextLHS = fullText.substring(0, startPosition); + String matchingText = fullText.substring(startPosition, startPosition + pattern.length()); + String matchingTextRHS = fullText.substring(startPosition + pattern.length()); + return matchingTextLHS + "[" + matchingText + "]" + matchingTextRHS; + } + + private void addChildNode(Node parentNode, String text, int position) { + parentNode.getChildren() + .add(new Node(text, position)); + } + + private void extendNode(Node node, String newText, int position) { + String currentText = node.getText(); + String commonPrefix = getLongestCommonPrefix(currentText, newText); + + if (commonPrefix != currentText) { + String parentText = currentText.substring(0, commonPrefix.length()); + String childText = currentText.substring(commonPrefix.length()); + splitNodeToParentAndChild(node, parentText, childText); + } + + String remainingText = newText.substring(commonPrefix.length()); + addChildNode(node, remainingText, position); + } + + private void splitNodeToParentAndChild(Node parentNode, String parentNewText, String childNewText) { + Node childNode = new Node(childNewText, parentNode.getPosition()); + + if (parentNode.getChildren() + .size() > 0) { + while (parentNode.getChildren() + .size() > 0) { + childNode.getChildren() + .add(parentNode.getChildren() + .remove(0)); + } + } + + parentNode.getChildren() + .add(childNode); + parentNode.setText(parentNewText); + parentNode.setPosition(POSITION_UNDEFINED); + } + + private String getLongestCommonPrefix(String str1, String str2) { + int compareLength = Math.min(str1.length(), str2.length()); + for (int i = 0; i < compareLength; i++) { + if (str1.charAt(i) != str2.charAt(i)) { + return str1.substring(0, i); + } + } + return str1.substring(0, compareLength); + } + + private List getAllNodesInTraversePath(String pattern, Node startNode, boolean isAllowPartialMatch) { + List nodes = new ArrayList<>(); + for (int i = 0; i < startNode.getChildren() + .size(); i++) { + Node currentNode = startNode.getChildren() + .get(i); + String nodeText = currentNode.getText(); + if (pattern.charAt(0) == nodeText.charAt(0)) { + if (isAllowPartialMatch && pattern.length() <= nodeText.length()) { + nodes.add(currentNode); + return nodes; + } + + int compareLength = Math.min(nodeText.length(), pattern.length()); + for (int j = 1; j < compareLength; j++) { + if (pattern.charAt(j) != nodeText.charAt(j)) { + if (isAllowPartialMatch) + nodes.add(currentNode); + return nodes; + } + } + + nodes.add(currentNode); + if (pattern.length() > compareLength) { + List nodes2 = getAllNodesInTraversePath(pattern.substring(compareLength), currentNode, isAllowPartialMatch); + if (nodes2.size() == 0 && !isAllowPartialMatch) { + nodes.add(null); + return nodes; + } + nodes.addAll(nodes2); + } + return nodes; + } + } + return nodes; + } + + public String printTree() { + return root.printTree(""); + } +} diff --git a/algorithms-searching/src/test/java/com/baeldung/algorithms/suffixtree/SuffixTreeUnitTest.java b/algorithms-searching/src/test/java/com/baeldung/algorithms/suffixtree/SuffixTreeUnitTest.java new file mode 100644 index 0000000000..1e08b2bf44 --- /dev/null +++ b/algorithms-searching/src/test/java/com/baeldung/algorithms/suffixtree/SuffixTreeUnitTest.java @@ -0,0 +1,78 @@ +package com.baeldung.algorithms.suffixtree; + +import java.util.List; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class SuffixTreeUnitTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(SuffixTreeUnitTest.class); + + private static SuffixTree suffixTree; + + @BeforeClass + public static void setUp() { + suffixTree = new SuffixTree(); + suffixTree.addText("bananabanana"); + printTree(); + } + + @Test + public void givenSuffixTree_whenSearchingForA_thenReturn6Matches() { + List matches = suffixTree.searchText("a"); + matches.stream() + .forEach(m -> LOGGER.info(m)); + Assert.assertArrayEquals(new String[] { "b[a]nanabanana", "ban[a]nabanana", "banan[a]banana", "bananab[a]nana", "bananaban[a]na", "bananabanan[a]" }, matches.toArray()); + } + + @Test + public void givenSuffixTree_whenSearchingForNab_thenReturn1Match() { + List matches = suffixTree.searchText("nab"); + matches.stream() + .forEach(m -> LOGGER.info(m)); + Assert.assertArrayEquals(new String[] { "bana[nab]anana" }, matches.toArray()); + } + + @Test + public void givenSuffixTree_whenSearchingForNag_thenReturnNoMatches() { + List matches = suffixTree.searchText("nag"); + matches.stream() + .forEach(m -> LOGGER.info(m)); + Assert.assertArrayEquals(new String[] {}, matches.toArray()); + } + + @Test + public void givenSuffixTree_whenSearchingForBanana_thenReturn2Matches() { + List matches = suffixTree.searchText("banana"); + matches.stream() + .forEach(m -> LOGGER.info(m)); + Assert.assertArrayEquals(new String[] { "[banana]banana", "banana[banana]" }, matches.toArray()); + } + + @Test + public void givenSuffixTree_whenSearchingForNa_thenReturn4Matches() { + List matches = suffixTree.searchText("na"); + matches.stream() + .forEach(m -> LOGGER.info(m)); + Assert.assertArrayEquals(new String[] { "ba[na]nabanana", "bana[na]banana", "bananaba[na]na", "bananabana[na]" }, matches.toArray()); + } + + @Test + public void givenSuffixTree_whenSearchingForX_thenReturnNoMatches() { + List matches = suffixTree.searchText("x"); + matches.stream() + .forEach(m -> LOGGER.info(m)); + Assert.assertArrayEquals(new String[] {}, matches.toArray()); + } + + private static void printTree() { + suffixTree.printTree(); + + LOGGER.info("\n" + suffixTree.printTree()); + LOGGER.info("=============================================="); + } +}