Merge pull request #12 from Kabhal/normalization

Normalization + hardbreaks fixes
2016-02-16 12:43:30 +01:00
parent d26789dd8b 3d31f71f69
commit bff7be0714
4 changed files with 55 additions and 7 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -48,6 +48,8 @@ dependencies {
    compile 'com.google.guava:guava'
    compile "commons-codec:commons-codec"
    testCompile 'junit:junit'
+    testCompile "org.mockito:mockito-core"
+
    testCompile 'ch.qos.logback:logback-classic'
 }

@@ -55,6 +57,7 @@ dependencyManagement {
    dependencies {
        dependency "org.slf4j:slf4j-api:1.7.12"
        dependency "junit:junit:4.11"
+        dependency "org.mockito:mockito-core:1.9.5"
        dependency "ch.qos.logback:logback-classic:1.1.2"
        dependency "commons-collections:commons-collections:3.2.1"
        dependency "org.apache.commons:commons-lang3:3.2.1"
--- a/src/main/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilder.java
+++ b/src/main/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilder.java
@@ -36,8 +36,8 @@ import java.util.regex.Pattern;
 */
 public abstract class AbstractMarkupDocBuilder implements MarkupDocBuilder {

-    private static final Pattern ANCHOR_FORBIDDEN_PATTERN = Pattern.compile("[^0-9a-zA-Z-_]+");
-    private static final Pattern ANCHOR_PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+");
+    private static final Pattern ANCHOR_UNIGNORABLE_PATTERN = Pattern.compile("[^0-9a-zA-Z-_]+");
+    private static final Pattern ANCHOR_IGNORABLE_PATTERN = Pattern.compile("[\\p{InCombiningDiacriticalMarks}@#&(){}\\[\\]!$*%+=/:.;,?\\\\<>|]+");
    private static final Pattern ANCHOR_SPACE_PATTERN = Pattern.compile("[\\s]+");

    protected StringBuilder documentBuilder = new StringBuilder();
@@ -159,15 +159,15 @@ public abstract class AbstractMarkupDocBuilder implements MarkupDocBuilder {
     */
    protected String normalizeAnchor(Markup spaceEscape, String anchor) {
        String normalizedAnchor = anchor.trim();
-        String trimAnchor = normalizedAnchor;
        normalizedAnchor = Normalizer.normalize(normalizedAnchor, Normalizer.Form.NFD);
-        normalizedAnchor = ANCHOR_PUNCTUATION_PATTERN.matcher(normalizedAnchor).replaceAll("");
+        normalizedAnchor = ANCHOR_IGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll("");
+        normalizedAnchor = normalizedAnchor.trim();
        normalizedAnchor = normalizedAnchor.toLowerCase();
        normalizedAnchor = ANCHOR_SPACE_PATTERN.matcher(normalizedAnchor).replaceAll(spaceEscape.toString());

-        String validAnchor = ANCHOR_FORBIDDEN_PATTERN.matcher(normalizedAnchor).replaceAll("");
+        String validAnchor = ANCHOR_UNIGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll("");
        if (validAnchor.length() != normalizedAnchor.length())
-            normalizedAnchor = DigestUtils.md5Hex(trimAnchor);
+            normalizedAnchor = DigestUtils.md5Hex(normalizedAnchor);
        else
            normalizedAnchor = validAnchor;

--- a/src/main/java/io/github/robwin/markup/builder/asciidoc/AsciiDoc.java
+++ b/src/main/java/io/github/robwin/markup/builder/asciidoc/AsciiDoc.java
@@ -30,7 +30,7 @@ public enum AsciiDoc implements Markup {
    TABLE_COLUMN_DELIMITER("|"),
    TABLE_COLUMN_DELIMITER_ESCAPE("\\|"), // AsciiDoctor supports both \| and {vbar}
    LISTING("----"),
-    HARDBREAKS(":hardbreaks:"),
+    HARDBREAKS("[%hardbreaks]"),
    DOCUMENT_TITLE("= "),
    SECTION_TITLE_LEVEL1("== "),
    SECTION_TITLE_LEVEL2("=== "),
--- a/src/test/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilderTest.java
+++ b/src/test/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilderTest.java
@@ -0,0 +1,45 @@
+package io.github.robwin.markup.builder;
+
+import io.github.robwin.markup.builder.asciidoc.AsciiDoc;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.mock;
+
+public class AbstractMarkupDocBuilderTest {
+
+    AbstractMarkupDocBuilder builder;
+
+    @Before
+    public void setUp() {
+        builder = mock(AbstractMarkupDocBuilder.class, Mockito.CALLS_REAL_METHODS);
+    }
+
+    private String normalize(String anchor) {
+        return builder.normalizeAnchor(AsciiDoc.SPACE_ESCAPE, anchor);
+    }
+
+    private void assertNormalization(String result, String anchor) {
+        assertEquals(result, normalize(anchor));
+    }
+
+    @Test
+    public void testNormalizeAnchor() throws Exception {
+        assertNormalization("", "");
+        assertNormalization("anchor", "anchor");
+        assertNormalization("anchor", "aNcHoR");
+        assertNormalization("__anchor__", "_ anchor _");
+        assertNormalization("-_anchor_-", "- anchor -");
+        assertNormalization("classic-simple_anchor", "classic-simple_anchor");
+        assertNormalization("an_chor", "     an    chor  ");
+        assertNormalization("anchor", "#  anchor  &");
+        assertNormalization(DigestUtils.md5Hex("\u0240"), "\u0240");
+        assertNormalization(normalize("\u0240"), " \u0240 ");
+        assertNormalization(DigestUtils.md5Hex("µu_\u0240this_-_"), "  µ&|ù \u0240This .:/-_#  ");
+        assertNormalization("this_is_a_funky_string", "Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ");
+        assertNormalization("", "  @#&(){}[]!$*%+=/:.;,?\\<>| ");
+    }
+}