diff --git a/src/main/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilder.java b/src/main/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilder.java index 570456e8..8b5fe11c 100644 --- a/src/main/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilder.java +++ b/src/main/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilder.java @@ -36,8 +36,8 @@ import java.util.regex.Pattern; */ public abstract class AbstractMarkupDocBuilder implements MarkupDocBuilder { - private static final Pattern ANCHOR_FORBIDDEN_PATTERN = Pattern.compile("[^0-9a-zA-Z-_]+"); - private static final Pattern ANCHOR_PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+"); + private static final Pattern ANCHOR_UNIGNORABLE_PATTERN = Pattern.compile("[^0-9a-zA-Z-_]+"); + private static final Pattern ANCHOR_IGNORABLE_PATTERN = Pattern.compile("[\\p{InCombiningDiacriticalMarks}@#&(){}\\[\\]!$*%+=/:.;,?\\\\<>|]+"); private static final Pattern ANCHOR_SPACE_PATTERN = Pattern.compile("[\\s]+"); protected StringBuilder documentBuilder = new StringBuilder(); @@ -159,15 +159,15 @@ public abstract class AbstractMarkupDocBuilder implements MarkupDocBuilder { */ protected String normalizeAnchor(Markup spaceEscape, String anchor) { String normalizedAnchor = anchor.trim(); - String trimAnchor = normalizedAnchor; normalizedAnchor = Normalizer.normalize(normalizedAnchor, Normalizer.Form.NFD); - normalizedAnchor = ANCHOR_PUNCTUATION_PATTERN.matcher(normalizedAnchor).replaceAll(""); + normalizedAnchor = ANCHOR_IGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll(""); + normalizedAnchor = normalizedAnchor.trim(); normalizedAnchor = normalizedAnchor.toLowerCase(); normalizedAnchor = ANCHOR_SPACE_PATTERN.matcher(normalizedAnchor).replaceAll(spaceEscape.toString()); - String validAnchor = ANCHOR_FORBIDDEN_PATTERN.matcher(normalizedAnchor).replaceAll(""); + String validAnchor = ANCHOR_UNIGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll(""); if (validAnchor.length() != normalizedAnchor.length()) - normalizedAnchor = DigestUtils.md5Hex(trimAnchor); + normalizedAnchor = DigestUtils.md5Hex(normalizedAnchor); else normalizedAnchor = validAnchor; diff --git a/src/test/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilderTest.java b/src/test/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilderTest.java new file mode 100644 index 00000000..f018443c --- /dev/null +++ b/src/test/java/io/github/robwin/markup/builder/AbstractMarkupDocBuilderTest.java @@ -0,0 +1,45 @@ +package io.github.robwin.markup.builder; + +import io.github.robwin.markup.builder.asciidoc.AsciiDoc; +import org.apache.commons.codec.digest.DigestUtils; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mockito; + +import static org.junit.Assert.*; +import static org.mockito.Mockito.mock; + +public class AbstractMarkupDocBuilderTest { + + AbstractMarkupDocBuilder builder; + + @Before + public void setUp() { + builder = mock(AbstractMarkupDocBuilder.class, Mockito.CALLS_REAL_METHODS); + } + + private String normalize(String anchor) { + return builder.normalizeAnchor(AsciiDoc.SPACE_ESCAPE, anchor); + } + + private void assertNormalization(String result, String anchor) { + assertEquals(result, normalize(anchor)); + } + + @Test + public void testNormalizeAnchor() throws Exception { + assertNormalization("", ""); + assertNormalization("anchor", "anchor"); + assertNormalization("anchor", "aNcHoR"); + assertNormalization("__anchor__", "_ anchor _"); + assertNormalization("-_anchor_-", "- anchor -"); + assertNormalization("classic-simple_anchor", "classic-simple_anchor"); + assertNormalization("an_chor", " an chor "); + assertNormalization("anchor", "# anchor &"); + assertNormalization(DigestUtils.md5Hex("\u0240"), "\u0240"); + assertNormalization(normalize("\u0240"), " \u0240 "); + assertNormalization(DigestUtils.md5Hex("µu_\u0240this_-_"), " µ&|ù \u0240This .:/-_# "); + assertNormalization("this_is_a_funky_string", "Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ"); + assertNormalization("", " @#&(){}[]§!¡$*£%+=/:.;,?¿\\<>| "); + } +} \ No newline at end of file