Merge pull request #12 from Kabhal/normalization

Normalization + hardbreaks fixes
This commit is contained in:
Robert Winkler
2016-02-16 12:43:30 +01:00
4 changed files with 55 additions and 7 deletions

View File

@@ -48,6 +48,8 @@ dependencies {
compile 'com.google.guava:guava'
compile "commons-codec:commons-codec"
testCompile 'junit:junit'
testCompile "org.mockito:mockito-core"
testCompile 'ch.qos.logback:logback-classic'
}
@@ -55,6 +57,7 @@ dependencyManagement {
dependencies {
dependency "org.slf4j:slf4j-api:1.7.12"
dependency "junit:junit:4.11"
dependency "org.mockito:mockito-core:1.9.5"
dependency "ch.qos.logback:logback-classic:1.1.2"
dependency "commons-collections:commons-collections:3.2.1"
dependency "org.apache.commons:commons-lang3:3.2.1"

View File

@@ -36,8 +36,8 @@ import java.util.regex.Pattern;
*/
public abstract class AbstractMarkupDocBuilder implements MarkupDocBuilder {
private static final Pattern ANCHOR_FORBIDDEN_PATTERN = Pattern.compile("[^0-9a-zA-Z-_]+");
private static final Pattern ANCHOR_PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+");
private static final Pattern ANCHOR_UNIGNORABLE_PATTERN = Pattern.compile("[^0-9a-zA-Z-_]+");
private static final Pattern ANCHOR_IGNORABLE_PATTERN = Pattern.compile("[\\p{InCombiningDiacriticalMarks}@#&(){}\\[\\]!$*%+=/:.;,?\\\\<>|]+");
private static final Pattern ANCHOR_SPACE_PATTERN = Pattern.compile("[\\s]+");
protected StringBuilder documentBuilder = new StringBuilder();
@@ -159,15 +159,15 @@ public abstract class AbstractMarkupDocBuilder implements MarkupDocBuilder {
*/
protected String normalizeAnchor(Markup spaceEscape, String anchor) {
String normalizedAnchor = anchor.trim();
String trimAnchor = normalizedAnchor;
normalizedAnchor = Normalizer.normalize(normalizedAnchor, Normalizer.Form.NFD);
normalizedAnchor = ANCHOR_PUNCTUATION_PATTERN.matcher(normalizedAnchor).replaceAll("");
normalizedAnchor = ANCHOR_IGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll("");
normalizedAnchor = normalizedAnchor.trim();
normalizedAnchor = normalizedAnchor.toLowerCase();
normalizedAnchor = ANCHOR_SPACE_PATTERN.matcher(normalizedAnchor).replaceAll(spaceEscape.toString());
String validAnchor = ANCHOR_FORBIDDEN_PATTERN.matcher(normalizedAnchor).replaceAll("");
String validAnchor = ANCHOR_UNIGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll("");
if (validAnchor.length() != normalizedAnchor.length())
normalizedAnchor = DigestUtils.md5Hex(trimAnchor);
normalizedAnchor = DigestUtils.md5Hex(normalizedAnchor);
else
normalizedAnchor = validAnchor;

View File

@@ -30,7 +30,7 @@ public enum AsciiDoc implements Markup {
TABLE_COLUMN_DELIMITER("|"),
TABLE_COLUMN_DELIMITER_ESCAPE("\\|"), // AsciiDoctor supports both \| and {vbar}
LISTING("----"),
HARDBREAKS(":hardbreaks:"),
HARDBREAKS("[%hardbreaks]"),
DOCUMENT_TITLE("= "),
SECTION_TITLE_LEVEL1("== "),
SECTION_TITLE_LEVEL2("=== "),

View File

@@ -0,0 +1,45 @@
package io.github.robwin.markup.builder;
import io.github.robwin.markup.builder.asciidoc.AsciiDoc;
import org.apache.commons.codec.digest.DigestUtils;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
import static org.junit.Assert.*;
import static org.mockito.Mockito.mock;
public class AbstractMarkupDocBuilderTest {
AbstractMarkupDocBuilder builder;
@Before
public void setUp() {
builder = mock(AbstractMarkupDocBuilder.class, Mockito.CALLS_REAL_METHODS);
}
private String normalize(String anchor) {
return builder.normalizeAnchor(AsciiDoc.SPACE_ESCAPE, anchor);
}
private void assertNormalization(String result, String anchor) {
assertEquals(result, normalize(anchor));
}
@Test
public void testNormalizeAnchor() throws Exception {
assertNormalization("", "");
assertNormalization("anchor", "anchor");
assertNormalization("anchor", "aNcHoR");
assertNormalization("__anchor__", "_ anchor _");
assertNormalization("-_anchor_-", "- anchor -");
assertNormalization("classic-simple_anchor", "classic-simple_anchor");
assertNormalization("an_chor", " an chor ");
assertNormalization("anchor", "# anchor &");
assertNormalization(DigestUtils.md5Hex("\u0240"), "\u0240");
assertNormalization(normalize("\u0240"), " \u0240 ");
assertNormalization(DigestUtils.md5Hex("µu_\u0240this_-_"), " µ&|ù \u0240This .:/-_# ");
assertNormalization("this_is_a_funky_string", "Tĥïŝ ĩš â fůňķŷ Šťŕĭńġ");
assertNormalization("", " @#&(){}[]!$*%+=/:.;,?\\<>| ");
}
}