cleanup and testing work

This commit is contained in:
eugenp
2016-10-12 08:00:02 +03:00
parent 2aa2f43c6e
commit eb7650eead
24 changed files with 259 additions and 271 deletions

View File

@@ -10,31 +10,31 @@ import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class HtmlUnitWebScraping {
public static void main(final String[] args) throws Exception {
try (final WebClient webClient = new WebClient()) {
public static void main(final String[] args) throws Exception {
try (final WebClient webClient = new WebClient()) {
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
final HtmlPage page = webClient.getPage("http://www.baeldung.com/full_archive");
final HtmlAnchor latestPostLink = (HtmlAnchor) page.getByXPath("(//ul[@class='car-monthlisting']/li)[1]/a").get(0);
final HtmlPage page = webClient.getPage("http://www.baeldung.com/full_archive");
final HtmlAnchor latestPostLink = (HtmlAnchor) page.getByXPath("(//ul[@class='car-monthlisting']/li)[1]/a").get(0);
System.out.println("Entering: " + latestPostLink.getHrefAttribute());
System.out.println("Entering: " + latestPostLink.getHrefAttribute());
final HtmlPage postPage = latestPostLink.click();
final HtmlPage postPage = latestPostLink.click();
final HtmlHeading1 heading1 = (HtmlHeading1) postPage.getByXPath("//h1").get(0);
System.out.println("Title: " + heading1.getTextContent());
final HtmlHeading1 heading1 = (HtmlHeading1) postPage.getByXPath("//h1").get(0);
System.out.println("Title: " + heading1.getTextContent());
final List<HtmlHeading2> headings2 = (List<HtmlHeading2>) postPage.getByXPath("//h2");
final List<HtmlHeading2> headings2 = (List<HtmlHeading2>) postPage.getByXPath("//h2");
final StringBuilder sb = new StringBuilder(heading1.getTextContent());
for (final HtmlHeading2 h2 : headings2) {
sb.append("\n").append(h2.getTextContent());
}
final StringBuilder sb = new StringBuilder(heading1.getTextContent());
for (final HtmlHeading2 h2 : headings2) {
sb.append("\n").append(h2.getTextContent());
}
System.out.println(sb.toString());
}
}
System.out.println(sb.toString());
}
}
}