Merge branch 'master' into master

This commit is contained in:
Loredana Crusoveanu
2019-06-11 21:47:42 +03:00
committed by GitHub
243 changed files with 5206 additions and 2213 deletions

View File

@@ -0,0 +1,22 @@
package com.baeldung.crawler4j;
public class CrawlerStatistics {
private int processedPageCount = 0;
private int totalLinksCount = 0;
public void incrementProcessedPageCount() {
processedPageCount++;
}
public void incrementTotalLinksCount(int linksCount) {
totalLinksCount += linksCount;
}
public int getProcessedPageCount() {
return processedPageCount;
}
public int getTotalLinksCount() {
return totalLinksCount;
}
}

View File

@@ -0,0 +1,48 @@
package com.baeldung.crawler4j;
import java.util.Set;
import java.util.regex.Pattern;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class HtmlCrawler extends WebCrawler {
private final static Pattern EXCLUSIONS = Pattern.compile(".*(\\.(css|js|xml|gif|jpg|png|mp3|mp4|zip|gz|pdf))$");
private CrawlerStatistics stats;
public HtmlCrawler(CrawlerStatistics stats) {
this.stats = stats;
}
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String urlString = url.getURL().toLowerCase();
return !EXCLUSIONS.matcher(urlString).matches()
&& urlString.startsWith("https://www.baeldung.com/");
}
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
stats.incrementProcessedPageCount();
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String title = htmlParseData.getTitle();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
stats.incrementTotalLinksCount(links.size());
System.out.printf("Page with title '%s' %n", title);
System.out.printf(" Text length: %d %n", text.length());
System.out.printf(" HTML length: %d %n", html.length());
System.out.printf(" %d outbound links %n", links.size());
}
}
}

View File

@@ -0,0 +1,36 @@
package com.baeldung.crawler4j;
import java.io.File;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class HtmlCrawlerController {
public static void main(String[] args) throws Exception {
File crawlStorage = new File("src/test/resources/crawler4j");
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
config.setMaxDepthOfCrawling(2);
int numCrawlers = 12;
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("https://www.baeldung.com/");
CrawlerStatistics stats = new CrawlerStatistics();
CrawlController.WebCrawlerFactory<HtmlCrawler> factory = () -> new HtmlCrawler(stats);
controller.start(factory, numCrawlers);
System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
}
}

View File

@@ -0,0 +1,49 @@
package com.baeldung.crawler4j;
import java.io.File;
import java.util.regex.Pattern;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.BinaryParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class ImageCrawler extends WebCrawler {
private final static Pattern EXCLUSIONS = Pattern.compile(".*(\\.(css|js|xml|gif|png|mp3|mp4|zip|gz|pdf))$");
private static final Pattern IMG_PATTERNS = Pattern.compile(".*(\\.(jpg|jpeg))$");
private File saveDir;
public ImageCrawler(File saveDir) {
this.saveDir = saveDir;
}
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String urlString = url.getURL().toLowerCase();
if (EXCLUSIONS.matcher(urlString).matches()) {
return false;
}
if (IMG_PATTERNS.matcher(urlString).matches()
|| urlString.startsWith("https://www.baeldung.com/")) {
return true;
}
return false;
}
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
if (IMG_PATTERNS.matcher(url).matches()
&& page.getParseData() instanceof BinaryParseData) {
String extension = url.substring(url.lastIndexOf("."));
int contentLength = page.getContentData().length;
System.out.printf("Extension is '%s' with content length %d %n", extension, contentLength);
}
}
}

View File

@@ -0,0 +1,36 @@
package com.baeldung.crawler4j;
import java.io.File;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class ImageCrawlerController {
public static void main(String[] args) throws Exception {
File crawlStorage = new File("src/test/resources/crawler4j");
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
config.setIncludeBinaryContentInCrawling(true);
config.setMaxPagesToFetch(500);
File saveDir = new File("src/test/resources/crawler4j");
int numCrawlers = 12;
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("https://www.baeldung.com/");
CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
controller.start(factory, numCrawlers);
}
}

View File

@@ -0,0 +1,54 @@
package com.baeldung.crawler4j;
import java.io.File;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class MultipleCrawlerController {
public static void main(String[] args) throws Exception {
File crawlStorageBase = new File("src/test/resources/crawler4j");
CrawlConfig htmlConfig = new CrawlConfig();
CrawlConfig imageConfig = new CrawlConfig();
htmlConfig.setCrawlStorageFolder(new File(crawlStorageBase, "html").getAbsolutePath());
imageConfig.setCrawlStorageFolder(new File(crawlStorageBase, "image").getAbsolutePath());
imageConfig.setIncludeBinaryContentInCrawling(true);
htmlConfig.setMaxPagesToFetch(500);
imageConfig.setMaxPagesToFetch(1000);
PageFetcher pageFetcherHtml = new PageFetcher(htmlConfig);
PageFetcher pageFetcherImage = new PageFetcher(imageConfig);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcherHtml);
CrawlController htmlController = new CrawlController(htmlConfig, pageFetcherHtml, robotstxtServer);
CrawlController imageController = new CrawlController(imageConfig, pageFetcherImage, robotstxtServer);
htmlController.addSeed("https://www.baeldung.com/");
imageController.addSeed("https://www.baeldung.com/");
CrawlerStatistics stats = new CrawlerStatistics();
CrawlController.WebCrawlerFactory<HtmlCrawler> htmlFactory = () -> new HtmlCrawler(stats);
File saveDir = new File("src/test/resources/crawler4j");
CrawlController.WebCrawlerFactory<ImageCrawler> imageFactory = () -> new ImageCrawler(saveDir);
imageController.startNonBlocking(imageFactory, 7);
htmlController.startNonBlocking(htmlFactory, 10);
htmlController.waitUntilFinish();
System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
imageController.waitUntilFinish();
System.out.printf("Image Crawler is finished.");
}
}

View File

@@ -0,0 +1,102 @@
package com.baeldung.okhttp;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.ResponseBody;
import okhttp3.mockwebserver.MockResponse;
import okhttp3.mockwebserver.MockWebServer;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import java.io.InputStreamReader;
public class ResponseDecoderUnitTest {
@Rule public ExpectedException exceptionRule = ExpectedException.none();
@Rule public MockWebServer server = new MockWebServer();
SimpleEntity sampleResponse;
MockResponse mockResponse;
OkHttpClient client;
@Before
public void setUp() {
sampleResponse = new SimpleEntity("Baeldung");
client = new OkHttpClient.Builder().build();
mockResponse = new MockResponse()
.setResponseCode(200)
.setHeader("Content-Type", "application/json")
.setBody(new Gson().toJson(sampleResponse));
}
@Test
public void givenJacksonDecoder_whenGetStringOfResponse_thenExpectSimpleEntity() throws Exception {
server.enqueue(mockResponse);
Request request = new Request.Builder()
.url(server.url(""))
.build();
ResponseBody responseBody = client
.newCall(request)
.execute()
.body();
Assert.assertNotNull(responseBody);
Assert.assertNotEquals(0, responseBody.contentLength());
ObjectMapper objectMapper = new ObjectMapper();
SimpleEntity entity = objectMapper.readValue(responseBody.string(), SimpleEntity.class);
Assert.assertNotNull(entity);
Assert.assertEquals(sampleResponse.getName(), entity.getName());
}
@Test
public void givenGsonDecoder_whenGetByteStreamOfResponse_thenExpectSimpleEntity() throws Exception {
server.enqueue(mockResponse);
Request request = new Request.Builder()
.url(server.url(""))
.build();
ResponseBody responseBody = client
.newCall(request)
.execute()
.body();
Assert.assertNotNull(responseBody);
Assert.assertNotEquals(0, responseBody.contentLength());
Gson gson = new Gson();
SimpleEntity entity = gson.fromJson(new InputStreamReader(responseBody.byteStream()), SimpleEntity.class);
Assert.assertNotNull(entity);
Assert.assertEquals(sampleResponse.getName(), entity.getName());
}
@Test
public void givenGsonDecoder_whenGetStringOfResponse_thenExpectSimpleEntity() throws Exception {
server.enqueue(mockResponse);
Request request = new Request.Builder()
.url(server.url(""))
.build();
ResponseBody responseBody = client
.newCall(request)
.execute()
.body();
Assert.assertNotNull(responseBody);
Gson gson = new Gson();
SimpleEntity entity = gson.fromJson(responseBody.string(), SimpleEntity.class);
Assert.assertNotNull(entity);
Assert.assertEquals(sampleResponse.getName(), entity.getName());
}
}

View File

@@ -0,0 +1,22 @@
package com.baeldung.okhttp;
public class SimpleEntity {
protected String name;
public SimpleEntity(String name) {
this.name = name;
}
//no-arg constructor, getters and setters here
public SimpleEntity() {
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}