本篇文章的原理分析围绕着WebMagic的四大组件展开的,不清楚的小伙伴可以看小编的上一篇文章WebMagic初探,原理分析围绕着爬虫的运行展开的,可以运行下方的程序,然后debug跟随小编一起了解四大组件是如何运行的
public class WyNewsProcessor implements PageProcessor {
//抓取网站的相关配置、编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
//计数器
private static int count = 0 ;
@Override
public void process(Page page) {
//判断连接是否满足
boolean flag = page.getUrl().regex("https://www.cnblogs.com/[a-z 0-9 -]+/p/[0-9]{7}.html").match();
if(!flag){
//加入满足条件的连接 xpath选取要爬取的div
page.addTargetRequests(page.getHtml().xpath("//*[@id=\"post_list\"]/div/div[@class='post_item_body']/h3/a/@href").all());
}else {
System.out.println("抓到的内容:作者昵称"+page.getHtml().xpath("//*[@id=\"Header1_HeaderTitle\"]/text()").get());
count++ ;
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new WyNewsProcessor()).addUrl("https://www.cnblogs.com/").thread(5).run();
System.out.println("结束了"+count);
}
}
项目运行后,首先是Spider创建,
//程序启动创建 Spider
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
//获取Site
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
}
//获取url 可变长参数接收
public Spider addUrl(String... urls) {
for (String url : urls) {
addRequest(new Request(url));
}
signalNewUrl();
return this;
}
private void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}
得到抓取网站的相关配置、编码、抓取间隔、重试次数等
页面获取成功后调用 onDownloadSuccess方法
private void onDownloadSuccess(Request request, Page page) {
if (site.getAcceptStatCode().contains(page.getStatusCode())){
//调用自己重写的process()方法
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
} else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
return;
}
Xpath 对下载得到的页面进行筛选,得到有效信息
@Override
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = Selectors.xpath(xpath);
return selectElements(xpathSelector);
}
protected Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
//将匹配的结果赋值给sourceTexts
public PlainText(List<String> sourceTexts) {
this.sourceTexts = sourceTexts;
}
线程启动,截取了两个方法了解,checkRunningStat方法有并发操作,他有一个值
stat 类型是AtomicInteger 这个类型提供原子操作,具体可以参考其他文章,这里不做深入探讨。
initComponent方法是进行初始化操作
public void run() {
checkRunningStat();
initComponent();
...省略
}
private void checkRunningStat() {
while (true) {
int statNow = stat.get();
if (statNow == STAT_RUNNING) {
throw new IllegalStateException("Spider is already running!");
}
if (stat.compareAndSet(statNow, STAT_RUNNING)) {
break;
}
}
}
protected void initComponent() {
if (downloader == null) {
this.downloader = new HttpClientDownloader();
}
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
downloader.setThread(threadNum);
if (threadPool == null || threadPool.isShutdown()) {
if (executorService != null && !executorService.isShutdown()) {
threadPool = new CountableThreadPool(threadNum, executorService);
} else {
threadPool = new CountableThreadPool(threadNum);
}
}
if (startRequests != null) {
for (Request request : startRequests) {
addRequest(request);
}
startRequests.clear();
}
startTime = new Date();
}
致此,程序运行完毕,对代码解读有误请多多指正,多多交流