这个方法缺点是速度慢,优点是暂时不用考虑数据共享的问题,但是单线程去爬虫还是有很大缺点的,
暂时先写一个单线程的,下次再此基础上优化出多线程版本。
package com.lhsjohn.spider;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest2 {
public static void main(String[] args) throws Exception {
int index = 118;
String Sindex = "";
while (index <= 999) {
if (index <= 9) {
Sindex = "00" + index;
}
if (index > 9 && index <= 99) {
Sindex = "0" + index;
}
if (index > 99) {
Sindex = "" + index;
}
index++;
System.out.println(Sindex);
crawNews(Sindex);
}
}
public static void crawNews(String index) throws Exception {
String url = "http://news.ifeng.com/a/20180922/60080" + index + "_0.shtml";
org.jsoup.nodes.Document document = Jsoup.connect(url).timeout(50000000).get();
String selection = ".yc_main.wrap";
Elements elements = document.select(selection);
// if (!elements.isEmpty()) {
// Element element = elements.get(0);
// }
if(elements.isEmpty()) {
Elements elements2 = document.select("#artical #artical_topic");
if(!elements2.isEmpty()) {
System.out.println(elements2.get(0).text());
return;
}else {
return;
}
}
Element element = elements.get(0);
Elements elements2 = element.select(".yc_tit");
if (!elements2.isEmpty()) {
Element element2 = elements2.get(0);
Elements children = element2.children();
for (Element element3 : children) {
// System.out.println(element3.tagName());
if (element3.tagName().equals("h1")) {
System.out.println("标题:" + element3.text());
}
if (element3.tagName().equals("p")) {
Elements children2 = element3.children();
Element element4 = children2.get(0);
System.out.println("时间:" + element4.text());
Element element5 = children2.get(1);
System.out.println("来源:" + element5.text());
System.out.println("来源链接:" + element5.attr("href"));
}
}
}
}