package com.tanzhou.spiders;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @ClassName: Main
* @Description: TODO(爬虫程序测试)
* @author Administrator
* @date 2018年4月30日
*
*/
public class Main {
/**
* @throws IOException
*
* @Title: processPage
* @Description: TODO(爬虫方法,用来从网页上爬取数据)
* @param @param URL 参数
* @return void 返回类型
* @throws
*/
public static String processPage(String word,int x) throws IOException{
List list = Ha.getList();
Document doc = Jsoup.connect("https://www.baidu.com/s?wd="+word+"&pn="+x).get();
Elements select2 = doc.select("head");
int y = 1;
if(x>0){
y= x+1;
}
for(int i = y;i<(x+11);i++){
Element elementById = doc.getElementById(""+i+"");
list.add(elementById.toString());
}
String path = "D:/workspace/Spiders/WebContent/jsp/css.html";
File f = new File(path);
if (!f.exists()) {
f.createNewFile();
}
Writer writer = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(f), "UTF-8"));
writer.write(select2.toString());
for (int j = 0,len = list.size(); j < len; j++) {
writer.append((CharSequence) list.get(j));
}
writer.flush();
writer.close();
return path;
}
public static void main(String[] args) throws IOException {
processPage("haha",10);
}
}
package com.tanzhou.spiders;
import java.io.IOException;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
@WebServlet(urlPatterns= "/SpiderServlet")
public class SpiderServlet extends HttpServlet {
@Override
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
request.setCharacterEncoding("utf-8");
String word = request.getParameter("word");
String num = request.getParameter("num");
int x = Integer.parseInt(num);
x=10;
String processPage =null;
for(int i = 0;i<(x/10);i++){
processPage = Main.processPage(word,i*10);
}
int of = processPage.indexOf("jsp");
String substring = processPage.substring(of);
System.out.println(substring);
request.getRequestDispatcher("/"+substring).forward(request, response);
}
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
}
}