需求:
使用httpClient 爬: 白居易的<琵琶行>
http://www.shicimingju.com/chaxun/list/4059.html
要求:输入上述url 返回白居易的琵琶行
效果如下:
一、该项目使用springboot的多组件方式,即需要一个前端和后端提供数据的api接口。项目目录结构如下:
二、思路: 获取前端的url内容地址, 通过HttpClients获取整个页面内容,再通过Jsoup进行解析获取相关标签下面的内容。
三、maven需要的依赖及版本
<parent> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.3.RELEASE</version>
<relativePath/>
</parent><groupId>com.alibaba</groupId>
<artifactId>httpclient</artifactId>
<version>1.0.0</version><properties>
<java.version>1.8</java.version>
</properties><dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.8</version>
</dependency>
<dependency> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
<!--lombok-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
</dependencies>
controller:
两个工具类
public class HttpClientUtil {
public static StringgetHtml(String url) {
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response =null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
// HttpGet request = new HttpGet("http://www.shicimingju.com/chaxun/list/4059.html");
HttpGet request =new HttpGet(url);
try {
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
String html = EntityUtils.toString(httpEntity, "utf-8");
// Jsoup 解析网页数据
Document document = Jsoup.parse(html);
// 获取目标内容
Elements item_content = document.getElementsByClass("item_content");
// String text = item_content.text();
return item_content.toString();
}else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
return "不是200";
}
}catch (ClientProtocolException e) {
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return "请输入正确url地址";
}
}
@Setter
@Getter
@NoArgsConstructor
public class JsonResult {
public static final int CODE_SUCCESS =200;
public static final StringMSG_SUCCESS ="操作成功";
public static final int CODE_NOLOGIN =401;
public static final StringMSG_NOLOGIN ="请先登录";
public static final int CODE_ERROR =500;
public static final StringMSG_ERROR ="系统异常,请联系管理员";
public static final int CODE_ERROR_PARAM =501; // 参数异常
private int code; // 用来区分不同的结果, 不是true或false
private Stringmsg; // 处理操作, 还要携带的数据
private T data;
public JsonResult(int code, String msg, T data){
this.code = code;
this.msg = msg;
this.data = data;
}
public static JsonResultsuccess(T data){
return new JsonResult(CODE_SUCCESS, MSG_SUCCESS, data);
}
public static JsonResultsuccess(){
return new JsonResult(CODE_SUCCESS, MSG_SUCCESS, null);
}
public static JsonResulterror(int code, String msg, T data){
return new JsonResult(code, msg, data);
}
public static JsonResultdefaultError(){
return new JsonResult(CODE_ERROR, MSG_ERROR, null);
}
public static JsonResultnoLogin() {
return new JsonResult(CODE_NOLOGIN, MSG_NOLOGIN, null);
}
}
主方法:
@SpringBootApplication
public class WebSiteAppimplements WebMvcConfigurer {
//跨域访问
@Bean
public WebMvcConfigurercorsConfigurer() {
return new WebMvcConfigurer() {
@Override
//重写父类提供的跨域请求处理的接口
public void addCorsMappings(CorsRegistry registry) {
//添加映射路径
registry.addMapping("/**")
//放行哪些原始域
.allowedOrigins("*")
//是否发送Cookie信息
.allowCredentials(true)
//放行哪些原始域(请求方式)
.allowedMethods("GET", "POST", "PUT", "DELETE","OPTIONS")
//放行哪些原始域(头部信息)
.allowedHeaders("*")
//暴露哪些头部信息(因为跨域访问默认不能获取全部头部信息)
.exposedHeaders("Header1", "Header2");
}
};
}
public static void main(String[] args) {
SpringApplication.run(WebSiteApp.class, args);
}
}
配置端口: application.properties
server.port=8081
前端用到jquery的插件:
<!DOCTYPE html>
<html lang="en">
<meta charset="UTF-8">
<title>Title
<script src="../jquery/jquery.js">
$(function () {
$('#url').blur(function () {
// 获取url地址
var url =$("#url").val();
console.log(url);
// 提交表单
$.get('http://localhost:8081/htmlclient/list',{url: url}, function (data) {
console.log(data.data);
if (data.code ==200){
// 追加内容
$("#item_content").append(data.data)
}
})
})
})
<form id="myForm">
<table border="1" cellspacing="0">
url:<input type="text" id="url">
<div class="item_content" id="item_content">
</html>