HttpClient分享
HttpClient 是Apache Jakarta Common 下的子项目,可以用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。
1. 初识HttpClient 使用HttpClient爬取某网站
前置知识
- Javaj基础
- 网络知识基础
代码如下图:
@Test
public void helloHttpClient() throws Exception {
CloseableHttpClient httpClient=HttpClients.createDefault();
String uri="https://www.tuicool.com/";
HttpGet httpGet=new HttpGet(uri);
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
String entityStr = EntityUtils.toString(entity, DEFAULT_CHARASET);
logger.info(entityStr);
response.close();
httpClient.close();
}
爬取网页被拦截
为什么呢?浏览器能正常访问
看到了请求头部分的User-Agent信息于是模仿浏览器
@Test
/**
* 模拟浏览器访问
* 在HttpGet例设置请求头内容,通过kv的方式赋予浏览器的标示
* @throws Exception
*/
public void analogBrowser() throws Exception {
CloseableHttpClient httpClient=HttpClients.createDefault();
String uri="https://www.tuicool.com/";
HttpGet httpGet=new HttpGet(uri);
// 在HttpGet例设置请求头内容,通过kv的方式赋予浏览器的标示,当然也可以赋值为Android、iOS等客户端
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
logger.info("返回响应状态编码为[ {} ]",response.getStatusLine().getStatusCode());
logger.info("返回内容编码类型为[ {} ]",entity.getContentType().getValue());
String entityStr = EntityUtils.toString(entity, DEFAULT_CHARASET);
logger.info(entityStr);
response.close();
httpClient.close();
}
模仿浏览器后允许访问
接下来我们搞点事情,为了以后从某站下载某些不可描述的资料做准备,下载一张图片
@Test
/**
* 下载图片
* @throws Exception
*/
public void dowloadPicture() throws Exception {
CloseableHttpClient httpClient=HttpClients.createDefault();
String uri="https://aimg2.tuicool.com/YnmA3y3.jpg!index";
HttpGet httpGet=new HttpGet(uri);
// 在HttpGet例设置请求头内容,通过kv的方式赋予浏览器的标示,当然也可以赋值为Android、iOS等客户端
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if(null!=entity) {
logger.info("返回响应状态编码为[ {} ]",response.getStatusLine().getStatusCode());
logger.info("返回内容编码类型为[ {} ]",entity.getContentType().getValue());
// 通过流的方式把图片
InputStream inputStream = entity.getContent();
FileUtils.copyToFile(inputStream, new File("/home/left/aaa.jpg"));
}
response.close();
httpClient.close();
}
控制台显示
图片下载成功
使用代理
@Test
/**
* 使用代理IP
* @throws Exception
*/
public void helloProxyip() throws Exception {
CloseableHttpClient httpClient=HttpClients.createDefault();
String uri="http://www.i2finance.net/";
HttpGet httpGet=new HttpGet(uri);
// 在HttpGet例设置请求头内容,通过kv的方式赋予浏览器的标示,当然也可以赋值为Android、iOS等客户端
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
HttpHost proxy=new HttpHost("103.235.199.93", 49176);
RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
// 设置代理
httpGet.setConfig(config);
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if(null!=entity) {
logger.info("此次为代理访问,代理信息为[{}]",httpGet.getConfig().getProxy());
logger.info("返回响应状态编码为[ {} ]",response.getStatusLine().getStatusCode());
logger.info("返回内容编码类型为[ {} ]",entity.getContentType().getValue());
}
response.close();
httpClient.close();
}
关于超时,可以通过设置超时时间(单位毫秒)来判断是否该结束当前进行,避免空耗
public static void main(String[] args) throws Exception {
CloseableHttpClient httpClient=HttpClients.createDefault();
String uri="http://www.i2finance.net/";
HttpGet httpGet=new HttpGet(uri);
// 在HttpGet例设置请求头内容,通过kv的方式赋予浏览器的标示,当然也可以赋值为Android、iOS等客户端
httpGet.setHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
RequestConfig config = RequestConfig.custom()
.setConnectionRequestTimeout(100)
.setSocketTimeout(10)
.build();
// 设置代理
httpGet.setConfig(config);
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if(null!=entity) {
logger.info("此次为代理访问,代理信息为[{}]",httpGet.getConfig().getProxy());
logger.info("返回响应状态编码为[ {} ]",response.getStatusLine().getStatusCode());
logger.info("返回内容编码类型为[ {} ]",entity.getContentType().getValue());
}
response.close();
httpClient.close();
}