HttpClient:
(1)实现了所有 HTTP 的方法(GET,POST,PUT,DELETE 等)
(2)支持自动转向
(3)支持 HTTPS 协议
(4)支持代理服务器等
//pom.xml
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
//基本用法
public class HttpClientDemo {
public static void main(String[] args) throws Exception {
String url = "http://www.baidu.com";
CloseableHttpClient httpClient = HttpClients.createDefault();
// POST请求
// 创建一个post对象
HttpPost post = new HttpPost(url);
// 添加头信息
post.addHeader("head1", "123");
post.setHeader("Cookie", "aaa");
// 添加参数
List<NameValuePair> kvList = new ArrayList<>();
kvList.add(new BasicNameValuePair("param1", "111"));
kvList.add(new BasicNameValuePair("param2", "222"));
// 包装成一个Entity对象
StringEntity entity = new UrlEncodedFormEntity(kvList, "UTF8");
// 设置请求的内容
post.setEntity(entity);
// 执行post请求
CloseableHttpResponse response = httpClient.execute(post);
String result = EntityUtils.toString(response.getEntity(), "UTF8");
System.out.println(result);
// GET请求
// 由于GET请求的参数都是拼装在URL地址后方,所以我们要构建一个URL,带参数
URIBuilder uriBuilder = new URIBuilder(url);
List<NameValuePair> list = new LinkedList<>();
list.add(new BasicNameValuePair("h1", "123"));
uriBuilder.setParameters(list);
// 创建Get对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
// 执行Get请求
CloseableHttpResponse resp = httpClient.execute(httpGet);
String ret = EntityUtils.toString(resp.getEntity(), "UTF8");
System.out.println(ret);
resp.close();
httpClient.close();
}
}
//工具类
public class HttpClientUtil {
private static final Logger logger = LoggerFactory.getLogger(HttpClientUtil.class);
private static final String HTTP = "http";
private static final String HTTPS = "https";
private static SSLConnectionSocketFactory sslsf = null;
private static PoolingHttpClientConnectionManager cm = null;
private static SSLContextBuilder builder = null;
static {
try {
builder = new SSLContextBuilder();
// 全部信任 不做身份鉴定
builder.loadTrustMaterial(null, new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
return true;
}
});
sslsf = new SSLConnectionSocketFactory(builder.build(), new String[]{"SSLv2Hello", "SSLv3", "TLSv1", "TLSv1.2"}, null, NoopHostnameVerifier.INSTANCE);
Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register(HTTP, new PlainConnectionSocketFactory())
.register(HTTPS, sslsf)
.build();
cm = new PoolingHttpClientConnectionManager(registry);
cm.setMaxTotal(200);//max connection
} catch (Exception e) {
e.printStackTrace();
}
}
public static String doPost(String url, Map<String,String> heads, Map<String,String> params){
try {
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建一个post对象
HttpPost post =new HttpPost(url);
if(heads != null){
Set<Entry<String, String>> entrySet = heads.entrySet();
for(Entry<String, String> entry : entrySet){
post.addHeader(entry.getKey(), entry.getValue());
}
}
if(params != null){
Set<Entry<String, String>> entrySet = params.entrySet();
List<NameValuePair>kvList = new ArrayList<>();
for(Entry<String, String> entry : entrySet){
kvList.add(new BasicNameValuePair(entry.getKey(),entry.getValue()));
}
//包装成一个Entity对象
StringEntity entity = new UrlEncodedFormEntity(kvList,"UTF8");
//设置请求的内容
post.setEntity(entity);
}
//执行post请求
CloseableHttpResponse response = httpClient.execute(post);
String result = EntityUtils.toString(response.getEntity(),"UTF8");
response.close();
httpClient.close();
return result;
} catch (ClientProtocolException e) {
logger.error("post请求异常:{}",e);
} catch (ParseException e) {
logger.error("post请求异常:{}",e);
} catch (IOException e) {
logger.error("post请求异常:{}",e);
}
return null;
}
public static String doPostWithEntity(String url, JSONObject params){
try {
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建一个post对象
HttpPost post =new HttpPost(url);
post.addHeader("Content-Type", "application/json");
if(params != null){
post.setEntity(new StringEntity(params.toJSONString(),"UTF8"));
}
//执行post请求
CloseableHttpResponse response = httpClient.execute(post);
String result = EntityUtils.toString(response.getEntity(),"UTF8");
response.close();
httpClient.close();
return result;
} catch (ClientProtocolException e) {
logger.error("post请求异常:{}",e);
} catch (ParseException e) {
logger.error("post请求异常:{}",e);
} catch (IOException e) {
logger.error("post请求异常:{}",e);
}
return null;
}
public static byte[] doPostToStream(String url, JSONObject params){
try {
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建一个post对象
HttpPost post =new HttpPost(url);
post.addHeader("Content-Type", "application/json");
if(params != null){
post.setEntity(new StringEntity(params.toJSONString(),"UTF8"));
}
//执行post请求
CloseableHttpResponse response = httpClient.execute(post);
HttpEntity httpEntity = response.getEntity();
ContentType contentType = ContentType.getOrDefault(httpEntity);
if(ContentType.IMAGE_JPEG.getMimeType().equals(contentType.getMimeType())){
byte[] array = EntityUtils.toByteArray(httpEntity);
return array;
}else{
String result = EntityUtils.toString(response.getEntity(),"UTF8");
logger.warn("doPostToStream方法请求失败:{}",result);
}
response.close();
httpClient.close();
} catch (ClientProtocolException e) {
logger.error("post请求异常:{}",e);
} catch (ParseException e) {
logger.error("post请求异常:{}",e);
} catch (IOException e) {
logger.error("post请求异常:{}",e);
}
return null;
}
public static String doGetWithHttps(String url, Map<String,String> heads, Map<String,String> params){
String result = "";
CloseableHttpClient httpClient = null;
try {
httpClient = HttpClients.custom()
.setSSLSocketFactory(sslsf)
.setConnectionManager(cm)
.setConnectionManagerShared(true)
.build();
/*
* 由于GET请求的参数都是拼装在URL地址后方,所以我们要构建一个URL,带参数
*/
URIBuilder uriBuilder = new URIBuilder(url);
if(params != null){
Set<Entry<String, String>> entrySet = params.entrySet();
List<NameValuePair> list = new LinkedList<>();
for(Entry<String, String> entry : entrySet){
list.add(new BasicNameValuePair(entry.getKey(),entry.getValue()));
}
uriBuilder.setParameters(list);
}
// 根据带参数的URI对象构建GET请求对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
if(heads != null){
Set<Entry<String, String>> entrySet = heads.entrySet();
for(Entry<String, String> entry : entrySet){
httpGet.addHeader(entry.getKey(), entry.getValue());
}
}
//执行post请求
CloseableHttpResponse response = httpClient.execute(httpGet);
result = EntityUtils.toString(response.getEntity(),"UTF8");
response.close();
httpClient.close();
}catch(Exception e){
logger.error("doGetWithHttps请求异常:{}",e);
}
return result;
}
public static String doGetWithCookie(String url, CookieStore cookieStore){
String result = "";
CloseableHttpClient httpClient = null;
try {
httpClient = HttpClients.custom()
.setDefaultCookieStore(cookieStore)
.setSSLSocketFactory(sslsf)
.setConnectionManager(cm)
.setConnectionManagerShared(true)
.build();
/*
* 由于GET请求的参数都是拼装在URL地址后方,所以我们要构建一个URL,带参数
*/
URIBuilder uriBuilder = new URIBuilder(url);
// 根据带参数的URI对象构建GET请求对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
//执行post请求
CloseableHttpResponse response = httpClient.execute(httpGet);
result = EntityUtils.toString(response.getEntity(),"UTF8");
response.close();
httpClient.close();
}catch(Exception e){
logger.error("doGetWithCookie请求异常:{}",e);
}
return result;
}
}
Jsoup:
(1)一款Java的HTML解析器,主要用来对HTML解析
(2)虽然也支持从某个地址直接去爬取网页源码,但是只支持HTTP,HTTPS协议,支持不够丰富
//pom.xml
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
//基本用法
Document document = Jsoup.connect(url)
// 手动设置cookies
.header("Cookie", "your cookies")
.get();
System.out.println(document );
爬虫实现思路
在做爬虫时,遇到需要登陆的问题也比较常见,比如写脚本抢票之类的,但凡需要个人信息的都需要登陆,对于这类问题主要有两种解决方式:一种方式是手动设置 cookie ,就是先在网站上面登录,复制登陆后的 cookies ,在爬虫程序中手动设置 HTTP 请求中的 Cookie 属性,这种方式适用于采集频次不高、采集周期短,因为 cookie 会失效,如果长期采集的话就需要频繁设置 cookie,这不是一种可行的办法,第二种方式就是使用程序模拟登陆,通过模拟登陆获取到 cookies,这种方式适用于长期采集该网站,因为每次采集都会先登陆,这样就不需要担心 cookie 过期的问题。
1、模拟登录之Jsoup方式
/**
* Jsoup 模拟登录豆瓣 访问个人中心
* 在豆瓣登录时先输入一个错误的账号密码,查看到登录所需要的参数
* 先构造登录请求参数,成功后获取到cookies
* 设置request cookies,再次请求
* @param loginUrl 登录url
* @param userInfoUrl 个人中心url
* @throws IOException
*/
public void jsoupLogin(String loginUrl,String userInfoUrl) throws IOException {
// 构造登陆参数
Map<String,String> data = new HashMap<>();
data.put("name","your_account");
data.put("password","your_password");
data.put("remember","false");
data.put("ticket","");
data.put("ck","");
Connection.Response login = Jsoup.connect(loginUrl)
.ignoreContentType(true) // 忽略类型验证
.followRedirects(false) // 禁止重定向
.postDataCharset("utf-8")
.header("Upgrade-Insecure-Requests","1")
.header("Accept","application/json")
.header("Content-Type","application/x-www-form-urlencoded")
.header("X-Requested-With","XMLHttpRequest")
.header("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
.data(data)
.method(Connection.Method.POST)
.execute();
login.charset("UTF-8");
// login 中已经获取到登录成功之后的cookies
// 构造访问个人中心的请求
Document document = Jsoup.connect(userInfoUrl)
// 取出login对象里面的cookies
.cookies(login.cookies())
.get();
if (document != null) {
Element element = document.select(".info h1").first();
if (element == null) {
System.out.println("没有找到 .info h1 标签");
return;
}
String userName = element.ownText();
System.out.println("豆瓣我的网名为:" + userName);
} else {
System.out.println("出错啦!!!!!");
}
}
2、模拟登录之httpclient方式
/**
* httpclient 的方式模拟登录豆瓣
* httpclient 跟jsoup差不多,不同的地方在于 httpclient 有session的概念
* 在同一个httpclient 内不需要设置cookies ,会默认缓存下来
* @param loginUrl
* @param userInfoUrl
*/
public void httpClientLogin(String loginUrl,String userInfoUrl) throws Exception{
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpUriRequest login = RequestBuilder.post()
.setUri(new URI(loginUrl))// 登陆url
.setHeader("Upgrade-Insecure-Requests","1")
.setHeader("Accept","application/json")
.setHeader("Content-Type","application/x-www-form-urlencoded")
.setHeader("X-Requested-With","XMLHttpRequest")
.setHeader("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
// 设置账号信息
.addParameter("name","your_account")
.addParameter("password","your_password")
.addParameter("remember","false")
.addParameter("ticket","")
.addParameter("ck","")
.build();
// 模拟登陆
CloseableHttpResponse response = httpclient.execute(login);
if (response.getStatusLine().getStatusCode() == 200){
// 构造访问个人中心请求
HttpGet httpGet = new HttpGet(userInfoUrl);
CloseableHttpResponse user_response = httpclient.execute(httpGet);
HttpEntity entity = user_response.getEntity();
//
String body = EntityUtils.toString(entity, "utf-8");
// 偷个懒,直接判断 缺心眼那叫单纯 是否存在字符串中
System.out.println("缺心眼那叫单纯是否查找到?"+(body.contains("缺心眼那叫单纯")));
}else {
System.out.println("httpclient 模拟登录豆瓣失败了!!!!");
}
}
两者不一样的地方,httpclient 能够像浏览器一样保存 session 会话,这样登陆之后就保存下了 cookie ,在同一个 httpclient 内请求就会带上 cookie