1.先添加依赖
<dependency>
<groupId>org.jsoup
<artifactId>jsoup
<version>1.11.3
</dependency>
2.写如下代码
package com.stylefeng.guns.common.utils;
import com.auth0.jwt.internal.org.apache.commons.lang3.StringUtils;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author:wy
* @date: 2018/9/21 9:55
* @description: 爬虫
*/
public class RepTileTest2 {
private static final StringfilePath ="E:\\imgeTest";
// private static int initCount = 1;
public static final ThreadLocaltempCount =new ThreadLocal() {
@Override
protected IntegerinitialValue() {
System.out.println("当前线程初始化:" + Thread.currentThread().getName());
return 1;
}
};
public ListrepTileGo(String url) {
Document document =null;
List imgUrList =new ArrayList();
try {
document = Jsoup.connect(url).get();
Elements elements = document.getElementsByTag("img");
for (Element element : elements) {
String imgUrl = element.attr("data-src");
System.out.println("爬到图片地址:" + imgUrl);
imgUrList.add(imgUrl);
}
}catch (IOException e) {
System.out.println("卧槽,爬虫爬GG了");
}
return imgUrList;
}
/**
* @author:wy
* @date: 2018/9/21 13:37
* @description: 计算下一页
*/
public StringnextUrl(String url) {
//最后一次出现的下标
int lastIndex = url.lastIndexOf("-") +1;
int total = url.length();
//拿到页数参数
String lastPage = url.substring(lastIndex, total);
//下一页 默认第一页
String nextPage ="";
if (StringUtils.isNotEmpty(lastPage)) {
//计算下一页
nextPage = String.valueOf(Integer.valueOf(lastPage) +1);
}
//拼接url地址
StringBuilder sb =new StringBuilder(url);
StringBuilder nextUlr = sb.replace(lastIndex, total, nextPage);
System.out.println(nextUlr);
return nextUlr.toString();
}
/**
* @author:Zhang jc
* @date: 2018/9/21 14:33
* @description: 下载图片
*/
public static void downLoadImage(String imgUrl, String imgType) {
if (StringUtils.isEmpty(imgUrl)) {
return;
}
String fileName = imgType +tempCount.get();
tempCount.set(Integer.valueOf(tempCount.get()) +1);
String mkdirPath =filePath +"\\" + fileName +".jpg";
File imgMkdir =new File(filePath);
if (!createMkdir(imgMkdir)) {
System.out.println("文件创建失败!");
return;
}
System.out.println("文件创建成功!");
try {
URL url =new URL(imgUrl);
InputStream inputStream = url.openConnection().getInputStream();
FileOutputStream fileOutputStream =new FileOutputStream(new File(mkdirPath));
byte[] bs =new byte[1024];
int len;
while ((len = inputStream.read(bs)) != -1) {
fileOutputStream.write(bs, 0, len);
}
}catch (Exception e) {
e.printStackTrace();
}
}
private static boolean createMkdir(File file) {
try {
if (file.exists()) {
System.out.println("文件已经存在!");
return true;
}
FileUtils.forceMkdir(file);
return true;
}catch (IOException e) {
return false;
}
}
static class ZjcsSmallReptileimplements Runnable {
private Stringurl;
private StringimgType;
public ZjcsSmallReptile(String url, String imgType) {
this.url = url;
this.imgType = imgType;
}
@Override
public void run() {
int count =0;
RepTileTest2 repTile =new RepTileTest2();
//保存所有爬到的图片地址
List list =new ArrayList<>();
while (count <=227) {
count++;
url = repTile.nextUrl(url);
List partList = repTile.repTileGo(url);
for (String part : partList) {
list.add(part);
}
}
if (list.isEmpty()) {
return;
}
for (String imgUrl : list) {
System.out.println("开始下载图片,图片地址:" + imgUrl);
RepTileTest2.downLoadImage(imgUrl, imgType);
System.out.println("下载图片结束=======");
}
}
}
public static void main(String[] args) {
String url ="http://www.51yuansu.com/search/guoqing-0-0-0-0-0";
Map map =new HashMap<>();
map.put("guoqing", "http://www.51yuansu.com/search/guoqing-0-0-0-0-0");
map.put("zhongqiujie", "http://www.51yuansu.com/search/zhongqiujie-0-0-0-0-0");
map.put("qiutian", "http://www.51yuansu.com/search/qiutian-0-0-0-0-0");
for (Map.Entry entry : map.entrySet()) {
ZjcsSmallReptile reptile =new ZjcsSmallReptile(entry.getValue(), entry.getKey());
Thread thread =new Thread(reptile);
thread.start();
}
}
}