主管让做个抓取淘宝数据的功能,但是淘宝的比较难,我先从扒新浪新闻开始。
环境,Apache 提供免费的 HTTPClien t源码和 JAR 包下载,可以登陆这里下载,笔者用的是4.51版本。
参考apache提供的例子,使用正则表达式做出如下程序。
public class Main {
public static void Detail(String url) throws Exception {
CloseableHttpClient httpclient = HttpClients.createDefault();
String oldStr;
try {
HttpGet httpget = new HttpGet(url);
String encoding="gbk";
if(url.contains("comments")){
encoding = "utf-8";
}
System.out.println(encoding);
System.out.println("Executing request " + httpget.getURI());
CloseableHttpResponse response = httpclient.execute(httpget);
try {
System.out.println("----------------------------------------");
System.out.println(response.getStatusLine());
HttpEntity entity = response.getEntity();
oldStr = EntityUtils.toString(response.getEntity(),encoding);
// Call abort on the request object
httpget.abort();
} finally {
response.close();
}
} finally {
httpclient.close();
}
Pattern pattern = Pattern.compile("<title>[^<]*</title>");
Matcher matcher = pattern.matcher(oldStr);
if(matcher.find()){
String str = matcher.group();
str = str.substring(7,str.length()-8);
System.out.println("---"+str);
}
pattern = Pattern.compile("<p>[^<]*</p>");
matcher = pattern.matcher(oldStr);
while(matcher.find()){
String str = matcher.group();
str = str.substring(3,str.length()-4);
System.out.println(str);
}
}
public static void main(String[] args) throws Exception {
CloseableHttpClient httpclient = HttpClients.createDefault();
String oldStr;
try {
String str = null;
str ="http://news.sina.com.cn/hotnews/";
HttpGet httpget = new HttpGet(str);
System.out.println("Executing request " + httpget.getURI());
CloseableHttpResponse response = httpclient.execute(httpget);
try { System.out.println("----------------------------------------");
System.out.println(response.getStatusLine());
HttpEntity entity = response.getEntity();
oldStr = EntityUtils.toString(response.getEntity(),"UTF-8");
// Call abort on the request object
httpget.abort();
} finally {
response.close();
}
} finally {
httpclient.close();
}
Pattern pattern = Pattern.compile("href='http://[^']*'");
Matcher matcher = pattern.matcher(oldStr);
int i= 1;
while(matcher.find()){
String str = matcher.group();
str = str.substring(6,str.length()-1);
System.out.println(str);
Detail(str);
System.out.println(i++);
}
}
}
```