import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import com.juxinli.jobscrawler.service.CleanWebService;
import lombok.extern.slf4j.Slf4j;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.springframework.stereotype.Service;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
@Slf4j
@Service
public class CleanWebServiceImpl implements CleanWebService {
@Override
public Object fetchNode(String pageString, String xpath) {
HtmlCleaner hc = new HtmlCleaner();
TagNode tn = hc.clean(pageString);
Document dom = null;
try {
dom = new DomSerializer(new CleanerProperties()).createDOM(tn);
} catch (ParserConfigurationException e) {
log.error(e.getLocalizedMessage(), e);
}
XPath xPath = XPathFactory.newInstance().newXPath();
Object rootNode = null;
try {
rootNode = xPath.evaluate(xpath, dom, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
log.error("xpath提取出错", e);
}
return rootNode;
}
@Override
public List<String> getNodeListByAttr(String pageString,String xpath, String attr) {
Object rootNode = fetchNode(pageString,xpath);
List<String> attrContentList = new ArrayList<>();
if (rootNode instanceof NodeList) {
NodeList nodeList = (NodeList) rootNode;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if (node.getAttributes().getNamedItem(attr) == null)
attrContentList.add("Null");
attrContentList.add(node.getAttributes().getNamedItem(attr).getTextContent());
}
}
return attrContentList;
}
@Override
public List<String> getNodeList(String pageString, String xpath) {
Object rootNode = fetchNode(pageString, xpath);
List<String> contentList = new ArrayList<>();
if (rootNode instanceof NodeList) {
NodeList nodeList = (NodeList) rootNode;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
contentList.add(node.getTextContent()!= "" ? node
.getTextContent() : "Null");
}
}
return contentList;
}
/**
* 这个只用来取页数
* @param xpath
* @return
*/
@Override
public String[] getNodeArray(String pageString, String xpath) {
Object rootNode = fetchNode(pageString,xpath);
String[] contentArray = new String[6];
if (rootNode instanceof NodeList) {
NodeList nodeList = (NodeList) rootNode;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if (node == null) {
continue;
}
contentArray[i] = (node.getTextContent()!= null ? node
.getTextContent() : "Null");
}
}
return contentArray;
}
}
2019-07-17
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
推荐阅读更多精彩内容
- 译者注:NHL选秀已经过去了将近1个月时间,让我们再回顾一下又一批的锦鲤们在选秀场的表现吧。 温哥华 -- Jac...
- 泰国曼谷,一个废弃的商场变成了鱼类的天堂 树木为了生存,顺着人行道的砖缝分离扎根。 澳大利亚悉尼,树木长满了废弃的...
- 有天晚上,在连续被两个女生说“丑”之后,我出离的愤怒。怒写七个段子,展开一场“审美运动”。没错,审判美,为丑代言。...