public static function getResourceByTemp($html)
{
$attr = ['title', 'alt', 'placeholder'];//指定html属性上的文本
$getAttrArr = [];
foreach ($attr as $k => $v) {
preg_match_all("/{$v}=\"(.*?)\"/", $html, $get);
if (!empty($get['1'])) {
$getAttrArr = array_unique(array_merge($getAttrArr, $get['1']));
}
}
$html = preg_replace("/<!--.*?-->/is", '', $html); //删除注释
$html = preg_replace("/<style.*?>.*?<\/style>/is", '', $html); //删除style标签
$html = preg_replace("/<script.*?>.*?<\/script>/is", '', $html); //删除js标签
$separator = '::#::myself::#::'; //自己设定的,特有的分隔符
$html = preg_replace("/<.*?>/is", $separator, $html);
$arr = array_filter(explode($separator, $html));
if (!empty($getAttrArr)) {
$arr = array_filter(array_unique(array_merge($arr, $getAttrArr)));
}
$array = [];
foreach ($arr as $k => $v) {
$v = trim($v);
if (!empty($v)) {
if (strpos(PHP_EOL, $v)) {
//有些字符里面会还有换行,再分析一次并去掉字符前后的空格。
$tmp = explode(PHP_EOL, $v);
foreach ($tmp as $val) {
$val = trim($val);
if (!empty($val)) {
$array[] = $val;
}
}
} else {
//如果没有换行,直接赋值
$array[] = $v;
}
}
}
foreach ($array as $k => $v) {
//去掉纯数字的元素
if (is_numeric($v)) {
unset($array[$k]);
}
//去掉纯符号的元素
$pregStr = preg_replace("/[\x{4e00}-\x{9fa5}A-Za-z0-9]/u", '', $v);
if ($v == $pregStr) {
unset($array[$k]);
}
//去掉类似这种图标字符
if (strpos($v, '&#x') !== false && (strlen($v) == 7 || strlen($v) == 8)) {
unset($array[$k]);
}
}
$result = [];
$array = array_unique($array);
foreach ($array as $v) {
$result[] = ['text' => trim($v), 'len' => mb_strlen($v)];
}
array_multisort(array_column($result, 'len'), SORT_DESC, $result); //按字符长度倒序
return $result;
}
//获取页面的title,keyword,description
public static function get_sitemeta($data)
{
if (self::isUrl($data)) {
$data = file_get_contents($url);
}
$meta = array();
if (!empty($data)) {
#Title
preg_match('/<TITLE>([\w\W]*?)<\/TITLE>/si', $data, $matches);
if (!empty($matches[1])) {
$meta['title'] = $matches[1];
}
#Keywords
preg_match('/<META\s+name="keywords"\s+content="([\w\W]*?)"/si', $data, $matches);
if (empty($matches[1])) {
preg_match("/<META\s+name='keywords'\s+content='([\w\W]*?)'/si", $data, $matches);
}
if (empty($matches[1])) {
preg_match('/<META\s+content="([\w\W]*?)"\s+name="keywords"/si', $data, $matches);
}
if (empty($matches[1])) {
preg_match('/<META\s+http-equiv="keywords"\s+content="([\w\W]*?)"/si', $data, $matches);
}
if (!empty($matches[1])) {
$meta['keywords'] = $matches[1];
}
#Description
preg_match('/<META\s+name="description"\s+content="([\w\W]*?)"/si', $data, $matches);
if (empty($matches[1])) {
preg_match("/<META\s+name='description'\s+content='([\w\W]*?)'/si", $data, $matches);
}
if (empty($matches[1])) {
preg_match('/<META\s+content="([\w\W]*?)"\s+name="description"/si', $data, $matches);
}
if (empty($matches[1])) {
preg_match('/<META\s+http-equiv="description"\s+content="([\w\W]*?)"/si', $data, $matches);
}
if (!empty($matches[1])) {
$meta['description'] = $matches[1];
}
}
return $meta;
}