(.*)<\/div>/iUs"; //微信图⽚样式 private $imageStyle = 'style="max-width: 100%;height:auto"'; /** * 爬取内容 * @param $url * @return false|string * @author bignerd * @since 2016-08-16T10:13:58+0800 */ private function _get($url) { return file_get_contents($url); } public function crawByUrl($url) { $content = $this->_get($url); if(empty($content)){ return error(-1, '⽂章不存在'); } $basicInfo = $this->articleBasicInfo($content); $content_result = $this->contentHandle($content); if(!empty($content_result['code']) && $content_result['code'] < 0){ return $content_result; } list($content_html, $content_text) = $content_result; return success(0,'',array_merge($basicInfo, ['content_html' => $content_html, 'content_text' => $content_text])); } /** * 处理微信⽂章源码,提取⽂章主体,处理图⽚链接 * @author bignerd * @since 2016-08-16T15:59:27+0800 * @param $content 抓取的微信⽂章源码 * @return [带图html⽂本,⽆图html⽂本] */ private function contentHandle($content) { $content_html_pattern = $this->wxContentDiv; preg_match_all($content_html_pattern, $content, $html_matchs); if (empty(array_filter($html_matchs))) { return error(-1, '⽂章不存在'); } $content_html = $html_matchs[1][0]; // $content_html = "
".$content_html; $content_html = "".$content_html; $content_html = str_replace("preview.html","player.html",$content_html); //去除掉hidden隐藏 // $content_html = str_replace('style="visibility: hidden;"', '', $content_html); //过滤掉iframe // $content_html = preg_replace('//', '', $content_html); // $content_html = preg_replace('//', '', $content_html); $path = 'article/'; /** @var 带图⽚html⽂本 */ $content_html = preg_replace_callback('/data-src="(.*?)"/', function ($matches) use ($path) { return 'src="' . img($this->getImg($matches[1])) . '" ' ; }, $content_html); //添加微信样式 // $content_html = '
' . $content_html . '
'; /** @var ⽆图html⽂本 */ $content_text = preg_replace('//s', '', $content_html); return [$content_html, $content_text]; } /** * 获取⽂章的基本信息 * @author bignerd * @since 2016-08-16T17:16:32+0800 * @param $content ⽂章详情源码 * @return $basicInfo */ private function articleBasicInfo($content) { //待获取item $item = [ 'ct' => 'date',//发布时间 'msg_title' => 'title',//标题 'msg_desc' => 'digest',//描述 'msg_link' => 'content_url',//⽂章链接 'msg_cdn_url' => 'cover',//封⾯图⽚链接 'nickname' => 'wechatname',//公众号名称 ]; $basicInfo = [ 'author' => '', 'copyright_stat' => '', ]; foreach ($item as $k => $v) { if ($k == 'msg_title') $pattern = '/var ' . $k . ' = \'(.*?)\'\.html\(false\);/s'; else $pattern = '/var ' . $k . ' = "\'(.*?)\'";/s'; preg_match_all($pattern, $content, $matches); if (array_key_exists(1, $matches) && !empty($matches[1][0])) { $basicInfo[$v] = trim($this->htmlTransform($matches[1][0])); } else { $basicInfo[$v] = ''; } } // // 获取作者 // preg_match('/(.*?)<\/em>/s', $content, $matchAuthor); // if(!empty($matchAuthor[1])) $basicInfo['author'] = $matchAuthor[1]; // // ⽂章类型 // preg_match('/