概述
闲来无事,研究了下搜狗的微信搜索,顺手写了两个工具:查询文章的关键词排名和公众号的关键词排名。用到的知识很简单:分析搜狗微信搜索参数和正则匹配。点击下面链接可直接访问工具:
微信文章排名查询
微信公众号排名查询
实现思路
1.分析搜狗微信搜索的主要参数,说明如下:
query:查询的关键词
type:查询类型:1-查询公众号;2-查询文章
p:当前页码
num:查询每页的结果数,默认为10
_ast:当前请求的时间
2.为保障查询参数的正确,我首先模拟登陆首页,获得其它意义不明确的参数,和自定义的查询参数还有相关cookies数据一起,作为一个完整请求发送。
3.用正则表达式匹配排名数据即可。
为保证能检索到排名,需要确认一个文章的识别码(公众号识别码就是微信号),具体如下:
/** * @params $keyword 关键词 * @params $id 身份标识 * @params $type 搜索类型,1:公众号;2:搜索文章 * @return int 0-无排名;非零值表示当前排名 */ function getRank($keyword, $id, $type=2) { //模擬從首頁登陸 $client = new HttpClient('weixin.sogou.com'); if(!$client->get('/')) { die('查询发生错误'); } $param = array( 'query'=>$keywords, 'type'=>$type, 'p'=>1, 'num'=>100, '_ast'=>time(), ); //获取額外参数 preg_match_all('/<input name="([^" type="hidden" value="([^" />content, $out); foreach($out[1] as $k=>$v) { $param[$v] = $out[2][$k]; } $re = array(); if($client->get('/weixin',$param)) { if($type==2) //查找文章排名 { //匹配字串为 <h4><a id="sogou_vr_11002601_title_2" href="http://mp.weixin.qq.com/s?__biz=MzA5ODE2OTYwMA==&mid=200270633&idx=4&sn=db2642f26640871b1747e92fcd8070af&3rd=MzA3MDU4NTYzMw==&scene=6#rd" target="_blank">夏养小常识 让你轻松<em><!--red_beg-->祛湿<!--red_end--></em>又健脾</a></h4> $patten = '/ <h4>.*<a href="([^" target="_blank">]+>(.+)<\/a>.*<\/h4>/Us'; preg_match_all($patten, $client->content, $result); if(!empty($result[1])) { foreach($result[1] as $k=>$val) { if(strpos($val, $id)!==false) { return $k+1; } } } } else //查找公众号排名 { /**匹配字串示例 </a><a id="sogou_vr_11002301_box_25" class="wx-rb bg-blue wx-rb_v1" href="/gzh?openid=oIWsFt_TJmVxf3hcRW7af1Gxi3lU" target="_blank"></a></h4> <div class="img-box"><img style="visibility: hidden;" src="http://img01.sogoucdn.com/app/a/100520090/oIWsFt_TJmVxf3hcRW7af1Gxi3lU" alt="" /></div> <div class="txt-box"> <h3>移动<em><!--red_beg-->互联网<!--red_end--></em>中心</h3> <h4>微信号:yidongwang2003</h4> <p class="s-p3"><span class="sp-tit">功能介绍:</span><span class="sp-txt">移动<em><!--red_beg-->互联网<!--red_end--></em>中心官方平台主要专注于移动<em><!--red_beg-->互联网<!--red_end--></em>领域,分享行业内资讯、产品、运营、营销、创业等最新资讯;广泛发布正能量及移动<em><!--red_beg-->互联网<!--red_end--></em>合作信息,力求给您的工作及生活带来真正的帮助!中国人必定帐号!</span></p> <p class="s-p3"><span class="sp-tit"><script>// <![CDATA[ authnamewrite('1') // ]]></script>认证:</span><span class="sp-txt">来自腾讯微博认证资料:移动互联网中心官方微博! @移动互联网中心</span></p> </div> <div class="v-box"><img src="http://img03.sogoucdn.com/app/a/100520081/tUy9svLEHsxhh_PRnxmR" alt="" width="70" height="70" /> 微信扫描关注</div> */ $patten = '/微信号:([^<]+)<\/span>/U'; preg_match_all($patten, $client->content, $result); if(!empty($result[1])) { $k = array_search($id, $result[1]); if($k!==false) { return $k+1; } } } } return 0; }
其中$client是httpclient类,主要用户发送http请求。代码如下:
class HttpClient { // Request vars var $host; var $port; var $path; var $method; var $postdata = ''; var $cookies = array(); var $referer; var $accept = 'text/xml,application/xml,application/xhtml+xml,text/html,text/plain,image/png,image/jpeg,image/gif,*/*'; var $accept_encoding = 'gzip'; var $accept_language = 'en-us'; var $user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'; // Options var $timeout = 20; var $use_gzip = true; var $persist_cookies = true; // If true, received cookies are placed in the $this->cookies array ready for the next request // Note: This currently ignores the cookie path (and time) completely. Time is not important, // but path could possibly lead to security problems. var $persist_referers = true; // For each request, sends path of last request as referer var $debug = false; var $handle_redirects = true; // Auaomtically redirect if Location or URI header is found var $max_redirects = 5; var $headers_only = false; // If true, stops receiving once headers have been read. // Basic authorization variables var $username; var $password; // Response vars var $status; var $headers = array(); var $content = ''; var $errormsg; // Tracker variables var $redirect_count = 0; var $cookie_host = ''; function HttpClient($host, $port = 80) { $this->host = $host; $this->port = $port; } function get($path, $data = false) { $this->path = $path; $this->method = 'GET'; if ($data) { $this->path .= '?' . $this->buildQueryString($data); } return $this->doRequest(); } function post($path, $data) { $this->path = $path; $this->method = 'POST'; $this->postdata = $this->buildQueryString($data); return $this->doRequest(); } function buildQueryString($data) { $querystring = ''; if (is_array($data)) { // Change data in to postable data foreach ($data as $key => $val) { if (is_array($val)) { foreach ($val as $val2) { $querystring .= urlencode($key) . '=' . urlencode($val2) . '&'; } } else { $querystring .= urlencode($key) . '=' . urlencode($val) . '&'; } } $querystring = substr($querystring, 0, -1); // Eliminate unnecessary & } else { $querystring = $data; } return $querystring; } function doRequest() { // Performs the actual HTTP request, returning true or false depending on outcome if (!$fp = @fsockopen($this->host, $this->port, $errno, $errstr, $this->timeout)) { // Set error message switch ($errno) { case - 3: $this->errormsg = 'Socket creation failed (-3)'; case - 4: $this->errormsg = 'DNS lookup failure (-4)'; case - 5: $this->errormsg = 'Connection refused or timed out (-5)'; default: $this->errormsg = 'Connection failed (' . $errno . ')'; $this->errormsg .= ' ' . $errstr; $this->debug($this->errormsg); } return false; } socket_set_timeout($fp, $this->timeout); $request = $this->buildRequest(); $this->debug('Request', $request); fwrite($fp, $request); // Reset all the variables that should not persist between requests $this->headers = array(); $this->content = ''; $this->errormsg = ''; // Set a couple of flags $inHeaders = true; $atStart = true; // Now start reading back the response while (!feof($fp)) { $line = fgets($fp, 4096); if ($atStart) { // Deal with first line of returned data $atStart = false; if (!preg_match('/HTTP\/(\\d\\.\\d)\\s*(\\d+)\\s*(.*)/', $line, $m)) { $this->errormsg = "Status code line invalid: " . htmlentities($line); $this->debug($this->errormsg); return false; } $http_version = $m[1]; // not used $this->status = $m[2]; $status_string = $m[3]; // not used $this->debug(trim($line)); continue; } if ($inHeaders) { if (trim($line) == '') { $inHeaders = false; $this->debug('Received Headers', $this->headers); if ($this->headers_only) { break; // Skip the rest of the input } continue; } if (!preg_match('/([^:]+):\\s*(.*)/', $line, $m)) { // Skip to the next header continue; } $key = strtolower(trim($m[1])); $val = trim($m[2]); // Deal with the possibility of multiple headers of same name if (isset($this->headers[$key])) { if (is_array($this->headers[$key])) { $this->headers[$key][] = $val; } else { $this->headers[$key] = array($this->headers[$key], $val); } } else { $this->headers[$key] = $val; } continue; } // We're not in the headers, so append the line to the contents $this->content .= $line; } fclose($fp); // If data is compressed, uncompress it if (isset($this->headers['content-encoding']) && $this->headers['content-encoding'] == 'gzip') { $this->debug('Content is gzip encoded, unzipping it'); $this->content = substr($this->content, 10); // See http://www.php.net/manual/en/function.gzencode.php $this->content = gzinflate($this->content); } // If $persist_cookies, deal with any cookies //if ($this->persist_cookies && isset($this->headers['set-cookie']) && $this->host == $this->cookie_host) { if ($this->persist_cookies && isset($this->headers['set-cookie'])) { $cookies = $this->headers['set-cookie']; if (!is_array($cookies)) { $cookies = array($cookies); } foreach ($cookies as $cookie) { if (preg_match('/([^=]+)=([^;]+);/', $cookie, $m)) { $this->cookies[$m[1]] = $m[2]; } } // Record domain of cookies for security reasons //$this->cookie_host = $this->host; } // If $persist_referers, set the referer ready for the next request if ($this->persist_referers) { $this->debug('Persisting referer: ' . $this->getRequestURL()); $this->referer = $this->getRequestURL(); } // Finally, if handle_redirects and a redirect is sent, do that if ($this->handle_redirects) { if (++$this->redirect_count >= $this->max_redirects) { $this->errormsg = 'Number of redirects exceeded maximum ('.$this->max_redirects.')'; $this->debug($this->errormsg); $this->redirect_count = 0; return false; } $location = isset($this->headers['location']) ? $this->headers['location'] : ''; $uri = isset($this->headers['uri']) ? $this->headers['uri'] : ''; if ($location || $uri) { $url = parse_url($location.$uri); // This will FAIL if redirect is to a different site return $this->get($url['path']); } } return true; } function buildRequest() { $headers = array(); $headers[] = "{$this->method} {$this->path} HTTP/1.0"; // Using 1.1 leads to all manner of problems, such as "chunked" encoding $headers[] = "Host: {$this->host}"; $headers[] = "User-Agent: {$this->user_agent}"; $headers[] = "Accept: {$this->accept}"; if ($this->use_gzip) { $headers[] = "Accept-encoding: {$this->accept_encoding}"; } $headers[] = "Accept-language: {$this->accept_language}"; if ($this->referer) { $headers[] = "Referer: {$this->referer}"; } // Cookies if ($this->cookies) { $cookie = 'Cookie: '; foreach ($this->cookies as $key => $value) { $cookie .= "$key=$value; "; } $headers[] = $cookie; } // Basic authentication if ($this->username && $this->password) { $headers[] = 'Authorization: BASIC ' . base64_encode($this->username . ':' . $this-> password); } // If this is a POST, set the content type and length if ($this->postdata) { $headers[] = 'Content-Type: application/x-www-form-urlencoded'; $headers[] = 'Content-Length: ' . strlen($this->postdata); } $request = implode(" ", $headers) . " " . $this->postdata; return $request; } function getStatus() { return $this->status; } function getContent() { return $this->content; } function getHeaders() { return $this->headers; } function getHeader($header) { $header = strtolower($header); if (isset($this->headers[$header])) { return $this->headers[$header]; } else { return false; } } function getError() { return $this->errormsg; } function getCookies() { return $this->cookies; } function getRequestURL() { $url = 'http://' . $this->host; if ($this->port != 80) { $url .= ':' . $this->port; } $url .= $this->path; return $url; } function setReferee($string) { $this->referer = $string; } // Setter methods function setUserAgent($string) { $this->user_agent = $string; } function setAuthorization($username, $password) { $this->username = $username; $this->password = $password; } function setCookies($array) { $this->cookies = $array; } // Option setting methods function useGzip($boolean) { $this->use_gzip = $boolean; } function setPersistCookies($boolean) { $this->persist_cookies = $boolean; } function setPersistReferers($boolean) { $this->persist_referers = $boolean; } function setHandleRedirects($boolean) { $this->handle_redirects = $boolean; } function setMaxRedirects($num) { $this->max_redirects = $num; } function setHeadersOnly($boolean) { $this->headers_only = $boolean; } function setDebug($boolean) { $this->debug = $boolean; } // "Quick" static methods function quickGet($url) { $bits = parse_url($url); $host = $bits['host']; $port = isset($bits['port']) ? $bits['port'] : 80; $path = isset($bits['path']) ? $bits['path'] : '/'; if (isset($bits['query'])) { $path .= '?' . $bits['query']; } $client = new HttpClient($host, $port); if (!$client->get($path)) { return false; } else { return $client->getContent(); } } function quickPost($url, $data) { $bits = parse_url($url); $host = $bits['host']; $port = isset($bits['port']) ? $bits['port'] : 80; $path = isset($bits['path']) ? $bits['path'] : '/'; $client = new HttpClient($host, $port); if (!$client->post($path, $data)) { return false; } else { return $client->getContent(); } } function debug($msg, $object = false) { if ($this->debug) { print ' <div style="border: 1px solid red; padding: 0.5em; margin: 0.5em;"><strong>HttpClient Debug:</strong> ' .$msg; if ($object) { ob_start(); print_r($object); $content = htmlentities(ob_get_contents()); ob_end_clean(); print $content; } print '</div> '; } } }
备注:微信排名变动比较大,所以排名的准确性比百度要低好多。本示例仅供参考!
转载请注明:勇哥实验室 » 微信文章和公众号排名查询工具