PHP Classes

File: GoogleCrawler.class.php

Recommend this page to a friend!
  Classes of Jonathan Schmidt-Dominé   GoogleCrawler   GoogleCrawler.class.php   Download  
File: GoogleCrawler.class.php
Role: Class source
Content type: text/plain
Description: GoogleCrawler-Class
Class: GoogleCrawler
Perform searches and retrieve results from Google
Author: By
Last change: all proxies working again
Date: 12 years ago
Size: 7,593 bytes
 

Contents

Class file image Download
<?php
/*
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 *
 * (C) Jonathan Schmidt-Dominé 2008-2012 < devel@the-user.org >
 */
/*
 * Retrieves Google web search results
 */
class GoogleCrawler
{
    public
$content;
    public
$results;
    public
$numresults;
    private
$usebase64;

    private static
$glypeProxies = array(
       
'2proxy.info/browse.php',
       
'flation.info/browse.php',
       
'orbing.info/browse.php',
       
'voye.info/browse.php',
       
'iners.info/browse.php',
       
'horted.info/browse.php',
       
'omzeil.com/browse.php',
       
'byewall.com/browse.php',
       
'iplama.com/browse.php',
       
'bouncevia.nl/browse.php'
   
);
    private static
$base64required = array(
       
true,
       
false,
       
false,
       
false,
       
false,
       
false,
       
true,
       
true,
       
true,
       
true
   
);
    private static
$glypeMaxTries = 5;
   
    public static function
getUrl($url)
    {
        if(
substr($url, 0, 7) == 'http://')
           
$url = substr($url, 7);
       
$slashPos = strpos($url, '/');
       
$host = substr($url, 0, $slashPos);
       
$path = substr($url, $slashPos);
// echo 'Url: ' . $url . "\n";
// echo 'Host: ' . $host . "\n";
// echo 'Path: ' . $path . "\n";
       
$fp = fsockopen($host, 80);
       
stream_set_timeout($fp, 5);
        if(!
$fp)
            return
false;
       
fputs($fp, "GET $path HTTP/1.1\n");
       
fputs($fp, "Host: $host\n");
       
fputs($fp, "User-Agent: Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.3) Gecko/2008092700 Firefox/3.0.3\n");
       
fputs($fp, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\n");
       
fputs($fp, "Referer: http://$url\n");
       
fputs($fp, "Cookie: \n");
       
fputs($fp, "Keep-Alive: 115\n");
       
fputs($fp, "Connection: keep-alive\n\n");

       
$c = false;
        while(!
feof($fp))
        {
            if((
$tmp = fgets($fp, 128)) == "\n" || $tmp == "\r\n")
            {
                break;
            }
            else if(
substr($tmp, 0, 9) == 'Location:')
            {
               
fclose($fp);
// echo "Redirect: " . $tmp . "\n";
               
return self::getUrl(trim(substr($tmp, 9)));
            }
        }
       
ob_start();
       
fpassthru($fp);
       
$info = stream_get_meta_data($fp);
       
fclose($fp);
        if(
$info['timed_out'])
        {
           
ob_clean();
            return
false;
        }
        return
ob_get_clean();
    }

    private static function
urldecode1(array $results)
    {
        return
urldecode($results[1]);
    }
    private static function
removeGoogleSpyingUrl($url)
    {
        return
preg_replace_callback('~^http://(?:www\.)?google\.[a-zA-Z]+/url\?q=([^&]*).*$~', array(__CLASS__, 'urldecode1'), $url);
    }
    private function
base64Callback(array $match)
    {
       
$tmp = urldecode(html_entity_decode($match[1]));
        if(
$this->usebase64)
           
$tmp = base64_decode($tmp);
        return
'href="' . self::removeGoogleSpyingUrl(($this->usebase64 ? 'http' : '') . htmlspecialchars($tmp)) . '"';
    }
   
    public function
getUrlViaProxy($url)
    {
        for(
$i = 0; $i != self::$glypeMaxTries; ++$i)
        {
           
$id = rand() % count(self::$glypeProxies);
           
$proxy = self::$glypeProxies[$id];
           
$this->usebase64 = self::$base64required[$id];
            echo
'Proxy: ' . htmlspecialchars($proxy);
           
$data = self::getUrl($proxy . "?u=" . urlencode($url) . ($this->usebase64 ? "&b=25" : "&b=0"));
            if(
$data)
                return
preg_replace_callback('~href="http://' . $proxy . '\?u=([^&"]*)[^"]*"~', array($this, 'base64Callback'), $data);
        }
        throw new
Exception("Can't get any Google-Results");
    }
// <h2 class="r"><a class="l" href="{url-proxy-encoded}" class="l" >{title}</a>(&#8230;stuff&#8230;)<div class="std"><span class="s">{description}</span>&#8230;<span class="a">{url} - </span><nobr><a href="{cache-url-encoded}" class="fl"(&#8230;don't care&#8230;)s

   
public function __construct($keywordsGot, $pageNum = 1, $number = 10, $lang = 'en', $googleurl = 'http://www.google.com/cse')
    {
       
$keywords=$keywordsGot;
       
        if(
$pageNum==0)
           
$this->content=$this->getUrlViaProxy($googleurl . "?q=".urlencode($keywords) . "&num=" . $number . "&ie=UTF-8&oe=UTF-8&hl=" . $lang);
        else
        {
           
$index=(($pageNum - 1)*$number);
           
$this->content=$this->getUrlViaProxy($googleurl . "?q=".urlencode($keywords)."&start=".$index . "&num=" . $number . "&ie=UTF-8&oe=UTF-8&hl=" . $lang);
        }
       
// echo '<pre>' . htmlspecialchars($this->content) . '</pre>';
       
       
$pos = strpos($this->content, '<td nowrap align="right"><font size="-1">');
       
$pos = strpos($this->content, '</b>', $pos+34);
       
$pos = strpos($this->content, '</b>', $pos+4);
       
$pos = strpos($this->content, '<b>', $pos+4);
       
$npos = strpos($this->content, '</b>', $pos+3);
       
$this->numresults = intval(str_replace('.', '', str_replace(',', '', substr($this->content, $pos+3, $npos-$pos-3))));
       
       
$this->results = array();
       
       
$pos = strpos($this->content, '<h2 class="r"><a class="l" href="', $npos+4);
        while(
$pos != false)
        {
           
$curr = array();
           
$npos = strpos($this->content, '"', $pos + 33);
            if(
$npos == false)
                break;
           
$curr['url'] = html_entity_decode(substr($this->content, $pos+33, $npos-$pos-33));
           
$pos = strpos($this->content, '>', $npos);
           
$npos = strpos($this->content, '</a>', $pos);
            if(
$pos == false || $npos == false)
                break;
           
$curr['title'] = substr($this->content, $pos+1, $npos-$pos-1);
           
$pos = strpos($this->content, '<div class="std"><span class="s">', $npos);
           
$npos = strpos($this->content, '</span>', $pos+33);
            if(
$pos == false || $npos == false)
                break;
           
$curr['description'] = substr($this->content, $pos+33, $npos-$pos-33);
           
$pos = strpos($this->content, '<nobr><a href="http://', $npos);
           
$tmppos=strpos($this->content, '<h2 class="r"><a class="l" href="', $npos);
            if(
$pos && (!$tmppos || $pos < $tmppos))
            {
               
$npos = strpos($this->content, '"', $pos+22);
               
$nnpos = strpos($this->content, '//', $pos+22);
               
$curr['cache-url'] = 'http://' . html_entity_decode(substr($this->content, $nnpos+2, $npos-$nnpos-2));
            }
            else
               
$curr['cache-url'] = null;
           
$pos = $tmppos;
           
$this->results[] = $curr;
        }
    }
   
}
?>