Login   Register  
PHP Classes
elePHPant
Icontem

File: GoogleCrawler.class.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Jonathan Schmidt-Dominé  >  GoogleCrawler  >  GoogleCrawler.class.php  >  Download  
File: GoogleCrawler.class.php
Role: Class source
Content type: text/plain
Description: GoogleCrawler-Class
Class: GoogleCrawler
Perform searches and retrieve results from Google
Author: By
Last change: all proxies working again
Date: 2 years ago
Size: 7,593 bytes
 

Contents

Class file image Download
<?php
/*
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * (C) Jonathan Schmidt-Dominé 2008-2012 < devel@the-user.org >
 */
/*
 * Retrieves Google web search results
 */
class GoogleCrawler
{
    public 
$content;
    public 
$results;
    public 
$numresults;
    private 
$usebase64;

    private static 
$glypeProxies = array(
        
'2proxy.info/browse.php',
        
'flation.info/browse.php',
        
'orbing.info/browse.php',
        
'voye.info/browse.php',
        
'iners.info/browse.php',
        
'horted.info/browse.php',
        
'omzeil.com/browse.php',
        
'byewall.com/browse.php',
        
'iplama.com/browse.php',
        
'bouncevia.nl/browse.php'
    
);
    private static 
$base64required = array(
        
true,
        
false,
        
false,
        
false,
        
false,
        
false,
        
true,
        
true,
        
true,
        
true
    
);
    private static 
$glypeMaxTries 5;
    
    public static function 
getUrl($url)
    {
        if(
substr($url07) == 'http://')
            
$url substr($url7);
        
$slashPos strpos($url'/');
        
$host substr($url0$slashPos);
        
$path substr($url$slashPos);
//         echo 'Url: ' . $url . "\n";
//         echo 'Host: ' . $host . "\n";
//         echo 'Path: ' . $path . "\n";
        
$fp fsockopen($host80);
        
stream_set_timeout($fp5);
        if(!
$fp)
            return 
false;
        
fputs($fp"GET $path HTTP/1.1\n");
        
fputs($fp"Host: $host\n");
        
fputs($fp"User-Agent: Mozilla/5.0 (X11; U; Linux i686; de; rv:1.9.0.3) Gecko/2008092700 Firefox/3.0.3\n");
        
fputs($fp"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\n");
        
fputs($fp"Referer: http://$url\n");
        
fputs($fp"Cookie: \n");
        
fputs($fp"Keep-Alive: 115\n");
        
fputs($fp"Connection: keep-alive\n\n");

        
$c false;
        while(!
feof($fp))
        {
            if((
$tmp fgets($fp128)) == "\n" || $tmp == "\r\n")
            {
                break;
            }
            else if(
substr($tmp09) == 'Location:')
            {
                
fclose($fp);
//                 echo "Redirect: " . $tmp . "\n";
                
return self::getUrl(trim(substr($tmp9)));
            }
        }
        
ob_start();
        
fpassthru($fp);
        
$info stream_get_meta_data($fp);
        
fclose($fp);
        if(
$info['timed_out'])
        {
            
ob_clean();
            return 
false;
        }
        return 
ob_get_clean();
    }

    private static function 
urldecode1(array $results)
    {
        return 
urldecode($results[1]);
    }
    private static function 
removeGoogleSpyingUrl($url)
    {
        return 
preg_replace_callback('~^http://(?:www\.)?google\.[a-zA-Z]+/url\?q=([^&]*).*$~', array(__CLASS__'urldecode1'), $url);
    }
    private function 
base64Callback(array $match)
    {
        
$tmp urldecode(html_entity_decode($match[1]));
        if(
$this->usebase64)
            
$tmp base64_decode($tmp);
        return 
'href="' self::removeGoogleSpyingUrl(($this->usebase64 'http' '') . htmlspecialchars($tmp)) . '"';
    }
    
    public function 
getUrlViaProxy($url)
    {
        for(
$i 0$i != self::$glypeMaxTries; ++$i)
        {
            
$id rand() % count(self::$glypeProxies);
            
$proxy self::$glypeProxies[$id];
            
$this->usebase64 self::$base64required[$id];
            echo 
'Proxy: ' htmlspecialchars($proxy);
            
$data self::getUrl($proxy "?u=" urlencode($url) . ($this->usebase64 "&b=25" "&b=0"));
            if(
$data)
                return 
preg_replace_callback('~href="http://' $proxy '\?u=([^&"]*)[^"]*"~', array($this'base64Callback'), $data);
        }
        throw new 
Exception("Can't get any Google-Results");
    }
// <h2 class="r"><a class="l" href="{url-proxy-encoded}" class="l" >{title}</a>(&#8230;stuff&#8230;)<div class="std"><span class="s">{description}</span>&#8230;<span class="a">{url} - </span><nobr><a href="{cache-url-encoded}" class="fl"(&#8230;don't care&#8230;)s

    
public function __construct($keywordsGot$pageNum 1$number 10$lang 'en'$googleurl 'http://www.google.com/cse')
    {
        
$keywords=$keywordsGot;
        
        if(
$pageNum==0)
            
$this->content=$this->getUrlViaProxy($googleurl "?q=".urlencode($keywords) . "&num=" $number "&ie=UTF-8&oe=UTF-8&hl=" $lang);
        else
        {
            
$index=(($pageNum 1)*$number);
            
$this->content=$this->getUrlViaProxy($googleurl "?q=".urlencode($keywords)."&start=".$index "&num=" $number "&ie=UTF-8&oe=UTF-8&hl=" $lang);
        }
        
//         echo '<pre>' . htmlspecialchars($this->content) . '</pre>';
        
        
$pos strpos($this->content'<td nowrap align="right"><font size="-1">');
        
$pos strpos($this->content'</b>'$pos+34);
        
$pos strpos($this->content'</b>'$pos+4);
        
$pos strpos($this->content'<b>'$pos+4);
        
$npos strpos($this->content'</b>'$pos+3);
        
$this->numresults intval(str_replace('.'''str_replace(','''substr($this->content$pos+3$npos-$pos-3))));
        
        
$this->results = array();
        
        
$pos strpos($this->content'<h2 class="r"><a class="l" href="'$npos+4);
        while(
$pos != false)
        {
            
$curr = array();
            
$npos strpos($this->content'"'$pos 33);
            if(
$npos == false)
                break;
            
$curr['url'] = html_entity_decode(substr($this->content$pos+33$npos-$pos-33));
            
$pos strpos($this->content'>'$npos);
            
$npos strpos($this->content'</a>'$pos);
            if(
$pos == false || $npos == false)
                break;
            
$curr['title'] = substr($this->content$pos+1$npos-$pos-1);
            
$pos strpos($this->content'<div class="std"><span class="s">'$npos);
            
$npos strpos($this->content'</span>'$pos+33);
            if(
$pos == false || $npos == false)
                break;
            
$curr['description'] = substr($this->content$pos+33$npos-$pos-33);
            
$pos strpos($this->content'<nobr><a href="http://'$npos);
            
$tmppos=strpos($this->content'<h2 class="r"><a class="l" href="'$npos);
            if(
$pos && (!$tmppos || $pos $tmppos))
            {
                
$npos strpos($this->content'"'$pos+22);
                
$nnpos strpos($this->content'//'$pos+22);
                
$curr['cache-url'] = 'http://' html_entity_decode(substr($this->content$nnpos+2$npos-$nnpos-2));
            }
            else
                
$curr['cache-url'] = null;
            
$pos $tmppos;
            
$this->results[] = $curr;
        }
    }
    
}
?>