Login   Register  
PHP Classes
elePHPant
Icontem

File: spider_simple.class.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of vivek  >  Web Crawler using MySQL DB  >  spider_simple.class.php  >  Download  
File: spider_simple.class.php
Role: Class source
Content type: text/plain
Description: DBCrawler
Class: Web Crawler using MySQL DB
Retrieve Web pages and store links in a database
Author: By
Last change: enhancement for class
Date: 6 years ago
Size: 5,269 bytes
 

Contents

Class file image Download
<?

/**
* About author:
*  vivekanandan
* email: vivekanandan8@gmail.com
*
* If you want to any help on spider ot any thing in php just mail me 
*
* About class:
*  WebSpider    -  constructor set teh domain & url to map it 
*  
*   processTagInPageData()     - it process the anchor tag & frame tag as googlebot does
*   fetchURLPageData()         - it returns the html page content for the given URL
*     isURLExists()            - it checks wheather the given url is added in DB  
*   displayDomainRecords    - it displays teh records  from DB    
*     StoreUniqueURL             - it stores the unique url in DB 
*     processSpecificTagbyType-  it parse each & every tag & truncate the parsed tag from the string 
*/



ini_set("display_errors",1); 


/* table structure 

CREATE TABLE `spider` (
  `id` bigint(20) NOT NULL auto_increment,
  `domain` varchar(150) NOT NULL,
  `url` varchar(2000) NOT NULL,
  `parentid` int(11) NOT NULL,
  `visitflag` int(11) NOT NULL,
  `type` varchar(20) NOT NULL,
  `level` mediumint(9) NOT NULL,
  PRIMARY KEY  (`id`)
)
*/


class WebSpider {

var 
$mMaxDepth;
var 
$mDomain;
var 
$mDBHost;
var 
$mDBUserName;
var 
$mDBPassword;
var 
$mDBDatabase;
var 
$mURLPageData;
var 
$mURL;


    function 
WebSpider($pmDomain$pmDepth,$pmURL) { 
    
        
$this->mDomain        $pmDomain;
        
$this->mMaxDepth    $pmDepth;
        
$this->mURL            $pmURL;
                    
    }
    
    function 
isURLExists($pmDomain$pmURL) {  

        
mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
        
mysql_select_db($this->mDBDatabase);  
        
        
$vSQL  "SELECT count( id ) AS cnt FROM spider
                    WHERE  domain = '
$pmDomain' and url = '$pmURL'";
        
$rs mysql_query($vSQL); 
        
$oRecord mysql_fetch_assoc($rs);
        return 
$oRecord['cnt'];
        
    }

    function 
displayDomainRecords($pmDomain){
    
           
mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
        
mysql_select_db($this->mDBDatabase);          
        
        
$vSQL "select count(id) as cnt   from  spider where domain = '$pmDomain' ";
        
$rsURLList mysql_query($vSQL);
        
$vCnt mysql_fetch_assoc($rsURLList);
        
            
        
$vSQL "select *   from  spider where domain = '$pmDomain' order by id asc ";
        
$rsURLList mysql_query($vSQL);
        print 
"<strong>Domain</strong> : ".$pmDomain ." <strong>Total URL</strong> :".$vCnt['cnt'];
        
?>
        <table width='80%'  cellspacing='2' cellpadding='2' border="1">
          <tr>
            <td><strong>URL</strong></td>
            <td><strong>Type</strong></td>
          </tr>
        <?
        
while($aRec mysql_fetch_assoc($rsURLList)) { ?>            
             <tr>
            <td><?php echo $aRec['url'?></td>
            <td><?php echo htmlspecialchars($aRec['type'] )?></td>

          </tr>
         <? }  ?> 
         </table>
        
        <? 
        

    
    
}
    
    function 
StoreUniqueURL($pmDomain$pmURL$pmParentId=0$pmLevel,$pmType){
    
        
mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
        
mysql_select_db($this->mDBDatabase);           
    
        
$pmURL mysql_real_escape_string($pmURL); 
        if(
$this->isURLExists($pmDomain,$pmURL)==0) {      
            
$vURLSQL " INSERT INTO `spider` ( domain, `url` , `parentid` , `visitflag` , `type` , `level` )
                            VALUES ('
$pmDomain' , '$pmURL', '$pmParentId', '0', '$pmType', '$pmLevel' )";                
            
mysql_query($vURLSQL);     
        }
        
    }


    function 
fetchLinkfromTag($pmData$pmTagName$pmAtributeName){ 
    
        
$vPos strpos($pmData$pmTagName);
        if(
$vPos === false){
            return 
false// if no link found stop search 
        
}
        
$vPos += strlen($vStr);     
        
$vSubStr substr($pmData,$vPos);
        
        
$vHrefPos strpos($vSubStr$pmAtributeName);
        
$vSubStr substr($vSubStr,  $vHrefPos);
    
        
        
$url explode('"',$vSubStr);    
        return array(
"url"=>$url[1],"str"=>$vSubStr);
        
    }




    function 
fetchURLPageData($vURL) { 

        
$rCurlRes curl_init();
        
curl_setopt($rCurlResCURLOPT_URL,$vURL);        
        
curl_setopt($rCurlResCURLOPT_USERAGENT'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13)');
        
curl_setopt($rCurlResCURLOPT_REFERER$this->mDomain);
        
curl_setopt($rCurlResCURLOPT_AUTOREFERERtrue);       
        
curl_setopt($rCurlResCURLOPT_HEADER0); // set to 0 to eliminate header info from response
        
curl_setopt($rCurlResCURLOPT_RETURNTRANSFER1); // Returns response data instead of TRUE(1    
        
$res curl_exec($rCurlRes);
        return 
$res;
    }
    
   function 
ProcessSpiderInit() { 
         
    
$this->StoreUniqueURL($this->mDomain$this->mURL0,1,'index');  
    
   }
     

   function 
processSpecificTagbyType($pmData$pmTagName$pmAttribute) {
      
    do {    
            
$aResult $this->fetchLinkfromTag($pmData,$pmTagName$pmAttribute);             
            
            
$vURL     $aResult['url'];
            
$pmData $aResult['str'];             
            if(
$pmData) {
                
$this->StoreUniqueURL($this->mDomain$vURL 1,  1$pmTagName);  
            }
            
$vIndex++;                
            
     }while(
$pmData);     
   
   }   


   
   function 
processTagInPageData($pmData) {  
   
          
$this->processSpecificTagbyType($pmData,'<a',"href=");  
       
$this->processSpecificTagbyType($pmData,'<frame'"src=");       
   
   }
   
   
   function 
fetchURLDataandParseURL() { 

      
$vData   $this->fetchURLPageData($this->mURL);      
         
$this->processTagInPageData($vData);        
  }        

}

?>