PHP Classes

File: spider_simple.class.php

Recommend this page to a friend!
  Classes of vivek   Web Crawler using MySQL DB   spider_simple.class.php   Download  
File: spider_simple.class.php
Role: Class source
Content type: text/plain
Description: DBCrawler
Class: Web Crawler using MySQL DB
Retrieve Web pages and store links in a database
Author: By
Last change: enhancement for class
Date: 15 years ago
Size: 5,269 bytes
 

Contents

Class file image Download
<?

/**
* About author:
* vivekanandan
* email: vivekanandan8@gmail.com
*
* If you want to any help on spider ot any thing in php just mail me
*
* About class:
* WebSpider - constructor set teh domain & url to map it
*
* processTagInPageData() - it process the anchor tag & frame tag as googlebot does
* fetchURLPageData() - it returns the html page content for the given URL
* isURLExists() - it checks wheather the given url is added in DB
* displayDomainRecords - it displays teh records from DB
* StoreUniqueURL - it stores the unique url in DB
* processSpecificTagbyType- it parse each & every tag & truncate the parsed tag from the string
*/



ini_set("display_errors",1);


/* table structure

CREATE TABLE `spider` (
  `id` bigint(20) NOT NULL auto_increment,
  `domain` varchar(150) NOT NULL,
  `url` varchar(2000) NOT NULL,
  `parentid` int(11) NOT NULL,
  `visitflag` int(11) NOT NULL,
  `type` varchar(20) NOT NULL,
  `level` mediumint(9) NOT NULL,
  PRIMARY KEY (`id`)
)
*/


class WebSpider {

var
$mMaxDepth;
var
$mDomain;
var
$mDBHost;
var
$mDBUserName;
var
$mDBPassword;
var
$mDBDatabase;
var
$mURLPageData;
var
$mURL;


    function
WebSpider($pmDomain, $pmDepth,$pmURL) {
   
       
$this->mDomain = $pmDomain;
       
$this->mMaxDepth = $pmDepth;
       
$this->mURL = $pmURL;
                   
    }
   
    function
isURLExists($pmDomain, $pmURL) {

       
mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
       
mysql_select_db($this->mDBDatabase);
       
       
$vSQL = "SELECT count( id ) AS cnt FROM spider
                    WHERE domain = '
$pmDomain' and url = '$pmURL'";
       
$rs = mysql_query($vSQL);
       
$oRecord = mysql_fetch_assoc($rs);
        return
$oRecord['cnt'];
       
    }

    function
displayDomainRecords($pmDomain){
   
          
mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
       
mysql_select_db($this->mDBDatabase);
       
       
$vSQL = "select count(id) as cnt from spider where domain = '$pmDomain' ";
       
$rsURLList = mysql_query($vSQL);
       
$vCnt = mysql_fetch_assoc($rsURLList);
       
           
       
$vSQL = "select * from spider where domain = '$pmDomain' order by id asc ";
       
$rsURLList = mysql_query($vSQL);
        print
"<strong>Domain</strong> : ".$pmDomain ." <strong>Total URL</strong> :".$vCnt['cnt'];
       
?>
<table width='80%' cellspacing='2' cellpadding='2' border="1">
          <tr>
            <td><strong>URL</strong></td>
            <td><strong>Type</strong></td>
          </tr>
        <?
       
while($aRec = mysql_fetch_assoc($rsURLList)) { ?>
             <tr>
            <td><?php echo $aRec['url'] ?></td>
            <td><?php echo htmlspecialchars($aRec['type'] )?></td>

          </tr>
         <? } ?>
         </table>
       
        <?
       

   
   
}
   
    function
StoreUniqueURL($pmDomain, $pmURL, $pmParentId=0, $pmLevel,$pmType){
   
       
mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
       
mysql_select_db($this->mDBDatabase);
   
       
$pmURL = mysql_real_escape_string($pmURL);
        if(
$this->isURLExists($pmDomain,$pmURL)==0) {
           
$vURLSQL = " INSERT INTO `spider` ( domain, `url` , `parentid` , `visitflag` , `type` , `level` )
                            VALUES ('
$pmDomain' , '$pmURL', '$pmParentId', '0', '$pmType', '$pmLevel' )";
           
mysql_query($vURLSQL);
        }
       
    }


    function
fetchLinkfromTag($pmData, $pmTagName, $pmAtributeName){
   
       
$vPos = strpos($pmData, $pmTagName);
        if(
$vPos === false){
            return
false; // if no link found stop search
       
}
       
$vPos += strlen($vStr);
       
$vSubStr = substr($pmData,$vPos);
       
       
$vHrefPos = strpos($vSubStr, $pmAtributeName);
       
$vSubStr = substr($vSubStr, $vHrefPos);
   
       
       
$url = explode('"',$vSubStr);
        return array(
"url"=>$url[1],"str"=>$vSubStr);
       
    }




    function
fetchURLPageData($vURL) {

       
$rCurlRes = curl_init();
       
curl_setopt($rCurlRes, CURLOPT_URL,$vURL);
       
curl_setopt($rCurlRes, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13)');
       
curl_setopt($rCurlRes, CURLOPT_REFERER, $this->mDomain);
       
curl_setopt($rCurlRes, CURLOPT_AUTOREFERER, true);
       
curl_setopt($rCurlRes, CURLOPT_HEADER, 0); // set to 0 to eliminate header info from response
       
curl_setopt($rCurlRes, CURLOPT_RETURNTRANSFER, 1); // Returns response data instead of TRUE(1
       
$res = curl_exec($rCurlRes);
        return
$res;
    }
   
   function
ProcessSpiderInit() {
        
   
$this->StoreUniqueURL($this->mDomain, $this->mURL, 0,1,'index');
   
   }
    

   function
processSpecificTagbyType($pmData, $pmTagName, $pmAttribute) {
     
    do {
           
$aResult = $this->fetchLinkfromTag($pmData,$pmTagName, $pmAttribute);
           
           
$vURL = $aResult['url'];
           
$pmData = $aResult['str'];
            if(
$pmData) {
               
$this->StoreUniqueURL($this->mDomain, $vURL , 1, 1, $pmTagName);
            }
           
$vIndex++;
           
     }while(
$pmData);
  
   }


  
   function
processTagInPageData($pmData) {
  
         
$this->processSpecificTagbyType($pmData,'<a',"href=");
      
$this->processSpecificTagbyType($pmData,'<frame', "src=");
  
   }
  
  
   function
fetchURLDataandParseURL() {

     
$vData = $this->fetchURLPageData($this->mURL);
        
$this->processTagInPageData($vData);
  }

}

?>