Login   Register  
PHP Classes
elePHPant
Icontem

File: example.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Trev Tune  >  PHP Search Engine Crawler  >  example.php  >  Download  
File: example.php
Role: Example script
Content type: text/plain
Description: A simple example on how to crawl several pages for links.
Class: PHP Search Engine Crawler
Crawl pages and extract links, images and metadata
Author: By
Last change: Fixed bugs and added a new parameter (singledomain)
Date: 9 months ago
Size: 1,667 bytes
 

Contents

Class file image Download
<!Doctype html>
<html>
<head>
<title>My crawler</title>
</head>
<body>

<?php 

//error_reporting(-1);

/*Example link crawler
*@package :Simple crawler
*author : Trev Tune
*/

include 'crawler.class.php';

 
/* @seenlinks
*
*A multidimentional array of all crawled links
*
*e.g print_r($seenlinks) may produce
array(
  [domain]=>array( [link1]=>link;
)
*/


$url=trim($_GET['url']);

$seenlinks=array();


crawl($url,2);


function 
crawl($url,$depth=2,$singledomain=false)
{

$domain=host($url);

global 
$seenlinks;

//Have we crawled into the specified depth

if ($depth==0)
 return;

//Have we crawled this url

if(isset($seenlinks[$domain][$url]))
return;

$crawler=new crawler($url);

if(!
$crawler)
return;

//$crawler->getLinks();

//Add to array


$seenlinks[$domain][$url]=$url;

$links=$crawler->getLinks();


foreach(
$links as $link=>$a)

{

//Does user want to crawl only a specific domain

if($singledomain
{
if(
$domain=!host($link))
break;}


crawl($link,$depth-1,$singledomain);}
}

function 
host($url)
{
/*
$host = parse_url($url);
$host = $host['host'];
*/

$host str_ireplace('www.',''parse_url($urlPHP_URL_HOST));

return 
$host;
}

if(
count($seenlinks)==0)
{die(
"No links found for $url");}

echo 
"

<div class='crawler'>
Domains/subdomains found = " 
count($seenlinks) . "<br/>";


foreach(
$seenlinks as $domain=>$links){

echo 
"<br/> Domain ".$domain" has "  count($links) . "links <br>";

foreach(
$links as $link){
{
echo 
" <br/>  $link <br/>";

}

echo 
"<hr/>";
}

echo 
"<hr/>";
}

?>

</div>
</body>
</html>