PHP Classes
Icontem

File: phpWebHacks.php


  Search   All class groups All class groups   Latest entries Latest entries   Top 10 charts Top 10 charts   Newsletter Newsletter   Blog Blog   Forums Forums   Help FAQ Help FAQ  
  Login   Register  
Recommend this page to a friend! ReTweet ReTweet Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Nashruddin Amin  >  Hacker's HTTP Client  >  phpWebHacks.php  
File: phpWebHacks.php
Role: Class source
Content type: text/plain
Description: HTTP client
Class: Hacker's HTTP Client
Automate retrieval and processing of Web pages
 

Contents

Class file image Download
<?php
/**
 * phpWebHacks.php 1.5
 * This class is a powerful tool for HTTP scripting with PHP.
 * It simulates a web browser, only that you use it with lines of code
 * rather than mouse and keyboard.
 *
 * See the documentation at http://php-http.com/documentation
 * See the examples at http://php-http.com/examples
 *
 * Author  Nashruddin Amin - me@nashruddin.com
 * License GPL
 * Website http://php-http.com
 */

class phpWebHacks 
{ 
	private $_user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0';
	private $_boundary 	 = '----PhPWebhACKs-RoCKs--';
	private $_useproxy 	 = false;
	private $_proxy_host = '';
	private $_proxy_port = '';
	private $_proxy_user = '';
	private $_proxy_pass = '';
	private $_usegzip 	 = false;
	private $_log 		 = false;
	private $_debugdir   = '.log';
	private $_debugnum   = 1;
	private $_delay 	 = 1;
	private $_body 		 = array();
	private $_cookies 	 = array();
	private $_addressbar = '';
	private $_multipart  = false;
	private $_timestart  = 0;
	private $_bytes 	 = 0;

	/**
	 * Constructor
	 */
	public function __construct()
	{
		$this->setDebug(true);

		/* check if zlib is available */
		if (function_exists('gzopen')) {
			$this->_usegzip = true;
		}

		/* start time */
		$this->_timestart = microtime(true);
	} 

	/**
	 * Destructor
	 */
	public function __destruct()
	{
		/* remove temporary file for gzip encoding */
		if (file_exists('tmp.gz')) {
			unlink('tmp.gz');
		}

		/* get elapsed time and transferred bytes */
		$time  = sprintf("%02.1f", microtime(true) - $this->_timestart);
		$bytes = sprintf("%d", ceil($this->_bytes / 1024));

		/* log */
		if ($this->_log) {
			$fp = fopen("$this->_debugdir/headers.txt", 'a');
			fputs($fp, "------ Transferred " . $bytes . "kb in $time sec ------\r\n");
			fclose($fp);
		}
	} 

	/** 
	 * HEAD 
	 */
	public function head($url)
	{ 
		return $this->fetch($url, 'HEAD');
	} 

	/**
	 * GET
	 */
	public function get($url)
	{ 
		return $this->fetch($url, 'GET');
	} 

	/**
	 * POST
	 */
	public function post($url, $form = array(), $files = array())
	{ 
		return $this->fetch($url, 'POST', 10, $form, $files);
	} 

	/**
	 * Make HTTP request
	 */
	protected function fetch($url, $method, $maxredir = 10, $form = array(), $files = array())
	{
		/* convert to absolute if relative URL */
		$url = $this->getAbsUrl($url, $this->_addressbar);

		/* only http or https */
		if (substr($url, 0, 4) != 'http') return '';

		/* cache URL */
		$this->_addressbar = $url;

		/* build request */
		$reqbody = $this->getReqBody($form, $files);
		$reqhead = $this->getReqHead($url, $method, strlen($reqbody), empty($files) ? false : true);

		/* log request */
		if ($this->_log) {
			$this->logHttpStream($url, $reqhead, $reqbody);
		}

		/* parse URL and convert to local variables:
		   $scheme, $host, $path */
		$parts = parse_url($url);
		if (!$parts) { 
			die("Invalid URL!\n");
		} else { 
			foreach($parts as $key=>$val) $$key = $val;
		} 

		/* open connection */
		if ($this->_useproxy) {
			$fp = @fsockopen($this->_proxy_host, $this->_proxy_port);
		} else  {
			$fp = @fsockopen(($scheme=='https' ? "ssl://$host" : $host), $scheme == 'https' ? 443 : 80);
		}

		/* always check */
		if (!$fp) {
			die("Cannot connect to $host!\n");
		}

		/* send request & read response */
		@fputs($fp, $reqhead.$reqbody);
		for($res=''; !feof($fp); $res.=@fgets($fp, 4096)) {} 
		fclose($fp);

		/* set delay between requests. behave! */
		sleep($this->_delay);

		/* transferred bytes */
		$this->_bytes += (strlen($reqhead)+ strlen($reqbody)+ strlen($res));

		/* get response header & body */
		list($reshead, $resbody) = explode("\r\n\r\n", $res, 2);

		/* convert header to associative array */
		$head = $this->parseHead($reshead);

		/* return immediately if HEAD */
		if ($method == 'HEAD') { 
			if ($this->_log) $this->logHttpStream($url, $reshead, null);
			return $head;
		} 

		/* cookies */
		if (!empty($head['Set-Cookie'])) {
			$this->saveCookies($head['Set-Cookie'], $url);
		}
			
		/* referer */
		if ($head['Status']['Code'] == 200) {
			$this->_referer = $url;
		}
			
		/* transfer-encoding: chunked */
		if ($head['Transfer-Encoding'] == 'chunked') {
			$body = $this->joinChunks($resbody);
		} else {
			$body = $resbody;
		} 

		/* content-encoding: gzip */
		if ($head['Content-Encoding'] == 'gzip') {
			@file_put_contents('tmp.gz', $body);
			$fp = @gzopen('tmp.gz', 'r');
			for($body = ''; !@gzeof($fp); $body.=@gzgets($fp, 4096)) {}
			@gzclose($fp);
		} 

		/* log response */
		if ($this->_log) {
			$this->logHttpStream($url, $reshead, $body);
		}

		/* cache body */
		array_unshift($this->_body, $body);

		/* redirects: 302 */
		if (isset($head['Location']) && $maxredir > 0) {
			$this->fetch($this->getAbsUrl($head['Location'], $url), 'GET', $maxredir--);
		}

		/* parse meta tags */
		$meta = $this->parseMetaTags($body);

		/* redirects: <meta http-equiv=refresh...> */
		if (isset($meta['http-equiv']['refresh']) && $maxredir > 0) { 
			list($delay, $loc) = explode(';', $meta['http-equiv']['refresh'], 2);
			$loc = substr(trim($loc), 4);
			if (!empty($loc) && $loc != $url)
				$this->fetch($this->getAbsUrl($loc, $url), 'GET', $maxredir--);
		}

		/* get body and clear cache */
		$body = $this->_body[0];
		for($i = 1; $i < count($this->_body); $i++) {
			unset($this->_body[$i]);
		}

		return $body;
	} 

	/**
	 * Build request header
	 */
	protected function getReqHead($url, $method, $bodylen = 0, $sendfile = true)
	{
		/* parse URL elements to local variables:
		   $scheme, $host, $path, $query, $user, $pass */
		$parts = parse_url($url);
		foreach($parts as $key=>$val) $$key = $val;

		/* setup path */
		$path = empty($path)  ? '/' : $path 
			  .(empty($query) ? ''  : "?$query");
			
		/* request header */	
		if ($this->_useproxy) {
			$head = "$method $url HTTP/1.1\r\nHost: $this->_proxy_host\r\n";
		} else  {
			$head = "$method $path HTTP/1.1\r\nHost: $host\r\n";
		}

		/* cookies */
		$head .= $this->getCookies($url);

		/* content-type */
		if ($method == 'POST' && ($sendfile || $this->_multipart)) {
			$head .= "Content-Type: multipart/form-data; boundary=$this->_boundary\r\n";
		} elseif ($method == 'POST') {
			$head .= "Content-Type: application/x-www-form-urlencoded\r\n";
		}

		/* set the content length if POST */
		if ($method == 'POST') {
			$head .= "Content-Length: $bodylen\r\n";
		}

		/* basic authentication */
		if (!$this->_useproxy && !empty($user) && !empty($pass)) {
			$head .= "Authorization: Basic ". base64_encode("$user:$pass")."\r\n";
		}

		/* basic authentication for proxy */
		if ($this->_useproxy && !empty($this->_proxy_user) && !empty($this->_proxy_pass)) {
			$head .= "Authorization: Basic ". base64_encode("$this->_proxy_user:$this->_proxy_pass")."\r\n";
		}

		/* gzip */
		if ($this->_usegzip) {
			$head .= "Accept-Encoding: gzip\r\n";
		}

		/* make it like real browsers */
		if (!empty($this->_user_agent)) {
			$head .= "User-Agent: $this->_user_agent\r\n";
		}
		if (!empty($this->_referer)) {
			$head .= "Referer: $this->_referer\r\n";
		}

		/* no pipelining yet */
		$head .= "Connection: Close\r\n\r\n";

		/* request header is ready */
		return $head;
	} 

	/**
	 * Build request body
	 */
	protected function getReqBody($form = array(), $files = array())
	{ 
		/* check for parameters */
		if (empty($form) && empty($files)) 
			return '';

		$body = '';
		$tmp  = array();

		/* only form available: x-www-urlencoded */
		if (!empty($form) &&  empty($files) && !$this->_multipart) { 
			foreach($form as $key=>$val)
				$tmp[] = $key .'='. urlencode($val);
			return implode('&', $tmp);
		} 

		/* form */
		foreach($form as $key=>$val) {
			$body .= "--$this->_boundary\r\nContent-Disposition: form-data; name=\"" . $key ."\"\r\n\r\n" . $val ."\r\n";
		}

		/* files */
		foreach($files as $key=>$val) { 
			if (!file_exists($val)) continue;
			$body .= "--$this->_boundary\r\n"
				   . "Content-Disposition: form-data; name=\"" . $key . "\"; filename=\"" . basename($val) . "\"\r\n"
				   . "Content-Type: " . $this->getMimeType($val) . "\r\n\r\n"
				   . file_get_contents($val) . "\r\n";
		} 

		/* request body is ready! */
		return $body."--$this->_boundary--";
	} 

	/**
	 * convert response header to associative array
	 */
	protected function parseHead($str)
	{
		$lines = explode("\r\n", $str);

		list($ver, $code, $msg) = explode(' ', array_shift($lines), 3);
		$stat = array('Version' => $ver, 'Code' => $code, 'Message' => $msg);

		$head = array('Status' => $stat);

		foreach($lines as $line) { 
			list($key, $val) = explode(':', $line, 2);
			if ($key == 'Set-Cookie') {
				$head['Set-Cookie'][] = trim($val);
			} else {
				$head[$key] = trim($val);
			}
		} 

		return $head;
	} 

	/**
	 * Read chunked pages
	 */
	protected function joinChunks($str)
	{
		$CRLF = "\r\n";
		for($tmp = $str, $res = ''; !empty($tmp); $tmp = trim($tmp)) { 
			if (($pos = strpos($tmp, $CRLF)) === false) return $str;
			$len = hexdec(substr($tmp, 0, $pos));
			$res.= substr($tmp, $pos + strlen($CRLF), $len);
			$tmp = substr($tmp, $pos + strlen($CRLF) + $len);
		} 
		return $res;
	} 

	/**
	 * Save cookies from server
	 */
	protected function saveCookies($set_cookies, $url) 
	{ 
		foreach($set_cookies as $str) 
		{
			$parts = explode(';', $str);

			/* extract cookie parts to local variables:
			   $name, $value, $domain, $path, $expires, $secure, $httponly */
			foreach($parts as $part) { 
				list($key, $val) = explode('=', trim($part), 2);

				$k = strtolower($key);

				if ($k == 'secure' || $k == 'httponly') {
					$$k = true;
				} elseif ($k == 'domain' || $k == 'path' || $k == 'expires') {
					$$k = $val;
				} else {
					$name  = $key;
					$value = $val;
				}
			} 

			/* cookie's domain */
			if (empty($domain)) {
				$domain = parse_url($url, PHP_URL_HOST);
			}

			/* cookie's path */	
			if (empty($path)) {
				$path = parse_url($url, PHP_URL_PATH);
				$path = preg_replace('#/[^/]*$#', '', $path);
				$path = empty($path) ? '/' : $path;
			} 

			/* cookie's expire time */
			if (!empty($expires)) {
				$expires = strtotime($expires);
			}
				
			/* setup cookie ID, a simple trick to add/update existing cookie
			   and cleanup local variables later */
			$id = md5("$domain;$path;$name");

			/* add/update cookie */
			$this->_cookies[$id] = array(
				'domain'   => substr_count($domain, '.') == 1 ? ".$domain" : $domain, 
				'path'     => $path, 
				'expires'  => $expires, 
				'name'     => $name, 
				'value'    => $value, 
				'secure'   => $secure, 
				'httponly' => $httponly
			);

			/* cleanup local variables */
			foreach($this->_cookies[$id] as $key=>$val) unset($$key);
		} 

		return true;
	} 

	/**
	 * Get cookies for URL
	 */
	protected function getCookies($url)
	{
		$tmp = array();
		$res = array();

		/* remove expired cookies first */
		foreach($this->_cookies as $id=>$cookie) {
			if (empty($cookie['expires']) || $cookie['expires'] >= time()) {
				$tmp[$id] = $cookie;
			}
		}

		/* cookies ready */
		$this->_cookies = $tmp;

		/* parse URL to local variables:
		   $scheme, $host, $path, $query */
		$parts = parse_url($url);
		foreach($parts as $key=>$val) $$key = $val;

		if (empty($path)) $path = '/';

		/* get all cookies for this domain and path */
		foreach($this->_cookies as $cookie) {
			$d = substr($host, -1 * strlen($cookie['domain']));
			$p = substr($path, 0, strlen($cookie['path']));
			
			if (($d == $cookie['domain'] || ".$d" == $cookie['domain']) && $p == $cookie['path']) { 
				if ($cookie['secure'] == true  && $scheme == 'http') {
					continue;
				}
				$res[] = $cookie['name'].'='.$cookie['value'];
			}
		} 

		/* return the string for HTTP header */
		return (empty($res) ? '' : 'Cookie: '.implode('; ', $res)."\r\n");
	} 

	/**
	 * Convert relative URL to absolute URL
	 */
	protected function getAbsUrl($loc, $parent)
	{ 
		/* parameters is required */
		if (empty($loc) && empty($parent)) return;

		$loc = str_replace('&amp;', '&', $loc);

		/* return if URL is abolute */
		if (parse_url($loc, PHP_URL_SCHEME) != '') return $loc;

		/* handle anchors and query's part */
		$c = substr($loc, 0, 1);
		if ($c == '#' || $c == '&') return "$parent$loc";

		/* handle query string */
		if ($c == '?') {
			$pos = strpos($parent, '?');
			if ($pos !== false) $parent = substr($parent, 0, $pos);
			return "$parent$loc";
		}

		/* parse URL and convert to local variables:
		   $scheme, $host, $path */
		$parts = parse_url($parent);
		foreach ($parts as $key=>$val) $$key = $val;

		/* remove non-directory part from path */
		$path = preg_replace('#/[^/]*$#', '', $path);

		/* set path to '/' if empty */
		$path = preg_match('#^/#', $loc) ? '/' : $path;

		/* dirty absolute URL */
		$abs = "$host$path/$loc";

		/* replace '//', '/./', '/foo/../' with '/' */
		while($abs = preg_replace(array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#'), '/', $abs, -1, $count)) 
			if (!$count) break;

		/* absolute URL */
		return "$scheme://$abs";
	} 

	/**
	 * Convert meta tags to associative array
	 */
	protected function parseMetaTags($html) 
	{ 
		/* extract to </head> */
		if (($pos = strpos(strtolower($html), '</head>')) === false) { 
			return array();
		} else {
			$head = substr($html, 0, $pos);
		} 

		/* get page's title */
		preg_match("/<title>(.+)<\/title>/siU", $head, $m);
		$meta = array('title' => $m[1]);

		/* get all <meta...> */
		preg_match_all('/<meta\s+[^>]*name\s*=\s*[\'"][^>]+>/siU', $head, $m);
		foreach($m[0] as $row) { 
			preg_match('/name\s*=\s*[\'"](.+)[\'"]/siU', $row, $key);
			preg_match('/content\s*=\s *[\'"](.+)[\'"]/siU', $row, $val);
			if (!empty($key[1]) && !empty($val[1]))
				$meta[$key[1]] = $val[1];
		} 

		/* get <meta http-equiv=refresh...> */
		preg_match('/<meta[^>]+http-equiv\s*=\s*[\'"]?refresh[\'"]?[^>]+content\s*=\s*[\'"](.+)[\'"][^>]*>/siU', $head, $m);
		if (!empty($m[1])) {
			$meta['http-equiv']['refresh'] = preg_replace('/&#0?39;/', '', $m[1]);
		} 
		return $meta;
	} 

	/**
	 * Convert form to associative array
	 */
	public function parseForm($name_or_id, $action = '', $str = '')
	{ 
		if (empty($str) && empty($this->_body[0])) 
			return array();
			
		$body = empty($str) ? $this->_body[0] : $str;

		/* extract the form */
		$re = '(<form[^>]+(id|name)\s*=\s*(?(?=[\'"])[\'"]'.$name_or_id.'[\'"]|\b'.$name_or_id.'\b)[^>]*>.+<\/form>)';
		if (!preg_match("/$re/siU", $body, $form)) 
			return array();

		/* check if enctype=multipart/form-data */
		if (preg_match('/<form[^>]+enctype[^>]+multipart\/form-data[^>]*>/siU', $form[1], $a))
			$this->_multipart = true;
		else 
			$this->_multipart = false;
			
		/* get form's action */
		preg_match('/<form[^>]+action\s*=\s*(?(?=[\'"])[\'"]([^\'"]+)[\'"]|([^>\s]+))[^>]*>/si', $form[1], $a);
		$action = empty($a[1]) ? html_entity_decode($a[2]) : html_entity_decode($a[1]);

		/* select all <select..> with default values */
		$re = '<select[^>]+name\s*=\s*(?(?=[\'"])[\'"]([^>]+)[\'"]|\b([^>]+)\b)[^>]*>'
			. '.+value\s*=\s*(?(?=[\'"])[\'"]([^>]+)[\'"]|\b([^>]+)\b)[^>]+\bselected\b'
			. '.+<\/select>';
		preg_match_all("/$re/siU", $form[1], $a);

		foreach($a[1] as $num=>$key) {
			$val = $a[3][$num];
			if ($val == '') $val = $a[4][$num];
			if ($key == '') $key = $a[2][$num];
			$res[$key] = html_entity_decode($val);
		} 

		/* get all <input...> */
		preg_match_all('/<input([^>]+)\/?>/siU', $form[1], $a);

		/* convert to associative array */
		foreach($a[1] as $b) { 
			preg_match_all('/([a-z]+)\s*=\s*(?(?=[\'"])[\'"]([^"]+)[\'"]|\b(.+)\b)/siU', trim($b), $c);
			
			$element = array();
			
			foreach($c[1] as $num=>$key) {
				$val = $c[2][$num];
				if ($val == '') $val = $c[3][$num];
				$element[$key] = $val;
			}
			
			$type = strtolower($element['type']);
			
			/* only radio or checkbox with default values */
			if ($type == 'radio' || $type == 'checkbox') 
				if (!preg_match('/\s+\bchecked\b/', $b)) continue;
				
			/* remove buttons and file */	
			if ($type == 'file' || $type == 'submit' || $type == 'reset' || $type == 'button') 
				continue;
			
			/* remove unnamed elements */
			if ($element['name'] == '' && $element['id'] == '') 
				continue;
			
			/* cool */
			$key = $element['name'] == '' ? $element['id'] : $element['name'];
			$res[$key] = html_entity_decode($element['value']);
		} 

		return $res;
	} 

	/**
	 * Get mime type for a file
	 */
	protected function getMimeType($filename)
	{ 
		/* list of mime type. add more rows to suit your need */
		$mimetypes = array(
			'jpg'  => 'image/jpeg',
			'jpe'  => 'image/jpeg',
			'jpeg' => 'image/jpeg',
			'gif'  => 'image/gif',
			'png'  => 'image/png',
			'tiff' => 'image/tiff',
			'html' => 'text/html',
			'txt'  => 'text/plain',
			'pdf'  => 'application/pdf',
			'zip'  => 'application/zip'
		);

		/* get file extension */
		preg_match('#\.([^\.]+)$#', strtolower($filename), $e);

		/* get mime type */
		foreach($mimetypes as $ext=>$mime)
			if ($e[1] == $ext) return $mime;

		/* this is the default mime type */
		return 'application/octet-stream';
	} 

	/**
	 * Log HTTP request/response
	 */
	protected function logHttpStream($url, $head, $body)
	{ 
		/* open log file */
		if (($fp = @fopen("$this->_debugdir/headers.txt", 'a')) == false) return;

		/* get method */
		$m = substr($head, 0, 4);

		/* append the requested URL for HEAD, GET and POST */
		if ($m == 'HEAD' || $m == 'GET ' || $m == 'POST')
			$head = str_repeat('-', 90) . "\r\n$url\r\n\r\n" . trim($head);

		/* header */
		@fputs($fp, trim($head)."\r\n\r\n");

		/* request body */
		if ($m == 'POST' &&  strpos($head, 'Content-Length: ') !== false) {
			/* skip binary contents */
			$find = 'Content-Type: \s*([^\s]+)\r\n\r\n(.+)\r\n';
			$repl = "Content-Type: $1\r\n\r\n <... File contents ...>\r\n";
			$body = preg_replace('/'.$find .'/siU', $repl, $body);

			@fputs($fp, "$body\r\n\r\n");
		} 

		/* response body */
		if (substr($head, 0, 7) == 'HTTP/1.' && strpos($head, 'text/html') !== false && !empty($body)) {
			$tmp = "$this->_debugdir/" . $this->_debugnum++ . '.html';
			@file_put_contents($tmp, $body);
			@fputs($fp, "<... See page contents in $tmp ...>\r\n\r\n");
		}

		@fclose($fp);
	} 

	public function setDebug($bool)
	{
		$this->_log = $bool;

		if (!$this->_log) return;

		/* create directory */
		if (!is_dir($this->_debugdir)) { 
			mkdir($this->_debugdir);
			chmod($this->_debugdir, 0644);
		}

		/* empty debug directory */
		$items = scandir($this->_debugdir);
		foreach($items as $item) { 
			if ($item == '.' || $item == '..') continue;
			unlink("$this->_debugdir/$item");
		}
	} 

	/**
	 * Set proxy
	 */
	public function setProxy($host, $port, $user = '', $pass = '')
	{
		$this->_proxy_host = $host;
		$this->_proxy_port = $port;
		$this->_proxy_user = $user;
		$this->_proxy_pass = $pass;
		$this->_useproxy   = true;
	} 

	/**
	 * Set delay between requests
	 */
	public function setInterval($sec)
	{ 
		if (!preg_match('/^\d+$/', $sec) || $sec <= 0) {
			$this->_delay = 1;
		} else { 
			$this->_delay = $sec;
		}
	} 

	/**
	 * Assign a name for this HTTP client
	 */
	public function setUserAgent($ua)
	{
		$this->_user_agent = $ua;
	}
}


 
  Advertise on this site Advertise on this site   Site map Site map   Statistics Statistics   Site tips Site tips   Privacy policy Privacy policy   Contact Contact  

For more information send a message to :
info at phpclasses dot org.
Copyright (c) Icontem 1999-2009 PHP Classes - PHP Class Scripts
  PHP Book Reviews - Reviews of books and other products