PHP Classes
elePHPant
Icontem

File: ParseHtml.php

Recommend this page to a friend!
  Classes of Grigori Kochanov  >  PHP server-side web browser  >  ParseHtml.php  >  Download  
File: ParseHtml.php
Role: Class source
Content type: text/plain
Description: class that makes changes in HTML
Class: PHP server-side web browser
Browsing the Web via PHP
Author: By
Last change:
Date: 13 years ago
Size: 6,905 bytes
 

Contents

Class file image Download
<?php
/***********
* This script takes html retrieved from the remote server and changes all links
* so that it will lead to the script itself with original URL as parameter
*
*/
class ParseHtml{
var
$tag_modifications = array ();//tags modifications
var $source_html; //page recieved from the web server
var $html = ""; //html to display to user agent
var $URI; //object with parsed url of html page
//array of correspondence
var $tag_attributes = array("A" => "HREF",
                           
"LINK" => "HREF",
                           
"IMG" => "SRC",
                           
"FORM" => "ACTION"
                           
);

function
ParseHtml($html, $URI_obj){
    global
$HTTP_SERVER_VARS;
   
$this->source_html = $html;
   
$this->URI =& $URI_obj;
   
$this->script_url = "http://".$HTTP_SERVER_VARS['HTTP_HOST'].$HTTP_SERVER_VARS["SCRIPT_NAME"];
   
//offset - position of the parser
   
$offset = 0;
   
$done = false;
    while (!
$done){
       
//finding tag opening
       
$tag_open_pos = strpos ($html, "<", $offset);
        if (
$tag_open_pos === false){
           
//no more tags - just adding the rest of the contents
           
$this->html .= substr ($this->source_html, $offset);
            break;
        }
       
//there is a tag opening smbol, add preceding page contents
        //(after prev. tag)
       
$this->html .= substr ($this->source_html, $offset, $tag_open_pos-$offset);

       
//moving the parse pointer to the beginning of the tag
       
$offset = $tag_open_pos+1;

       
//finding the position of ">" symbol
       
$tag_close_pos = strpos ($html, ">", $offset);

       
//if there is no closing ">", considering it's the last tag, process it
        //and break the cycle
        //else move parse pointer behind the end of the tag and continue
       
if ($tag_close_pos){
           
//get the content of the tag to analyse
           
$tag_content = trim (substr($html, $tag_open_pos+1, $tag_close_pos-$tag_open_pos-1));
           
//change the tag, if needed
           
if ($this->is_opening_tag("<$tag_content")){
               
$new_tag = $this->change_tag ($tag_content);
            }else{
               
$new_tag = $tag_content;
            }
           
//add tag (changed or not) to the page
           
$this->html .= "<". $new_tag . ">";
           
//move page pointer behind the end of the tag
           
$offset = $tag_close_pos+1;
        }else{
           
$tag_content = trim (substr($html, $tag_open_pos+1));
            if (
$this->is_opening_tag("<$tag_content")){
               
$new_tag = $this->change_tag ($tag_content);
            }else{
               
$new_tag = $tag_content;
            }
           
$this->html .= "<". $new_tag . ">";
            break;
        }
        unset (
$tag_open_pos, $tag_close_pos);
    }
}

function
is_opening_tag($tag){
    return !
preg_match("~^< */~", $tag);
}

//change tag attribute's value
//making the changes in the tags
function change_tag ($tag_content){
   
//get tag name
   
$space_position = strpos($tag_content, " ");
    if (
$space_position){//tag has attributes
       
$tag_name = substr($tag_content, 0, $space_position);
    }else return (
$tag_content);

   
//should we change this tag?
   
if (!array_key_exists(strtoupper($tag_name), $this->tag_attributes)){
        return (
$tag_content);
    }
   
$attr_name = $this->tag_attributes[strtoupper($tag_name)];

   
//get the value of the attribute
    //find symbol around the value (can be ', " or nothing) and split the tag
   
$regexp = "/$attr_name\s*=\s*('|\")?.*/im";
   
preg_match ($regexp, $tag_content, $found);
    if (isset(
$found[1])){
       
$quote = $found[1];
       
$regexp = "/(.*)$attr_name\s*=?\s*$quote([^$quote]*)$quote?\s?(.*)/is";
    }else{
       
$regexp = "/(.*)$attr_name\s*=\s*(\S+)\s?(.*)/is";
    }
    if (!
preg_match ($regexp, $tag_content, $found))
        return
$tag_content;
   
$tag_part_before_attribute = $found[1];
   
$attr_value = $found[2];
   
$tag_part_after_attribute = $found[3];

   
//convert relative address to absolute (if needed)
   
$new_attr_value = $this->convert_relative_link($attr_value);
   
//change the tag attribute value so the tag will point to the script,
    //and the original URL becomes the variable value passed to the script
   
$new_attr_value = $this->link_url_to_script ($new_attr_value);
   
   
//compiling new tag contentents
    //adding the beginning of the tag
   
$new_tag_content = $tag_part_before_attribute;
   
//adding new attribute
   
$new_tag_content .= $attr_name. "=". '"'.$new_attr_value.'"';
   
//adding the rest of the tag intact
   
$new_tag_content .= $tag_part_after_attribute;
   
    return
$new_tag_content;
}

// function recieves the URI from the tag attribute and checks if it is relative
// if so, function converts the URI to the absolute form
// using data on current URI from the class variable (array) $url
function convert_relative_link ($relative_url){
   
$regexp = "~^http://~i";
    if (
preg_match($regexp, $relative_url)){
        return (
$relative_url); // this is an absolute URL, no change needed
   
}

   
//attach relative link to the current URI's directory
   
$new_path = dirname($this->URI->path)."/". $relative_url;
   
//replace back and repetitive slashes with a single forward one
   
$new_path = preg_replace ("~((\\\\+)|/){2,}~", "/", $new_path);
   
//parse links to the upper directories
   
if (strpos($new_path, "/../") !== false){
       
$path_array = explode ("/", $new_path);
        foreach (
$path_array as $key=>$value){
            if (
$value == ".."){
                if (
$key > 2){
                    unset (
$path_array[$key-1]);
                }
                unset (
$path_array[$key]);
            }
        }
       
$new_path = implode ("/", $path_array);
    }
   
//writing absolute url based on relative and base page addresses
   
$absolute_url = $this->URI->scheme.$this->URI->user.$this->URI->pass.
       
$this->URI->host.$this->URI->port_string.$new_path;
    return
$absolute_url;
}

//url from the web page becomes the attribute passed to the script
function link_url_to_script ($url){
    global
$HTTP_SERVER_VARS;
   
$script_url = "http://".$HTTP_SERVER_VARS['HTTP_HOST'].$HTTP_SERVER_VARS ['PHP_SELF'];
   
   
//detect fragment in the target URI
   
if ($fragment_pos = strpos($url, "#")){
       
$fragment = substr($url, $fragment_pos);
    }else{
       
$fragment ="";
    }
   
//encode target URI to be passed as a parameter
   
$point_to = urlencode($url);
   
$full_new_url = $script_url."?php_browser_uri=".$point_to.$fragment;
    return
$full_new_url;
}
//end of the class
}
?>