Login   Register  
PHP Classes
elePHPant
Icontem

File: class.htmlparser.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Lin Zhemin  >  HTMLparser  >  class.htmlparser.php  >  Download  
File: class.htmlparser.php
Role: ???
Content type: text/plain
Description: The class itself
Class: HTMLparser
Author: By
Last change:
Date: 12 years ago
Size: 16,718 bytes
 

Contents

Class file image Download
<?
// Copyright(C)MMI, Lin Zhemin.
// PHP HTML parser class
// Copyright declared according to GPLv2.
//
// GeOu{AhC
// GTp rowspan (vTj)
// G|B DTD, , CSS, PHP, ASP 


$DEBUGLEVEL = 0;            // T
$SHOWINDENT = 4;            // Y


// HU http://www.w3.org/TR/html4/index/elements.html

// HTML n end tag  (bWW End Tag = )
$HTMLbrac = array(
        'a', 'abbr', 'acronym', 'address', 'applet',
        'b', 'bdo', 'big', 'blockquote', 'button',
        'caption', 'center', 'cite', 'code',
        'del', 'dfn', 'dir', 'div', 'dl',
        'em', 'fieldset', 'font', 'form', 'frameset',
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
        'i', 'iframe', 'ins',
        'kbd',
        'label', 'legend',
        'map', 'menu',
        'noframes', 'noscript',
        'object', 'ol', 'optgroup',
        'pre',
        'q',
        's', 'samp', 'script', 'select', 'small', 'span', 'strike', 'strong', '
style', 'sub', 'sup',
        'table', 'textarea', 'title', 'tt',
        'u', 'ul',
        'var',
        'html', 'body', 'head'  // oOvXRI
        );

// HTML ]tO (WGEnd Tag = O)
// @w]tO
$HTMLcont = array_merge($HTMLbrac, array(
            'colgroup', 'dd', 'dt', 'li', 'option', 'p',
            'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
            ));

// HTML L]tO (WGEnd Tag = F, Empty = E)
$HTMLnocn = array(
        'area', 'base', 'basefont', 'br', 'col', 'frame',
        'hr', 'img', 'input', 'isindex', 'link', 'meta',
        'param');

// NH
$HTMLCloseTag = array(
        'p'  => array('table')
        );




function HTMLdebug($l, $s) {
global $DEBUGLEVEL;
if($l > $DEBUGLEVEL)
    return;
echo "DEBUG: $s\n";
}




// Container
class Container {
    var $obj = array();
    var $iden;
    var $cnt = 0;   // obj 

    function push(&$o) {
        HTMLdebug(5, ' Container::push');
        $this->cnt++;
        return array_push($this->obj, $o);
    }
    function pop() {
        return array_pop($this->obj);
    }
    function show($in = 0) {
        HTMLdebug(7, " Container::show($in)");
        $r = "";
        for($i = 0; $i < $this->cnt; $i++) {
            $r .= $i.'C'.$this->obj[$i]->show($in);
        }
        return $r;
    }
    function out() {
        HTMLdebug(7, ' Container::out');
        $r[] = $this;
        for($i = 0; $i < $this->cnt; $i++) {
            $r[] = $this->obj[$i]->out();
        }
        return $r;
    }
    function type() {
        return "Container";
    }
}

class HTMLtxt {
    var $txt;

    function HTMLtxt(&$s) {
        HTMLdebug(5, " HTMLtxt::HTMLtxt");
        HTMLdebug(4, "HTMLtxt = $s");
        $this->txt = trim($s);
    }
    function show($in = 0) {
        HTMLdebug(7, " HTMLtxt::show($in)");
        return 'r'.str_repeat(' ', $in).$this->txt."\n";
    }
    function out() {
        HTMLdebug(7, ' HTMLtxt::out');
        return $this->txt;
    }
    function iden() {
        return "HTMLtxt";
    }
    function type() {
        return "HTMLtxt";
    }
}





// HTML Tag 
class HTMLattr {
    var $element;
    var $property;

    function HTMLattr($e, $p = "") {
        HTMLdebug(5, " HTMLattr::HTMLattr");
        $this->element = trim($e);
        $this->property = trim($p);
    }
    function show($in = 0) {
        HTMLdebug(7, " HTMLattr::show($in)");
        return ''.str_repeat(' ', $in).$this->element.' = '.$this->property
."\n";
    }
    function out() {        HTMLdebug(7, ' HTMLattr::out');
        return $this;
    }
    function type() {
        return "HTMLattr";
    }
    function iden() {
        return "HTMLattr";
    }
}






// HTML Tag  class
class HTMLtag {
    var $tag;    function HTMLtag($t) {
        HTMLdebug(5, " HTMLtag::HTMLtag");
        if(!preg_match('/(\/?\w+)(.*)/', $t, $a)) {
            die('_ tag');
        }
        $this->tag = strtolower($a[1]);
        if(count($a) > 2) {
            preg_match_all('/([\w-]+)(=\s*"?([^"]*)"?)?/', $a[2], $b);
                for($i = 0; $i < count($b[0]); $i++) {
                    $attr = new HTMLattr($b[1][$i], $b[3][$i]);
                    $this->push($attr);
                }
        }
    }
    function push(&$attr) {
        HTMLdebug(5, ' HTMLtag::push');
        $this->cnt++;
        array_push($this->attr, $attr);
    }            if($this->attr[$i]->element == $a) {
                return $this->attr[$i]->property;
            }
        }
        return NULL;
    }
    function show($in = 0) {
        global $SHOWINDENT;
        HTMLdebug(7, " HTMLtag::show($in)");
        $r = ''.str_repeat(' ', $in).'['.$this->tag."]\n";
        for($i = 0; $i < $this->cnt; $i++) {
            $r .= $i.'T'.$this->attr[$i]->show($in + $SHOWINDENT);
        }
        return $r;
    }
    function out() {
        HTMLdebug(7, ' HTMLtag::out');
        $r[] = $this;
        for($i = 0; $i < $this->cnt; $i++) {
            $r[] = $this->attr[$i]->out();
        }
        return $r;
    }
    function iden() {
        return $this->tag;
    }
    function type() {
        return "HTMLtag";
    }
}





// HTML para: hAtdh| container
// G
//      $sup Whr
//      $s   nBzry
//      $t   yz
// $this->iden er
// $this->obj[0] eyz
// $this->obj[1..]  subnodes
// $this->sup OWhr
class HTMLpara extends Container {
    var $iden;
    var $sup;

    function HTMLpara($sup, $s, $t = "") {
        HTMLdebug(5, " HTMLpara::HTMLpara, Wh [$sup]");
        $this->sup = $sup;
        if(is_object($t)) {
            $this->push($t);
        }
        if($s) {
            if(eregi('^<table', $s)) {
                $this->fixtable($s);
            }
            $this->parse(&$s);
        }
    }



    function iden() {
        HTMLdebug(5, ' HTMLpara::iden');
        return $this->iden;
    }


    function push(&$o) {
        HTMLdebug(5, ' HTMLpara::push');
        if(!isset($this->iden)) {
            $this->iden = $o->iden();
            HTMLdebug(3, 'eO ['.$this->iden.']');
        }
        if($o->type() == "HTMLpara") {
            HTMLdebug(3, 'e ['.$this->iden.']  e ['.$o->iden().']');
        } else {
            HTMLdebug(3, 'e ['.$this->iden.']   ['.$o->iden().']');
        }
        $this->cnt++;
        array_push($this->obj, $o);
    }


    function show($in = 0) {
        global $SHOWINDENT;
        HTMLdebug(7, " HTMLpara::show($in)");
        $r = 'e'.str_repeat(' ', $in).'{'.$this->iden."}\n";
        $r .= Container::show($in + $SHOWINDENT);
        return $r;
    }

    function out() {
        HTMLdebug(7, ' HTMLpara::out');
        $r[] = $this;
        for($i = 0; $i < $this->cnt; $i++) {
            $r[] = $this->obj[$i]->out();
        }
        return $r;
    }

    function type() {
        return "HTMLpara";
    }


    function parse($s) {
        HTMLdebug(5, ' HTMLpara::parse');
        while($s) {
            $this->parsetxt($s);
            $this->parsetag($s);
        }
        HTMLdebug(3, "[".$this->iden."] RF");
    }


    // RGr
    function parsetxt(&$s) {
        HTMLdebug(6, ' HTMLpara::parsetxt');
        $TxtRe = '/^([^<]*)/';
        if(preg_match($TxtRe, $s, $r)) {
            $txt = trim($r[1]);
            if(!empty($txt)) {
                $t = new HTMLtxt($txt);
                HTMLdebug(2, 'e ['.$this->iden.'] F ['.$t->iden().'] 
');
                $this->push($t);
                $s = trim(substr($s, strlen($r[1])));
                HTMLdebug(8, 's = '.$s);
            }
        }
    }


    // R: 
    function parsetag(&$s) {
        global $HTMLcont, $HTMLbrac;
        HTMLdebug(6, ' HTMLpara::parsetag');
        $TagRe = '/^<(\/?\w+[^>]*)>/';
        if(preg_match($TagRe, $s, $r)) {
            $tag = trim($r[1]);
            $t = new HTMLtag($tag);
            HTMLdebug(2, 'e ['.$this->iden.'] F ['.$t->iden().'] ')
;

            // S]tOC
            if(!in_array($t->tag, $HTMLcont)) {
                HTMLdebug(4, "$t->tag ]tOA~ parse ");
                $this->parsetagmono($s, $t, $tag);
                return true;
            }
            // Y
            $bndb = preg_quote(strtolower($tag), '/');
            $bnde = explode(" ", $tag);
            $bnde = strtolower($bnde[0]);
            $bnde = preg_quote($bnde, '/');

            // Bz
            if(in_array($t->tag, $HTMLbrac)) {
                HTMLdebug(4, "n [$tag]");
                $ns = $this->parsetagbrac($s, $bndb, $bnde);
            } else {
                HTMLdebug(4, " [$tag]");
                $ns = $this->parsetagunbrac($s, $tag, $bndb, $bnde);
            }

            $c = new HTMLpara($this->iden, trim($ns), $t);

            // SBz
            if($t->tag == 'table') {
                $c->parsetable();
            }
            $this->push($c);
        }
    }


    function parsetagmono(&$s, &$t, &$tag) {
        HTMLdebug(6, ' HTMLpara::parsetagmono');
        $s = trim(substr($s, strlen($tag) + 2));
        $this->push($t);
        HTMLdebug(8, 's = '.$s);
    }


    function parsetagbrac(&$s, &$bndb, &$bnde) {
        HTMLdebug(6, ' HTMLpara::parsetagbrac');
        $len = $this->parsebalance($s, $bnde);
        $p = trim(substr($s, 0, $len));
        $s = trim(substr($s, $len));
        $re = '/(<'.$bndb.'>(.*)(<\/'.$bnde.'[^>]*>))/i';
        // v
        if(preg_match($re, $p, $q)) {
            HTMLdebug(4, 'e: '.$q[2]);
            HTMLdebug(8, 's = '.$s);
            $ns = &$q[2];
        } else {
            HTMLdebug(0, '~Gregex = '.$re);
            HTMLdebug(0, '~Gp = '.$p);
            HTMLdebug(0, '~Gs = '.$s);
            die('io '.__line__);
        }
        return $ns;
    }


    function parsetagunbrac(&$s, &$tag, &$bndb, &$bnde) {
        global $HTMLbrac, $HTMLCloseTag;
        HTMLdebug(6, ' HTMLpara::parsetagunbrac');
        HTMLdebug(5, 'J '.$s);
        HTMLdebug(5, 'tag = '.$tag);
        $p = substr($s, strlen($tag) + 2);
        $skip = 0;
        while($p) {
            // Dr
            if(preg_match('/^([^<]+)/i', $p, $q)) {
                HTMLdebug(7, '[^<]+ = '.$q[1]);
                $k = strlen($q[1]);
                $skip += $k;
                $p = substr($p, $k);
            }
            // 
            if(preg_match('/^(<\/?([\w]+)[^>]*>[^<]*)/i', $p, $q)) {
                $CloseTag = &$q[2];
                HTMLdebug(7, 'q[2] = '.$CloseTag);
                if($CloseTag == $bnde) {
                    break;
                }
                $ct = $HTMLCloseTag["$bnde"];
                if(is_array($ct) && in_array($CloseTag, $ct)) {
                    break;
                }
            }
            // L_
            if(in_array($q[2], $HTMLbrac)) {
                $len = $this->parsebalance($p, $q[2]);
                $skip += $len;
                $p = substr($p, $len);
                continue;
            }
            $k= strlen($q[1]);
            $p = substr($p, $k);
            $skip += $k;
        }
        $re = '/(<'.$bndb.'>(.{'.$skip.'})(<\/?'.$bnde.'[^>]*>)?)/i';
        HTMLdebug(5, 'regex = '.$re);

        // v
        if(preg_match($re, $s, $p)) {
            HTMLdebug(4, "e: $p[2]");
            if($p[3] == '</'.$bnde.'>') {   // pGF
                $s = trim(substr($s, strlen($p[1])));
            } else {
                $s = trim(substr($s, strlen($p[1]) - strlen($p[3])));
            }
            HTMLdebug(8, 's = '.$s);
            $ns = &$p[2];
        } else {
            die('io, line: '.__line__);
        }
        return $ns;

    }


    // X
    function parsebalance($s, $bnde) {
        HTMLdebug(6, ' HTMLpara::parsebalance');
        HTMLdebug(5, 'J '.$s.' / '.$bnde);
        $tmp = $s;
        $balance = 0;
        $len = 0;
        $re = '/^(<\/?(\w+)[^>]*>([^<]*))/i';
        HTMLdebug(6, 'regex: '.$re);
        do {
            if(!preg_match($re, $tmp, $p)) {
                HTMLdebug(6, ' regex F');
                break;
            }
            if(strtolower($p[2]) != $bnde) {
                $sl = strlen($p[1]);
                $len += $sl;
                $tmp = trim(substr($tmp, $sl));
                continue;
            }
            if($p[1][1] == '/') {
                $balance--;
            } else {
                $balance++;
            }
            $sl = strlen($p[1]);
            $len += $sl;
            $tmp = trim(substr($tmp, strlen($p[1])));
            HTMLdebug(6, 'balance = '.$balance.' len = '.$len.' sl = '.$sl);
        } while($balance != 0);
            if($balance) {
                HTMLdebug(0, 'R~: '.$s);
                die('Line '.__line__);
            }
            $len -= strlen($p[3]);
            return $len;
    }


    // ApC
    function parsetable() {
        // `NGSBz rowspan p
        HTMLdebug(6, ' HTMLpara::parsetable');

        $td = array();
        $tr = 0;
        for($i = 1; $i < $this->cnt; $i++) {
            $o = &$this->obj[$i];
            if($o->iden() == 'tr') {
                $tr++;
                $ttd = 0;
                for($j = 1; $j < $o->cnt; $j++) {
                    $p = &$o->obj[$j];
                    if($p->iden() == 'td' || $p->iden() == 'th') {
                        if(($csp = $p->obj[0]->getattr('colspan')) != NULL) {
                            $ttd += $csp;
                        } else {
                            $ttd++;
                        }
                    }
                }
                $td[] = $ttd;
            }        }
        $max = 0;
        for($i = 0; $i < count($td); $i++) {
            if($max < $td[$i]) {
                HTMLdebug(4, "td[".$i."] = ".$td[$i]);
                $max = $td[$i];
            }
        }
        $td = $max;
        HTMLdebug(3, ' '.$tr.' CA'.$td.'');
        // oO table I
        $this->obj[0]->push(new HTMLattr('rows', $tr));
        $this->obj[0]->push(new HTMLattr('cols', $td));
    }


    //  table
    function fixtable(&$s) {
        HTMLdebug(6, ' HTMLpara::fixtable');
        HTMLdebug(8, 'JF '.$s);
        $re = '/^(<(\/?\w+)[^>]*>[^<]*)/i';
        $tmp = $s;
        $tags = array();
        $len = 0;
        while(preg_match($re, $tmp, $p)) {
            if($p[2] == 'td' || $p[2] == 'th') {
                break;
            }
            $tags[] = $p[2];
            $sl = strlen($p[1]);
            $len += $sl;
            $tmp = substr($tmp, $sl);
        }
        // [J@ <tr>
        $r = substr($s, 0, $len);
        if(!in_array('tr', $tags)) {
            HTMLdebug(6, '[JF <tr> ');
            $r .= '<tr>';
        }
        $oldlen = $len;
        while(preg_match($re, $tmp, $p)) {
            $ot = $tags;
            $ol = $len;
            $tags = $p[2];            $len = strlen($p[1]);
            $tmp = substr($tmp, $len);
        }
        // h@ <tr>
        if($ot == 'tr') {
            HTMLdebug(5, 'F <tr> ');
            $r .= substr($s, $oldlen, strlen($s) - $oldlen - $len - $ol) .
                  substr($s, strlen($s) - $len);
        } else {
            $r .= substr($s, $oldlen);
        }
        $s = trim($r);
    }
}





// HTML J class
class HTMLin extends Container {
    function HTMLin($s) {
        HTMLdebug(5, " HTMLin::HTMLin()");
        $s = strtr($s, "\n", ' ');
        $s = strtr($s, "\r", '');
        $s = preg_replace('/<!.*>/U', '', $s);          // Bz DTD//CSS
        $s = preg_replace('/<[?%].*[?%]>/U', '', $s);   // Bz PHP/ASP 
        $s = preg_replace('/>\s+</U', '><', $s);        // d
        $s = strchr($s, '<');                       // Bz@eF

        $this->iden = "HTMLin";
        $t = new HTMLtag("HTMLin");
        $this->push(new HTMLpara($this->iden, $s, $t));
    }
    function show($in = 0) {
        HTMLdebug(7, " HTMLin::show($in)");
        $r = Container::show($in);
        return $r;
    }
    function out() {
        HTMLdebug(7, ' HTMLin::out');
        for($i = 0; $i < $this->cnt; $i++) {
            $r[] = $this->obj[$i]->out();
        }
        return $r;
    }
    function type() {
        return "HTMLin";
    }
    function iden() {
        return "HTMLin";
    }
}
?>



    function getattr($a) {
        HTMLdebug(5, ' HTMLtag::getattr');
        for($i = 0; $i < $this->cnt; $i++) {

    var $attr = array();
    var $cnt = 0;   // attr