Login   Register  
PHP Classes
elePHPant
Icontem

File: wText.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Till Wehowski  >  wText  >  wText.php  >  Download  
File: wText.php
Role: Class source
Content type: text/plain
Description: main class
Class: wText
Detect and filter spam in text
Author: By
Last change:
Date: 1 year ago
Size: 8,218 bytes
 

Contents

Class file image Download
<?php

/*
  wText - This class analizes text, related to:
          Out: - Spammyfactor in percent
               - TextWeight in percent
          In:  - Text to analize in constructor
               - Stopwords in constructor
               - Spamwords with positive(spam) or negative(interesting content) values in constructor
               - optional config array in detect method
          Required: List of stopwords ans list of spammwords or phrases in the desired language
               
  Version - 1.0.0

  Author - Till Wehowski
  License - Do What The Fuck You Want To Pulic License
            I would by happy about a backlink to my homepage (Webfan.de) but its not an obligation.
            
  More   - http://phpclasses.org
           http://www.webfan.de (Author Homepage)

  Docu  - Please read the code
  
    count: Number of words of original text
    count_clean: Number of words without stopwords
    count_spammy: Number of words without stopwords and spamwords
    value: count_clean in percent
    spammy: count_spammy in percent
    spammyPoints: spam points regarding weighted spamwords
    spammyValue: spammyPoints / count  * factor
  
  Example:
$txt = '';
     foreach($_POST as $k => $p)
      {
        $p = trim($p);
        $p = (string)$p;
        $txt .= $p;
        $txt .= ' ';
      }
     $s = new wText($txt, $stopwords, $spamwords, TRUE );
     $r = $s->calc();
     $isSpam = $s->detect();
     
     $html = '';
     //var_dump($txt, $s->result);
     $html.='Testergebnis:<br />';
     $html.='Spam? ';
     if($isSpam === FALSE)
      {
       $html.='<span style="color:green;">NEIN</span>';
      }else{
              $html.='<span style="color:red;">JA</span>'; 
           }
     
     $html.='<ul>';

      foreach($r as $name => $value)
       {
          $html.='<li>';
           $html.='<b>'.$name.'</b>: '.$value;
          $html.='</li>';
       }

     $html.='</ul>';

     $r = $s->getBuf();
     $html.='Datengrundlage:';
     $html.='<ul>';

      foreach($r as $name => $value)
       {
          $html.='<li>';
           $html.='<b>'.strip_tags($name).'</b>: '.strip_tags($value);
          $html.='</li>';
       }

     $html.='</ul>';


     echo $html;
*/


class wText
{

 public 
$factor;

 public 
$result;

 private 
$txt;
 private 
$buf;

 private static 
$stopwords;
 private static 
$spamwords;
 private static 
$noise;

 private 
$lower;

 
/*
      $txt   - string/text
      $stopwords - array of array( 'word' => 'theWord', 'lang' => 'de')
      $spamwords - array of array( 'word' => 'theWord', 'lang' => 'en', 'value' => 0.00)
      @returns $this
 */
 
public function __construct($text$stopwords = array(), $spamwords = array(), $lowercase TRUE$factor 100 )
  {

    
$this->factor $factor;

    if(
is_array($stopwords) && count($stopwords) > 0)
      {
        
$this->stopwords $stopwords;
      }
    if(
is_array($spamwords) && count($spamwords) > 0)
      {
        
$this->spamwords $spamwords;
      }

    
$this->lower $lowercase;
    
$this->txt ' '.$this->html2txt($text).' ';
    
$this->buf = array();
    
$this->result = array();

    
$this->noise = array('.'':'';'',''!''?''-''_''+''=','~','`','*','&','^','%''('')','{','}'"'"'"''\\''/','|''['']''#','$''''@''&amp;''&nbsp;');

    return 
$this;
  }
  
//eof constructor

 
public function calc()
  {
    
$this->doCalc();

    return 
$this->result;
  }
  
//eof calc

 /*
  Example detect
  */
  
public function detect($conf = array(
   
'APP' => array('online' => 1'factor' => 100),
   
'SPAM' => array('min_count' => 0
                   
'min_count_clean' => 0,
                   
'max_count_spammy' => 10,
                   
'min_value' => 40,
                   
'max_spammy' => 20,
                   
'max_spammyPoints' => 20,
                   
'max_spammyValue' => 40
                  
)  )    )
   {
    
$isSpam FALSE;
     if(
$conf['SPAM']['min_count'] !== && $this->result['count'] < $conf['SPAM']['min_count']) return TRUE;
     if(
$conf['SPAM']['min_count_clean'] !== && $this->result['count_clean'] < $conf['SPAM']['min_count_clean']) return TRUE;
     if(
$conf['SPAM']['max_count_spammy'] !== && $this->result['count_spammy'] > $conf['SPAM']['max_count_spammy']) return TRUE;
     if(
$conf['SPAM']['min_value'] !== && $this->result['value'] < $conf['SPAM']['min_value']) return TRUE;
     if(
$conf['SPAM']['max_spammy'] !== && $this->result['spammy'] > $conf['SPAM']['max_spammy']) return TRUE;
     if(
$conf['SPAM']['max_spammyPoints'] !== && $this->result['spammyPoints'] > $conf['SPAM']['max_spammyPoints']) return TRUE;
     if(
$conf['SPAM']['max_spammyValue'] !== && $this->result['spammyValue'] > $conf['SPAM']['max_spammyValue']) return TRUE;
     
    
    return 
$isSpam;
   }
   
//eof detect
   
 

  
private function doCalc()
   {
      
$this->result['count'] = 0;
      
$this->result['count_clean'] = 0;
      
$this->result['count_spammy'] = 0;

      
$this->result['value'] = 0;

      
$this->result['spammy'] = 0;
      
$this->result['spammyPoints'] = 0;
      
$this->result['spammyValue'] = 0;

      
$this->buf['content'] = strip_tags($this->txt);
      if(
$this->lower !== FALSE)$this->buf['content'] = strtolower($this->buf['content']);

      
$replace = array();
      for(
$i 0$i <= count($this->noise) -1$i++)
        {
           
$replace[] = ' ';
        }
      
$this->buf['content'] = str_replace($this->noise$replace$this->buf['content']);

      
$this->result['count'] = str_word_count($this->buf['content'], 0);

      
$this->buf['clean'] = $this->buf['content'];
      foreach(
$this->stopwords as $k => $w)
        {
            
$w['word'] = trim($w['word']);
           
// $this->buf['clean'] = str_ireplace($w['word'], ' ', $this->buf['clean']);
          
$this->buf['clean'] = preg_replace("/\s".preg_quote($w['word'])."\b/i"' '$this->buf['clean']);
        }

      
$this->buf['clean'] = preg_replace("/\s+/"' '$this->buf['clean']);
      
$this->result['count_clean'] = str_word_count($this->buf['clean'], 0);

      
$this->buf['clean_spammy'] = $this->buf['clean'];
      foreach(
$this->spamwords as $k => $w)
        {
           
$w['word'] = trim($w['word']);
          
// $this->buf['clean_spammy'] = str_ireplace($w['word'], ' ', $this->buf['clean_spammy']);
           
$this->buf['clean_spammy'] = preg_replace("/\s".preg_quote($w['word'])."\b/i"' '$this->buf['clean_spammy']);
         
           
$c substr_count($this->buf['content'], ' '.$w['word'].' ');
           
$this->result['spammyPoints'] =  $this->result['spammyPoints']  + ( $c $w['value'] );
           
$this->result['count_spammy'] = $this->result['count_spammy'] + $c;
        }
      
$this->buf['clean_spammy'] = preg_replace("/\s+/"' '$this->buf['clean_spammy']);

     
$this->result['value'] =  (str_word_count($this->buf['clean']) / str_word_count($this->buf['content']) ) * 100;
     
$this->result['value'] = round($this->result['value'] , 2);

     
$this->result['spammy'] = 100 -  ( (str_word_count($this->buf['clean_spammy']) / str_word_count($this->buf['clean']) ) * 100);
     
$this->result['spammy'] = round($this->result['spammy'], 2);

     
$this->result['spammyValue'] = $this->result['spammyPoints'] / $this->result['count'];
     
$this->result['spammyValue'] = round($this->result['spammyValue'] * $this->factor2);
   }
  
//eo prepare



  
public function getBuf()
   {
     return 
$this->buf;
   }
   
   
 function 
html2txt($document){
     
$search = array('@<script[^>]*?>.*?</script>@si',  // Strip out javascript
               
'@<[\/\!]*?[^<>]*?>@si',            // Strip out HTML tags
               
'@<style[^>]*?>.*?</style>@siU',    // Strip style tags properly
               
'@<![\s\S]*?--[ \t\n\r]*>@'         // Strip multi-line comments including CDATA
   
);
    
$text preg_replace($search''$document);
    
$text strip_tags($text);
    return 
$text;
  }
  
  
}
//EOF