PHP Classes
elePHPant
Icontem

File: wText.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Till Wehowski  >  wText  >  wText.php  >  Download  
File: wText.php
Role: Class source
Content type: text/plain
Description: main class
Class: wText
Detect and filter spam in text
Author: By
Last change:
Date: 1 year ago
Size: 8,218 bytes
 

Contents

Class file image Download
<?php

/*
  wText - This class analizes text, related to:
          Out: - Spammyfactor in percent
               - TextWeight in percent
          In: - Text to analize in constructor
               - Stopwords in constructor
               - Spamwords with positive(spam) or negative(interesting content) values in constructor
               - optional config array in detect method
          Required: List of stopwords ans list of spammwords or phrases in the desired language
              
  Version - 1.0.0

  Author - Till Wehowski
  License - Do What The Fuck You Want To Pulic License
            I would by happy about a backlink to my homepage (Webfan.de) but its not an obligation.
           
  More - http://phpclasses.org
           http://www.webfan.de (Author Homepage)

  Docu - Please read the code
 
    count: Number of words of original text
    count_clean: Number of words without stopwords
    count_spammy: Number of words without stopwords and spamwords
    value: count_clean in percent
    spammy: count_spammy in percent
    spammyPoints: spam points regarding weighted spamwords
    spammyValue: spammyPoints / count * factor
 
  Example:
$txt = '';
     foreach($_POST as $k => $p)
      {
        $p = trim($p);
        $p = (string)$p;
        $txt .= $p;
        $txt .= ' ';
      }
     $s = new wText($txt, $stopwords, $spamwords, TRUE );
     $r = $s->calc();
     $isSpam = $s->detect();
    
     $html = '';
     //var_dump($txt, $s->result);
     $html.='Testergebnis:<br />';
     $html.='Spam? ';
     if($isSpam === FALSE)
      {
       $html.='<span style="color:green;">NEIN</span>';
      }else{
              $html.='<span style="color:red;">JA</span>';
           }
    
     $html.='<ul>';

      foreach($r as $name => $value)
       {
          $html.='<li>';
           $html.='<b>'.$name.'</b>: '.$value;
          $html.='</li>';
       }

     $html.='</ul>';

     $r = $s->getBuf();
     $html.='Datengrundlage:';
     $html.='<ul>';

      foreach($r as $name => $value)
       {
          $html.='<li>';
           $html.='<b>'.strip_tags($name).'</b>: '.strip_tags($value);
          $html.='</li>';
       }

     $html.='</ul>';


     echo $html;
*/


class wText
{

 public
$factor;

 public
$result;

 private
$txt;
 private
$buf;

 private static
$stopwords;
 private static
$spamwords;
 private static
$noise;

 private
$lower;

 
/*
      $txt - string/text
      $stopwords - array of array( 'word' => 'theWord', 'lang' => 'de')
      $spamwords - array of array( 'word' => 'theWord', 'lang' => 'en', 'value' => 0.00)
      @returns $this
 */
 
public function __construct($text, $stopwords = array(), $spamwords = array(), $lowercase = TRUE, $factor = 100 )
  {

   
$this->factor = $factor;

    if(
is_array($stopwords) && count($stopwords) > 0)
      {
       
$this->stopwords = $stopwords;
      }
    if(
is_array($spamwords) && count($spamwords) > 0)
      {
       
$this->spamwords = $spamwords;
      }

   
$this->lower = $lowercase;
   
$this->txt = ' '.$this->html2txt($text).' ';
   
$this->buf = array();
   
$this->result = array();

   
$this->noise = array('.', ':', ';', ',', '!', '?', '-', '_', '+', '=','~','`','*','&','^','%', '(', ')','{','}', "'", '"', '\\', '/','|', '[', ']', '#','$', '', '@', '&amp;', '&nbsp;');

    return
$this;
  }
 
//eof constructor

 
public function calc()
  {
   
$this->doCalc();

    return
$this->result;
  }
 
//eof calc

 /*
  Example detect
  */
 
public function detect($conf = array(
  
'APP' => array('online' => 1, 'factor' => 100),
  
'SPAM' => array('min_count' => 0,
                  
'min_count_clean' => 0,
                  
'max_count_spammy' => 10,
                  
'min_value' => 40,
                  
'max_spammy' => 20,
                  
'max_spammyPoints' => 20,
                  
'max_spammyValue' => 40
                 
) ) )
   {
   
$isSpam = FALSE;
     if(
$conf['SPAM']['min_count'] !== 0 && $this->result['count'] < $conf['SPAM']['min_count']) return TRUE;
     if(
$conf['SPAM']['min_count_clean'] !== 0 && $this->result['count_clean'] < $conf['SPAM']['min_count_clean']) return TRUE;
     if(
$conf['SPAM']['max_count_spammy'] !== 0 && $this->result['count_spammy'] > $conf['SPAM']['max_count_spammy']) return TRUE;
     if(
$conf['SPAM']['min_value'] !== 0 && $this->result['value'] < $conf['SPAM']['min_value']) return TRUE;
     if(
$conf['SPAM']['max_spammy'] !== 0 && $this->result['spammy'] > $conf['SPAM']['max_spammy']) return TRUE;
     if(
$conf['SPAM']['max_spammyPoints'] !== 0 && $this->result['spammyPoints'] > $conf['SPAM']['max_spammyPoints']) return TRUE;
     if(
$conf['SPAM']['max_spammyValue'] !== 0 && $this->result['spammyValue'] > $conf['SPAM']['max_spammyValue']) return TRUE;
    
   
    return
$isSpam;
   }
  
//eof detect
  
 

 
private function doCalc()
   {
     
$this->result['count'] = 0;
     
$this->result['count_clean'] = 0;
     
$this->result['count_spammy'] = 0;

     
$this->result['value'] = 0;

     
$this->result['spammy'] = 0;
     
$this->result['spammyPoints'] = 0;
     
$this->result['spammyValue'] = 0;

     
$this->buf['content'] = strip_tags($this->txt);
      if(
$this->lower !== FALSE)$this->buf['content'] = strtolower($this->buf['content']);

     
$replace = array();
      for(
$i = 0; $i <= count($this->noise) -1; $i++)
        {
          
$replace[] = ' ';
        }
     
$this->buf['content'] = str_replace($this->noise, $replace, $this->buf['content']);

     
$this->result['count'] = str_word_count($this->buf['content'], 0);

     
$this->buf['clean'] = $this->buf['content'];
      foreach(
$this->stopwords as $k => $w)
        {
           
$w['word'] = trim($w['word']);
          
// $this->buf['clean'] = str_ireplace($w['word'], ' ', $this->buf['clean']);
         
$this->buf['clean'] = preg_replace("/\s".preg_quote($w['word'])."\b/i", ' ', $this->buf['clean']);
        }

     
$this->buf['clean'] = preg_replace("/\s+/", ' ', $this->buf['clean']);
     
$this->result['count_clean'] = str_word_count($this->buf['clean'], 0);

     
$this->buf['clean_spammy'] = $this->buf['clean'];
      foreach(
$this->spamwords as $k => $w)
        {
          
$w['word'] = trim($w['word']);
         
// $this->buf['clean_spammy'] = str_ireplace($w['word'], ' ', $this->buf['clean_spammy']);
          
$this->buf['clean_spammy'] = preg_replace("/\s".preg_quote($w['word'])."\b/i", ' ', $this->buf['clean_spammy']);
        
          
$c = substr_count($this->buf['content'], ' '.$w['word'].' ');
          
$this->result['spammyPoints'] = $this->result['spammyPoints'] + ( $c * $w['value'] );
          
$this->result['count_spammy'] = $this->result['count_spammy'] + $c;
        }
     
$this->buf['clean_spammy'] = preg_replace("/\s+/", ' ', $this->buf['clean_spammy']);

    
$this->result['value'] = (str_word_count($this->buf['clean']) / str_word_count($this->buf['content']) ) * 100;
    
$this->result['value'] = round($this->result['value'] , 2);

    
$this->result['spammy'] = 100 - ( (str_word_count($this->buf['clean_spammy']) / str_word_count($this->buf['clean']) ) * 100);
    
$this->result['spammy'] = round($this->result['spammy'], 2);

    
$this->result['spammyValue'] = $this->result['spammyPoints'] / $this->result['count'];
    
$this->result['spammyValue'] = round($this->result['spammyValue'] * $this->factor, 2);
   }
 
//eo prepare



 
public function getBuf()
   {
     return
$this->buf;
   }
  
  
 function
html2txt($document){
    
$search = array('@<script[^>]*?>.*?</script>@si', // Strip out javascript
              
'@<[\/\!]*?[^<>]*?>@si', // Strip out HTML tags
              
'@<style[^>]*?>.*?</style>@siU', // Strip style tags properly
              
'@<![\s\S]*?--[ \t\n\r]*>@' // Strip multi-line comments including CDATA
  
);
   
$text = preg_replace($search, '', $document);
   
$text = strip_tags($text);
    return
$text;
  }
 
 
}
//EOF