PHP Classes
elePHPant
Icontem

File: getLanguage.php

Recommend this page to a friend!
  Classes of Jill Lingoff  >  Sweeper  >  getLanguage.php  >  Download  
File: getLanguage.php
Role: Auxiliary script
Content type: text/plain
Description: Auxiliary script
Class: Sweeper
Clean HTML to remove unwanted tags and attributes
Author: By
Last change:
Date: 5 months ago
Size: 12,531 bytes
 

Contents

Class file image Download
<?php

//include('retidy.php');

function getLanguage($source) {
	$language = "unknown";
	if(strpos($source, "<body") !== false) {	
		$contents = $source;
	} else {
		$sourceIsAFilename = true;
		$array_non_filename_characters = array(/*'/', '\\', only disallowed for file and folder names; not paths */ ':', '*', '?', '"', '<', '>', '|');
		foreach($array_non_filename_characters as $non_filename_character) {
			if(strpos($source, $non_filename_character) !== false) {
				$contents = $source;
				$sourceIsAFilename = false;
				break;
			}
		}
		if($sourceIsAFilename) {
			$contents = file_get_contents($source);	
		}
		//  we could do something like the following if necessary
		//if(strlen($source) > 260) { // file and folder names shouldn't be longer than roughly 256 characters
		//		$slash_count = substr_count($source, '/');
	}
	//print('here374595969790870<br>');
	// method 1 to find the language of a file (look at the file extension) 
	// (notice that we are not limiting the search to the end of the filename)
	if($sourceIsAFilename === true) {
		// these are separate from the below since they are higher priority
		if (
		strpos(strtolower($source), "/eng/") != false || 
		strpos(strtolower($source), "/english/") != false
		) {
			return $language = "english";
		}
		if (
		strpos(strtolower($source), "/fra/") != false || 
		strpos(strtolower($source), "/french/") != false
		) {
			return $language = "french";
		}
		
		if (
		
		//strpos(strtolower($source), "e.htm") != false ||

		//strpos(strtolower($source), "_e.html") != false || 
		//strpos(strtolower($source), "-e.html") != false || 
		//strpos(strtolower($source), "-en.html") != false || 
		//strpos(strtolower($source), "_en.html") != false ||  
		//strpos(strtolower($source), "-eng.html") != false || 
		//strpos(strtolower($source), "_eng.html") != false ||  // redundant
		strpos(strtolower($source), "_e.htm") != false || 
		strpos(strtolower($source), "-e.htm") != false || 
		strpos(strtolower($source), "-en.htm") != false || 
		strpos(strtolower($source), "_en.htm") != false ||  
		strpos(strtolower($source), "-eng.htm") != false || 
		strpos(strtolower($source), "_eng.htm") != false || 
		strpos(strtolower($source), " eng.htm") != false || 		
		strpos(strtolower($source), "_e.xml") != false || 
		strpos(strtolower($source), "-e.xml") != false || 
		strpos(strtolower($source), "-en.xml") != false || 
		strpos(strtolower($source), "_en.xml") != false ||  
		strpos(strtolower($source), "-eng.xml") != false || 
		strpos(strtolower($source), "_eng.xml") != false || 
		strpos(strtolower($source), " eng.xml") != false || 
		strpos(strtolower($source), "_e.php") != false || 
		strpos(strtolower($source), "-e.php") != false || 
		strpos(strtolower($source), "-en.php") != false || 
		strpos(strtolower($source), "_en.php") != false ||  
		strpos(strtolower($source), "-eng.php") != false || 
		strpos(strtolower($source), "_eng.php") != false || 
		strpos(strtolower($source), " eng.php") != false || 
		strpos(strtolower($source), "_e.asp") != false || 
		strpos(strtolower($source), "-e.asp") != false || 		
		strpos(strtolower($source), "-en.asp") != false || 
		strpos(strtolower($source), "_en.asp") != false ||  
		strpos(strtolower($source), "-eng.asp") != false || 
		strpos(strtolower($source), "_eng.asp") != false ||
		strpos(strtolower($source), " eng.asp") != false ||
		strpos(strtolower($source), "_e.aspx") != false || 
		strpos(strtolower($source), "-e.aspx") != false || 		
		strpos(strtolower($source), "-en.aspx") != false || 
		strpos(strtolower($source), "_en.aspx") != false ||  
		strpos(strtolower($source), "-eng.aspx") != false || 
		strpos(strtolower($source), "_eng.aspx") != false ||
		strpos(strtolower($source), " eng.aspx") != false	
		) {
			return $language = "english";
		}
		if (
		
		//strpos(strtolower($source), "f.htm") != false ||
		
		//strpos(strtolower($source), "_f.html") != false || 
		//strpos(strtolower($source), "-f.html") != false || 
		//strpos(strtolower($source), "-fr.html") != false || 
		//strpos(strtolower($source), "_fr.html") != false ||  
		//strpos(strtolower($source), "-fra.html") != false || 
		//strpos(strtolower($source), "_fra.html") != false ||  // redundant
		strpos(strtolower($source), "_f.htm") != false || 
		strpos(strtolower($source), "-f.htm") != false || 
		strpos(strtolower($source), "-fr.htm") != false || 
		strpos(strtolower($source), "_fr.htm") != false ||  
		strpos(strtolower($source), "-fra.htm") != false || 
		strpos(strtolower($source), "_fra.htm") != false || 
		strpos(strtolower($source), " fre.htm") != false || 
		strpos(strtolower($source), "_f.xml") != false || 
		strpos(strtolower($source), "-f.xml") != false || 
		strpos(strtolower($source), "-fr.xml") != false || 
		strpos(strtolower($source), "_fr.xml") != false ||  
		strpos(strtolower($source), "-fra.xml") != false || 
		strpos(strtolower($source), "_fra.xml") != false || 
		strpos(strtolower($source), " fre.xml") != false || 
		strpos(strtolower($source), "_f.php") != false || 
		strpos(strtolower($source), "-f.php") != false || 
		strpos(strtolower($source), "-fr.php") != false || 
		strpos(strtolower($source), "_fr.php") != false ||  
		strpos(strtolower($source), "-fra.php") != false || 
		strpos(strtolower($source), "_fra.php") != false || 
		strpos(strtolower($source), " fre.php") != false || 
		strpos(strtolower($source), "_f.asp") != false || 
		strpos(strtolower($source), "-f.asp") != false || 
		strpos(strtolower($source), "-fr.asp") != false || 
		strpos(strtolower($source), "_fr.asp") != false ||  		
		strpos(strtolower($source), "-fra.asp") != false || 
		strpos(strtolower($source), "_fra.asp") != false ||
		strpos(strtolower($source), " fre.asp") != false ||
		strpos(strtolower($source), "_f.aspx") != false || 
		strpos(strtolower($source), "-f.aspx") != false || 
		strpos(strtolower($source), "-fr.aspx") != false || 
		strpos(strtolower($source), "_fr.aspx") != false ||  		
		strpos(strtolower($source), "-fra.aspx") != false || 
		strpos(strtolower($source), "_fra.aspx") != false ||
		strpos(strtolower($source), " fre.aspx") != false	
		) {
			return $language = "french";
		}
		if (
		strpos(strtolower($source), "-bil.html") != false || 
		strpos(strtolower($source), "-english_and_french.html") != false ||
		strpos(strtolower($source), "_bil.html") != false || 
		strpos(strtolower($source), "_english_and_french.html") != false		
		) {
			return $language = "english_and_french";
		}		
	}
	//print('here374595969790871<br>');
	// method 2 to find the language of a file (look for lang attributes on the <html> tag)
	preg_match('/<html[^<>]*?>/is', $contents, $html_tag_matches);
	preg_match_all('/lang="([^"]*?)"/is', $html_tag_matches[0], $lang_matches);
	foreach($lang_matches[1] as $lang_index => $lang_value) {
		if($lang_value === "fr") {
			return $language = "french";
		}
		if($lang_value === "en") {
			return $language = "english";
		}
	}
	
	// I suppose we could also look for lang attributes in the content and call the language the opposite of what they declare (with the assumption that these attributes would only be used when some piece of content is in the 
	// language opposite to the whole of the document)
	//print('here374595969790872<br>');
	// method 3 to find the language of a file (look for french  density)
	if($language === "unknown") {
		$strlen = strlen($contents);
		if ($strlen > 5000) {
			$code_divisions = bcdiv($strlen, 1000, 0);
			$division_count = 0;
			while($division_count < $code_divisions && $language === "unknown") {
				$substr = substr($contents, bcmul($division_count, 1000), bcmul($division_count + 1, 1000));
				preg_match_all('/(()|(&eacute;)|(&#233;)|(&#xE9;))/is', $substr, $matches);
				if (sizeof($matches[1]) > 50) {
					return $language = "french";
				}				
				$division_count++;
			}
		}
		elseif ($strlen > 1000) {
			$substr = substr($contents, 300, 700);
			preg_match_all('/(()|(&eacute;)|(&#233;)|(&#xE9;))/is', $substr, $matches);
			if (sizeof($matches[1]) > 10) {
				return $language = "french";			
			}
		}
	}
	//print('here374595969790873<br>');
	// method 4 to find the language of a file (look for french characters)
	if($language === "unknown") {
		if (
		(
		strpos($contents, "&eacute;") != false && 
		strpos($contents, "&Eacute;") != false &&
		strpos($contents, "&agrave;") != false &&
		strpos($contents, "&Agrave;") != false &&
		strpos($contents, "&egrave;") != false &&
		strpos($contents, "&ocirc;") != false
		)
		||
		(
		strpos($contents, "") != false && 
		strpos($contents, "") != false &&
		strpos($contents, "") != false &&
		strpos($contents, "") != false &&
		strpos($contents, "") != false &&
		strpos($contents, "") != false
		)
		) {
			return $language = "french";
		}
	}
	//print('here374595969790874<br>');
	// method 5 uses a dictionary search
	$body_code = ReTidy::getBodyCode($contents);
	if($body_code !== false) {
		$body_code = ReTidy::tagless($body_code); // (2009-08-24)
		// choose a number of words to check...
		$number_of_words = 200;
		$word_count = 0;
		$body_length = strlen($body_code);
		$minimum_number_of_characters = 100;
		if($body_length < $minimum_number_of_characters) {
			return $language;
		}
		if($body_length < $number_of_words) {
			return $language; // otherwise we'll get an infinite loop since the step-size will be less than 1
		}
		$step_size = bcdiv($body_length, $number_of_words, 0);
		$position = 0;
		$arrayMatches = array();
		$english_words_file = "abbr/eng/words.txt";
		$ArrayEnglishWords = explode("\r\n", file_get_contents($english_words_file));
		$count_of_english_matches = 0;
		while($position < $body_length) {
			preg_match('/ ([a-z]{1,}) /i', $body_code, $matches, PREG_OFFSET_CAPTURE, $position);
			$arrayMatches = array_merge($arrayMatches, array($matches[1][0]));
			$position += $step_size;		
		}
		$arrayMatches = array_unique($arrayMatches);
		$arraySize = sizeof($arrayMatches);
		if($arraySize > 0) {
			foreach($arrayMatches as $index => $match) {
				$lowered = strtolower(html_entity_decode($match));
				foreach($ArrayEnglishWords as $englishWord) {
					if($englishWord === $lowered) {
						$count_of_english_matches++;
						break;
					}
				}
			}
			print("Number of words found: " . $arraySize . "<br>\r\n");
			print("Number of english words found: " . $count_of_english_matches . "<br>\r\n");
			if(bcdiv($count_of_english_matches, $arraySize, 2) > 0.90) {
				return $language = "english";
			}
		}
		
		$position = 0;
		//$arrayFrenchMatches = array();
		$french_words_file = "abbr/fra/mots.txt";
		$ArrayFrenchWords = explode("\n", file_get_contents($french_words_file));
		$count_of_french_matches = 0;
		//while($position < $body_length) {
		//	preg_match('/ ([a-z]{1,}) /i', $body_code, $matches, PREG_OFFSET_CAPTURE, $position);
		//	$arrayFrenchMatches = array_merge($arrayFrenchMatches, array($matches[1][0]));
		//	$position += $step_size;		
		//}
		//$arrayFrenchMatches = array_unique($arrayFrenchMatches);
		//$frenchArraySize = sizeof($arrayFrenchMatches);
		//if($frenchArraySize > 0) {
		if($arraySize > 0) {		
			//foreach($arrayFrenchMatches as $index => $match) {
			foreach($arrayMatches as $index => $match) {			
				$lowered = strtolower(html_entity_decode($match));
				foreach($ArrayFrenchWords as $frenchWord) {
					if($frenchWord === $lowered) {
						$count_of_french_matches++;
						break;
					}
				}
			}
			//print("Number of words found: " . $frenchArraySize . "<br>\r\n");
			//print("Number of words found: " . $arraySize . "<br>\r\n");
			print("Number of french words found: " . $count_of_french_matches . "<br>\r\n");
			//if(bcdiv($count_of_french_matches, $frenchArraySize, 2) > 0.90) {
			if(bcdiv($count_of_french_matches, $arraySize, 2) > 0.90) {			
				return $language = "french";
			}
		}
		//if($englishArraySize > 0 && $frenchArraySize > 0) {
		if($arraySize > 0) {
			//if(bcdiv($count_of_english_matches, $englishArraySize, 2) < 0.55 &&
			//bcdiv($count_of_english_matches, $englishArraySize, 2) > 0.45 &&
			//bcdiv($count_of_french_matches, $frenchArraySize, 2) < 0.55 &&
			//bcdiv($count_of_french_matches, $frenchArraySize, 2) > 0.45	
			if(bcdiv($count_of_english_matches, $arraySize, 2) < 0.55 &&
			bcdiv($count_of_english_matches, $arraySize, 2) > 0.45 &&
			bcdiv($count_of_french_matches, $arraySize, 2) < 0.55 &&
			bcdiv($count_of_french_matches, $arraySize, 2) > 0.45				
			) {
				return $language = "english_and_french";
			}
		}
	}
	return $language;
}

?>