File: docs/files/UrlExtractor.php.txt

Recommend this page to a friend!
  Classes of Joćo Ribeiro  >  PHP URL Extractor  >  docs/files/UrlExtractor.php.txt  >  Download  
File: docs/files/UrlExtractor.php.txt
Role: Documentation
Content type: text/plain
Description: Documentation
Class: PHP URL Extractor
Extract URLs of images and metadata from Web pages
Author: By
Last change: Generated updated documentation
Updated class docs
Date: 6 years ago
Size: 12,032 bytes
 

Contents

Class file image Download
<?php

/**
* Class UrlExtractor
*
* PHP version 5
*
* @category Utilities
* @package  UrlExtractor
* @author   Joao Ribeiro <joaopedrocr@gmail.com>
* @license  http://www.gnu.org/copyleft/gpl.html GNU General Public License
* @link     http://urlextractor.joaoperibeiro.com
*/

namespace rollbackpt\UrlExtractor;

/**
* PHP Class to extract images and meta data information from URLs.
*
* @category Utilities
* @package  UrlExtractor
* @author   Joao Ribeiro <joaopedrocr@gmail.com>
* @license  http://www.gnu.org/copyleft/gpl.html GNU General Public License
* @link     http://urlextractor.joaoperibeiro.com
*
* @TODO: Extract thumbnails from videos
* @TODO: Split the code into smaller classes (One to handle meta tags
* and other to handle images and thumbnails)
* @TODO: Change get_meta_tags to Regex
* @TODO: Add method to get existing extracted data
*/

class UrlExtractor
{
    /*
     * Use this const to define if you want to use curl or file_get_contents to
     * get the url contents
     */
    const CURL = true;

    /**
    * URL passed as a parameter in construct
    *
    * @var string $url
    */
    protected $url;

    /**
    * Host extracted from the URL
    *
    * @var string $host
    */
    protected $host;

    /**
    * Array to store all the images extracted from the URL
    *
    * @var array $images
    */
    public $images = array();

    /**
    * Title extracted from the URL
    *
    * @var string $title
    */
    public $title;

    /**
    * Description extracted from the URL
    *
    * @var string $description
    */
    public $description;

    /**
    * Array to store the keywords extracted from the URL
    *
    * @var array $keywords
    */
    public $keywords = array();

    /**
    * Array containing the name of the meta tags to be extracted
    *
    * @var array $metaTagNames
    */
    protected $metaTagNames = array(
        'title' => array(
            'twitter:title',
            'og:title'
        ),
        'description' => array(
            'description',
            'twitter:description',
            'og:description'
        ),
        'keywords' => array(
            'keywords'
        ),
        'images' => array(
            'twitter:image',
            'twitter:image:src',
            'og:image'
        )
    );

    /**
    * Class contructor.
    *
    * @return void
    */
    public function __construct()
    {
        // Empty constructor for future implementations
    }

    /**
    * Function extractAll
    *
    * Extract all the elements from the URL
    * (title, description, keywords and images)
    *
    * @param string $url
    * @param boolean $json Define if the result is returned in an array or a
    * json string.
    *
    * @return array|string Return an array or JSON string with the url info
    * (title, description, keywords and images) or and error message
    */
    public function extractAll($url, $json = true)
    {
        // Check the url parameter
        if (!empty($url)) {
            $this->url = $url;
        } else {
            throw new Exception("URL can\'t be empty!");
        }

        // Clean variables from old calls
        $this->host = "";
        $this->title = "";
        $this->description = "";
        $this->keywords = array();
        $this->images = array();

        // Get the url contents for extraction
        if (self::CURL) {
            $urlContent = $this->curlGetContents($this->url);
        } else {
            $urlContent = @file_get_contents($this->url);
        }

        // Avoid errors in the Regex matcher because of glued metatags
        $urlContent = str_replace("<meta", "\n<meta", $urlContent);

        if ($urlContent !== false) {
            $this->getHost($this->url);

            $this->getMetaTagsByProperty($urlContent);

            $this->getPageTitle($urlContent);

            $this->getMetaTagsByName($urlContent);

            $this->getImages($urlContent);

            $urlInfo = array(
                'title' => $this->title,
                'description' => $this->description,
                // Before assign, remove duplicate images and reorder the array
                'keywords' => array_values(array_unique($this->keywords)),
                // Before assign, remove duplicate images and reorder the array
                'images' => array_values(array_unique($this->images))
            );

            return ($json) ? json_encode($urlInfo) : $urlInfo;
        }

        return ($json) ? json_encode(array('error' => 'Invalid URL')) :
        array('error' => 'Invalid URL');
    }

    /**
    * Function getHost
    *
    * Get the host from the URL (Ex: http://localhost.com
    * is the host extracted from http://localhost.com/test/index.php)
    *
    * @param string $url
    *
    * @return void
    */
    protected function getHost($url)
    {
        $pattern = '/([^:]*:\/\/)?([^\/]*\.)*([^\/\.]+\.[^\/]+)/i';

        preg_match($pattern, $url, $results);

        $this->host = $results[0];
    }

    /**
    * Function getPageTitle
    *
    * Get the text inside the title tag
    *
    * @param string $urlContent Page content to get the title from
    *
    * @return void
    */
    protected function getPageTitle($urlContent)
    {
        $this->title = $this->getText($urlContent, "<title>", "</title>");
    }

    /**
    * Function getMetaTagsByName
    *
    * Get the regular meta tags (Description, keywords, etc..)
    *
    * @param string $urlContent Url content to get meta tags from
    *
    * @return void
    */
    protected function getMetaTagsByName($urlContent)
    {
        $pattern = '/<meta.*?name=["|\'](description|keywords)["|\'][^<]*?content=["|\'](.*?)["|\'].*?>|<meta.*?content=["|\'](.*?)["|\'][^<]*?name=["|\'](description|keywords)["|\'].*?>/i';

        preg_match_all($pattern, $urlContent, $results);

        $metaTags = $this->formatMetaTagsArray($results);

        if ($metaTags !== false) {
            $this->setUrlAtributes($metaTags);
        }
    }

    /**
    * Function getMetaTagsByProperty
    *
    * Get property meta tags like open graph for example
    * (Ex: <meta property="og:title" content="The Rock" />)
    *
    * @param string $urlContent Url content to get meta tags from
    *
    * @return void
    */
    protected function getMetaTagsByProperty($urlContent)
    {
        $pattern = '/<meta.*?property=["|\'](.*?)["|\'][^<]*?content=["|\'](.*?)["|\'].*?>|<meta.*?content=["|\'](.*?)["|\'][^<]*?property=["|\'](.*?)["|\'].*?>/i';

        preg_match_all($pattern, $urlContent, $results);

        $metaTags = $this->formatMetaTagsArray($results);

        if ($metaTags !== false) {
            $this->setUrlAtributes($metaTags);
        }
    }

    /**
    * Function getImages
    *
    * Get the images from the URL
    *
    * @param string $urlContent
    *
    * @return void
    */
    protected function getImages($urlContent)
    {
        $pattern = '/<img.*?src=["|\'](.*?)["|\'].*?>/i';

        preg_match_all($pattern, $urlContent, $results);

        foreach ($results[1] as $image) {
            $image = $this->checkImageUrl($image);
            if ($image !== null) {
                array_push($this->images, $image);
            }
        }
    }

    /**
    * Function setUrlAtributes
    *
    * Set the class atributes and overwrite duplicates
    * (Ex: Description, Keywords)
    *
    * @param string $metaTags
    *
    * @return void
    */
    protected function setUrlAtributes($metaTags)
    {
        foreach ($this->metaTagNames as $key => $name) {
            foreach ($name as $value) {
                if (array_key_exists($value, $metaTags)) {
                    if (is_array($this->$key)) {
                        if (!empty($metaTags[$value])) {
                            // Hard coded rule to split keywords by ","
                            if ($key == 'keywords') {
                                $metaTags[$value] = explode(",", $metaTags[$value]);
                                foreach ($metaTags[$value] as $v) {
                                    array_push($this->$key, trim($v));
                                }
                            } else {
                                array_push($this->$key, $metaTags[$value]);
                            }
                        }
                    } else {
                        if (!empty($metaTags[$value])) {
                            $this->$key = $metaTags[$value];
                        }
                    }
                }
            }
        }
    }

    /**
    * Function formatMetaTagsArray
    *
    * Utility function used by getMetaTagsByProperty to
    * properly format the meta tag array expected by
    * setUrlAtributes
    *
    * @param array $array
    *
    * @return array|boolean Returns the array with the meta tags found or false
    * in case of not founding any meta tags
    */
    protected function formatMetaTagsArray($array)
    {
        $pattern = '/^(' . $this->getPropertyRuleString() . ')/i';
        foreach ($array as $key => $value) {
            if (preg_grep($pattern, $value)) {
                if ($key%2 == 0) {
                    return array_combine($array[$key], $array[$key-1]);
                } else {
                    return array_combine($array[$key], $array[$key+1]);
                }
            }
        }
        return false;
    }

    /**
    * Function checkImageUrl
    *
    * Utility function used by getImages to check image URL
    * and complete relative URLs
    *
    * @param string $url Url of the image to be checked
    *
    * @return string Image url
    */
    protected function checkImageUrl($url)
    {
        $pattern = '/^[^(\.|\/)].*?[\.?].*?(.jpg|.gif|.png|.jpeg|.bmp)/i';
        $pattern2 = '/(.jpg|.gif|.png|.jpeg|.bmp)$/i';

        $url = preg_replace('/\.\.\//i', '', $url);

        if (!preg_match($pattern, $url)) {
            if (preg_match($pattern2, $url)) {
                return ($url[0] === '/') ? $this->host . $url : $this->host . '/' . $url;
            }
        } else {
            return $url;
        }

    }

    /**
    * Function getText
    *
    * Utility function that extract text between start and end points
    *
    * @param string $text
    * @param string $start
    * @param string $end
    *
    * @return string The text extracted
    */
    protected function getText($text, $start, $end)
    {
        $a = explode($start, $text);
        $b = explode($end, $a[1]);
        return $b[0];
    }

    /**
     * Function getPropertyRuleString
     *
     * Goes throug all the proprety names and generates a regex rule to match
     * at least one of them
     *
     * @return string Proprety names concateneted with | to make a regex rule
     */
    protected function getPropertyRuleString()
    {
        $string = "";
        foreach ($this->metaTagNames as $type) {
            foreach ($type as $tag) {
                $string .= $tag . "|";
            }
        }
        return trim($string, "|");
    }

    /**
     * Function curlGetContents
     *
     * Same as file_get_contents but using curl to avoid getting error 403
     * forbidden because the request doesn't have a valid user agent
     *
     * @param string $url Url to get the contents from using Curl
     *
     * @return string $output Contents obtained from the url
     */
    protected function curlGetContents($url)
    {
        // create curl resource
        $ch = curl_init();

        // set url
        curl_setopt($ch, CURLOPT_URL, $url);

        //return the transfer as a string
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');

        // $output contains the output string
        $output = curl_exec($ch);

        // close curl resource to free up system resources
        curl_close($ch);

        return $output;
    }
}

For more information send a message to info at phpclasses dot org.