PHP Classes
elePHPant
Icontem

File: parsers/test.php

Recommend this page to a friend!
  Classes of Raskin Veniamin  >  PHP Address Parser  >  parsers/test.php  >  Download  
File: parsers/test.php
Role: Auxiliary script
Content type: text/plain
Description: Auxiliary script
Class: PHP Address Parser
Extract address and other contact types from text
Author: By
Last change:
Date: 7 months ago
Size: 3,902 bytes
 

 

Contents

Class file image Download
<?php


function getContent($url) {
   
$opts = [
       
"http" => [
           
"method" => "GET",
           
"header" => "Accept-language: en\r\n" .
               
"User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36\r\n"
       
]
    ];

   
$context = stream_context_create($opts);

    return
file_get_contents($url, false, $context);
}

function
parseLinks($content, $site = 'http://base.site') {
   
$regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";
   
$links = [];
   
$emails = [];
   
$phones = [];
    if(
preg_match_all("/$regexp/siU", $content, $matches, PREG_SET_ORDER)) {
        foreach(
$matches as $match) {
           
$url = $match[2];
            if(
strpos($url, '#') !== false) {
                continue;
            }
            if(
strpos($url, 'tel:') !== false) {
               
$phones[] = substr($url, 4);
                continue;
            }
            if(
strpos($url, 'mailto:') !== false) {
               
$emails[] = substr($url, 7);
                continue;
            }

            if(
strpos($url, 'http') === false) {//local path
               
$url = rtrim($site, '/') . '/' .
                   
ltrim(str_replace('\'', '', $url), '/');
            }
           
$links[] = $url;
        }
    }

    return
array_map('array_unique', [ $links, $phones, $emails ]);
}

function
parseEmails($content) {
   
$regexp = '[A-Za-z\d._%+-]+@[A-Za-z\d.-]+\.[A-Za-z]{2,4}\b';
   
$result = [];
    if(
preg_match_all("/$regexp/siU", $content, $matches, PREG_SET_ORDER)) {
        foreach (
$matches as $match) {
           
$result[$match[0]] = 1;
        }
    }

    return
array_keys($result);
}


function
parsePhone($content) {
   
$regexps = [
       
'(\+\d\s)? \(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{4}',
       
'(\+\d\s)? \(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{2}-\d{2}',
       
'(\d\s)? \(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{2}-\d{2}',
       
'\(? (\d{3})? \)? (?(1) [\-\s] ) \d{3}-\d{2}-\d{2}'
       
];
   
$result = [];

    foreach(
$regexps as $regexp) {
        if(
preg_match_all("/$regexp/x", $content, $matches, PREG_SET_ORDER)) {
            foreach (
$matches as $match) {
               
$result[$match[0]] = 1;
            }
        }
    }

    return
array_map('normallyPhone', array_keys($result));
}

function
normallyPhone($phone) {
    return
preg_replace('/[^0-9\+]/', '', $phone);
}



function
parseAddress($content) {
   
$names = [
       
'?\.', '??????? ?????:', '??\.', '???????', '??\.', '?\.', '?\.', '??\.', '??'
   
];
   
$regexps = [
       
'('.join('|', $names).')\s{1,2}.+(\,|\s{2})'
   
];
   
$result = [];

    foreach(
$regexps as $regexp) {
        if(
preg_match_all("/$regexp/siU", $content, $matches, PREG_SET_ORDER)) {

            foreach (
$matches as $match) {
               
$val = trim($match[0]);
               
$key = trim($match[1]);
                if(
$val && $key) {
                    if(
substr($val, -1) === ',') {
                       
$val = substr($val, 0, -1);
                    }
                   
$index = array_search(str_replace('.', '\.', $key), $names);
                   
$result[$index] = $val;
                }
            }
        }
    }

   
ksort($result);
    return
join(', ', $result);
}

function
parse($site)
{
   
$content = getContent($site);
   
$result = parseLinks($content, $site);
   
$contentNoHtml = strip_tags($content);

   
$emails = parseEmails($contentNoHtml);

   
$phones = parsePhone($contentNoHtml);

   
var_dump($phones);


   
$address = parseAddress($contentNoHtml);

   
var_dump($address);

    foreach (
$emails as $email) {
        echo
"Email: ", $email, PHP_EOL;
    }

    foreach (
$result as $url) {
    
// echo $url, PHP_EOL;
   
}
}




$url = 'http://www.okna98.ru/contacts/';
parse($url);