PHP Classes
elePHPant
Icontem

File: abbr/getOrgAcro.php

Recommend this page to a friend!
  Classes of Jill Lingoff  >  Sweeper  >  abbr/getOrgAcro.php  >  Download  
File: abbr/getOrgAcro.php
Role: Auxiliary script
Content type: text/plain
Description: Auxiliary script
Class: Sweeper
Clean HTML to remove unwanted tags and attributes
Author: By
Last change:
Date: 8 months ago
Size: 5,768 bytes
 

 

Contents

Class file image Download
<?php

// this was used to get canadian government organization names and their acronyms
// as well as contact information using the found acronyms from government electronic directory services.

$OrganizationListPage = file_get_contents("http://direct.srv.gc.ca/cgi-bin/direct500/");
$OrganizationArray = findOrgsInURLs($OrganizationListPage);
$OrganizationNameArray = findOrgNamesInURLs($OrganizationListPage);

foreach(
$OrganizationArray as $index => $acro) {
   
preg_match("/([^\-]*)-/", $acro, $matches);
   
$match = $matches[1];
   
$ToWrite .= $match . " " . $OrganizationNameArray[$index] . "\r\n";
}
$file = "eng/GOC/acronyms.txt";
$fp = fopen($file, 'w');
fwrite($fp, $ToWrite);
fclose($fp);

$FraOrganizationListPage = file_get_contents("http://direct.srv.gc.ca/cgi-bin/direct500/XFo%3dGC%2cc%3dCA");
$FraOrganizationArray = FrafindOrgsInURLs($FraOrganizationListPage);
$FraOrganizationNameArray = FrafindOrgNamesInURLs($FraOrganizationListPage);

foreach(
$FraOrganizationArray as $Fraindex => $Fraacro) {
   
preg_match("/-([^\-]*)/", $Fraacro, $Framatches);
   
$Framatch = $Framatches[1];
   
$FraToWrite .= $Framatch . " " . $FraOrganizationNameArray[$Fraindex] . "\r\n";
}
$Frafile = "fra/GDC/acronyms.txt";
$Frafp = fopen($Frafile, 'w');
fwrite($Frafp, $FraToWrite);
fclose($Frafp);

exit(
0);

// from here we are getting contact information
$URLArray = array();
foreach(
$OrganizationArray as $OrgAcro) {
   
$PageWithPeople = file_get_contents("http://direct.srv.gc.ca/cgi-bin/direct500/SEou%3d$OrgAcro%2co%3dGC%2cc%3dCA?SV=web&SF=Title&ST=contains&x=31&y=20");
   
$URLArray = array_merge($URLArray, findURLs($PageWithPeople));
}

$rxpArray = array(
   
// Name
   
'/(<h2>)([\s]{0,10})([^\r\n]*)([\s]{0,10})(<a)/is' => '$3',
   
// Title
   
'/(<!-- Display detailed information -->)([\s]{0,10})(<div class="*text"*>)([\s]{0,10})([^<]*)(<br>)(<br>)([\s]{0,15})(<!-- title of person -->)/is' => '$5',
   
// Organization 1
   
'/(<!-- title of person -->)([\s]{0,10})([\w ,\-\(\)]*)([\s]{0,10})(<br>)([\s]{0,10})(<!-- top level OU -->)/is' => '${3}',
   
// Organization 2
   
'/(<!-- top level OU -->)([\s]{0,10})([\w ,\-\(\)]*)([\s]{0,10})*(<br>)([\s]{0,10})(<!-- immediate OU -->)/is' => '$3',
   
// Address
   
'/(<!-- Address - PO Box - Mail stop - City - Province - Contry - Postal code -->)([\s]{0,10})(<br>)([^<]*)(<br>)/is' => '$4',
   
// City, Province
   
'/(ITEM=\[\]\$-->)([\s]{0,10})(<br>)([\w ,\-\(\)]*)([\s]{0,10})(<br>)/is' => '$4',
   
// Country
   
'/(<br>)([\w]*)(<br>)([\s]{0,10})([\w \-\(\)]*)([\s]{0,10})(<!-- Telephone - Alternate telephone - Secure telephone - Fax - Secure Fax - TDD -->)/is' => '$2',
   
// Postal Code
   
'/(<br>)([\s]{0,10})([\w ]*)([\s]{0,10})(<!-- Telephone - Alternate telephone - Secure telephone - Fax - Secure Fax - TDD -->)/is' => '$3',
   
// Telephone 1
   
'/(<!-- Telephone - Alternate telephone - Secure telephone - Fax - Secure Fax - TDD -->)([\s]{0,10})(<dl>)([\s]{0,10})(<dt>Telephone:<\/dt><dd>)([^<]*)(<\/dd>)/is' => '$6',
   
// Telephone 2
   
'/(<dd>)([\w\(\)\- ]*)(<\/dd>)([\s]{0,10})(<dt>Fax:<\/dt><dd>)([^<]*)(<\/dd>)([\s]{0,10})(<\/dl>)([\s]{0,10})(<!-- X400 address -->)/is' => '$2',
   
// Fax
   
'/(<dt>Fax:<\/dt><dd>)([^<]*)(<\/dd>)([\s]{0,10})(<\/dl>)([\s]{0,10})(<!-- X400 address -->)/is' => '$2',
    );
   
// these spaces are tabs (since we want a tab-separated spreadsheet out of this).
$record = ("Name" . " ");
$record .= ("Title" . " ");
$record .= ("Organization 1" . " ");
$record .= ("Organization 2" . " ");
$record .= ("Address" . " ");
$record .= ("City, Province" . " ");
$record .= ("Country" . " ");
$record .= ("Postal Code" . " ");
$record .= ("Telephone 1" . " ");
$record .= ("Telephone 2" . " ");
$record .= ("Fax" . "\r\n");

foreach (
$URLArray as $file) {
   
$fileContents = file_get_contents($file);
   
$WhatToAdd = FindStuff($fileContents, $rxpArray);
    if (
" \r\n" != $WhatToAdd) {
       
$record .= $WhatToAdd;
    }
}

WriteFile("GEDS-record.txt", $record);

function
WriteFile($strTargetx, $tpx) {
   
// permission must be modified so that this file can be written to.
   
$handle = fopen($strTargetx, 'w');
   
fwrite($handle, $tpx);
   
fclose($handle);
}

function
FindStuff($strToFindOn, $rxpArray) {
   
$newRecordToAppend = "";
    foreach (
$rxpArray as $rxp => $replacement) {
       
preg_match($rxp, $strToFindOn, $matches);
       
// this space is a tab (since we want a tab-separated spreadsheet out of this).
       
$newRecordToAppend .= (ReplaceStuff($matches[0], $rxp, $replacement) . " ");
    }
   
$newRecordToAppend .= "\r\n";
    return(
$newRecordToAppend);
}

function
FindURLs($strToFindOn) {
   
preg_match_all("/<li><a\shref=\"([^\"]*)\"/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);
    return(
$matches[1]);
}

function
FindOrgsInURLs($strToFindOn) {
   
preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XEou%3d([^\"]*)%2co%3dGC%2cc%3dCA\"/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);
    return(
$matches[1]);
}

function
FraFindOrgsInURLs($strToFindOn) {
   
preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XFou%3d([^\"]*)%2co%3dGC%2cc%3dCA\"/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);
    return(
$matches[1]);
}

function
FindOrgNamesInURLs($strToFindOn) {
   
preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XEou%3d([^\"]*)%2co%3dGC%2cc%3dCA\">([^<]*)/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);
    return(
$matches[2]);
}

function
FraFindOrgNamesInURLs($strToFindOn) {
   
preg_match_all("/<li><a\shref=\"http:\/\/direct\.srv\.gc\.ca\/cgi\-bin\/direct500\/XFou%3d([^\"]*)%2co%3dGC%2cc%3dCA\">([^<]*)/is", $strToFindOn, $matches, PREG_PATTERN_ORDER);
    return(
$matches[2]);
}

function
ReplaceStuff($strToReplaceOn, $rxp, $replacement) {
   
$strToReplaceOn = preg_replace($rxp, $replacement, $strToReplaceOn);
    return(
$strToReplaceOn);
}

?>