#! /usr/local/bin/php -d include_path=/usr/local/www/portal.postgresql.org/pear/lib -q
* @version $Revision: 1.18 $
*/
class HTTP_Mirror extends HTTP_Request_Listener
{
var $_options = array();
var $_queue = array();
var $_visited = array();
var $_url = '';
var $_log = null;
/**
* Class constructor, sets the options
*
* Allowed options are:
* 'prefix' prefix for URLs to follow
* 'index' name of index file
* 'outputDir' directory to save files to
* 'reject' array of regex patterns for URLs not to follow.
* patterns are automatically anchored at 'prefix' in the start,
* and # is used as pattern terminator.
* 'check' if set the output dir is checked for stale files and nothing
* is updated.
*
* @access public
* @param object Log
* @param array options
*/
function HTTP_Mirror(&$log, $options)
{
$this->HTTP_Request_Listener();
$this->_log =& $log;
$this->_options = array_merge($this->_options, $options);
}
/**
* Crawls the site
*
* @access public
* @param string starting URL
*/
function crawl($start)
{
$this->_queue = array($start);
$client =& new HTTP_Client();
$client->enableHistory(false);
$client->attach($this);
$this->_log->log('Mirroring started', PEAR_LOG_INFO);
$start = time();
$urls = 0; // counting $this->_visited will give wrong results
while (!empty($this->_queue)) {
$this->_url = array_shift($this->_queue);
array_push($this->_visited, $this->_url);
$this->_log->log('Saving page ' . $this->_url, PEAR_LOG_DEBUG);
$client->get($this->_url);
$urls++;
} // while
$this->_log->log('Mirroring finished. ' . $urls . ' page(s) saved, ' .
(time() - $start) . ' second(s) spent', PEAR_LOG_INFO);
if ($this->_options['check'])
$this->check_directory($this->_options['outputDir']);
}
function check_directory($dir) {
$d = opendir($dir) or die("Failed to open directory $dir\n");
while ($f = readdir($d)) {
if ($f=='.' || $f=='..') continue;
if (is_dir($dir . '/' . $f))
$this->check_directory($dir . '/' . $f);
else {
$sd = substr($dir,strlen($this->_options['outputDir'])+1);
if ($f == '.htaccess') continue;
if ($f == 'sync_timestamp') continue;
if ($f == 'web_sync_timestamp') continue;
if (!empty($this->_options['reject'])) {
foreach ($this->_options['reject'] as $pattern) {
if (preg_match('#^' . $pattern . '#', $sd . '/' . $f))
continue 2;
}
}
$fn = $this->_options['prefix'] . '/' . ($sd==''?'':$sd . '/') . $f;
if (!in_array($fn,$this->_visited))
$this->_log->log('File ' . $sd . '/' . $f . ' no longer on website!');
}
}
closedir($d);
}
function update(&$subject, $event, $data)
{
switch ($event) {
case 'httpSuccess':
$response =& $subject->currentResponse();
$this->_savePage($response['body']);
if (substr($response['headers']['Content-Type'],0,9) == 'text/html') {
$this->_addToQueue($this->_extractLinks($response['body']));
}
break;
case 'httpError':
$response =& $subject->currentResponse();
$this->_log->log('HTTP error ' . $response['code'] . ' at page ' . $this->_url, PEAR_LOG_ERR);
// log the broken link
break;
} // switch
}
/**
* Extracts the links from the page
*
* @access private
* @param strin page content
* @return array list of links (expanded to fully-qualified URLs)
*/
function _extractLinks($body)
{
// search for tag, if found this should serve as a base for URLs
if (preg_match("/]+href\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $matches)) {
$base = empty($matches[3])? $matches[2]: $matches[3];
} else {
$base = $this->_url;
}
preg_match_all("/<(?:a|link|area)[^>]+href\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $hrefMatches);
preg_match_all("/<(?:img|script|input|i?frame)[^>]+src\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $srcMatches);
preg_match_all("/<(?:body|table|td|th)[^>]+background\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $bgMatches);
$links = array();
$matches = array_map(
'trim', array_merge($hrefMatches[2], $hrefMatches[3], $srcMatches[2],
$srcMatches[3], $bgMatches[2], $bgMatches[3])
);
foreach ($matches as $link) {
if (!empty($link)) {
$links[] = $this->_expandLink($base, $link);
}
}
return array_unique($links);
}
/**
* Expands the link to the fully-qualified URL
*
* @access private
* @param string page containing the link
* @param string link text
* @return string fully-qualified URL
*/
function _expandLink($source, $link)
{
if (preg_match('!^https?://!i', $link)) {
// absolute URI, nothing to expand, just remove fragment
return preg_replace('/#.*$/', '', $link);
} else {
$parts = parse_url($link);
// is this the link in some other protocol?
if (!empty($parts['scheme'])) {
return $link;
}
// empty link or link to an anchor on the same page
if (0 == count($parts) || (1 == count($parts) && !empty($parts['fragment']))) {
return $source;
}
$url =& new Net_URL($source);
if (!empty($parts['path'])) {
if ('/' == $parts['path']{0}) {
$url->path = Net_URL::resolvePath($parts['path']);
} elseif ('/' == substr($url->path, -1)) {
$url->path = Net_URL::resolvePath($url->path . $parts['path']);
} else {
$dirName = dirname($url->path);
$url->path = Net_URL::resolvePath((DIRECTORY_SEPARATOR == $dirName? '': $dirName) . '/' . $parts['path']);
}
}
if ('/' == substr($url->path, -1)) {
$url->path .= $this->_options['index'];
}
if (!empty($parts['query'])) {
$url->addRawQueryString($parts['query']);
} else {
$url->querystring = array();
}
$url->anchor = '';
return $url->getUrl();
}
}
/**
* Adds the URLs to the queue
*
* @access private
* @param array list of URLs to add
*/
function _addToQueue($links = array())
{
// remove already visited / scheduled links
$newLinks = array_diff($links, $this->_visited, $this->_queue);
foreach ($newLinks as $link) {
// reject external links, this will naturally take care of non-HTTP links also
if (0 !== strpos($link, $this->_options['prefix'])) {
continue;
}
// reject links by pattern
if (!empty($this->_options['reject'])) {
foreach ($this->_options['reject'] as $pattern) {
if (preg_match('#^' . $this->_options['prefix'] . '/' . $pattern . '#', $link)) {
continue 2;
}
}
}
array_push($this->_queue, $link);
}
}
/**
* Saves the current page
*
* @access private
* @param string page contents
*/
function _savePage($body)
{
$parts = parse_url($this->_url);
$path = Net_URL::resolvePath(urldecode($parts['path']));
$file = $this->_options['outputDir'] . $path;
if ('/' == substr($file, -1)) {
$file .= $this->_options['index'];
array_push($this->_visited, $this->_url . $this->_options['index']);
}
if (!empty($parts['query'])) {
$file .= '-' . $parts['query'] . '.html';
}
// XXX: postgresql.org specific code
if (true) {
$fname = basename($file);
// if file has no extension, append .html.en
if (false === strpos($fname, '.') || !preg_match('/\\.[a-z-]+$/', $fname)) {
$file .= '.html.en';
array_push($this->_visited, $this->_url . '.html.en');
array_push($this->_visited, $this->_url . '.html');
// file has extension, but not language, append .en
} elseif (!preg_match('/\\.[a-z]{2}(-[a-z]{2})?$/', $fname)) {
$file .= '.en';
array_push($this->_visited, $this->_url . '.en');
}
$this->_writeFile($file, $body);
// if file has .en appended, save it without .en also
if ('.en' == substr($file, -3)) {
$this->_writeFile(substr($file, 0, -3), $body);
}
if ('.en' == substr($this->_url, -3)) {
array_push($this->_visited, substr($this->_url, 0, -3));
}
} else {
// /XXX
$this->_writeFile($file, $body);
}
}
/**
* Writes a file
*
* @access private
* @param string file name
* @param string file contents
* @throws PEAR_Error
*/
function _writeFile($fileName, $content)
{
if ($this->_options['check']) return; // When just checking contents,
// never write the file.
if (!is_file(dirname($fileName)) && !$this->_createDir(dirname($fileName))) {
$this->_log->log('Cannot create output directory ' . dirname($fileName), PEAR_LOG_EMERG);
die();
}
// Check if file exists
if (($fp = @fopen($fileName, 'r'))) {
// File exists, so load it up and check the contents
$oldcontent = fread($fp, filesize($fileName));
fclose($fp);
if ($content == $oldcontent) {
// No changes in the file, no need to write it out
return;
}
}
if (!($fp = @fopen($fileName, 'w'))) {
$this->_log->log('Cannot save to file ' . $fileName, PEAR_LOG_EMERG);
die();
}
fwrite($fp, $content);
fclose($fp);
}
/**
* Recursively creates a directory
*
* @access private
* @param string directory name
* @return bool whether the directory was successfully created
*/
function _createDir($path)
{
if (is_dir($path)) {
return true;
}
if ($path == dirname($path)) {
return false;
}
return $this->_createDir(dirname($path)) and @mkdir($path, 0755);
}
}
function _foo($error)
{
die($error->getMessage() . "\n" . $error->getUserInfo());
}
// Process command line arguments
$argsConfig = array(
'log' => array(
'short' => 'l',
'min' => 1,
'max' => 1,
'default' => './mirror.log',
'desc' => 'File to use for logging'
),
'verbose' => array(
'short' => 'v',
'max' => 0,
'desc' => 'Be verbose: output the progress to stdout'
),
'reject' => array(
'short' => 'r',
'min' => 1,
'max' => -1,
'desc' => 'Pattern(s) to reject the URLs'
),
'output' => array(
'short' => 'o',
'min' => 1,
'max' => 1,
'default' => '../static',
'desc' => 'Directory to save pages to'
),
'server' => array(
'short' => 's',
'min' => 1,
'max' => 1,
'default' => 'wwwmaster.postgresql.org',
'desc' => 'Server to mirror'
),
'check' => array(
'short' => 'c',
'max' => 0,
'desc' => 'Check for stale files in output directory'
)
);
$args =& Console_Getargs::factory($argsConfig);
if (PEAR::isError($args)) {
$header = "Script for postgresql.org static mirror generation\n" .
"Usage: " . basename(__FILE__) . " [options]\n\n";
if (CONSOLE_GETARGS_ERROR_USER == $args->getCode()) {
echo Console_Getargs::getHelp($argsConfig, $header, $args->getMessage()) . "\n";
} elseif (CONSOLE_GETARGS_HELP == $args->getCode()) {
echo Console_Getargs::getHelp($argsConfig, $header) . "\n";
}
exit();
}
// Create log object to use
if (!$args->getValue('verbose')) {
$log =& Log::factory('file', $args->getValue('log'), 'mirror', array(), PEAR_LOG_INFO);
} else {
$log =& Log::factory('composite');
$log->addChild(Log::factory('file', $args->getValue('log'), 'mirror', array(), PEAR_LOG_INFO));
$log->addChild(Log::factory('console', null, 'mirror'));
}
// Patterns to reject
$reject = $args->getValue('reject');
if (empty($reject)) {
$reject = array();
} elseif (!is_array($reject)) {
$reject = array($reject);
}
set_time_limit(0);
PEAR::setErrorHandling(PEAR_ERROR_CALLBACK, '_foo');
$spider =& new HTTP_Mirror($log, array(
'outputDir' => $args->getValue('output'),
'prefix' => 'http://' . $args->getValue('server'),
'index' => 'index.html.en',
'reject' => array_merge(array('files/', 'layout/', 'redir\?', 'redir/', 'download/mirrors-ftp\?file'), $reject),
'check' => $args->getValue('check')
));
$spider->crawl('http://' . $args->getValue('server') . '/index.html.en');
?>