#! /usr/local/bin/php -d include_path=/usr/local/www/portal.postgresql.org/pear/lib -q * @version $Revision: 1.18 $ */ class HTTP_Mirror extends HTTP_Request_Listener { var $_options = array(); var $_queue = array(); var $_visited = array(); var $_url = ''; var $_log = null; /** * Class constructor, sets the options * * Allowed options are: * 'prefix' prefix for URLs to follow * 'index' name of index file * 'outputDir' directory to save files to * 'reject' array of regex patterns for URLs not to follow. * patterns are automatically anchored at 'prefix' in the start, * and # is used as pattern terminator. * 'check' if set the output dir is checked for stale files and nothing * is updated. * * @access public * @param object Log * @param array options */ function HTTP_Mirror(&$log, $options) { $this->HTTP_Request_Listener(); $this->_log =& $log; $this->_options = array_merge($this->_options, $options); } /** * Crawls the site * * @access public * @param string starting URL */ function crawl($start) { $this->_queue = array($start); $client =& new HTTP_Client(); $client->enableHistory(false); $client->attach($this); $this->_log->log('Mirroring started', PEAR_LOG_INFO); $start = time(); $urls = 0; // counting $this->_visited will give wrong results while (!empty($this->_queue)) { $this->_url = array_shift($this->_queue); array_push($this->_visited, $this->_url); $this->_log->log('Saving page ' . $this->_url, PEAR_LOG_DEBUG); $client->get($this->_url); $urls++; } // while $this->_log->log('Mirroring finished. ' . $urls . ' page(s) saved, ' . (time() - $start) . ' second(s) spent', PEAR_LOG_INFO); if ($this->_options['check']) $this->check_directory($this->_options['outputDir']); } function check_directory($dir) { $d = opendir($dir) or die("Failed to open directory $dir\n"); while ($f = readdir($d)) { if ($f=='.' || $f=='..') continue; if (is_dir($dir . '/' . $f)) $this->check_directory($dir . '/' . $f); else { $sd = substr($dir,strlen($this->_options['outputDir'])+1); if ($f == '.htaccess') continue; if ($f == 'sync_timestamp') continue; if ($f == 'web_sync_timestamp') continue; if (!empty($this->_options['reject'])) { foreach ($this->_options['reject'] as $pattern) { if (preg_match('#^' . $pattern . '#', $sd . '/' . $f)) continue 2; } } $fn = $this->_options['prefix'] . '/' . ($sd==''?'':$sd . '/') . $f; if (!in_array($fn,$this->_visited)) $this->_log->log('File ' . $sd . '/' . $f . ' no longer on website!'); } } closedir($d); } function update(&$subject, $event, $data) { switch ($event) { case 'httpSuccess': $response =& $subject->currentResponse(); $this->_savePage($response['body']); if (substr($response['headers']['Content-Type'],0,9) == 'text/html') { $this->_addToQueue($this->_extractLinks($response['body'])); } break; case 'httpError': $response =& $subject->currentResponse(); $this->_log->log('HTTP error ' . $response['code'] . ' at page ' . $this->_url, PEAR_LOG_ERR); // log the broken link break; } // switch } /** * Extracts the links from the page * * @access private * @param strin page content * @return array list of links (expanded to fully-qualified URLs) */ function _extractLinks($body) { // search for tag, if found this should serve as a base for URLs if (preg_match("/]+href\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $matches)) { $base = empty($matches[3])? $matches[2]: $matches[3]; } else { $base = $this->_url; } preg_match_all("/<(?:a|link|area)[^>]+href\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $hrefMatches); preg_match_all("/<(?:img|script|input|i?frame)[^>]+src\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $srcMatches); preg_match_all("/<(?:body|table|td|th)[^>]+background\s*=\s*([\"'])?(?(1)(.*?)\\1|([^\s>]+))/is", $body, $bgMatches); $links = array(); $matches = array_map( 'trim', array_merge($hrefMatches[2], $hrefMatches[3], $srcMatches[2], $srcMatches[3], $bgMatches[2], $bgMatches[3]) ); foreach ($matches as $link) { if (!empty($link)) { $links[] = $this->_expandLink($base, $link); } } return array_unique($links); } /** * Expands the link to the fully-qualified URL * * @access private * @param string page containing the link * @param string link text * @return string fully-qualified URL */ function _expandLink($source, $link) { if (preg_match('!^https?://!i', $link)) { // absolute URI, nothing to expand, just remove fragment return preg_replace('/#.*$/', '', $link); } else { $parts = parse_url($link); // is this the link in some other protocol? if (!empty($parts['scheme'])) { return $link; } // empty link or link to an anchor on the same page if (0 == count($parts) || (1 == count($parts) && !empty($parts['fragment']))) { return $source; } $url =& new Net_URL($source); if (!empty($parts['path'])) { if ('/' == $parts['path']{0}) { $url->path = Net_URL::resolvePath($parts['path']); } elseif ('/' == substr($url->path, -1)) { $url->path = Net_URL::resolvePath($url->path . $parts['path']); } else { $dirName = dirname($url->path); $url->path = Net_URL::resolvePath((DIRECTORY_SEPARATOR == $dirName? '': $dirName) . '/' . $parts['path']); } } if ('/' == substr($url->path, -1)) { $url->path .= $this->_options['index']; } if (!empty($parts['query'])) { $url->addRawQueryString($parts['query']); } else { $url->querystring = array(); } $url->anchor = ''; return $url->getUrl(); } } /** * Adds the URLs to the queue * * @access private * @param array list of URLs to add */ function _addToQueue($links = array()) { // remove already visited / scheduled links $newLinks = array_diff($links, $this->_visited, $this->_queue); foreach ($newLinks as $link) { // reject external links, this will naturally take care of non-HTTP links also if (0 !== strpos($link, $this->_options['prefix'])) { continue; } // reject links by pattern if (!empty($this->_options['reject'])) { foreach ($this->_options['reject'] as $pattern) { if (preg_match('#^' . $this->_options['prefix'] . '/' . $pattern . '#', $link)) { continue 2; } } } array_push($this->_queue, $link); } } /** * Saves the current page * * @access private * @param string page contents */ function _savePage($body) { $parts = parse_url($this->_url); $path = Net_URL::resolvePath(urldecode($parts['path'])); $file = $this->_options['outputDir'] . $path; if ('/' == substr($file, -1)) { $file .= $this->_options['index']; array_push($this->_visited, $this->_url . $this->_options['index']); } if (!empty($parts['query'])) { $file .= '-' . $parts['query'] . '.html'; } // XXX: postgresql.org specific code if (true) { $fname = basename($file); // if file has no extension, append .html.en if (false === strpos($fname, '.') || !preg_match('/\\.[a-z-]+$/', $fname)) { $file .= '.html.en'; array_push($this->_visited, $this->_url . '.html.en'); array_push($this->_visited, $this->_url . '.html'); // file has extension, but not language, append .en } elseif (!preg_match('/\\.[a-z]{2}(-[a-z]{2})?$/', $fname)) { $file .= '.en'; array_push($this->_visited, $this->_url . '.en'); } $this->_writeFile($file, $body); // if file has .en appended, save it without .en also if ('.en' == substr($file, -3)) { $this->_writeFile(substr($file, 0, -3), $body); } if ('.en' == substr($this->_url, -3)) { array_push($this->_visited, substr($this->_url, 0, -3)); } } else { // /XXX $this->_writeFile($file, $body); } } /** * Writes a file * * @access private * @param string file name * @param string file contents * @throws PEAR_Error */ function _writeFile($fileName, $content) { if ($this->_options['check']) return; // When just checking contents, // never write the file. if (!is_file(dirname($fileName)) && !$this->_createDir(dirname($fileName))) { $this->_log->log('Cannot create output directory ' . dirname($fileName), PEAR_LOG_EMERG); die(); } // Check if file exists if (($fp = @fopen($fileName, 'r'))) { // File exists, so load it up and check the contents $oldcontent = fread($fp, filesize($fileName)); fclose($fp); if ($content == $oldcontent) { // No changes in the file, no need to write it out return; } } if (!($fp = @fopen($fileName, 'w'))) { $this->_log->log('Cannot save to file ' . $fileName, PEAR_LOG_EMERG); die(); } fwrite($fp, $content); fclose($fp); } /** * Recursively creates a directory * * @access private * @param string directory name * @return bool whether the directory was successfully created */ function _createDir($path) { if (is_dir($path)) { return true; } if ($path == dirname($path)) { return false; } return $this->_createDir(dirname($path)) and @mkdir($path, 0755); } } function _foo($error) { die($error->getMessage() . "\n" . $error->getUserInfo()); } // Process command line arguments $argsConfig = array( 'log' => array( 'short' => 'l', 'min' => 1, 'max' => 1, 'default' => './mirror.log', 'desc' => 'File to use for logging' ), 'verbose' => array( 'short' => 'v', 'max' => 0, 'desc' => 'Be verbose: output the progress to stdout' ), 'reject' => array( 'short' => 'r', 'min' => 1, 'max' => -1, 'desc' => 'Pattern(s) to reject the URLs' ), 'output' => array( 'short' => 'o', 'min' => 1, 'max' => 1, 'default' => '../static', 'desc' => 'Directory to save pages to' ), 'server' => array( 'short' => 's', 'min' => 1, 'max' => 1, 'default' => 'wwwmaster.postgresql.org', 'desc' => 'Server to mirror' ), 'check' => array( 'short' => 'c', 'max' => 0, 'desc' => 'Check for stale files in output directory' ) ); $args =& Console_Getargs::factory($argsConfig); if (PEAR::isError($args)) { $header = "Script for postgresql.org static mirror generation\n" . "Usage: " . basename(__FILE__) . " [options]\n\n"; if (CONSOLE_GETARGS_ERROR_USER == $args->getCode()) { echo Console_Getargs::getHelp($argsConfig, $header, $args->getMessage()) . "\n"; } elseif (CONSOLE_GETARGS_HELP == $args->getCode()) { echo Console_Getargs::getHelp($argsConfig, $header) . "\n"; } exit(); } // Create log object to use if (!$args->getValue('verbose')) { $log =& Log::factory('file', $args->getValue('log'), 'mirror', array(), PEAR_LOG_INFO); } else { $log =& Log::factory('composite'); $log->addChild(Log::factory('file', $args->getValue('log'), 'mirror', array(), PEAR_LOG_INFO)); $log->addChild(Log::factory('console', null, 'mirror')); } // Patterns to reject $reject = $args->getValue('reject'); if (empty($reject)) { $reject = array(); } elseif (!is_array($reject)) { $reject = array($reject); } set_time_limit(0); PEAR::setErrorHandling(PEAR_ERROR_CALLBACK, '_foo'); $spider =& new HTTP_Mirror($log, array( 'outputDir' => $args->getValue('output'), 'prefix' => 'http://' . $args->getValue('server'), 'index' => 'index.html.en', 'reject' => array_merge(array('files/', 'layout/', 'redir\?', 'redir/', 'download/mirrors-ftp\?file'), $reject), 'check' => $args->getValue('check') )); $spider->crawl('http://' . $args->getValue('server') . '/index.html.en'); ?>