#!/usr/bin/php
<?php
// PHP Regex Spider v. 2.0
// Copyright 2004 Frank Mitchell. All rights reserved.
// http://thiefsystems.org/ccs/phpregexspider
echo("\n");
// Where should we start searching from?
$start = 'http://thiefsystems.org/ccs/';
// What should we be looking for?
$search = '~<em>(.*?)<\/em>~';
// Do you want to follow query links?
$follow_queries = true;
// Do you want to convert 'http://www.' to 'http://' ?
$convert_www = true;
// Do you want to convert HTML entries?
// Setting this to true may cause the spider to seg. fault when it
// encounters pages with malformed HTML code.
$convert_html = false;
// What kinds of files and schemes should we avoid?
$dont_follow = array('jpg', 'gif', 'png', 'ico', 'zip', 'rar', 'tar', 'gz',
'c|', 'c', 'pl', 'py', 'js', 'jar', 'reg', 'orig', 'exe', 'java', 'class',
'css', 'xml', 'txt', 'dvi', 'ps', 'lot', 'doc', 'ppt', 'pdf', 'lit', 'mp3',
'wav', 'ra', 'pm', 'mpg', 'mpeg', 'mso', 'psd', 'swf', 'img', 'vhdl', 'dat',
'cpp', 'cls', 'tex', 'clq', 'mailto', 'javascript', 'news', 'feed', 'file');
// Build information about the site we're going to search.
if($url = parse_url($start))
{
if(isset($url['scheme'], $url['host']))
{
$b_scheme = $url['scheme'];
$b_host = $url['host'];
}
}
else
{
echo("\nError!\n");
echo('Description: Unable to parse starting URL. ');
echo("Please enter a different URL to start from.\n");
echo("Starting URL: " .$start. "\n\n");
exit;
}
// Initialize our array of links.
$links = array($start => 0);
// Initialize our array of search results.
$gold = array();
// Keep crawling until we run out of links.
while($p_link = array_search(0, $links))
{
// Mark this link as having been seen.
$links[$p_link] = 1;
// Get the contents of the link we're currently looking at.
// If we fail this, there's no point in going further.
// Remove the @ symbol if you want to see all warnings for pages that
// could not be retreived.
if(@ $contents = file_get_contents($p_link))
{
// Convert any HTML characters we find, including quotes.
if($convert_html)
{
$contents = html_entity_decode($contents, ENT_QUOTES);
}
// What link are we following?
echo('Following link: '.$p_link."\n");
// Build information about the link we're currently looking at.
unset($url, $p_url, $p_scheme, $p_host, $p_path);
if($url = parse_url($p_link))
{
$p_url = $p_link;
if(isset($url['scheme']))
{
$p_scheme = $url['scheme'];
$p_url = $p_scheme.'://';
}
if(isset($url['host']))
{
$p_host = $url['host'];
$p_url .= $p_host;
}
if(isset($url['path']))
{
$p_path = dirname($url['path']);
$p_url .= $p_path;
// Remove leading and trailing slashes from our path.
$p_path_end = strlen($p_path);
if($p_path_end > 0)
{
$p_path_end--;
if($p_path{0} == '/')
{
$p_path{0} = '';
}
if($p_path{$p_path_end} == '/')
{
$p_path{$p_path_end} = '';
}
}
}
// Add a trailing slash to our URL if one doesn't exist.
if($p_url{strlen($p_url) - 1} != '/')
{
$p_url .= '/';
}
}
// Extract all the search matches from the current page.
preg_match_all($search, $contents, $search_results);
// Put the search results into our pot of gold.
for($i = 0; $i < count($search_results[1]); $i++)
{
$result = $search_results[1][$i];
if(array_search($result, $gold) === false)
{
$gold[] = $result;
}
}
// Extract the links from the current page.
preg_match_all('~href *= *(\'|")(.*?)\1~i', $contents, $link_results);
// Loop through our extracted links and manipulate them.
for($i = 0; $i < count($link_results[2]); $i++)
{
// Get an extracted link from our list.
$c_link = $link_results[2][$i];
// Decode the link in case it's been encoded.
$c_link = urldecode($c_link);
// Trim any whitespace that might be on our link.
$c_link = trim($c_link);
// Build information about our extracted link.
// If we can't parse the URL, don't continue.
unset($url);
if($url = parse_url($c_link))
{
// Get the extension for this particular link.
$c_ext = substr(strrchr($c_link, '.'), 1);
$c_ext = strtolower($c_ext);
// Skip links to files on our don't follow list.
if($c_ext != '' && in_array($c_ext, $dont_follow))
{
$c_link = '';
}
// If this link is external, we don't want to follow it.
elseif(isset($url['scheme']))
{
if(isset($url['host']) && strpos($url['host'], $b_host) === false)
{
$c_link = '';
}
elseif(in_array(strtolower($url['scheme']), $dont_follow))
{
$c_link = '';
}
}
// Remove fragments from the end of a link.
if($c_link != '' && isset($url['fragment']))
{
$c_link = str_replace('#'.$url['fragment'], '', $c_link);
}
// Remove queries from the end of a link.
if(!$follow_queries && $c_link != '' && isset($url['query']))
{
$c_link = str_replace('?'.$url['query'], '', $c_link);
}
}
else
{
// If we won't be able to follow it, mark it as bad.
$c_link = '';
}
// If our link's made it this far, it's good, so let's keep it.
if($c_link != '')
{
// We can skip any absolute links we've still got.
if(strpos($c_link, 'http:') === false)
{
// Case 1: The URL is of the form: /directory/file
if($c_link{0} == '/')
{
$c_link = $b_scheme.'://'.$b_host.$c_link;
}
// Case 2: The URL is of the form: ../directory/file
elseif($count = substr_count($c_link, '../'))
{
// Remove the relative bits from our link.
$c_link = str_replace('../', '', $c_link);
// Backtrack the required number of directories.
$path_array = explode('/', $p_path);
$new_path = '';
for($j = $count; $j > 0; $j--)
{
array_pop($path_array);
}
for($j = 0; $j < count($path_array); $j++)
{
$new_path = $new_path.$path_array[$j].'/';
}
$new_path .= $c_link;
// Assemble the correct path for our link.
$c_link = $p_scheme.'://'.$p_host.'/'.$new_path;
}
// Case 3: The URL is of the form: ./directory/file
elseif(strpos($c_link, './') !== false)
{
$c_link = str_replace('./', '', $c_link);
$c_link = $p_url.$c_link;
}
// Case 4: The URL is of the form: file
else
{
$c_link = $p_url.$c_link;
}
}
// Remove any www. stuff from the start of our link.
if($convert_www)
{
$c_link = str_replace('http://www.', 'http://', $c_link);
}
// Add our extracted list to our list of links to look at.
if(!array_key_exists($c_link, $links))
{
$links[$c_link] = 0;
}
}
}
}
else
{
// Mark this link as being unretrievable.
$links[$p_link] = -1;
}
}
// How many links did we end up finding vs. searching?
$count = array_count_values($links);
if(!isset($count[-1]))
{
$count[-1] = 0;
}
$count[2] = $count[1] + $count[-1];
echo("\nTotal number of links found was ".$count[2].".");
echo("\nTotal number of links searched was ".$count[1].".");
echo("\nTotal number of bad links was ".$count[-1].".\n\n");
// What kind of search results did we get?
$count = count($gold);
echo("\nSearch results: \n\n");
for($i = 0; $i < $count; $i++)
{
echo($gold[$i]. "\n");
}
echo("\nTotal number of search results found was ".$count.".\n\n");
?>