Frank Mitchell

PHP regex spider

#!/usr/bin/php
<?php
// PHP Regex Spider v. 2.0
// Copyright 2004 Frank Mitchell. All rights reserved.
// http://thiefsystems.org/ccs/phpregexspider
echo("\n");

// Where should we start searching from?
$start = 'http://thiefsystems.org/ccs/';

// What should we be looking for?
$search = '~<em>(.*?)<\/em>~';

// Do you want to follow query links?
$follow_queries = true;

// Do you want to convert 'http://www.' to 'http://' ?
$convert_www = true;

// Do you want to convert HTML entries?
// Setting this to true may cause the spider to seg. fault when it
// encounters pages with malformed HTML code.
$convert_html = false;

// What kinds of files and schemes should we avoid?
$dont_follow = array('jpg', 'gif', 'png', 'ico', 'zip', 'rar', 'tar', 'gz',
'c|', 'c', 'pl', 'py', 'js', 'jar', 'reg', 'orig', 'exe', 'java', 'class',
'css', 'xml', 'txt', 'dvi', 'ps', 'lot', 'doc', 'ppt', 'pdf', 'lit', 'mp3',
'wav', 'ra', 'pm', 'mpg', 'mpeg', 'mso', 'psd', 'swf', 'img', 'vhdl', 'dat',
'cpp', 'cls', 'tex', 'clq', 'mailto', 'javascript', 'news', 'feed', 'file');

// Build information about the site we're going to search.
if($url = parse_url($start))
{
  if(isset($url['scheme'], $url['host']))
  {
    $b_scheme = $url['scheme'];
    $b_host = $url['host'];
  }
}
else
{
  echo("\nError!\n");
  echo('Description: Unable to parse starting URL. ');
  echo("Please enter a different URL to start from.\n");
  echo("Starting URL: " .$start. "\n\n");
  exit;
}

// Initialize our array of links.
$links = array($start => 0);

// Initialize our array of search results.
$gold = array();

// Keep crawling until we run out of links.
while($p_link = array_search(0, $links))
{

  // Mark this link as having been seen.
  $links[$p_link] = 1;

  // Get the contents of the link we're currently looking at.
  // If we fail this, there's no point in going further.
  // Remove the @ symbol if you want to see all warnings for pages that
  // could not be retreived.
  if(@ $contents = file_get_contents($p_link))
  {

    // Convert any HTML characters we find, including quotes.
    if($convert_html)
    {
      $contents = html_entity_decode($contents, ENT_QUOTES);
    }

    // What link are we following?
    echo('Following link: '.$p_link."\n");

    // Build information about the link we're currently looking at.
    unset($url, $p_url, $p_scheme, $p_host, $p_path);
    if($url = parse_url($p_link))
    {
      $p_url = $p_link;
      if(isset($url['scheme']))
      {
        $p_scheme = $url['scheme'];
        $p_url = $p_scheme.'://';
      }
      if(isset($url['host']))
      {
        $p_host = $url['host'];
        $p_url .= $p_host;
      }
      if(isset($url['path']))
      {
        $p_path = dirname($url['path']);
        $p_url .= $p_path;

        // Remove leading and trailing slashes from our path.
        $p_path_end = strlen($p_path);
        if($p_path_end > 0)
        {
          $p_path_end--;
          if($p_path{0} == '/')
          {
            $p_path{0} = '';
          }
          if($p_path{$p_path_end} == '/')
          {
            $p_path{$p_path_end} = '';
          }
        }
      }

      // Add a trailing slash to our URL if one doesn't exist.
      if($p_url{strlen($p_url) - 1} != '/')
      {
        $p_url .= '/';
      }
    }

    // Extract all the search matches from the current page.
    preg_match_all($search, $contents, $search_results);

    // Put the search results into our pot of gold.
    for($i = 0; $i < count($search_results[1]); $i++)
    {
      $result = $search_results[1][$i];
      if(array_search($result, $gold) === false)
      {
        $gold[] = $result;
      }
    }

    // Extract the links from the current page.
    preg_match_all('~href *= *(\'|")(.*?)\1~i', $contents, $link_results);

    // Loop through our extracted links and manipulate them.
    for($i = 0; $i < count($link_results[2]); $i++)
    {

      // Get an extracted link from our list.
      $c_link = $link_results[2][$i];

      // Decode the link in case it's been encoded.
      $c_link = urldecode($c_link);

      // Trim any whitespace that might be on our link.
      $c_link = trim($c_link);

      // Build information about our extracted link.
      // If we can't parse the URL, don't continue.
      unset($url);
      if($url = parse_url($c_link))
      {
        // Get the extension for this particular link.
        $c_ext = substr(strrchr($c_link, '.'), 1);
        $c_ext = strtolower($c_ext);

        // Skip links to files on our don't follow list.
        if($c_ext != '' && in_array($c_ext, $dont_follow))
        {
          $c_link = '';
        }

        // If this link is external, we don't want to follow it.
        elseif(isset($url['scheme']))
        {
          if(isset($url['host']) && strpos($url['host'], $b_host) === false)
          {
            $c_link = '';
          }
          elseif(in_array(strtolower($url['scheme']), $dont_follow))
          {
            $c_link = '';
          }
        }

        // Remove fragments from the end of a link.
        if($c_link != '' && isset($url['fragment']))
        {
          $c_link = str_replace('#'.$url['fragment'], '', $c_link);
        }

        // Remove queries from the end of a link.
        if(!$follow_queries && $c_link != '' && isset($url['query']))
        {
          $c_link = str_replace('?'.$url['query'], '', $c_link);
        }
      }
      else
      {
        // If we won't be able to follow it, mark it as bad.
        $c_link = '';
      }

      // If our link's made it this far, it's good, so let's keep it.
      if($c_link != '')
      {

        // We can skip any absolute links we've still got.
        if(strpos($c_link, 'http:') === false)
        {

          // Case 1: The URL is of the form: /directory/file
          if($c_link{0} == '/')
          {
            $c_link = $b_scheme.'://'.$b_host.$c_link;
          }

          // Case 2: The URL is of the form: ../directory/file
          elseif($count = substr_count($c_link, '../'))
          {
            // Remove the relative bits from our link.
            $c_link = str_replace('../', '', $c_link);  

            // Backtrack the required number of directories.
            $path_array = explode('/', $p_path);
            $new_path = '';
            for($j = $count; $j > 0; $j--)
            {
              array_pop($path_array);
            }
            for($j = 0; $j < count($path_array); $j++)
            {
              $new_path = $new_path.$path_array[$j].'/';
            }
            $new_path .= $c_link;

            // Assemble the correct path for our link.
            $c_link = $p_scheme.'://'.$p_host.'/'.$new_path;
          }

          // Case 3: The URL is of the form: ./directory/file
          elseif(strpos($c_link, './') !== false)
          {
            $c_link = str_replace('./', '', $c_link);
            $c_link = $p_url.$c_link;
          }

          // Case 4: The URL is of the form: file 
          else
          {
            $c_link = $p_url.$c_link;
          }
        }

        // Remove any www. stuff from the start of our link.
        if($convert_www)
        {
          $c_link = str_replace('http://www.', 'http://', $c_link);
        }

        // Add our extracted list to our list of links to look at.
        if(!array_key_exists($c_link, $links))
        {
          $links[$c_link] = 0;
        }
      }
    }
  }
  else
  {
    // Mark this link as being unretrievable.
    $links[$p_link] = -1;
  }
}

// How many links did we end up finding vs. searching?
$count = array_count_values($links);
if(!isset($count[-1]))
{
  $count[-1] = 0;
}
$count[2] = $count[1] + $count[-1];

echo("\nTotal number of links found was ".$count[2].".");
echo("\nTotal number of links searched was ".$count[1].".");
echo("\nTotal number of bad links was ".$count[-1].".\n\n");

// What kind of search results did we get?
$count = count($gold);

echo("\nSearch results: \n\n");
for($i = 0; $i < $count; $i++)
{
  echo($gold[$i]. "\n");
}
echo("\nTotal number of search results found was ".$count.".\n\n");
?>