Master Summoner
- Joined
- Nov 14, 2004
- Messages
- 533
- Reaction score
- 1
Ok, first of all: It's still buggy.
This script's function is to crawl a website for links.
Only the 1st loop works as intended.
But then again, v0.04b speaks for itself.
Just thought that someone may find this useful.
Also, any improvements on it would be quite welcome ;]
And yeah, it's CLI. More fun this way.
Crawler.php
library.php
Enjoy.
[Fd]
This script's function is to crawl a website for links.
Only the 1st loop works as intended.
But then again, v0.04b speaks for itself.
Just thought that someone may find this useful.
Also, any improvements on it would be quite welcome ;]
And yeah, it's CLI. More fun this way.
Crawler.php
PHP:
<?php
/**
* Author: Otixa, The Scythe
* Date: 26 Jun 08
* Description: Web crawler, that crawls websites in search for links.
*/
set_time_limit(0);
require('library.php');
//--------------------------
out('Scythe Link Crawler v0.04b'."\n");
out('Enter URL: ', 0);
$URL = prepareURL(in());
out('Loading '.$URL.' ...');
out('Please wait.'."\n");
//--------------------------
$links = getLinks($URL);
if(count($links) > 0) out('Phase 1 complete.');
else { out('No links found. Terminating.'); die(); }
$clinks = array();
for($i=0;$i<=(count($links)-1);$i++) {
if(fixURL(trim($links[$i]), $URL) != null) $clinks[] = fixURL(trim($links[$i]), $URL);
}
if(count($clinks) > 0) out('Phase 2 complete. Links found: '.count($clinks));
out('Proceding to phase 3: Tracing ...');
$phase = 3;
$cycle = 0;
$all = $clinks;
while(1) {
$doloop = null;
$all = loopLinks($all,$phase);
out(count($all).' results found. Loop again? (y/n)');
$doloop = in();
if($doloop != 'y') break;
$phase++;
$cycle++;
}
print_r($all);
?>
library.php
PHP:
<?php
function out($string, $nl = true) {
if($nl) echo $string."\n";
else echo $string;
}
function in() {
$stdin = fopen('php://stdin', 'r');
$str = fgets($stdin);
fclose($stdin);
return trim($str);
}
function getLinks($URL) {
$html = @file_get_contents($URL) or die(out('Fatal error: Could not connect to URL ('.$URL.') - Terminating.'));
$regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";
preg_match_all("/$regexp/siU", $html, $links);
$links = $links[2];
return $links;
}
function prepareURL($URL) {
$URL = trim($URL);
$check = substr($URL, 0, 7);
if($check != 'http://') $URL = 'http://'.$URL;
if($URL{(strlen($URL)-1)} != '/') $URL .= '/';
return $URL;
}
function fixURL($URL, $basePage) {
switch($URL{0}) {
case '/':
$URL = $basePage.substr($URL, 1);
break;
case '#':
$URL = null;
break;
case '?':
$URL = $basePage.$URL;
break;
case 'j':
if(substr($URL,0,10) == 'javascript') $URL = null;
break;
case '.':
$URL = $basePage.substr($URL,2);
break;
default:
//Meh, should make a func to parse the base and remove any ?s
if(substr($URL,0,4) != 'http') $URL = $basePage.$URL;
break;
}
return $URL;
}
function loopLinks($links, $phase) {
if(!is_array($links)) die(out('Fatal error: Links not in array.'));
$cycle = 1;
$tlinks = array();
$alinks = array();
foreach ($links as $link) {
$alinks[1] = $link;
}
foreach ($links as $current) {
$tlinks[$cycle] = getLinks($current);
if(count($tlinks[$cycle]) > 0) {
for($i=0;$i<=(count($tlinks[$cycle])-1);$i++) {
$fixed = fixURL(trim($tlinks[$cycle][$i]), $current);
if($fixed != null) {
if(!in_array($fixed ,$tlinks)) $alinks[] = $fixed;
}
}
}
out('Phase '.$phase.'.'.$cycle.' complete.');
$cycle++;
return $alinks;
}
}
?>
Enjoy.
[Fd]