tokenizer.phps

#!/usr/bin/php
<?
// Reads all the files in the user-specified directory and attempts to extract
// all of the words that exist within them, and dump a list of all the unique ones found.
$dir = $argv[1];
ini_set ( "memory_limit", "600M");
$tokenlist = " \\/.\"#',?=&!:-|<>()[]@~\n";
if ($handle = opendir($dir)) {
	while (false !== ($file = readdir($handle))) {
		if(is_file($dir.$file))
		{
			$str = file_get_contents($dir.$file);
			$tokens = strtok($str, $tokenlist);
			while($tokens !== false)
			{
				$keys[$tokens]++;
				$tokens = strtok($tokenlist);
			}
		}
	}
	closedir($handle);
	foreach(array_keys($keys) as $key)
	{
		echo $key."\n";
	}
}
?>