Парсер словарного запаса из истории Pidgin
Материал из Wiki - thelogin.ru
<?php $_ENV['HOME'] = `echo -n \$HOME`; $_ENV['USER'] = 'themylogin'; ini_set('mbstring.internal_encoding', 'utf-8'); class PidginHistoryParserApp { public function __construct($arguments) { $incoming = false; $n = 50; $path = $_ENV['HOME'] . '/.purple/logs'; unset($arguments[0]); foreach ($arguments as $argument) { if (strpos($argument, '-incoming') === 0) { $incoming = true; } elseif (strpos($argument, '-n=') === 0) { $n = substr($argument, 3); } elseif (strpos($argument, '-path=') === 0) { $path = substr($argument, 6); } else { $this->usage(); exit; } } $PidginHistoryParser = new PidginHistoryParser($path, $n, $incoming); $PidginHistoryParser->showStats(); } private function usage() { echo 'Использование: pidgin-history [-incoming] [-n=50] [-path=' . $_ENV['HOME'] . '/.purple/logs]'; echo "\n"; } } class PidginHistoryParser { private $incoming; private $n; private $replace = array(); private $words = array(); public function __construct($basedir, $n, $incoming) { $this->n = $n; $this->incoming = $incoming; $this->replace = str_split(" \r\n\t.,?!"); foreach ($this->getAllHistoryFiles($basedir) as $historyFile) { foreach($this->getMyMessages($historyFile) as $message) { foreach ($this->getMessageWords($message) as $word) { $word = $this->parseWord($word); if (!empty($word)) { $this->registerWord($word); } } } } } private function getAllHistoryFiles($basedir) { $files = array(); $profileDir = opendir($basedir); while (false !== $history = readdir($profileDir)) { if ($history == '.' || $history == '..') { continue; } if (is_dir($basedir . '/' . $history)) { foreach ($this->getAllHistoryFiles($basedir . '/' . $history) as $value) { $files[] = $value; } } else { $files[] = file_get_contents($basedir . '/' . $history); } } return $files; } private function getMyMessages($historyText) { $myMessages = array(); foreach (preg_split("#(^\(|\n\()[^)]*[0-9]{2}:[0-9]{2}:[0-9]{2}\) #U", $historyText, -1) as $message) { @list($nickName, $text) = explode(": ", $message, 2); if (empty($text)) { continue; } if ($this->incoming) { if ($nickName != $_ENV['USER']) { $myMessages[] = $text; } } else { if ($nickName == $_ENV['USER']) { $myMessages[] = $text; } } } return $myMessages; } protected function getMessageWords($messageText) { return preg_split('/\s{1,}/', $messageText); } protected function parseWord($word) { $word = str_replace('\n', ' ', $word); $word = str_replace($this->replace, '', $word); return mb_strtolower($word); } protected function registerWord($word) { if (isset($this->words[$word])) { $this->words[$word]++; } else { $this->words[$word] = 1; } } public function showStats() { arsort($this->words); $n = 0; foreach ($this->words as $word => $timesUsed) { echo "$word => $timesUsed\n"; $n++; if ($n == $this->n) { break; } } } } $PidginHistoryParserApp = new PidginHistoryParserApp($argv); ?>