Парсер словарного запаса из истории SIM
Материал из Wiki - thelogin.ru
<?php ini_set('mbstring.internal_encoding', 'utf-8'); class SimHistoryParserApp { public function __construct($arguments) { $incoming = false; $n = 50; $path = $_ENV['HOME'] . '/.kde/share/apps/sim/' . $_ENV['USER'] . '/history'; unset($arguments[0]); foreach ($arguments as $argument) { if (strpos($argument, '-incoming') === 0) { $incoming = true; } elseif (strpos($argument, '-n=') === 0) { $n = substr($argument, 3); } elseif (strpos($argument, '-path=') === 0) { $path = substr($argument, 6); } else { $this->usage(); exit; } } $SimHistoryParser = new SimHistoryParser($path, $n, $incoming); $SimHistoryParser->showStats(); } private function usage() { echo 'Использование: sim-history [-incoming] [-n=50] [-path=' . $_ENV['HOME'] . '/.kde/share/apps/sim/' . $_ENV['USER'] . '/history]'; echo "\n"; } } class SimHistoryParser { private $incoming; private $n; private $replace = array(); private $words = array(); public function __construct($basedir, $n, $incoming) { $this->n = $n; $this->incoming = $incoming; foreach (array_merge(range(0, 64), range(91, 96), range(123, 128)) as $code) { $this->replace[] = chr($code); } foreach ($this->getAllHistoryFiles($basedir) as $historyFile) { foreach($this->getMyMessages($historyFile) as $message) { foreach ($this->getMessageWords($message) as $word) { $word = $this->parseWord($word); if (!empty($word)) { $this->registerWord($word); } } } } } private function getAllHistoryFiles($basedir) { $files = array(); $profileDir = opendir($basedir); while (false !== $history = readdir($profileDir)) { if ($history == '.' || $history == '..') { continue; } $files[] = file_get_contents($basedir . '/' . $history); } return $files; } private function getMyMessages($historyText) { if ($this->incoming) { $regex = 'Flags=1'; } else { $regex = '[^F]'; } preg_match_all('|\[Message\]' . "\n" . 'Text="([^"]*)"' . "\n" . "{$regex}|Us", $historyText, $myMessages); return $myMessages[1]; } protected function getMessageWords($messageText) { return preg_split('/\s{1,}/', $messageText); } protected function parseWord($word) { $word = str_replace('\n', ' ', $word); $word = str_replace($this->replace, '', $word); return mb_strtolower($word); } protected function registerWord($word) { if (isset($this->words[$word])) { $this->words[$word]++; } else { $this->words[$word] = 1; } } public function showStats() { arsort($this->words); $n = 0; foreach ($this->words as $word => $timesUsed) { echo "$word => $timesUsed\n"; $n++; if ($n == $this->n) { break; } } } } $SimHistoryParserApp = new SimHistoryParserApp($argv); ?>