Парсер словарного запаса из истории SIM

Материал из Wiki - thelogin.ru
Перейти к: навигация, поиск
Писалось, в основном, в электричке, строго не судите
<?php
ini_set('mbstring.internal_encoding', 'utf-8');
 
class SimHistoryParserApp
{
	public function __construct($arguments)
	{
		$incoming = false;
		$n        = 50;
		$path     = $_ENV['HOME'] . '/.kde/share/apps/sim/' . $_ENV['USER'] . '/history';
 
		unset($arguments[0]);
		foreach ($arguments as $argument)
		{
			if (strpos($argument, '-incoming') === 0)
			{
				$incoming = true;
			} elseif (strpos($argument, '-n=') === 0) {
				$n        = substr($argument, 3);
			} elseif (strpos($argument, '-path=') === 0) {
				$path     = substr($argument, 6);
			} else {
				$this->usage();
				exit;
			}
		}
 
		$SimHistoryParser = new SimHistoryParser($path, $n, $incoming);
		$SimHistoryParser->showStats();
	}
 
	private function usage()
	{
		echo 'Использование: sim-history [-incoming] [-n=50] [-path=' . $_ENV['HOME'] . '/.kde/share/apps/sim/' . $_ENV['USER'] . '/history]';
		echo "\n";
	}
}
 
class SimHistoryParser
{
	private $incoming;
	private $n;
	private $replace = array();
	private $words   = array();
 
	public function __construct($basedir, $n, $incoming)
	{
		$this->n        = $n;
		$this->incoming = $incoming;
 
		foreach (array_merge(range(0, 64), range(91, 96), range(123, 128)) as $code)
		{
			$this->replace[] = chr($code);
		}
 
		foreach ($this->getAllHistoryFiles($basedir) as $historyFile)
		{
			foreach($this->getMyMessages($historyFile) as $message)
			{
				foreach ($this->getMessageWords($message) as $word)
				{
					$word = $this->parseWord($word);
					if (!empty($word))
					{
						$this->registerWord($word);
					}
				}
			}
		}
	}
 
	private function getAllHistoryFiles($basedir)
	{
		$files = array();
		$profileDir = opendir($basedir);
		while (false !== $history = readdir($profileDir))
		{
			if ($history == '.' || $history == '..')
			{
				continue;
			}
			$files[] = file_get_contents($basedir . '/' . $history);
		}
		return $files;
	}
 
	private function getMyMessages($historyText)
	{
		if ($this->incoming)
		{
			$regex = 'Flags=1';
		} else {
			$regex = '[^F]';
		}
		preg_match_all('|\[Message\]' . "\n" .
			       'Text="([^"]*)"'  . "\n" .
			       "{$regex}|Us", $historyText, $myMessages);
		return $myMessages[1];
	}
 
	protected function getMessageWords($messageText)
	{
		return preg_split('/\s{1,}/', $messageText);
	}
 
	protected function parseWord($word)
	{
		$word = str_replace('\n',           ' ', $word);
		$word = str_replace($this->replace, '',  $word);
		return mb_strtolower($word);
	}
 
	protected function registerWord($word)
	{
		if (isset($this->words[$word]))
		{
			$this->words[$word]++;
		} else {
			$this->words[$word] = 1;
		}
	}
 
	public function showStats()
	{
		arsort($this->words);
		$n = 0;
		foreach ($this->words as $word => $timesUsed)
		{
			echo "$word => $timesUsed\n";
			$n++;
			if ($n == $this->n)
			{
				break;
			}
		}
	}
}
 
$SimHistoryParserApp = new SimHistoryParserApp($argv);
?>
Личные инструменты
Пространства имён
Варианты
Действия
Навигация
Инструменты
Тэги