Парсер словарного запаса из истории Pidgin

Материал из Wiki - thelogin.ru
Перейти к: навигация, поиск
<?php
$_ENV['HOME'] = `echo -n \$HOME`;
$_ENV['USER'] = 'themylogin';
ini_set('mbstring.internal_encoding', 'utf-8');
 
class PidginHistoryParserApp
{
    public function __construct($arguments)
    {
        $incoming = false;
        $n        = 50;
        $path     = $_ENV['HOME'] . '/.purple/logs';
 
        unset($arguments[0]);
        foreach ($arguments as $argument)
        {
            if (strpos($argument, '-incoming') === 0)
            {
                $incoming = true;
            } elseif (strpos($argument, '-n=') === 0) {
                $n        = substr($argument, 3);
            } elseif (strpos($argument, '-path=') === 0) {
                $path     = substr($argument, 6);
            } else {
                $this->usage();
                exit;
            }
        }
 
        $PidginHistoryParser = new PidginHistoryParser($path, $n, $incoming);
        $PidginHistoryParser->showStats();
    }
 
    private function usage()
    {
        echo 'Использование: pidgin-history [-incoming] [-n=50] [-path=' . $_ENV['HOME'] . '/.purple/logs]';
        echo "\n";
    }
}
 
class PidginHistoryParser
{
    private $incoming;
    private $n;
    private $replace = array();
    private $words   = array();
 
    public function __construct($basedir, $n, $incoming)
    {
        $this->n        = $n;
        $this->incoming = $incoming;
 
        $this->replace = str_split(" \r\n\t.,?!");
 
        foreach ($this->getAllHistoryFiles($basedir) as $historyFile)
        {
            foreach($this->getMyMessages($historyFile) as $message)
            {
                foreach ($this->getMessageWords($message) as $word)
                {
                    $word = $this->parseWord($word);
                    if (!empty($word))
                    {
                        $this->registerWord($word);
                    }
                }
            }
        }
    }
 
    private function getAllHistoryFiles($basedir)
    {
        $files = array();
        $profileDir = opendir($basedir);
        while (false !== $history = readdir($profileDir))
        {
            if ($history == '.' || $history == '..')
            {
                continue;
            }
            if (is_dir($basedir . '/' . $history))
            {
                foreach ($this->getAllHistoryFiles($basedir . '/' . $history) as $value)
                {
                    $files[] = $value;
                }
            } else {
                $files[] = file_get_contents($basedir . '/' . $history);
            }
        }
        return $files;
    }
 
    private function getMyMessages($historyText)
    {
        $myMessages = array();
        foreach (preg_split("#(^\(|\n\()[^)]*[0-9]{2}:[0-9]{2}:[0-9]{2}\) #U", $historyText, -1) as $message)
        {
            @list($nickName, $text) = explode(": ", $message, 2);
            if (empty($text))
            {
                continue;
            }
            if ($this->incoming)
            {
                if ($nickName != $_ENV['USER'])
                {
                    $myMessages[] = $text;
                }
            } else {
                if ($nickName == $_ENV['USER'])
                {
                    $myMessages[] = $text;
                }
            }
        }
 
        return $myMessages;
    }
 
    protected function getMessageWords($messageText)
    {
        return preg_split('/\s{1,}/', $messageText);
    }
 
    protected function parseWord($word)
    {
        $word = str_replace('\n',           ' ', $word);
        $word = str_replace($this->replace, '',  $word);
        return mb_strtolower($word);
    }
 
    protected function registerWord($word)
    {
        if (isset($this->words[$word]))
        {
            $this->words[$word]++;
        } else {
            $this->words[$word] = 1;
        }
    }
 
    public function showStats()
    {
        arsort($this->words);
        $n = 0;
        foreach ($this->words as $word => $timesUsed)
        {
            echo "$word => $timesUsed\n";
            $n++;
            if ($n == $this->n)
            {
                break;
            }
        }
    }
}
 
$PidginHistoryParserApp = new PidginHistoryParserApp($argv);
?>
Личные инструменты
Пространства имён
Варианты
Действия
Навигация
Инструменты
Тэги