Библиотека для cis, online, cms1

2016-06-29 18:51:32 +03:00 · 2016-06-29 18:51:32 +03:00 · 3c2e614d87
commit 3c2e614d87
269 changed files with 39854 additions and 0 deletions
--- a/core/search/htmlhelper.php
+++ b/core/search/htmlhelper.php
@ -0,0 +1,85 @@
+<?php
+
+/**
+ * Èçâëåêàåò òåêñò èç HTML äîêóìåíòà
+ */
+function stripText($document)
+{
+    $search = array("'<script[^>]*?>.*?</script>'si" => "",    // strip out javascript
+                    "'<[\/\!]*?[^<>]*?>'si"          => "",    // strip out html tags
+                    "'([\r\n])[\s]+'"                => "\\1", // strip out white space
+                    "'&(quot|#34|#034|#x22);'i"      => "\"",  // replace html entities
+                    "'&(amp|#38|#038|#x26);'i"       => "&",   // added hexadecimal values
+                    "'&(lt|#60|#060|#x3c);'i"        => ">",
+                    "'&(gt|#62|#062|#x3e);'i"        => "<",
+                    "'&(nbsp|#160|#xa0);'i"          => " ",
+                    "'&(iexcl|#161);'i"              => chr(161),
+                    "'&(cent|#162);'i"               => chr(162),
+                    "'&(pound|#163);'i"              => chr(163),
+                    "'&(copy|#169);'i"               => chr(169),
+                    "'&(reg|#174);'i"                => chr(174),
+                    "'&(deg|#176);'i"                => chr(176));
+    $text = preg_replace(array_keys($search), array_values($search), $document);
+    return $text;
+}
+
+/**
+ * Ðàçäåëåíèå òåêñòà íà ìàññèâ ñëîâ
+ */
+function tokenize ($document)
+{
+    $array = preg_split("/[\W]+/", $document); 
+    return $array;
+}
+
+
+/**
+ * Èùåò îäèí èç ñèìâîëîâ ñ êîíöà ñòðîêè
+ *
+ * @param string $haystack
+ * @param array  $needle Ìàññèâ ñèìâîëîâ äëÿ ïîèñêà
+ * @param int    $offset Ñìåùåíèå îò íà÷àëà ñòðîêè
+ *
+ * @return int Ïîçèöèþ ïåðâîãî ñîâïàäåíèÿ
+ */
+function indexRight ($haystack, $needle, $offset = 0)
+{
+    if ((bool)$offset === false) $offset = 0;
+    while ($offset >= 0) {
+        if (in_array ($haystack[$offset], $needle)) {
+            return $offset;
+        }
+        $offset --;
+    }
+    return false;
+}
+
+/**
+ * Èùåò îäèí èç ñèìâîëîâ ñ íà÷àëà ñòðîêè
+ *
+ * @param string $haystack
+ * @param array  $needle Ìàññèâ ñèìâîëîâ äëÿ ïîèñêà
+ * @param int    $offset Ñìåùåíèå îò íà÷àëà ñòðîêè
+ *
+ * @return int Ïîçèöèþ ïåðâîãî ñîâïàäåíèÿ
+ */
+function indexLeft ($haystack, $needle, $offset = 0)
+{
+    if ($offset < 0) return false;
+    while ($offset < strlen($haystack)) {
+        if ((is_callable($needle) && call_user_func ($needle, $haystack[$offset])) 
+            || (is_array ($needle) && in_array ($haystack[$offset], $needle))) {
+            return $offset;
+        }
+        $offset ++;
+    }
+    return false;
+}
+
+function not_ctype_alpha ($ch)
+{
+    return !ctype_alpha($ch);
+}
+
+
+?>
--- a/core/search/index.php
+++ b/core/search/index.php
@ -0,0 +1,86 @@
+<?php
+
+require_once 'core/search/htmlhelper.php';
+require_once 'core/search/stemmer.php';
+require_once 'core/path.php';
+
+/**
+ * Èíäåêñèðîâàíèå ôàéëîâ
+ */
+class Index
+{    
+    const ARRAY_FILE = 0;
+    const ARRAY_TEXT = 1;
+
+    public $index = array ();
+    public $text  = array ();
+    protected $count = 0;
+
+    function getTitle ($content) {
+        $title = "'<title[^>]*?>(.*?)</title>'si";        
+        preg_match($title, $content, $matches);
+        if(isset($matches[1])) {
+            return $matches[1];
+        }
+        return "";
+    }
+
+    // Âûáèðàåì îñíîâó ñëîâà
+    function clean ($word)
+    {
+        return Stemmer::russian(strtolower($word));
+    }
+    
+    function process ($base, $files)
+    {
+        $path = new Path($base);
+        // Ñïèñîê äîêóìåíòîâ
+        foreach ($path->getContentRec($files) as $file) {
+            $content = file_get_contents ($file);
+            $text  = stripText($content);
+//            $title = self::getTitle ($content);
+            $title = pathinfo($file, PATHINFO_BASENAME);
+//            echo $file, "\n";
+            // Ñïèñîê ñëîâ â äîêóìåíòå
+            $list    = tokenize($text);
+            foreach ($list as $word) {
+                $preword = self::clean($word);
+                if (isset($this->index[$preword])) {
+                    $index = $this->index[$preword];
+                    if ( ! in_array ($this->count, $index)) $this->index[$preword] [] = $this->count;
+                } else {
+                    // Íå çàïèñûâàåì ñëîâà äëèííà êîòîðûõ ìåíüøå 2
+                    if (strlen($preword) > 1) {
+                        $this->index[$preword] = array ($this->count);
+                    }
+                }
+            }
+            $this->text [] = array ($title, $path->relPath ($file), $text);
+            $this->count ++;
+        }
+        ksort($this->index);
+    }
+
+    /**
+     * Ñîõðàíåíèå ðåçóëüòàòà ïîèñêà
+     */
+    function saveData ($file) 
+    {
+        $file = fopen($file, "w");
+        // Êîëè÷åñòâî ñëîâ è òåêñòîâ
+        fwrite ($file, pack("SS", count($this->index), count($this->text)));
+        foreach ($this->index as $word => $value) {
+            $length = strlen($word);
+            array_unshift ($value, "SSa*S*", $length, count($value), $word);
+            fwrite($file, call_user_func_array ('pack', $value));
+        }
+
+        foreach ($this->text as $text) {
+            fwrite($file, pack("SSSa*a*a*", 
+                strlen($text[0]), strlen($text[1]), strlen($text[2])
+                , $text[0], $text[1], $text[2]));
+        }
+    }
+}
+
+?>
--- a/core/search/lexer.php
+++ b/core/search/lexer.php
@ -0,0 +1,93 @@
+<?php
+
+/**
+ * Ðàçáèðâåò ñòðîêó çàïðîñà íà òîêåíû
+ */
+class Lexer
+{
+    const TOKEN_NOT    = 1;
+    const TOKEN_OR     = 2;
+    const TOKEN_LPAREN = 3;
+    const TOKEN_RPAREN = 4;
+    const TOKEN_AND    = 5;
+    const TOKEN_WORD   = 6;
+    const TOKEN_EOL    = 7;
+
+    protected $src;
+    private   $offset = 0;
+    public    $token;
+
+    public function __construct ()
+    {
+    }
+
+    function setSource ($src)
+    {
+        $this->src = $src;
+        $this->offset;
+    }
+
+    private function skipSpace ()
+    {
+        while (!$this->isEOL() && $this->getChar() == " ") { 
+            $this->offset++; 
+        }
+    }
+
+    private function getChar ()
+    {
+        return $this->src [$this->offset]; 
+    }
+    
+    /**
+     * Ïðîâåðÿåò íà êîíåö ñòðîêè
+     */
+    private function isEOL () {
+        return $this->offset >= strlen($this->src);
+    }
+
+    /**
+     * Îäíîñèìâîëüíûé òîêåí
+     */
+    private function easyToken () {
+        $ch = $this->getChar ();
+        switch ($ch) {
+            case '~': $token = array(self::TOKEN_NOT, $ch); break;
+            case '|': $token = array(self::TOKEN_OR, $ch); break;
+            case '(': $token = array(self::TOKEN_LPAREN, $ch); break;
+            case ')': $token = array(self::TOKEN_RPAREN, $ch); break; 
+            case '&': $token = array(self::TOKEN_AND, $ch); break; 
+            default:
+                $this->offset++;
+                $token = $this->getToken();
+        }
+        $this->offset++;
+        return $token;
+    } 
+
+    /**
+     * Âîçâðàùàåò ñëåäóþùèé òîêåí
+     */
+    public function getToken ()
+    {
+        $this->skipSpace ();
+        if ($this->isEOL()) {
+            return array(self::TOKEN_EOL, "");
+        }
+        if (ctype_alpha($this->getChar())) {
+            $start = $this->offset;
+            while (!$this->isEOL() && ctype_alpha($this->getChar())) {
+                $this->offset ++;
+            }
+            return array(self::TOKEN_WORD, substr ($this->src, $start, $this->offset-$start));
+        }
+        return $this->easyToken();
+    }
+
+    public function nextToken ()
+    {
+        $this->token = $this->getToken();
+    }    
+}
+
+?>
--- a/core/search/search.php
+++ b/core/search/search.php
@ -0,0 +1,98 @@
+<?php
+
+require_once 'core/search/lexer.php';
+require_once 'core/functions.php';
+
+/**
+ * Ïîèñê â èíäåêñå
+ */
+class Search 
+{
+    private $lexer;
+    private $index;
+    function __construct ($index)
+    {
+        $this->lexer = new Lexer();   
+        $this->index = $index;
+
+        $this->op = array ($this, 'Op');    
+        $this->binary = array ($this, 'binaryOp');
+        $this->union = array ($this, 'union');
+        $this->intersection = lcurry($this->op, 'array_uintersect', $this->union);
+
+        $this->notQuery = lcurry ($this->binary, Lexer::TOKEN_NOT, 
+            lcurry($this->op, 'array_udiff', 'array_udiff'), array ($this, 'easyQuery'));
+
+        $this->orQuery  = lcurry ($this->binary, Lexer::TOKEN_OR,
+            lcurry($this->op, $this->union, $this->union), $this->notQuery);
+
+        $this->andQuery = lcurry ($this->binary, Lexer::TOKEN_AND, $this->intersection, $this->orQuery);
+    }
+
+    function union ($a, $b, $callback)
+    {
+        return array_merge($a, $b);
+    }
+
+    function Eq ($a, $b)
+    {
+        return $a == $b;
+    }
+
+    function Op ($files, $words, $a, $b) {
+        return array (
+            'words' => call_user_func ($words, $a['words'], $b['words'], array ($this, 'eq')), 
+            'files' => call_user_func ($files, $a['files'], $b['files'], array ($this, 'eq'))
+        );
+    }
+
+    public function getQuery ($source)
+    {
+        $this->lexer->setSource ($source);
+        $this->lexer->nextToken();
+        return $this->topQuery();
+    }
+
+    function topQuery ()
+    {
+        $result = call_user_func ($this->andQuery); 
+        while ($this->lexer->token[0] == Lexer::TOKEN_LPAREN) {
+            $result = call_user_func ($this->intersection, $result, call_user_func ($this->andQuery));
+        }
+        return $result;
+    }
+
+    function easyQuery ()
+    {
+        $result = null;
+        if ($this->lexer->token[0] == Lexer::TOKEN_LPAREN) {
+            $this->lexer->nextToken ();
+            $result = $this->topQuery ();
+            if ($this->lexer->token[0] == Lexer::TOKEN_RPAREN) {
+                $this->lexer->nextToken ();
+            }
+            return $result;
+        } else {
+            $result = call_user_func ($this->index, $this->lexer->token[1]);
+            $this->lexer->nextToken ();
+            return $result;
+        }
+    }
+
+    /**
+     * @param int $type Òèï ëåêñåìû  
+     * @param function $op Ôóíêöèÿ ïðè ñîâïàäåíèè òèïà ëåêñåìû ïðè çàïðîñå  
+     * @param function $next Ñëåäóþùèé îáðàáîò÷èê çàïðîñà   
+     */
+    function binaryOp ($type, $op, $next)
+    {
+         $result = call_user_func($next);
+         while ($this->lexer->token[0] == $type) {
+             $this->lexer->nextToken();
+             $result = call_user_func($op, $result, call_user_func ($next));
+         }
+         return $result; 
+    }
+}
+
+?>
--- a/core/search/searcher.php
+++ b/core/search/searcher.php
@ -0,0 +1,102 @@
+<?php
+
+require_once 'core/search/search.php';
+require_once 'core/search/htmlhelper.php';
+require_once 'core/search/stemmer.php';
+
+class Searcher {
+    /*  protected */ public $index;
+    protected $text;
+    protected $search;
+    public function __construct ()
+    {
+        // Ìîæåò ïåðåäàâàòü îáüåêò ìåòîä ïî óìëî÷àíèþ getWordStat?? 
+        $this->search = new Search (array ($this, 'getWord'));
+    }
+
+    /**
+     * ×èòàåò ñîäåðæèìîå èíäåêñíîãî ôàéëà
+     * 
+     * @param string $file Èìÿ ôàéëà
+     */
+    function setSource ($fileName) 
+    {
+        $file  = fopen($fileName, "r");
+        $words = fread($file, 4);
+        $all   = unpack("Swords/Stexts", $words);
+        for ($i = 0; $i < $all['words']; $i++) {
+            $pos   = fread($file, 4);
+            $size  = unpack("Sword/Sindex", $pos);
+
+            $word  = fread($file, $size['word']);
+            $index = unpack("S*", fread($file, $size['index']*2));
+            $this->index[$word] = $index;
+        }
+
+        for ($i = 0; $i < $all['texts']; $i++) {
+            $pos  = fread($file, 6);
+            $size = unpack("Stitle/Surl/Stext", $pos);
+            // 
+            $title = fread($file, $size['title']);
+            $url   = fread($file, $size['url']);
+            $text  = fread($file, $size['text']);
+            $this->text [] = array ($title, $url, $text);
+        }        
+    }
+
+    // Ïî ñëîâó âîçâðàùàåì ñïèñîê ôàéëîâ è ñëîâî
+    public function getWord ($word)
+    {
+        $preword = Stemmer::russian($word); // Index?? -> clean
+        if (isset($this->index[$preword])) { // Search??
+            return array ('files' => $this->index[$preword], 'words' => array ($preword));
+        }
+        return array ('files' => array (), 'words' => array ());
+    } 
+ 
+    /**
+     * Ñïèñîê äîêóìåíòîâ â êîòîðûõ âñòå÷àåòñÿ ñëîâî
+     *
+     */
+    function getResult (&$query)
+    {
+        $result = array ();
+        $word = $query['words'];
+        $list = $query['files'];
+        // 
+        foreach ($list as $n) {
+            $result [] = array (
+                'title' => $this->text[$n][0],
+                'file' => $this->text[$n][1],
+                'text' => self::getSlice ($word[0], $this->text[$n][2]));
+        }
+        return $result;
+    }
+
+    /**
+     * ×àñòü äîêóìåíòà â êîòîðîì âñòðå÷àåòñÿ ñëîâî
+     * 
+     * @param $word Ñëîâî
+     * @param $text Òåêñò ñîäåðæàùèé ñëîâî
+     */
+    function getSlice ($word, $text)
+    {
+        $pos    = stripos($text, $word);
+        $offset = max(max ($pos-100, indexRight($text, array ("."), $pos) + 1), 0);
+        $real   = substr($text, $pos, strlen($word)) ;
+        return substr($text, $offset, $pos - $offset)
+            . "<span style='color: red'>" . $real . "</span>" . substr ($text, $pos + strlen($word), 100);
+    }
+
+    /**
+     * Ïîèñê ïî çàïðîñó
+     *   
+     */
+    function search ($query)
+    {
+        $result = $this->search->getQuery($query);
+        return $this->getResult($result);
+    }
+}
+
+?>
--- a/core/search/stemmer.php
+++ b/core/search/stemmer.php
@ -0,0 +1,181 @@
+<?php
+/*
+*  PHP5 implementation of Martin Porter's stemming algorithm for Russian language.
+*  Written on a cold winter evening close to the end of 2005 by Dennis Kreminsky (etranger at etranger dot ru)
+*  Use the code freely, but don't hold me responsible if it breaks whatever it might break.
+*
+*/
+
+define ('CHAR_LENGTH', '1'); // all Russian characters take 2 bytes in UTF-8, so instead of using (not supported by default) mb_*
+                             // string functions, we use the standard ones with a dirty char-length trick.
+                             // Should you want to use WIN-1251 (or any other charset), convert this source file to that encoding
+                             // and then change CHAR_LENGTH to the proper value, which is likely to be '1' then.
+//
+class Stemmer {
+    
+    static public function russian($word)
+    {
+        $a = self::rv($word);
+        $start = $a[0];
+        $rv = $a[1];
+        $rv = self::step1($rv);
+        $rv = self::step2($rv);
+        $rv = self::step3($rv);
+        $rv = self::step4($rv);
+        return $start.$rv;
+    }
+    
+    static private function rv($word)
+    {
+        $vowels = array('à','å','è','î','ó','û','ý','þ','ÿ');
+        $flag = 0;
+        $rv = '';
+        $start = '';
+        for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH) {
+            if ($flag==1)
+                $rv .= substr($word, $i, CHAR_LENGTH);
+            else
+                $start .= substr($word, $i, CHAR_LENGTH);
+            if (array_search(substr($word,$i,CHAR_LENGTH), $vowels) !== false)
+                $flag=1;
+        }
+        return array($start,$rv);
+    }
+
+    static function substitute ($word, &$suffix_list)
+    {
+        foreach ($suffix_list as $suffix) {
+            if (self::has_suffix($word, $suffix)) {
+               $word = self::cut_suffix($word, $suffix);
+            }
+        }
+        return $word;
+    }
+
+    static function has_suffix ($word, $suffix) 
+    {
+        return substr($word, -(strlen($suffix))) == $suffix;
+    }
+
+    static function has_aya ($word, $suffix)
+    {
+        return (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='à' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='ÿ');
+    }
+
+    static function cut_suffix ($word, $suffix) 
+    {
+        return substr($word, 0, strlen($word) - strlen($suffix));
+    }
+    
+    static private function step1($word)
+    {
+        $perfective1 = array('â', 'âøè', 'âøèñü');
+        foreach ($perfective1 as $suffix) {
+            if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix)) {
+                return self::cut_suffix($word, $suffix);
+            }
+        }
+        
+        $perfective2 = array('èâ','èâøè','èâøèñü','ûâøè','ûâøèñü');
+        foreach ($perfective2 as $suffix) {
+            if (self::has_suffix($word, $suffix)) {
+                return self::cut_suffix($word, $suffix);
+            }
+        }
+        
+        $reflexive = array('ñÿ', 'ñü');
+        $word = self::substitute($word, $reflexive);
+        
+        $adjective = array('åå','èå','ûå','îå','èìè','ûìè','åé','èé','ûé','îé','åì','èì','ûì','îì','åãî','îãî','åìó','îìó','èõ','ûõ','óþ','þþ','àÿ','ÿÿ','îþ','åþ');
+        $participle2 = array('åì','íí','âø','þù','ù');
+        $participle1 = array('èâø','ûâø','óþù');
+        foreach ($adjective as $suffix) {
+            if (self::has_suffix($word, $suffix)) {
+                $word = self::cut_suffix($word, $suffix);
+
+                foreach ($participle1 as $suffix)
+                    if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix))
+                        $word = self::cut_suffix($word, $suffix);
+
+                return self::substitute($word, $participle2);
+            }
+        }
+
+        $verb1 = array('ëà','íà','åòå','éòå','ëè','é','ë','åì','í','ëî','íî','åò','þò','íû','òü','åøü','ííî');
+        foreach ($verb1 as $suffix)
+            if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix))
+                return self::cut_suffix($word, $suffix);
+        
+        $verb2 = array('èëà','ûëà','åíà','åéòå','óéòå','èòå','èëè','ûëè','åé','óé','èë','ûë','èì','ûì','åí','èëî','ûëî','åíî','ÿò','óåò','óþò','èò','ûò','åíû','èòü','ûòü','èøü','óþ','þ');
+        foreach ($verb2 as $suffix)
+            if (self::has_suffix($word, $suffix))
+            return self::cut_suffix($word, $suffix);
+        
+        $noun = array('à','åâ','îâ','èå','üå','å','èÿìè','ÿìè','àìè','åè','èè','è','èåé','åé','îé','èé','é','èÿì','ÿì','èåì','åì','àì','îì','î','ó','àõ','èÿõ','ÿõ','û','ü','èþ','üþ','þ','èÿ','üÿ','ÿ');
+        foreach ($noun as $suffix) {
+            if (self::has_suffix($word, $suffix))
+               return self::cut_suffix($word, $suffix);
+        }
+        
+        return $word;
+    }
+    
+    static private function step2($word)
+    {
+        if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'è')
+            $word = substr($word, 0, strlen($word)-CHAR_LENGTH);
+        return $word;
+    }
+    
+    static private function step3($word)
+    {
+        $vowels = array('à','å','è','î','ó','û','ý','þ','ÿ');
+        $flag = 0;
+        $r1 = '';
+        $r2 = '';
+        for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH)
+        {
+            if ($flag==2)
+                $r1.=substr($word, $i, CHAR_LENGTH);
+            if (array_search(substr($word,$i,CHAR_LENGTH), $vowels) !== false)
+                $flag=1;
+            if ($flag=1 && array_search(substr($word,$i,CHAR_LENGTH), $vowels) === false)
+                $flag=2;
+        }
+        $flag=0;
+        for ($i=0; $i<strlen($r1); $i+=CHAR_LENGTH)
+        {
+            if ($flag==2)
+                $r2.=substr($r1, $i, CHAR_LENGTH);
+            if (array_search(substr($r1,$i,CHAR_LENGTH), $vowels) !== false)
+                $flag=1;
+            if ($flag=1 && array_search(substr($r1,$i,CHAR_LENGTH), $vowels) === false)
+                $flag=2;
+        }
+        $derivational=array('îñò', 'îñòü');
+        foreach ($derivational as $suffix)
+            if (substr($r2,-(strlen($suffix))) == $suffix)
+                $word=substr($word, 0, strlen($r2)-strlen($suffix));
+        return $word;
+    }
+    
+    static private function step4($word)
+    {
+        if (substr($word,-CHAR_LENGTH*2)=='íí')
+            $word=substr($word, 0, strlen($word)-CHAR_LENGTH);
+        else
+        {
+            $superlative=array('åéø', 'åéøå');
+            foreach ($superlative as $suffix)
+                if (substr($word,-(strlen($suffix))) == $suffix)
+                $word = substr($word, 0, strlen($word) - strlen($suffix));
+            if (substr($word,-CHAR_LENGTH*2) == 'íí')
+                $word = substr($word, 0, strlen($word) - CHAR_LENGTH);
+        }
+        // should there be a guard flag? can't think of a russian word that ends with åéøü or ííü anyways, though the algorithm states this is an "otherwise" case
+        if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'ü')
+            $word=substr($word, 0, strlen($word)-CHAR_LENGTH);
+        return $word;
+    }
+}
+?>