Библиотека для cis, online, cms1
This commit is contained in:
commit
3c2e614d87
269 changed files with 39854 additions and 0 deletions
85
core/search/htmlhelper.php
Normal file
85
core/search/htmlhelper.php
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Èçâëåêàåò òåêñò èç HTML äîêóìåíòà
|
||||
*/
|
||||
function stripText($document)
|
||||
{
|
||||
$search = array("'<script[^>]*?>.*?</script>'si" => "", // strip out javascript
|
||||
"'<[\/\!]*?[^<>]*?>'si" => "", // strip out html tags
|
||||
"'([\r\n])[\s]+'" => "\\1", // strip out white space
|
||||
"'&(quot|#34|#034|#x22);'i" => "\"", // replace html entities
|
||||
"'&(amp|#38|#038|#x26);'i" => "&", // added hexadecimal values
|
||||
"'&(lt|#60|#060|#x3c);'i" => ">",
|
||||
"'&(gt|#62|#062|#x3e);'i" => "<",
|
||||
"'&(nbsp|#160|#xa0);'i" => " ",
|
||||
"'&(iexcl|#161);'i" => chr(161),
|
||||
"'&(cent|#162);'i" => chr(162),
|
||||
"'&(pound|#163);'i" => chr(163),
|
||||
"'&(copy|#169);'i" => chr(169),
|
||||
"'&(reg|#174);'i" => chr(174),
|
||||
"'&(deg|#176);'i" => chr(176));
|
||||
$text = preg_replace(array_keys($search), array_values($search), $document);
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ðàçäåëåíèå òåêñòà íà ìàññèâ ñëîâ
|
||||
*/
|
||||
function tokenize ($document)
|
||||
{
|
||||
$array = preg_split("/[\W]+/", $document);
|
||||
return $array;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Èùåò îäèí èç ñèìâîëîâ ñ êîíöà ñòðîêè
|
||||
*
|
||||
* @param string $haystack
|
||||
* @param array $needle Ìàññèâ ñèìâîëîâ äëÿ ïîèñêà
|
||||
* @param int $offset Ñìåùåíèå îò íà÷àëà ñòðîêè
|
||||
*
|
||||
* @return int Ïîçèöèþ ïåðâîãî ñîâïàäåíèÿ
|
||||
*/
|
||||
function indexRight ($haystack, $needle, $offset = 0)
|
||||
{
|
||||
if ((bool)$offset === false) $offset = 0;
|
||||
while ($offset >= 0) {
|
||||
if (in_array ($haystack[$offset], $needle)) {
|
||||
return $offset;
|
||||
}
|
||||
$offset --;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Èùåò îäèí èç ñèìâîëîâ ñ íà÷àëà ñòðîêè
|
||||
*
|
||||
* @param string $haystack
|
||||
* @param array $needle Ìàññèâ ñèìâîëîâ äëÿ ïîèñêà
|
||||
* @param int $offset Ñìåùåíèå îò íà÷àëà ñòðîêè
|
||||
*
|
||||
* @return int Ïîçèöèþ ïåðâîãî ñîâïàäåíèÿ
|
||||
*/
|
||||
function indexLeft ($haystack, $needle, $offset = 0)
|
||||
{
|
||||
if ($offset < 0) return false;
|
||||
while ($offset < strlen($haystack)) {
|
||||
if ((is_callable($needle) && call_user_func ($needle, $haystack[$offset]))
|
||||
|| (is_array ($needle) && in_array ($haystack[$offset], $needle))) {
|
||||
return $offset;
|
||||
}
|
||||
$offset ++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function not_ctype_alpha ($ch)
|
||||
{
|
||||
return !ctype_alpha($ch);
|
||||
}
|
||||
|
||||
|
||||
?>
|
||||
86
core/search/index.php
Normal file
86
core/search/index.php
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
<?php
|
||||
|
||||
require_once 'core/search/htmlhelper.php';
|
||||
require_once 'core/search/stemmer.php';
|
||||
require_once 'core/path.php';
|
||||
|
||||
/**
|
||||
* Èíäåêñèðîâàíèå ôàéëîâ
|
||||
*/
|
||||
class Index
|
||||
{
|
||||
const ARRAY_FILE = 0;
|
||||
const ARRAY_TEXT = 1;
|
||||
|
||||
public $index = array ();
|
||||
public $text = array ();
|
||||
protected $count = 0;
|
||||
|
||||
function getTitle ($content) {
|
||||
$title = "'<title[^>]*?>(.*?)</title>'si";
|
||||
preg_match($title, $content, $matches);
|
||||
if(isset($matches[1])) {
|
||||
return $matches[1];
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
// Âûáèðàåì îñíîâó ñëîâà
|
||||
function clean ($word)
|
||||
{
|
||||
return Stemmer::russian(strtolower($word));
|
||||
}
|
||||
|
||||
function process ($base, $files)
|
||||
{
|
||||
$path = new Path($base);
|
||||
// Ñïèñîê äîêóìåíòîâ
|
||||
foreach ($path->getContentRec($files) as $file) {
|
||||
$content = file_get_contents ($file);
|
||||
$text = stripText($content);
|
||||
// $title = self::getTitle ($content);
|
||||
$title = pathinfo($file, PATHINFO_BASENAME);
|
||||
// echo $file, "\n";
|
||||
// Ñïèñîê ñëîâ â äîêóìåíòå
|
||||
$list = tokenize($text);
|
||||
foreach ($list as $word) {
|
||||
$preword = self::clean($word);
|
||||
if (isset($this->index[$preword])) {
|
||||
$index = $this->index[$preword];
|
||||
if ( ! in_array ($this->count, $index)) $this->index[$preword] [] = $this->count;
|
||||
} else {
|
||||
// Íå çàïèñûâàåì ñëîâà äëèííà êîòîðûõ ìåíüøå 2
|
||||
if (strlen($preword) > 1) {
|
||||
$this->index[$preword] = array ($this->count);
|
||||
}
|
||||
}
|
||||
}
|
||||
$this->text [] = array ($title, $path->relPath ($file), $text);
|
||||
$this->count ++;
|
||||
}
|
||||
ksort($this->index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ñîõðàíåíèå ðåçóëüòàòà ïîèñêà
|
||||
*/
|
||||
function saveData ($file)
|
||||
{
|
||||
$file = fopen($file, "w");
|
||||
// Êîëè÷åñòâî ñëîâ è òåêñòîâ
|
||||
fwrite ($file, pack("SS", count($this->index), count($this->text)));
|
||||
foreach ($this->index as $word => $value) {
|
||||
$length = strlen($word);
|
||||
array_unshift ($value, "SSa*S*", $length, count($value), $word);
|
||||
fwrite($file, call_user_func_array ('pack', $value));
|
||||
}
|
||||
|
||||
foreach ($this->text as $text) {
|
||||
fwrite($file, pack("SSSa*a*a*",
|
||||
strlen($text[0]), strlen($text[1]), strlen($text[2])
|
||||
, $text[0], $text[1], $text[2]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
93
core/search/lexer.php
Normal file
93
core/search/lexer.php
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Ðàçáèðâåò ñòðîêó çàïðîñà íà òîêåíû
|
||||
*/
|
||||
class Lexer
|
||||
{
|
||||
const TOKEN_NOT = 1;
|
||||
const TOKEN_OR = 2;
|
||||
const TOKEN_LPAREN = 3;
|
||||
const TOKEN_RPAREN = 4;
|
||||
const TOKEN_AND = 5;
|
||||
const TOKEN_WORD = 6;
|
||||
const TOKEN_EOL = 7;
|
||||
|
||||
protected $src;
|
||||
private $offset = 0;
|
||||
public $token;
|
||||
|
||||
public function __construct ()
|
||||
{
|
||||
}
|
||||
|
||||
function setSource ($src)
|
||||
{
|
||||
$this->src = $src;
|
||||
$this->offset;
|
||||
}
|
||||
|
||||
private function skipSpace ()
|
||||
{
|
||||
while (!$this->isEOL() && $this->getChar() == " ") {
|
||||
$this->offset++;
|
||||
}
|
||||
}
|
||||
|
||||
private function getChar ()
|
||||
{
|
||||
return $this->src [$this->offset];
|
||||
}
|
||||
|
||||
/**
|
||||
* Ïðîâåðÿåò íà êîíåö ñòðîêè
|
||||
*/
|
||||
private function isEOL () {
|
||||
return $this->offset >= strlen($this->src);
|
||||
}
|
||||
|
||||
/**
|
||||
* Îäíîñèìâîëüíûé òîêåí
|
||||
*/
|
||||
private function easyToken () {
|
||||
$ch = $this->getChar ();
|
||||
switch ($ch) {
|
||||
case '~': $token = array(self::TOKEN_NOT, $ch); break;
|
||||
case '|': $token = array(self::TOKEN_OR, $ch); break;
|
||||
case '(': $token = array(self::TOKEN_LPAREN, $ch); break;
|
||||
case ')': $token = array(self::TOKEN_RPAREN, $ch); break;
|
||||
case '&': $token = array(self::TOKEN_AND, $ch); break;
|
||||
default:
|
||||
$this->offset++;
|
||||
$token = $this->getToken();
|
||||
}
|
||||
$this->offset++;
|
||||
return $token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Âîçâðàùàåò ñëåäóþùèé òîêåí
|
||||
*/
|
||||
public function getToken ()
|
||||
{
|
||||
$this->skipSpace ();
|
||||
if ($this->isEOL()) {
|
||||
return array(self::TOKEN_EOL, "");
|
||||
}
|
||||
if (ctype_alpha($this->getChar())) {
|
||||
$start = $this->offset;
|
||||
while (!$this->isEOL() && ctype_alpha($this->getChar())) {
|
||||
$this->offset ++;
|
||||
}
|
||||
return array(self::TOKEN_WORD, substr ($this->src, $start, $this->offset-$start));
|
||||
}
|
||||
return $this->easyToken();
|
||||
}
|
||||
|
||||
public function nextToken ()
|
||||
{
|
||||
$this->token = $this->getToken();
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
98
core/search/search.php
Normal file
98
core/search/search.php
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
<?php
|
||||
|
||||
require_once 'core/search/lexer.php';
|
||||
require_once 'core/functions.php';
|
||||
|
||||
/**
|
||||
* Ïîèñê â èíäåêñå
|
||||
*/
|
||||
class Search
|
||||
{
|
||||
private $lexer;
|
||||
private $index;
|
||||
function __construct ($index)
|
||||
{
|
||||
$this->lexer = new Lexer();
|
||||
$this->index = $index;
|
||||
|
||||
$this->op = array ($this, 'Op');
|
||||
$this->binary = array ($this, 'binaryOp');
|
||||
$this->union = array ($this, 'union');
|
||||
$this->intersection = lcurry($this->op, 'array_uintersect', $this->union);
|
||||
|
||||
$this->notQuery = lcurry ($this->binary, Lexer::TOKEN_NOT,
|
||||
lcurry($this->op, 'array_udiff', 'array_udiff'), array ($this, 'easyQuery'));
|
||||
|
||||
$this->orQuery = lcurry ($this->binary, Lexer::TOKEN_OR,
|
||||
lcurry($this->op, $this->union, $this->union), $this->notQuery);
|
||||
|
||||
$this->andQuery = lcurry ($this->binary, Lexer::TOKEN_AND, $this->intersection, $this->orQuery);
|
||||
}
|
||||
|
||||
function union ($a, $b, $callback)
|
||||
{
|
||||
return array_merge($a, $b);
|
||||
}
|
||||
|
||||
function Eq ($a, $b)
|
||||
{
|
||||
return $a == $b;
|
||||
}
|
||||
|
||||
function Op ($files, $words, $a, $b) {
|
||||
return array (
|
||||
'words' => call_user_func ($words, $a['words'], $b['words'], array ($this, 'eq')),
|
||||
'files' => call_user_func ($files, $a['files'], $b['files'], array ($this, 'eq'))
|
||||
);
|
||||
}
|
||||
|
||||
public function getQuery ($source)
|
||||
{
|
||||
$this->lexer->setSource ($source);
|
||||
$this->lexer->nextToken();
|
||||
return $this->topQuery();
|
||||
}
|
||||
|
||||
function topQuery ()
|
||||
{
|
||||
$result = call_user_func ($this->andQuery);
|
||||
while ($this->lexer->token[0] == Lexer::TOKEN_LPAREN) {
|
||||
$result = call_user_func ($this->intersection, $result, call_user_func ($this->andQuery));
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
|
||||
function easyQuery ()
|
||||
{
|
||||
$result = null;
|
||||
if ($this->lexer->token[0] == Lexer::TOKEN_LPAREN) {
|
||||
$this->lexer->nextToken ();
|
||||
$result = $this->topQuery ();
|
||||
if ($this->lexer->token[0] == Lexer::TOKEN_RPAREN) {
|
||||
$this->lexer->nextToken ();
|
||||
}
|
||||
return $result;
|
||||
} else {
|
||||
$result = call_user_func ($this->index, $this->lexer->token[1]);
|
||||
$this->lexer->nextToken ();
|
||||
return $result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $type Òèï ëåêñåìû
|
||||
* @param function $op Ôóíêöèÿ ïðè ñîâïàäåíèè òèïà ëåêñåìû ïðè çàïðîñå
|
||||
* @param function $next Ñëåäóþùèé îáðàáîò÷èê çàïðîñà
|
||||
*/
|
||||
function binaryOp ($type, $op, $next)
|
||||
{
|
||||
$result = call_user_func($next);
|
||||
while ($this->lexer->token[0] == $type) {
|
||||
$this->lexer->nextToken();
|
||||
$result = call_user_func($op, $result, call_user_func ($next));
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
102
core/search/searcher.php
Normal file
102
core/search/searcher.php
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
<?php
|
||||
|
||||
require_once 'core/search/search.php';
|
||||
require_once 'core/search/htmlhelper.php';
|
||||
require_once 'core/search/stemmer.php';
|
||||
|
||||
class Searcher {
|
||||
/* protected */ public $index;
|
||||
protected $text;
|
||||
protected $search;
|
||||
public function __construct ()
|
||||
{
|
||||
// Ìîæåò ïåðåäàâàòü îáüåêò ìåòîä ïî óìëî÷àíèþ getWordStat??
|
||||
$this->search = new Search (array ($this, 'getWord'));
|
||||
}
|
||||
|
||||
/**
|
||||
* ×èòàåò ñîäåðæèìîå èíäåêñíîãî ôàéëà
|
||||
*
|
||||
* @param string $file Èìÿ ôàéëà
|
||||
*/
|
||||
function setSource ($fileName)
|
||||
{
|
||||
$file = fopen($fileName, "r");
|
||||
$words = fread($file, 4);
|
||||
$all = unpack("Swords/Stexts", $words);
|
||||
for ($i = 0; $i < $all['words']; $i++) {
|
||||
$pos = fread($file, 4);
|
||||
$size = unpack("Sword/Sindex", $pos);
|
||||
|
||||
$word = fread($file, $size['word']);
|
||||
$index = unpack("S*", fread($file, $size['index']*2));
|
||||
$this->index[$word] = $index;
|
||||
}
|
||||
|
||||
for ($i = 0; $i < $all['texts']; $i++) {
|
||||
$pos = fread($file, 6);
|
||||
$size = unpack("Stitle/Surl/Stext", $pos);
|
||||
//
|
||||
$title = fread($file, $size['title']);
|
||||
$url = fread($file, $size['url']);
|
||||
$text = fread($file, $size['text']);
|
||||
$this->text [] = array ($title, $url, $text);
|
||||
}
|
||||
}
|
||||
|
||||
// Ïî ñëîâó âîçâðàùàåì ñïèñîê ôàéëîâ è ñëîâî
|
||||
public function getWord ($word)
|
||||
{
|
||||
$preword = Stemmer::russian($word); // Index?? -> clean
|
||||
if (isset($this->index[$preword])) { // Search??
|
||||
return array ('files' => $this->index[$preword], 'words' => array ($preword));
|
||||
}
|
||||
return array ('files' => array (), 'words' => array ());
|
||||
}
|
||||
|
||||
/**
|
||||
* Ñïèñîê äîêóìåíòîâ â êîòîðûõ âñòå÷àåòñÿ ñëîâî
|
||||
*
|
||||
*/
|
||||
function getResult (&$query)
|
||||
{
|
||||
$result = array ();
|
||||
$word = $query['words'];
|
||||
$list = $query['files'];
|
||||
//
|
||||
foreach ($list as $n) {
|
||||
$result [] = array (
|
||||
'title' => $this->text[$n][0],
|
||||
'file' => $this->text[$n][1],
|
||||
'text' => self::getSlice ($word[0], $this->text[$n][2]));
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* ×àñòü äîêóìåíòà â êîòîðîì âñòðå÷àåòñÿ ñëîâî
|
||||
*
|
||||
* @param $word Ñëîâî
|
||||
* @param $text Òåêñò ñîäåðæàùèé ñëîâî
|
||||
*/
|
||||
function getSlice ($word, $text)
|
||||
{
|
||||
$pos = stripos($text, $word);
|
||||
$offset = max(max ($pos-100, indexRight($text, array ("."), $pos) + 1), 0);
|
||||
$real = substr($text, $pos, strlen($word)) ;
|
||||
return substr($text, $offset, $pos - $offset)
|
||||
. "<span style='color: red'>" . $real . "</span>" . substr ($text, $pos + strlen($word), 100);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ïîèñê ïî çàïðîñó
|
||||
*
|
||||
*/
|
||||
function search ($query)
|
||||
{
|
||||
$result = $this->search->getQuery($query);
|
||||
return $this->getResult($result);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
181
core/search/stemmer.php
Normal file
181
core/search/stemmer.php
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
<?php
|
||||
/*
|
||||
* PHP5 implementation of Martin Porter's stemming algorithm for Russian language.
|
||||
* Written on a cold winter evening close to the end of 2005 by Dennis Kreminsky (etranger at etranger dot ru)
|
||||
* Use the code freely, but don't hold me responsible if it breaks whatever it might break.
|
||||
*
|
||||
*/
|
||||
|
||||
define ('CHAR_LENGTH', '1'); // all Russian characters take 2 bytes in UTF-8, so instead of using (not supported by default) mb_*
|
||||
// string functions, we use the standard ones with a dirty char-length trick.
|
||||
// Should you want to use WIN-1251 (or any other charset), convert this source file to that encoding
|
||||
// and then change CHAR_LENGTH to the proper value, which is likely to be '1' then.
|
||||
//
|
||||
class Stemmer {
|
||||
|
||||
static public function russian($word)
|
||||
{
|
||||
$a = self::rv($word);
|
||||
$start = $a[0];
|
||||
$rv = $a[1];
|
||||
$rv = self::step1($rv);
|
||||
$rv = self::step2($rv);
|
||||
$rv = self::step3($rv);
|
||||
$rv = self::step4($rv);
|
||||
return $start.$rv;
|
||||
}
|
||||
|
||||
static private function rv($word)
|
||||
{
|
||||
$vowels = array('à','å','è','î','ó','û','ý','þ','ÿ');
|
||||
$flag = 0;
|
||||
$rv = '';
|
||||
$start = '';
|
||||
for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH) {
|
||||
if ($flag==1)
|
||||
$rv .= substr($word, $i, CHAR_LENGTH);
|
||||
else
|
||||
$start .= substr($word, $i, CHAR_LENGTH);
|
||||
if (array_search(substr($word,$i,CHAR_LENGTH), $vowels) !== false)
|
||||
$flag=1;
|
||||
}
|
||||
return array($start,$rv);
|
||||
}
|
||||
|
||||
static function substitute ($word, &$suffix_list)
|
||||
{
|
||||
foreach ($suffix_list as $suffix) {
|
||||
if (self::has_suffix($word, $suffix)) {
|
||||
$word = self::cut_suffix($word, $suffix);
|
||||
}
|
||||
}
|
||||
return $word;
|
||||
}
|
||||
|
||||
static function has_suffix ($word, $suffix)
|
||||
{
|
||||
return substr($word, -(strlen($suffix))) == $suffix;
|
||||
}
|
||||
|
||||
static function has_aya ($word, $suffix)
|
||||
{
|
||||
return (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='à' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='ÿ');
|
||||
}
|
||||
|
||||
static function cut_suffix ($word, $suffix)
|
||||
{
|
||||
return substr($word, 0, strlen($word) - strlen($suffix));
|
||||
}
|
||||
|
||||
static private function step1($word)
|
||||
{
|
||||
$perfective1 = array('â', 'âøè', 'âøèñü');
|
||||
foreach ($perfective1 as $suffix) {
|
||||
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix)) {
|
||||
return self::cut_suffix($word, $suffix);
|
||||
}
|
||||
}
|
||||
|
||||
$perfective2 = array('èâ','èâøè','èâøèñü','ûâøè','ûâøèñü');
|
||||
foreach ($perfective2 as $suffix) {
|
||||
if (self::has_suffix($word, $suffix)) {
|
||||
return self::cut_suffix($word, $suffix);
|
||||
}
|
||||
}
|
||||
|
||||
$reflexive = array('ñÿ', 'ñü');
|
||||
$word = self::substitute($word, $reflexive);
|
||||
|
||||
$adjective = array('åå','èå','ûå','îå','èìè','ûìè','åé','èé','ûé','îé','åì','èì','ûì','îì','åãî','îãî','åìó','îìó','èõ','ûõ','óþ','þþ','àÿ','ÿÿ','îþ','åþ');
|
||||
$participle2 = array('åì','íí','âø','þù','ù');
|
||||
$participle1 = array('èâø','ûâø','óþù');
|
||||
foreach ($adjective as $suffix) {
|
||||
if (self::has_suffix($word, $suffix)) {
|
||||
$word = self::cut_suffix($word, $suffix);
|
||||
|
||||
foreach ($participle1 as $suffix)
|
||||
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix))
|
||||
$word = self::cut_suffix($word, $suffix);
|
||||
|
||||
return self::substitute($word, $participle2);
|
||||
}
|
||||
}
|
||||
|
||||
$verb1 = array('ëà','íà','åòå','éòå','ëè','é','ë','åì','í','ëî','íî','åò','þò','íû','òü','åøü','ííî');
|
||||
foreach ($verb1 as $suffix)
|
||||
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix))
|
||||
return self::cut_suffix($word, $suffix);
|
||||
|
||||
$verb2 = array('èëà','ûëà','åíà','åéòå','óéòå','èòå','èëè','ûëè','åé','óé','èë','ûë','èì','ûì','åí','èëî','ûëî','åíî','ÿò','óåò','óþò','èò','ûò','åíû','èòü','ûòü','èøü','óþ','þ');
|
||||
foreach ($verb2 as $suffix)
|
||||
if (self::has_suffix($word, $suffix))
|
||||
return self::cut_suffix($word, $suffix);
|
||||
|
||||
$noun = array('à','åâ','îâ','èå','üå','å','èÿìè','ÿìè','àìè','åè','èè','è','èåé','åé','îé','èé','é','èÿì','ÿì','èåì','åì','àì','îì','î','ó','àõ','èÿõ','ÿõ','û','ü','èþ','üþ','þ','èÿ','üÿ','ÿ');
|
||||
foreach ($noun as $suffix) {
|
||||
if (self::has_suffix($word, $suffix))
|
||||
return self::cut_suffix($word, $suffix);
|
||||
}
|
||||
|
||||
return $word;
|
||||
}
|
||||
|
||||
static private function step2($word)
|
||||
{
|
||||
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'è')
|
||||
$word = substr($word, 0, strlen($word)-CHAR_LENGTH);
|
||||
return $word;
|
||||
}
|
||||
|
||||
static private function step3($word)
|
||||
{
|
||||
$vowels = array('à','å','è','î','ó','û','ý','þ','ÿ');
|
||||
$flag = 0;
|
||||
$r1 = '';
|
||||
$r2 = '';
|
||||
for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH)
|
||||
{
|
||||
if ($flag==2)
|
||||
$r1.=substr($word, $i, CHAR_LENGTH);
|
||||
if (array_search(substr($word,$i,CHAR_LENGTH), $vowels) !== false)
|
||||
$flag=1;
|
||||
if ($flag=1 && array_search(substr($word,$i,CHAR_LENGTH), $vowels) === false)
|
||||
$flag=2;
|
||||
}
|
||||
$flag=0;
|
||||
for ($i=0; $i<strlen($r1); $i+=CHAR_LENGTH)
|
||||
{
|
||||
if ($flag==2)
|
||||
$r2.=substr($r1, $i, CHAR_LENGTH);
|
||||
if (array_search(substr($r1,$i,CHAR_LENGTH), $vowels) !== false)
|
||||
$flag=1;
|
||||
if ($flag=1 && array_search(substr($r1,$i,CHAR_LENGTH), $vowels) === false)
|
||||
$flag=2;
|
||||
}
|
||||
$derivational=array('îñò', 'îñòü');
|
||||
foreach ($derivational as $suffix)
|
||||
if (substr($r2,-(strlen($suffix))) == $suffix)
|
||||
$word=substr($word, 0, strlen($r2)-strlen($suffix));
|
||||
return $word;
|
||||
}
|
||||
|
||||
static private function step4($word)
|
||||
{
|
||||
if (substr($word,-CHAR_LENGTH*2)=='íí')
|
||||
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
|
||||
else
|
||||
{
|
||||
$superlative=array('åéø', 'åéøå');
|
||||
foreach ($superlative as $suffix)
|
||||
if (substr($word,-(strlen($suffix))) == $suffix)
|
||||
$word = substr($word, 0, strlen($word) - strlen($suffix));
|
||||
if (substr($word,-CHAR_LENGTH*2) == 'íí')
|
||||
$word = substr($word, 0, strlen($word) - CHAR_LENGTH);
|
||||
}
|
||||
// should there be a guard flag? can't think of a russian word that ends with åéøü or ííü anyways, though the algorithm states this is an "otherwise" case
|
||||
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'ü')
|
||||
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
|
||||
return $word;
|
||||
}
|
||||
}
|
||||
?>
|
||||
Loading…
Add table
Add a link
Reference in a new issue