Перекодировка в utf-8
This commit is contained in:
parent
43f501a5e2
commit
df2e65a670
80 changed files with 668 additions and 668 deletions
|
|
@ -27,7 +27,7 @@ class Stemmer {
|
|||
|
||||
static private function rv($word)
|
||||
{
|
||||
$vowels = array('à','å','è','î','ó','û','ý','þ','ÿ');
|
||||
$vowels = array('а','е','и','о','у','ы','э','ю','я');
|
||||
$flag = 0;
|
||||
$rv = '';
|
||||
$start = '';
|
||||
|
|
@ -59,7 +59,7 @@ class Stemmer {
|
|||
|
||||
static function has_aya ($word, $suffix)
|
||||
{
|
||||
return (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='à' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='ÿ');
|
||||
return (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='я');
|
||||
}
|
||||
|
||||
static function cut_suffix ($word, $suffix)
|
||||
|
|
@ -69,26 +69,26 @@ class Stemmer {
|
|||
|
||||
static private function step1($word)
|
||||
{
|
||||
$perfective1 = array('â', 'âøè', 'âøèñü');
|
||||
$perfective1 = array('в', 'вши', 'вшись');
|
||||
foreach ($perfective1 as $suffix) {
|
||||
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix)) {
|
||||
return self::cut_suffix($word, $suffix);
|
||||
}
|
||||
}
|
||||
|
||||
$perfective2 = array('èâ','èâøè','èâøèñü','ûâøè','ûâøèñü');
|
||||
$perfective2 = array('ив','ивши','ившись','ывши','ывшись');
|
||||
foreach ($perfective2 as $suffix) {
|
||||
if (self::has_suffix($word, $suffix)) {
|
||||
return self::cut_suffix($word, $suffix);
|
||||
}
|
||||
}
|
||||
|
||||
$reflexive = array('ñÿ', 'ñü');
|
||||
$reflexive = array('ся', 'сь');
|
||||
$word = self::substitute($word, $reflexive);
|
||||
|
||||
$adjective = array('åå','èå','ûå','îå','èìè','ûìè','åé','èé','ûé','îé','åì','èì','ûì','îì','åãî','îãî','åìó','îìó','èõ','ûõ','óþ','þþ','àÿ','ÿÿ','îþ','åþ');
|
||||
$participle2 = array('åì','íí','âø','þù','ù');
|
||||
$participle1 = array('èâø','ûâø','óþù');
|
||||
$adjective = array('ее','ие','ые','ое','ими','ыми','ей','ий','ый','ой','ем','им','ым','ом','его','ого','ему','ому','их','ых','ую','юю','ая','яя','ою','ею');
|
||||
$participle2 = array('ем','нн','вш','ющ','щ');
|
||||
$participle1 = array('ивш','ывш','ующ');
|
||||
foreach ($adjective as $suffix) {
|
||||
if (self::has_suffix($word, $suffix)) {
|
||||
$word = self::cut_suffix($word, $suffix);
|
||||
|
|
@ -101,17 +101,17 @@ class Stemmer {
|
|||
}
|
||||
}
|
||||
|
||||
$verb1 = array('ëà','íà','åòå','éòå','ëè','é','ë','åì','í','ëî','íî','åò','þò','íû','òü','åøü','ííî');
|
||||
$verb1 = array('ла','на','ете','йте','ли','й','л','ем','н','ло','но','ет','ют','ны','ть','ешь','нно');
|
||||
foreach ($verb1 as $suffix)
|
||||
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix))
|
||||
return self::cut_suffix($word, $suffix);
|
||||
|
||||
$verb2 = array('èëà','ûëà','åíà','åéòå','óéòå','èòå','èëè','ûëè','åé','óé','èë','ûë','èì','ûì','åí','èëî','ûëî','åíî','ÿò','óåò','óþò','èò','ûò','åíû','èòü','ûòü','èøü','óþ','þ');
|
||||
$verb2 = array('ила','ыла','ена','ейте','уйте','ите','или','ыли','ей','уй','ил','ыл','им','ым','ен','ило','ыло','ено','ят','ует','уют','ит','ыт','ены','ить','ыть','ишь','ую','ю');
|
||||
foreach ($verb2 as $suffix)
|
||||
if (self::has_suffix($word, $suffix))
|
||||
return self::cut_suffix($word, $suffix);
|
||||
|
||||
$noun = array('à','åâ','îâ','èå','üå','å','èÿìè','ÿìè','àìè','åè','èè','è','èåé','åé','îé','èé','é','èÿì','ÿì','èåì','åì','àì','îì','î','ó','àõ','èÿõ','ÿõ','û','ü','èþ','üþ','þ','èÿ','üÿ','ÿ');
|
||||
$noun = array('а','ев','ов','ие','ье','е','иями','ями','ами','еи','ии','и','ией','ей','ой','ий','й','иям','ям','ием','ем','ам','ом','о','у','ах','иях','ях','ы','ь','ию','ью','ю','ия','ья','я');
|
||||
foreach ($noun as $suffix) {
|
||||
if (self::has_suffix($word, $suffix))
|
||||
return self::cut_suffix($word, $suffix);
|
||||
|
|
@ -122,14 +122,14 @@ class Stemmer {
|
|||
|
||||
static private function step2($word)
|
||||
{
|
||||
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'è')
|
||||
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'и')
|
||||
$word = substr($word, 0, strlen($word)-CHAR_LENGTH);
|
||||
return $word;
|
||||
}
|
||||
|
||||
static private function step3($word)
|
||||
{
|
||||
$vowels = array('à','å','è','î','ó','û','ý','þ','ÿ');
|
||||
$vowels = array('а','е','и','о','у','ы','э','ю','я');
|
||||
$flag = 0;
|
||||
$r1 = '';
|
||||
$r2 = '';
|
||||
|
|
@ -152,7 +152,7 @@ class Stemmer {
|
|||
if ($flag=1 && array_search(substr($r1,$i,CHAR_LENGTH), $vowels) === false)
|
||||
$flag=2;
|
||||
}
|
||||
$derivational=array('îñò', 'îñòü');
|
||||
$derivational=array('ост', 'ость');
|
||||
foreach ($derivational as $suffix)
|
||||
if (substr($r2,-(strlen($suffix))) == $suffix)
|
||||
$word=substr($word, 0, strlen($r2)-strlen($suffix));
|
||||
|
|
@ -161,19 +161,19 @@ class Stemmer {
|
|||
|
||||
static private function step4($word)
|
||||
{
|
||||
if (substr($word,-CHAR_LENGTH*2)=='íí')
|
||||
if (substr($word,-CHAR_LENGTH*2)=='нн')
|
||||
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
|
||||
else
|
||||
{
|
||||
$superlative=array('åéø', 'åéøå');
|
||||
$superlative=array('ейш', 'ейше');
|
||||
foreach ($superlative as $suffix)
|
||||
if (substr($word,-(strlen($suffix))) == $suffix)
|
||||
$word = substr($word, 0, strlen($word) - strlen($suffix));
|
||||
if (substr($word,-CHAR_LENGTH*2) == 'íí')
|
||||
if (substr($word,-CHAR_LENGTH*2) == 'нн')
|
||||
$word = substr($word, 0, strlen($word) - CHAR_LENGTH);
|
||||
}
|
||||
// should there be a guard flag? can't think of a russian word that ends with åéøü or ííü anyways, though the algorithm states this is an "otherwise" case
|
||||
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'ü')
|
||||
// should there be a guard flag? can't think of a russian word that ends with ейшь or ннь anyways, though the algorithm states this is an "otherwise" case
|
||||
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'ь')
|
||||
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
|
||||
return $word;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue