phplibrary/src/search/stemmer.php
2017-02-09 14:57:40 +03:00

180 lines
7.2 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/*
* PHP5 implementation of Martin Porter's stemming algorithm for Russian language.
* Written on a cold winter evening close to the end of 2005 by Dennis Kreminsky (etranger at etranger dot ru)
* Use the code freely, but don't hold me responsible if it breaks whatever it might break.
*
*/
define ('CHAR_LENGTH', '1'); // all Russian characters take 2 bytes in UTF-8, so instead of using (not supported by default) mb_*
// string functions, we use the standard ones with a dirty char-length trick.
// Should you want to use WIN-1251 (or any other charset), convert this source file to that encoding
// and then change CHAR_LENGTH to the proper value, which is likely to be '1' then.
//
class Stemmer {
static public function russian($word)
{
$a = self::rv($word);
$start = $a[0];
$rv = $a[1];
$rv = self::step1($rv);
$rv = self::step2($rv);
$rv = self::step3($rv);
$rv = self::step4($rv);
return $start.$rv;
}
static private function rv($word)
{
$vowels = array('а','е','и','о','у','ы','э','ю','я');
$flag = 0;
$rv = '';
$start = '';
for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH) {
if ($flag==1)
$rv .= substr($word, $i, CHAR_LENGTH);
else
$start .= substr($word, $i, CHAR_LENGTH);
if (array_search(substr($word,$i,CHAR_LENGTH), $vowels) !== false)
$flag=1;
}
return array($start,$rv);
}
static function substitute ($word, &$suffix_list)
{
foreach ($suffix_list as $suffix) {
if (self::has_suffix($word, $suffix)) {
$word = self::cut_suffix($word, $suffix);
}
}
return $word;
}
static function has_suffix ($word, $suffix)
{
return substr($word, -(strlen($suffix))) == $suffix;
}
static function has_aya ($word, $suffix)
{
return (substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='а' || substr($word,-strlen($suffix)-CHAR_LENGTH,CHAR_LENGTH)=='я');
}
static function cut_suffix ($word, $suffix)
{
return substr($word, 0, strlen($word) - strlen($suffix));
}
static private function step1($word)
{
$perfective1 = array('в', 'вши', 'вшись');
foreach ($perfective1 as $suffix) {
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix)) {
return self::cut_suffix($word, $suffix);
}
}
$perfective2 = array('ив','ивши','ившись','ывши','ывшись');
foreach ($perfective2 as $suffix) {
if (self::has_suffix($word, $suffix)) {
return self::cut_suffix($word, $suffix);
}
}
$reflexive = array('ся', 'сь');
$word = self::substitute($word, $reflexive);
$adjective = array('ее','ие','ые','ое','ими','ыми','ей','ий','ый','ой','ем','им','ым','ом','его','ого','ему','ому','их','ых','ую','юю','ая','яя','ою','ею');
$participle2 = array('ем','нн','вш','ющ','щ');
$participle1 = array('ивш','ывш','ующ');
foreach ($adjective as $suffix) {
if (self::has_suffix($word, $suffix)) {
$word = self::cut_suffix($word, $suffix);
foreach ($participle1 as $suffix)
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix))
$word = self::cut_suffix($word, $suffix);
return self::substitute($word, $participle2);
}
}
$verb1 = array('ла','на','ете','йте','ли','й','л','ем','н','ло','но','ет','ют','ны','ть','ешь','нно');
foreach ($verb1 as $suffix)
if (self::has_suffix($word, $suffix) && self::has_aya ($word, $suffix))
return self::cut_suffix($word, $suffix);
$verb2 = array('ила','ыла','ена','ейте','уйте','ите','или','ыли','ей','уй','ил','ыл','им','ым','ен','ило','ыло','ено','ят','ует','уют','ит','ыт','ены','ить','ыть','ишь','ую','ю');
foreach ($verb2 as $suffix)
if (self::has_suffix($word, $suffix))
return self::cut_suffix($word, $suffix);
$noun = array('а','ев','ов','ие','ье','е','иями','ями','ами','еи','ии','и','ией','ей','ой','ий','й','иям','ям','ием','ем','ам','ом','о','у','ах','иях','ях','ы','ь','ию','ью','ю','ия','ья','я');
foreach ($noun as $suffix) {
if (self::has_suffix($word, $suffix))
return self::cut_suffix($word, $suffix);
}
return $word;
}
static private function step2($word)
{
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'и')
$word = substr($word, 0, strlen($word)-CHAR_LENGTH);
return $word;
}
static private function step3($word)
{
$vowels = array('а','е','и','о','у','ы','э','ю','я');
$flag = 0;
$r1 = '';
$r2 = '';
for ($i=0; $i<strlen($word); $i+=CHAR_LENGTH)
{
if ($flag==2)
$r1.=substr($word, $i, CHAR_LENGTH);
if (array_search(substr($word,$i,CHAR_LENGTH), $vowels) !== false)
$flag=1;
if ($flag=1 && array_search(substr($word,$i,CHAR_LENGTH), $vowels) === false)
$flag=2;
}
$flag=0;
for ($i=0; $i<strlen($r1); $i+=CHAR_LENGTH)
{
if ($flag==2)
$r2.=substr($r1, $i, CHAR_LENGTH);
if (array_search(substr($r1,$i,CHAR_LENGTH), $vowels) !== false)
$flag=1;
if ($flag=1 && array_search(substr($r1,$i,CHAR_LENGTH), $vowels) === false)
$flag=2;
}
$derivational=array('ост', 'ость');
foreach ($derivational as $suffix)
if (substr($r2,-(strlen($suffix))) == $suffix)
$word=substr($word, 0, strlen($r2)-strlen($suffix));
return $word;
}
static private function step4($word)
{
if (substr($word,-CHAR_LENGTH*2)=='нн')
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
else
{
$superlative=array('ейш', 'ейше');
foreach ($superlative as $suffix)
if (substr($word,-(strlen($suffix))) == $suffix)
$word = substr($word, 0, strlen($word) - strlen($suffix));
if (substr($word,-CHAR_LENGTH*2) == 'нн')
$word = substr($word, 0, strlen($word) - CHAR_LENGTH);
}
// should there be a guard flag? can't think of a russian word that ends with ейшь or ннь anyways, though the algorithm states this is an "otherwise" case
if (substr($word,-CHAR_LENGTH,CHAR_LENGTH) == 'ь')
$word=substr($word, 0, strlen($word)-CHAR_LENGTH);
return $word;
}
}