244 lines
7.4 KiB
PHP
244 lines
7.4 KiB
PHP
<?php
|
|
|
|
function GetStem($word) {return GermanStemmer::stem($word);}
|
|
|
|
mb_internal_encoding("utf-8");
|
|
|
|
/**
|
|
|
|
* Copyright (c) 2013 Aris Buzachis (buzachis.aris@gmail.com)
|
|
*
|
|
* All rights reserved.
|
|
*
|
|
* This script is free software.
|
|
*
|
|
* DISCLAIMER:
|
|
*
|
|
* IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/**
|
|
* Takes a word and reduces it to its German stem using the Porter stemmer algorithm.
|
|
*
|
|
* References:
|
|
* - http://snowball.tartarus.org/algorithms/porter/stemmer.html
|
|
* - http://snowball.tartarus.org/algorithms/german/stemmer.html
|
|
*
|
|
* Usage:
|
|
* $stem = GermanStemmer::stem($word);
|
|
*
|
|
* @author Aris Buzachis <buzachis.aris@gmail.com>
|
|
* @author Pascal Landau <kontakt@myseosolution.de>
|
|
*/
|
|
|
|
class GermanStemmer
|
|
{
|
|
/**
|
|
* R1 and R2 regions (see the Porter algorithm)
|
|
*/
|
|
private static $R1;
|
|
private static $R2;
|
|
|
|
private static $cache = array();
|
|
|
|
private static $vowels = array('a','e','i','o','u','y','ä','ö','ü');
|
|
private static $s_ending = array('b','d','f','g','h','k','l','m','n','r','t');
|
|
private static $st_ending = array('b','d','f','g','h','k','l','m','n','t');
|
|
|
|
|
|
/**
|
|
* Gets the stem of $word.
|
|
* @param string $word
|
|
* @return string
|
|
*/
|
|
public static function stem($word) {
|
|
$word = mb_strtolower($word);
|
|
//check for invalid characters
|
|
preg_match("#.#u",$word);
|
|
if(preg_last_error() !== 0){
|
|
throw new \InvalidArgumentException("Word '$word' seems to be errornous. Error code from preg_last_error(): ".preg_last_error());
|
|
}
|
|
if (!isset(self::$cache[$word])) {
|
|
$result = self::getStem($word);
|
|
self::$cache[$word] = $result;
|
|
}
|
|
|
|
return self::$cache[$word];
|
|
}
|
|
|
|
/**
|
|
* @param $word
|
|
* @return string
|
|
*/
|
|
private static function getStem($word) {
|
|
$word = self::step0a($word);
|
|
$word = self::step1($word);
|
|
$word = self::step2($word);
|
|
$word = self::step3($word);
|
|
$word = self::step0b($word);
|
|
|
|
return $word;
|
|
}
|
|
|
|
/**
|
|
* Replaces to protect some characters
|
|
* @param string $word
|
|
* @return string mixed
|
|
*/
|
|
private static function step0a($word) {
|
|
$vstr = implode('',self::$vowels);
|
|
$word = preg_replace('#(['.$vstr.'])u(['.$vstr.'])#u', '$1U$2',$word);
|
|
$word = preg_replace('#(['.$vstr.'])y(['.$vstr.'])#u', '$1Y$2',$word);
|
|
|
|
return $word;
|
|
}
|
|
|
|
/**
|
|
* Undo the initial replaces
|
|
* @param string $word
|
|
* @return string
|
|
*/
|
|
private static function step0b($word) {
|
|
$word = str_replace(array('ä','ö','ü', 'U','Y'),array('a','o','u','u','y'),$word);
|
|
|
|
return $word;
|
|
}
|
|
|
|
private static function step1($word) {
|
|
$word = str_replace('ß','ss',$word);
|
|
|
|
self::getR($word);
|
|
|
|
$replaceCount = 0;
|
|
|
|
$arr = array('em','ern','er');
|
|
foreach ($arr as $s) {
|
|
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1,-1,$replaceCount);
|
|
if ($replaceCount > 0) {
|
|
$word = preg_replace('#'.$s.'$#u','',$word);
|
|
}
|
|
}
|
|
|
|
$arr = array('en','es','e');
|
|
foreach ($arr as $s) {
|
|
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1,-1,$replaceCount);
|
|
if ($replaceCount > 0) {
|
|
$word = preg_replace('#'.$s.'$#u','',$word);
|
|
$word = preg_replace('#niss$#u', 'nis', $word);
|
|
}
|
|
}
|
|
|
|
$word = preg_replace('/(['.implode('',self::$s_ending).'])s$/u','$1',$word);
|
|
|
|
return $word;
|
|
}
|
|
|
|
private static function step2($word) {
|
|
self::getR($word);
|
|
|
|
$replaceCount = 0;
|
|
|
|
$arr = array('est','er','en');
|
|
foreach ($arr as $s) {
|
|
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1,-1,$replaceCount);
|
|
if ($replaceCount > 0) {
|
|
$word = preg_replace('#'.$s.'$#u','',$word);
|
|
}
|
|
}
|
|
|
|
if (strpos(self::$R1,'st') !== false) {
|
|
self::$R1 = preg_replace('#st$#u','',self::$R1);
|
|
$word = preg_replace('#(...['.implode('',self::$st_ending).'])st$#u','$1',$word);
|
|
}
|
|
|
|
return $word;
|
|
}
|
|
|
|
private static function step3($word) {
|
|
self::getR($word);
|
|
|
|
$replaceCount = 0;
|
|
|
|
$arr = array('end', 'ung');
|
|
foreach ($arr as $s) {
|
|
if (preg_match('#'.$s.'$#u',self::$R2)) {
|
|
$word = preg_replace('#([^e])'.$s.'$#u','$1',$word, -1, $replaceCount);
|
|
if ($replaceCount > 0) {
|
|
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2,-1,$replaceCount);
|
|
}
|
|
}
|
|
}
|
|
|
|
$arr = array('isch', 'ik', 'ig');
|
|
foreach ($arr as $s) {
|
|
if (preg_match('#'.$s.'$#u',self::$R2)) {
|
|
$word = preg_replace('#([^e])'.$s.'$#u','$1',$word, -1, $replaceCount);
|
|
if ($replaceCount > 0) {
|
|
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2);
|
|
}
|
|
}
|
|
}
|
|
|
|
$arr = array('lich', 'heit');
|
|
foreach ($arr as $s) {
|
|
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2,-1,$replaceCount);
|
|
if ($replaceCount > 0) {
|
|
$word = preg_replace('#'.$s.'$#u','',$word);
|
|
} else {
|
|
if (preg_match('#'.$s.'$#u',self::$R1)) {
|
|
$word = preg_replace('#(er|en)'.$s.'$#u','$1',$word, -1, $replaceCount);
|
|
if ($replaceCount > 0) {
|
|
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
$arr = array('keit');
|
|
foreach ($arr as $s) {
|
|
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2,-1,$replaceCount);
|
|
if ($replaceCount > 0) {
|
|
$word = preg_replace('#'.$s.'$#u','',$word);
|
|
}
|
|
}
|
|
|
|
return $word;
|
|
}
|
|
|
|
/**
|
|
* Find R1 and R2
|
|
* @param string $word
|
|
*/
|
|
private static function getR($word) {
|
|
self::$R1 = "";
|
|
self::$R2 = "";
|
|
|
|
$vowels = implode("", self::$vowels);
|
|
$vowelGroup = "[{$vowels}]";
|
|
$nonVowelGroup = "[^{$vowels}]";
|
|
// R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel.
|
|
$pattern = "#(?P<rest>.*?{$vowelGroup}{$nonVowelGroup})(?P<r>.*)#u";
|
|
if(preg_match($pattern, $word, $match)){
|
|
$rest = $match["rest"];
|
|
$r1 = $match["r"];
|
|
// [...], but then R1 is adjusted so that the region before it contains at least 3 letters.
|
|
$cutOff = 3 - mb_strlen($rest);
|
|
if($cutOff > 0){
|
|
$r1 = mb_substr($r1, $cutOff);
|
|
}
|
|
self::$R1 = $r1;
|
|
}
|
|
|
|
//R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel.
|
|
if(preg_match($pattern, self::$R1, $match)){
|
|
self::$R2 = $match["r"];
|
|
}
|
|
}
|
|
}
|