Files
resourcespace/lib/stemming/de.php
2025-07-18 16:20:14 +07:00

244 lines
7.4 KiB
PHP

<?php
function GetStem($word) {return GermanStemmer::stem($word);}
mb_internal_encoding("utf-8");
/**
* Copyright (c) 2013 Aris Buzachis (buzachis.aris@gmail.com)
*
* All rights reserved.
*
* This script is free software.
*
* DISCLAIMER:
*
* IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Takes a word and reduces it to its German stem using the Porter stemmer algorithm.
*
* References:
* - http://snowball.tartarus.org/algorithms/porter/stemmer.html
* - http://snowball.tartarus.org/algorithms/german/stemmer.html
*
* Usage:
* $stem = GermanStemmer::stem($word);
*
* @author Aris Buzachis <buzachis.aris@gmail.com>
* @author Pascal Landau <kontakt@myseosolution.de>
*/
class GermanStemmer
{
/**
* R1 and R2 regions (see the Porter algorithm)
*/
private static $R1;
private static $R2;
private static $cache = array();
private static $vowels = array('a','e','i','o','u','y','ä','ö','ü');
private static $s_ending = array('b','d','f','g','h','k','l','m','n','r','t');
private static $st_ending = array('b','d','f','g','h','k','l','m','n','t');
/**
* Gets the stem of $word.
* @param string $word
* @return string
*/
public static function stem($word) {
$word = mb_strtolower($word);
//check for invalid characters
preg_match("#.#u",$word);
if(preg_last_error() !== 0){
throw new \InvalidArgumentException("Word '$word' seems to be errornous. Error code from preg_last_error(): ".preg_last_error());
}
if (!isset(self::$cache[$word])) {
$result = self::getStem($word);
self::$cache[$word] = $result;
}
return self::$cache[$word];
}
/**
* @param $word
* @return string
*/
private static function getStem($word) {
$word = self::step0a($word);
$word = self::step1($word);
$word = self::step2($word);
$word = self::step3($word);
$word = self::step0b($word);
return $word;
}
/**
* Replaces to protect some characters
* @param string $word
* @return string mixed
*/
private static function step0a($word) {
$vstr = implode('',self::$vowels);
$word = preg_replace('#(['.$vstr.'])u(['.$vstr.'])#u', '$1U$2',$word);
$word = preg_replace('#(['.$vstr.'])y(['.$vstr.'])#u', '$1Y$2',$word);
return $word;
}
/**
* Undo the initial replaces
* @param string $word
* @return string
*/
private static function step0b($word) {
$word = str_replace(array('ä','ö','ü', 'U','Y'),array('a','o','u','u','y'),$word);
return $word;
}
private static function step1($word) {
$word = str_replace('ß','ss',$word);
self::getR($word);
$replaceCount = 0;
$arr = array('em','ern','er');
foreach ($arr as $s) {
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1,-1,$replaceCount);
if ($replaceCount > 0) {
$word = preg_replace('#'.$s.'$#u','',$word);
}
}
$arr = array('en','es','e');
foreach ($arr as $s) {
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1,-1,$replaceCount);
if ($replaceCount > 0) {
$word = preg_replace('#'.$s.'$#u','',$word);
$word = preg_replace('#niss$#u', 'nis', $word);
}
}
$word = preg_replace('/(['.implode('',self::$s_ending).'])s$/u','$1',$word);
return $word;
}
private static function step2($word) {
self::getR($word);
$replaceCount = 0;
$arr = array('est','er','en');
foreach ($arr as $s) {
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1,-1,$replaceCount);
if ($replaceCount > 0) {
$word = preg_replace('#'.$s.'$#u','',$word);
}
}
if (strpos(self::$R1,'st') !== false) {
self::$R1 = preg_replace('#st$#u','',self::$R1);
$word = preg_replace('#(...['.implode('',self::$st_ending).'])st$#u','$1',$word);
}
return $word;
}
private static function step3($word) {
self::getR($word);
$replaceCount = 0;
$arr = array('end', 'ung');
foreach ($arr as $s) {
if (preg_match('#'.$s.'$#u',self::$R2)) {
$word = preg_replace('#([^e])'.$s.'$#u','$1',$word, -1, $replaceCount);
if ($replaceCount > 0) {
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2,-1,$replaceCount);
}
}
}
$arr = array('isch', 'ik', 'ig');
foreach ($arr as $s) {
if (preg_match('#'.$s.'$#u',self::$R2)) {
$word = preg_replace('#([^e])'.$s.'$#u','$1',$word, -1, $replaceCount);
if ($replaceCount > 0) {
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2);
}
}
}
$arr = array('lich', 'heit');
foreach ($arr as $s) {
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2,-1,$replaceCount);
if ($replaceCount > 0) {
$word = preg_replace('#'.$s.'$#u','',$word);
} else {
if (preg_match('#'.$s.'$#u',self::$R1)) {
$word = preg_replace('#(er|en)'.$s.'$#u','$1',$word, -1, $replaceCount);
if ($replaceCount > 0) {
self::$R1 = preg_replace('#'.$s.'$#u','',self::$R1);
}
}
}
}
$arr = array('keit');
foreach ($arr as $s) {
self::$R2 = preg_replace('#'.$s.'$#u','',self::$R2,-1,$replaceCount);
if ($replaceCount > 0) {
$word = preg_replace('#'.$s.'$#u','',$word);
}
}
return $word;
}
/**
* Find R1 and R2
* @param string $word
*/
private static function getR($word) {
self::$R1 = "";
self::$R2 = "";
$vowels = implode("", self::$vowels);
$vowelGroup = "[{$vowels}]";
$nonVowelGroup = "[^{$vowels}]";
// R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel.
$pattern = "#(?P<rest>.*?{$vowelGroup}{$nonVowelGroup})(?P<r>.*)#u";
if(preg_match($pattern, $word, $match)){
$rest = $match["rest"];
$r1 = $match["r"];
// [...], but then R1 is adjusted so that the region before it contains at least 3 letters.
$cutOff = 3 - mb_strlen($rest);
if($cutOff > 0){
$r1 = mb_substr($r1, $cutOff);
}
self::$R1 = $r1;
}
//R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel.
if(preg_match($pattern, self::$R1, $match)){
self::$R2 = $match["r"];
}
}
}