class searchTextExtractor {
var $maxlength = 160;
var $searchwords = array();
var $pregs = 0;
var $text = "";
var $hellipsis = " … ";
var $wordnumber = 0;
function searchTextExtractor() {
//constructor
}
function extractText($text, $searchwords, $maxlength=null, $highlight = false,
$hellipsis=null) {
$this->text = $text;
$this->searchwords = $searchwords;
if ($maxlength) {
$this->maxlength = $maxlength;
}
if ($hellipsis) {
$this->hellipsis = $hellipsis;
}
if (mb_strlen($this->text) < $this->maxlength * 1.15) {
// if text is shorter than maxlength+15% then just return it all
$results = $this->text;
} else {
$this->countWords();
switch ($this->wordnumber) {
case 0:
$results = $this->extractBegining();
break;
case 1:
$results = $this->extractSingleWord();
break;
case 2:
$results = $this->extractTwoWords();
break;
default:
$results = $this->extractManyWords();
break;
}
}
if ($highlight) {
$highlight_results=$this->highlight($results);
if(!empty($highlight_results)){
$results=$highlight_results;
};
}
return $results;
}
function countWords() {
// checks what words REALLY meet in the text and counts em
foreach ($this->searchwords as $id => $word) {
if (!preg_match("/(?<![\pL\pN_])$word(?![\pL\pN_])/iu", $this->text)) {
unset($this->searchwords[$id]);
}
$this->pregs++;
}
return $this->wordnumber = count($this->searchwords);
}
function extractBegining() {
// return first $this->maxlength characters
$this->pregs++;
preg_match("/^(?:.{0,$this->maxlength}[\.;:,]|.{0,$this->maxlength}(?![\pL\pN_]))/smiu", $this->text, $matches);
return $matches[0] . $this->hellipsis;
}
function extractSingleWord() {
// finding first occurance of single word
$word = reset($this->searchwords);
$spacelength = round($this->maxlength/2);
$this->pregs++;
preg_match("/(\W|^).{0,$spacelength}(?<![\pL\pN_])$word(?![\pL\pN_]).{0,$spacelength}(\W|$)/smiu", $this->text, $matches);
if ( preg_match( '/(?:^|' . "\r|\n" . ')' . preg_quote( $matches[ 0 ], '/' ) . '/u', $this->text ) ) {
$noHellipL = true;
} else {
$noHellipL = false;
};
if ( preg_match( '/' . preg_quote( $matches[ 2 ], '/' ) . '(?:$|' . "\r|\n" . ')/u', $this->text ) ) {
$noHellipR = true;
} else {
$noHellipR = false;
};
return ($noHellipL? "": $this->hellipsis) . $matches[0] . ($noHellipR ? "":$this->hellipsis);
}
function extractTwoWords() {
// using optimized logic to find chunk containing both search words
// it should be much faster than using many words search logic
$spacelength = round($this->maxlength / 2.15);
$word1 = reset($this->searchwords);
$word2 = next($this->searchwords);
$this->pregs++;
if (preg_match("/(\W|^)(?:\w+\W+){3,7}(?:$word1(?![\pL\pN_]).{0,$spacelength}(?<![\pL\pN_])$word2|$word2(?![\pL\pN_]).{0,$spacelength}(?<![\pL\pN_])$word1)(?![\pL\pN_]).{0,$spacelength}(\W|$)/smiu", $this->text, $matches)) {
return ($matches[1] != "" ? $this->hellipsis : "") . $matches[0] . ($matches[2] != "" ? $this->hellipsis : "");
} else {
$spacelength = round($spacelength/2.5);
preg_match("/(\W|^).{0,$spacelength}(?<![\pL\pN_])$word1(?![\pL\pN_]).{0,$spacelength}(?=\W)/smiu", $this->text, $matches);
$matchedtext = ($matches[1] != "" ? $this->hellipsis : "") . $matches[0] . $this->hellipsis;
preg_match("/(\W).{0,$spacelength}(?<![\pL\pN_])$word2(?![\pL\pN_]).{0,$spacelength}(\W|$)/smiu", $this->text, $matches);
return $matchedtext . $matches[0] . ($matches[2] != "" ? $this->hellipsis : "");
}
}
function extractManyWords() {
// try to find single text chunk containing all search words.
$spacelength = round($this->maxlength/($this->wordnumber+0.15-($this->wordnumber*.15)));
$this->pregs++;
if (preg_match_all("/(\s|^)(?:\w+\s+){3,7}(?:(?<=(?<![\pL\pN_]))(?:" . join("|", $this->searchwords) . ")(?=(?![\pL\pN_])).{0,$spacelength}){{$this->wordnumber}}(\s|$)/smiu", $this->text, $matches)) {
$maxwords=0;
foreach($matches[0] as $key => $match) {
$foundwords=0;
$this->pregs++;
preg_match_all("/(?:(?<![\pL\pN_]))(?:" . join("|", $this->searchwords) . ")(?:(?![\pL\pN_]))/iu", mb_strtolower($match), $words);
$wordcount = count(array_unique($words[0])) + count($words[0]) / ($this->wordnumber*1.6);
if ($wordcount > $maxwords) {
$maxwords = $wordcount;
$maxkey = $key;
}
if ($wordcount >= $this->wordnumber) {
return ($matches[1][$key] != "" ? $this->hellipsis : "") . $match . ($matches[2][$key] != "" ? $this->hellipsis : "");
}
}
// still here? okay, what was the maxwordcount per chunk?
if ($maxwords > 1) {
return ($matches[1][$maxkey] != "" ? $this->hellipsis : "") . $matches[0][$maxkey] . ($matches[2][$maxkey] != "" ? $this->hellipsis : "");
}
}
// still here? Sadly, this means that single chunk was not found,
// lets try to find two chunks containing as much words as possible
// simpliest solution (should be fixed somehow later):
// !!! Idea: do the following:
// 1. Decrease maxlength twice
// 2. Go through found chunks and try to find chunks containing several words.
// 3. Try to get 2 chunks containing all words
// Or... 2 chunks containing maximum words.
return $this->extractTwoWords();
}
function highlight($text) {
foreach ($this->searchwords as $id => $word) {
$text = preg_replace( '/(?<![\pL\pN_])([\w-]*' . preg_quote( $word, '/' ) . '[\w-]*)(?![\pL\pN_])/iu', '<span class="search_found">\\1</span>', $text);
}
return $text;
}
}