100, //threhold for score of cluster 'min_length' => 80, // minimum length of blocks 'decay_factor' => 0.73, //decay factor for block scores 'continuous_factor' => 1.62, 'punctuation_weight' => 10, 'punctuations' => '/([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/', 'waste_expressions' => '/Copyright|All Rights Reserved/i', 'debug'=> false ); /** * Analyses the given HTML text, extracts body and title * * @param string $html * @param array $opt * @return */ public function analyze($html, $opt = array()) { //frameset or redirect if (preg_match('/<\/frameset>|]*url/i', $html)) { return array('', $this->title($html)); } $this->setOpt($opt); // header & title if (0 < count($matches = preg_split('/<\/head\s*>/im', $html))) { $html = $matches['1']; $title = $this->title($matches[0]); } else { $title = $this->title($html); } // Google AdSense Section Target $html = $this->section($html); //eliminate useles text $html = $this->eliminateUselessTags($html); //$this->hBlock //extract text blocks $factor = $continuous = 1.0; $body = ''; $score = 0; $bodylist = array(); $list = preg_split('/<\/?(?:div|center|td)[^>]*>|
]*class\s*=\s*[\"\']?(?:posted|plugin-\w+)[\'\"]?[^>]*>/', $html); for($i = 0; $i < count($list); $i++) { if (!$list[$i]) continue; $list[$i] = trim($list[$i]); if ($this->hasOnlyTags($list[$i])) continue; if (0 < strlen($body)) { $continuous /= $this->opt['continuous_factor']; } $notlinked = $this->eliminateLink($list[$i]); if (strlen($notlinked) < $this->opt['min_length']) continue; $c = (strlen($notlinked) + count(preg_split($this->opt['punctuations'], $notlinked)) * $this->opt['punctuation_weight']) * $factor; $factor *= $this->opt['decay_factor']; $not_body_rate = count(preg_split($this->opt['waste_expressions'], $list[$i])) + count(preg_split('/amazon[a-z0-9\.\/\-\?&]+-22/i', $list[$i])) / 2.0; $c *= ($not_body_rate > 0) ? pow(0.72, $not_body_rate) : 1; $c1 = $c * $continuous; if($c1 > $this->opt['threshold']) { $body .= $list[$i] . "\n"; $score += $c1; $continuous = $this->opt['continuous_factor']; } else if($c > $this->opt['threshold']) { $bodylist[] = array($body, $score); $body = $list[$i] + "\n"; $score = $c; $continuous = $this->opt['continuous_factor']; } } $bodylist[] = array($body, $score); usort($bodylist, array($this,'bdSort')); return array($this->stripTags($bodylist[0][0]), $title); } /** * Google AdSense Section Target * * @param string $html * @param string $sectionStylye */ protected function section($html, $sectionStylye = 'googlead') { $html = preg_replace('/.*?/ms', '', $html); if (preg_match('//', $html)) { preg_match_all('/.*?/ms', $html, $matches); $html = implode("\n", $matches[0]); } return $html; } /** * extract title * * @param string * @return string */ public function title($html) { if (preg_match('/