urlextractor.php
<?
class extracturl {
protected $url;
//var $openpage;
function detPageinfo($myurl){
$ch = curl_init();
$timeout = 5;
curl_setopt($ch, CURLOPT_URL, $myurl);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
//curl_setopt($ch, CURLOPT_HEADER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, "uni-web (http://uni-web.net/)");
//curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($ch, CURLOPT_REFERER, "http://uni-web.net/");
ob_start();
$myurl = curl_exec($ch);
ob_end_clean();
curl_close($ch);
unset($ch);
//$openpage = $this->myurl;
$this->url = $myurl;
//return($this);
}
function pagetitle(){
$openpage = $this->url;
if(preg_match('/<title>(.*)<\/title>/ismU', $openpage, $matches)) {
$title = $matches[1];
$pagetitle = $this->check_encoding($title);
$pagetitle = trim($pagetitle);
}
return($pagetitle);
}
function description($url, $limit){
$openpage = $this->url;
$description = @get_meta_tags(trim($url));
$description = @htmlspecialchars_decode(@html_entity_decode($description['description']));
$description = ltrim($description);
$description = $this->check_encoding($description);
$description = $this->remoove_chars($description);
$descr = $this->extract_text($openpage);
$descr = $this->check_encoding($descr);
$description = "$description $descr";
if($limit != ""){
$description = substr($description ,0,$limit). "...";
}
$RemoveChars = array( "([\40])" , "([^а-яА-Я0-9-a-zA-Z0-9-.-])", "(-{2,})" );
$ReplaceWith = array("-", "", "-");
$description = preg_replace($RemoveChars, $ReplaceWith, $description);
$description = str_replace('-', ' ', $description);
return($description);
}
function getImages($url){
$regexp = '<img (?:.*?)src=(?:"|\'){1}(.*?)(?:"|\'){1}';
$openpage = $this->url;
$i = 0;
if(preg_match_all("/$regexp/ism", stripslashes(htmlspecialchars_decode($openpage)), $matches, PREG_SET_ORDER)){
foreach($matches as $element){
//parse_url($links, PHP_URL_PATH)
$links = $element[1];
@list($width, $height, $type, $attra) = getimagesize($links);
if($width && $type){
$links = $links;
}else{
$domain = $this->getDomain($url);
$links = "http://$domain$links";
}
@list($width, $height, $type, $attra) = getimagesize($links);
if($width >= 50 && $type){
$i++;
$images .= "<li><img src=\"$links\" class=\"thumbsimg\" alt=\"$links\" id=\"image$i\" /></li>";
}
}
}
return($images);
}
function getDomain($myurl){
$parts = parse_url($myurl);
$domain = $parts['scheme'].'://'.$parts['host'];
$domain = str_replace("http://", "", $domain);
$domain = str_replace("www.", "", $domain);
if($domain == "://" or $domain == ""){
@preg_match("/^(http:\/\/)?([^\/]+)/i", "$myurl", $matches);
$host = $matches[2];
@preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches);
$domain = $matches[0];
}
if($domain==""){$domain="unknow";}
return($domain);
}
function check_encoding($that){
$thatcheck = $that;
if($this->detect_encoding($thatcheck) == "utf-8" or $this->detect_encoding($thatcheck) == "UTF-8"){
$thatcheck = @iconv('utf-8','windows-1251', $thatcheck);
}
return($thatcheck);
}
function detect_encoding($string) {
static $list = array('utf-8', 'UTF-8', 'windows-1251');
foreach ($list as $item) {
$sample = @iconv($item, $item, $string);
if (@md5($sample) == @md5($string))
return $item;
}
return null;
}
/// premahwame izli6nite chars
function remoove_chars($text){
$inarray = array("nbsp", "bdquo", "ldquo", "rdquo", """, "Размер на шрифта", "Powered by", "Copyright", "Theme", "Category", "Archives", "Posted by");
$outarry = array(" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ");
$output = str_replace($inarray, $outarry, $text);
return($output);
}
function truncateWords($input, $numwords, $padding="") {
$output = strtok($input, " \n");
while(--$numwords > 0)
$output .= " " . strtok(" \n");
if($output != $input)
$output .= $padding;
return $output;
}
function extract_text_old($openpage){
$file = $openpage;
$file = $this->check_encoding($file);
$file = @html_entity_decode($file, ENT_QUOTES);
$file = @htmlspecialchars_decode($file, ENT_QUOTES);
$output = @preg_replace('@<a[^>]*?>.*?<\/a>@si', ' ', $file);
$output = @preg_replace('@<script[^>]*?>.*?<\/script>@si', ' ', $output);
$output = @preg_replace('@<style[^>]*?>.*?<\/style>@si', '', $output);
$output = @preg_replace("/(<\/?)(\w+)([^>]*>)/is", " ", $output);
$output = @htmlspecialchars_decode($output, ENT_QUOTES);
$output = @html_entity_decode($output, ENT_QUOTES);
//$output = unhtmlentities($output);
/* if(strlen(trim($output, " ")) <= 300){
$output = @preg_replace('@<script[^>]*?>.*?<\/script>@si', ' ', $file);
$output = @preg_replace('@<style[^>]*?>.*?<\/style>@si', ' ', $output);
$output = @preg_replace("/(<\/?)(\w+)([^>]*>)/is", " ", $output);
}*/
$output = strip_tags($output);
$output = trim($output, " ");
$output = $this->truncateWords($output, 300);
$RemoveChars = array( "([\40])" , "([^а-яА-Я-0-9-a-zA-Z-0-9-.-])", "(-{2,})" );
$ReplaceWith = array("-", "", "-");
$output = preg_replace($RemoveChars, $ReplaceWith, $output);
$output = str_replace('-', ' ', $output);
return($output);
}
function extract_text($openpage){
$openpage = @preg_replace('@<a[^>]*?>.*?<\/a>@si', ' ', $openpage);
include "content_extractor_distrib.php";
$extractor = new ContentExtractor();
$output = $extractor->extract($openpage);
$output = @html_entity_decode($output, ENT_QUOTES, "UTF-8");
$output = $this->check_encoding($output);
$output = $this->remoove_chars($output);
$output = $this->truncateWords($output, 300);
$output = strip_tags($output);
$RemoveChars = array( "([\40])" , "([^а-яА-Я0-9-a-zA-Z0-9-.-])", "(-{2,})" );
$ReplaceWith = array("-", "", "-");
$output = preg_replace($RemoveChars, $ReplaceWith, $output);
$output = str_replace('-', ' ', $output);
if(trim($output) == ""){
//$outputa = try_get_other_tags($this, "h2");
$output = $this->extract_text_old($this);
$output = "$outputa $output";
}
//$output = substr($output, 0,1000);
return($output);
}
}
?>