Bot that retrieves url meta data and other infos

From a given url this function retrieves page title, meta description, keywords, favicon, and an array of 5 images to use for links. It call file_get_contents and then make some regular expression job.

This function is included in the Mini Bots Class.

print_r(getLinksInfo("http://www.rockit.it/articolo/825/nada-studio-report-quando-nasce-una-canzone"));

function getLinksInfo($url) {
	$web_page = file_get_contents($url);
	
	$data['keywords']="";
	$data['description']="";
	$data['title']="";
	$data['favicon']="";
	$data['images']=array();

	preg_match_all('#<title([^>]*)?>(.*)</title>#Uis', $web_page, $title_array);
	$data['title'] = $title_array[2][0];
	preg_match_all('#<meta([^>]*)(.*)>#Uis', $web_page, $meta_array);
	for($i=0;$i<count($meta_array[0]);$i++) {
		if (strtolower(attr($meta_array[0][$i],"name"))=='description') $data['description'] = attr($meta_array[0][$i],"content");
		if (strtolower(attr($meta_array[0][$i],"name"))=='keywords') $data['keywords'] = attr($meta_array[0][$i],"content");
	}
	preg_match_all('#<link([^>]*)(.*)>#Uis', $web_page, $link_array);
	for($i=0;$i<count($link_array[0]);$i++) {
		if (strtolower(attr($link_array[0][$i],"rel"))=='shortcut icon') $data['favicon'] = makeabsolute($url,attr($link_array[0][$i],"href"));
	}
	preg_match_all('#<img([^>]*)(.*)/?>#Uis', $web_page, $imgs_array);
	$imgs = array();
	for($i=0;$i<count($imgs_array[0]);$i++) {
		if ($src = attr($imgs_array[0][$i],"src")) {
			$src = makeabsolute($url,$src);
			if (getRemoteFileSize($src)>15000) array_push($imgs,$src);
		}
		if (count($imgs)>5) break;
	}
	$data['images']=$imgs;

	return $data;
}

Here is the output:

Array
(
    [keywords] => Nada
    [description] => (Nada e il compagno Gerri Manzoli, foto d archivio) Nada &egrave; al Naural HeadQuarter di Ferrara per la registrazione del suo ultimo album in studio, il ventitreesimo, un nuovo capitolo che segna un ulteriore punto nella sua carriera da musicista, iniziata da giovanissima alla fine dei 60. Il titolo non &egrave; stato ancora scelto, cos&igrave; come la data d uscita. Ma possiamo anticiparvi...
    [title] => Nada Studio report - Quando nasce una canzone
    [favicon] => http://www.rockit.it/favicon.ico
    [images] => Array
        (
            [0] => http://ww2.rockit.it/rockit/immagini/Nadain2.jpg
            [1] => http://ww2.rockit.it/rockit/immagini/NadaIn3.jpg
        )

)

And here there are the used functions:

function attr($s,$attrname) {
		//retrn html attribute
		preg_match_all('#\s*('.$attrname.')\s*=\s*["|\']([^"\']*)["|\']\s*#i', $s, $x); 
		if (count($x)>=3) return $x[2][0];
		return "";
	}


function makeabsolute($url,$link) {
	if (strpos( $link,"http://")===0 ) return $link;
	$p = parse_url($url);
	if (strpos( $link, "/")===0) return "http://".$p['host'].$link;
	return str_replace(substr(strrchr($url, "/"), 1),"",$url).$link;
}

function getRemoteFileSize($url) {
	if (substr($url,0,4)=='http') {
		$x = array_change_key_case(get_headers($url, 1),CASE_LOWER);
		if ( strcasecmp($x[0], 'HTTP/1.1 200 OK') != 0 ) { $x = $x['content-length'][1]; }
		else { $x = $x['content-length']; }
	}
	else { $x = @filesize($url); }
	return $x;
} 
  1. PHP to get twitter infos and avatar
  2. PHP bot to grab meteo information from Google
  3. Parsing Flickr Feed with PHP tutorial
  4. Do spelling using google spell checker
  5. get MySpace events with a PHP function

One comment

  1. Pingback: Need a Website Crawler - Net Builders

Post a comment

You may use the following HTML:
<a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>