image-fetcher/image_finder.class.php

<?php

/**
 * Finds images from a give URL.
 *
 * @author   Torleif Berger
 * @link     http://www.geekality.net/?p=1585
 * @license  http://creativecommons.org/licenses/by/3.0/
 */
class ImageFinder
{
	private $document;
	private $url;
	private $base;


	/**
	 * Creates a new image finder object.
	 */
	public function __construct($url)
	{
		// Store url
		$this->url = $url;
	}


	/**
	 * Loads the HTML from the url if not already done.
	 */
	public function load()
	{
		// Return if already loaded
		if($this->document)
			return;
		
		// Get the HTML document
		$this->document = self::get_document($this->url);

		// Get the base url
		$this->base = self::get_base($this->document);
		if( ! $this->base)
			$this->base = $this->url;
	}


	/**
	 * Returns an array with all the images found.
	 */
	public function get_images()
	{
		// Makes sure we're loaded
		$this->load();

		// Image collection array
		$images = array();
		
		// For all found img tags
		foreach($this->document->getElementsByTagName('img') as $img)
		{
			// Extract what we want
			$image = array
			(
				'src' => self::make_absolute($img->getAttribute('src'), $this->base),
			);
			
			// Skip images without src
			if( ! $image['src'])
				continue;

			// Add to collection. Use src as key to prevent duplicates.
			$images[$image['src']] = $image;
		}

		// Return values
		return array_values($images);
	}


	/**
	 * Gets the html of a url and loads it up in a DOMDocument.
	 */
	private static function get_document($url)
	{
		// Set up and execute a request for the HTML
		$request = curl_init();
		curl_setopt_array($request, array
		(
			CURLOPT_URL => $url,
			
			CURLOPT_RETURNTRANSFER => TRUE,
			CURLOPT_HEADER => FALSE,
			
			CURLOPT_SSL_VERIFYPEER => TRUE,
			CURLOPT_CAINFO => 'cacert.pem',

			CURLOPT_FOLLOWLOCATION => TRUE,
			CURLOPT_MAXREDIRS => 10,
		));
		$response = curl_exec($request);
		curl_close($request);

		// Create DOM document
		$document = new DOMDocument();

		// Load response into document, if we got any
		if($response)
		{
			libxml_use_internal_errors(true);
			$document->loadHTML($response);
			libxml_clear_errors();
		}

		return $document;
	}



	/**
	 * Tries to get the base tag href from the given document.
	 */
	private static function get_base(DOMDocument $document)
	{
		$tags = $document->getElementsByTagName('base');

		foreach($tags as $tag)
			return $tag->getAttribute('href');

		return NULL;
	}


	/**
	 * Makes sure a url is absolute.
	 */
	private static function make_absolute($url, $base) 
	{
		// Return base if no url
		if( ! $url) return $base;

		// Already absolute URL
		if(parse_url($url, PHP_URL_SCHEME) != '') return $url;
		
		// Only containing query or anchor
		if($url[0] == '#' || $url[0] == '?') return $base.$url;
		
		// Parse base URL and convert to local variables: $scheme, $host, $path
		extract(parse_url($base));

		// If no path, use /
		if( ! isset($path)) $path = '/';
	 
		// Remove non-directory element from path
		$path = preg_replace('#/[^/]*$#', '', $path);
	 
		// Destroy path if relative url points to root
		if($url[0] == '/') $path = '';
		
		// Dirty absolute URL
		$abs = "$host$path/$url";
	 
		// Replace '//' or '/./' or '/foo/../' with '/'
		$re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
		for($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) {}
		
		// Absolute URL is ready!
		return $scheme.'://'.$abs;
	}
}