PHP Classes

File: simple_crawler_example.php

Recommend this page to a friend!
  Classes of Jacek Lukasiewicz   Simple Page Crawler   simple_crawler_example.php   Download  
File: simple_crawler_example.php
Role: Example script
Content type: text/plain
Description: example
Class: Simple Page Crawler
Retrieve HTML pages and extract its elements
Author: By
Last change:
Date: 13 years ago
Size: 1,382 bytes
 

Contents

Class file image Download
<?php
/**
 * Example using of SimpleCrawler class library
 */

require 'simple_crawler.classes.php';

$reader = new HtmlReader();

$page = 'http://falsztyn.boo.pl';
//$page = 'http://www.phpclasses.org';

//read content from url
$html = $reader->getPageContent($page);

//document content object
$htmlDoc = new HtmlDocument($html);

//document body part object
$body = $htmlDoc->getBody();

//objects array of page links
$links = $body->grabLinks();

//clean text version of document body object
$cleanBody = $body->getStrippedBody();

//counted words from cleaned document body (word=>count)
$words = new BodyWords();
$pageWords = $words->findWords($cleanBody->getContent());
$words->appendWords($pageWords);


//follow front page links with recursive=1
foreach($links as $link) {
    if(
$link->url == '/') continue;
    if(
$link->type == 1) {
       
$pageLink = $page.$link->url;
    } else {
        continue;
//no follow external links
        //$pageLink = $link->url;
   
}
   
$html = $reader->getPageContent($pageLink);
   
   
$htmlDoc = new HtmlDocument($html);
   
$body = $htmlDoc->getBody();
   
$cleanBody = $body->getStrippedBody();
   
   
$pageWords = $words->findWords($cleanBody->getContent());
   
$words->appendWords($pageWords, $link->url);
}

//display words:count per page
print_r($words->getWords());
//here you may do something with this words

?>