<?
 
/*
 
Author: Alexey G. Piyanin (e-mail: drdrzlo at mail dot ru)
 
Date:   Jun 7 2006
 
Title:  Get wikipedia page content
 
*/
 
include('SAXParser.php');
 
 
function character($str){
 
  global $isComment,$startContent,$endContent,$commentPos;
 
  if(!$startContent){
 
    if($isComment && trim($str)=='start content') $startContent=true;
 
  }else{
 
    if($isComment && trim($str)=='end content') { $endContent=$commentPos; /*return(-1);*/ }
 
  }
 
}
 
 
function comment($start,$pos){
 
  global $isComment,$startContent,$commentPos,$beginContent;
 
  //----
 
  if($startContent && !$start && $beginContent==0) $beginContent=$pos+3;
 
  //----
 
  $isComment=$start;
 
  //----
 
  $commentPos=$pos;
 
}
 
 
$URL = 'http://en.wikipedia.org/wiki/Kalimpong';
 
#---
 
$isComment    = false;
 
$commentPos   = 0;
 
$startContent = false;
 
#---
 
$beginContent = 0;
 
$endContent   = 0;
 
#---
 
$parser = new HTML_SAXParser();
 
$parser->initFunc('','','character','comment');
 
#---
 
$content=join('',file($URL)); // ATTENTION!!! replace for correct loading content
 
?>
 
<html>
 
<body>
 
<center>Source page:<br><iframe src="<?=$URL?>" width="600" height="400" ></iframe><br><br></center>
 
Content:<br>
 
<?
 
$parser->parseString($content);
 
//----
 
echo substr($content,$beginContent,$endContent-$beginContent);
 
?>
 
</body></html>
 
 |