<?
 
/*
 
Author: Alexey G. Piyanin (e-mail: drdrzlo at mail dot ru)
 
Date:   Jun 7 2006
 
Title:  Get page part
 
*/
 
include('SAXParser.php');
 
 
function begin($tag,$attributes,$readSize){
 
  global $stack,$t,$isBeginNews,$news,$currentNewsIndex;
 
  if (!in_array($tag,$t)) array_unshift($stack,$tag);
 
  if ($isBeginNews){
 
    if ($tag=='a' && join('/',$stack)=='a/font/td/tr/table/td/tr/table/td/tr/table/font/center/body/html'){
 
      $news[$currentNewsIndex]['href'] = $attributes['href'];
 
    }elseif($currentNewsIndex>0 && $tag=='table' && join('/',$stack)=='table/font/td/tr/table/td/tr/table/td/tr/table/font/center/body/html'){
 
      return -1;
 
    }
 
  }
 
}
 
 
function endTag($tag,$readSize){
 
  global $stack,$isBeginNews,$news,$currentNewsIndex;
 
  if ($isBeginNews && $tag=='a' && join('/',$stack)=='a/font/td/tr/table/td/tr/table/td/tr/table/font/center/body/html'){
 
    $currentNewsIndex++;
 
  }
 
  while(reset($stack)!=$tag && !empty($stack)) array_shift($stack);
 
  array_shift($stack);
 
}
 
 
function character($str){
 
  global $stack,$isBeginNews,$news,$currentNewsIndex;
 
  //----
 
  if (!$isBeginNews){
 
    if (join('/',$stack)=='font/a/b/td/tr/table/td/tr/table/td/tr/table/font/center/body/html' && strtolower($str)=='in the news') $isBeginNews = true; // begin "In the News" part
 
  }else{
 
    if (join('/',$stack)=='a/font/td/tr/table/td/tr/table/td/tr/table/font/center/body/html')
 
      $news[$currentNewsIndex]['text'] = $str;
 
  }
 
}
 
 
$t = array('br','meta','img','spacer','input','base','hr','link',);
 
$stack = array();
 
$URL = 'http://yahoo.com';
 
 
$isBeginNews = false;
 
 
$currentNewsIndex = 0;
 
$news = array();
 
$parser = new HTML_SAXParser();
 
$parser->initFunc('begin','endTag','character');?>
 
<html>
 
<body>
 
<center>Source page:<br><iframe src="<?=$URL?>" width="600" height="400" ></iframe><br><br></center>
 
News list (part "In the News"):<br>
 
<?$parser->parse($URL);
 
foreach($news as $row){?>
 
<a href="<?=$URL.'/'.$row['href']?>" target="_blank"><?=$row['text']?></a><br>
 
<?}?>
 
</body></html>
 
 |