* @license PHP License * @package wb * @subpackage Markup */ WBClass::load('WBMarkup_Handler'); /** * Tag scanner / Markup Scanner * * Provides an SAX like event drivern API for none-XML but tagging like code * This class only scanns the XML-a-like string or file and devides it into * character data, opening tags, closing tags and entities. Other than real * XML scanner/parser it does not validate anything. On the contrary it tries * to walk through even corrupt content. Therefore it is useful to parse * "badly formated old-school HTML" or any other type of tag-based markup * string. * * As said, the scanner is only capable in deviding documents, it does not know * anything about the purpose. Albeit it has a handler which implements all * neccessary event functions (OnStartElement, OnCharacterData, etc.). This * handler actually is responsible to do whatever it is designed to do - usually * to transform the document to this or that format. * * @version 0.3.0 * @package wb * @subpackage Markup */ class WBMarkup_Scanner { /** * Stack of split content to parse through * @var array */ protected $contentStack = array(); /** * List of handlers that implements event callbacks * @var Markup_Handler */ protected $client = null; /** * Well known tags which are always empty * @var array */ protected $alwaysEmpty = array( 'img', 'br', 'input', 'hr' ); /** * Tell whether to well known empty tags should be closed automatically * @var bool */ protected $autoCloseEmpty = true; /** * The Scanner automatically validates and corrects the content * @var boolean */ protected $autoTidy = true; /** * Used for the autovalidMarkup Function. Every opened tag gonna be stored * on this stack an will be removed when the closing tag was scanned * * @var array */ protected $tagStack = array(); /** * Constant return values used in the tidyMarkup Function */ const TIDYVALID = 1; /** * Used if the */ const TIDYABORTED = 2; /** * Constant return values used in the tidyMarkup Function */ const TIDYIGNORE = 3; /** * Who is capable to handle the upcoming events? * * The handler is the one that actually does something within found tags * and character data. The handler will be called during scan - hence it * must implement the WBMarkup_Handler interface * * @todo see whether type-hint can be used * @param Markup_Handler $client * @return bool true on success */ public function setHandler($client) { /* if (!is_subclass_of($client , 'WBMarkup_Handler')) { WBClass::load('WBException_Class'); throw new WBException_Class('Handler must implement interface "WBMarkup_Handler"', 1, __CLASS__); } */ $this->client = $client; return true; } /** * Convinient function to scan files instead of strings * * Load file and call string scanner, piece of cake. * * @param string $file * @return bool true if scanner runs through, false if it was aborded. * @see scan() */ public function scanFile( $file ) { return $this->scan( file_get_contents( $file ) ); } /** * Scan string * * Scans whole string and informs the specified client about EndElements, * StartElements and CDATA * * @param string $content * @return bool true if scanner runs through, false if it was aborded. */ public function scan( $content ) { if( !$this->client->onScanStart( $content ) ) { return false; } // shortcut if there are no tags at all if( false === strpos( $content, '<' ) ) { if( !$this->client->onCharacterData( $content ) ) { return false; } if( !$this->client->onScanComplete() ) { return false; } return true; } // remove comments $content = preg_replace('/\<\!--(.*)--\>/Ums', '', $content); /* * First step is splitting the given content with our regular expression. * After that the whole content is organized with an array. One Element * is always five elements long. Every elements gives information about * CDATA, the whole tag including bracets, indicator for closing tag, the tag name, attributes of that tag * * Example: * My CDATA Content
more CDATA Content * becomes: * '', , '', tag, '' * 'My CDATA Content', '
' , '', 'br', '/' * 'more CDATA Content', '', '', 'img', 'src=pic.jpg" /' */ $regExp = '/(<(\/?)([\w:]+)[[:space:]]*([^>]*)>)/im'; $this->contentStack = preg_split( $regExp, $content, -1, PREG_SPLIT_DELIM_CAPTURE ); $cnt = count( $this->contentStack ) - 1; $i = 0; // walk through content while( $i < $cnt ) { $cData = $this->contentStack[$i++]; $fullTag = $this->contentStack[$i++]; // end tag like $end = false; if( $this->contentStack[$i++] === '/' ) { $end = true; } // tags with namespace $ns = false; $tag = $this->contentStack[$i++]; if( strpos( $tag, ':' ) !== false ) { list( $ns, $tag ) = explode( ':', $tag, 2 ); } // string holding attributes: src="img.gif" foo="bar"... $attributeString = $this->contentStack[$i++]; // manage empty or autoclose tags tags like or $empty = false; if( substr( $attributeString, -1, 1 ) === '/' ) { $attributeString = substr( $attributeString, 0, -1 ); $empty = true; } else if( $this->autoCloseEmpty && in_array( strtolower( $tag ), $this->alwaysEmpty ) ) { $empty = true; } // character data if( strlen( $cData ) ){ if( !$this->parseCData( $cData ) ){ return false; } } // start tag if( !$end ) { $this->tagStack[] = $this->getTagName( $ns, $tag ); $attributes = $this->parseAttributes( $attributeString ); if( !$this->client->onStartElement( $ns, $tag, $attributes, $empty ) ) { return false; } } // end tag if( $end || $empty ) { switch( $this->tidyMarkup( $ns, $tag )){ case self::TIDYVALID: array_pop( $this->tagStack ); if( !$this->client->onEndElement( $ns, $tag, $empty ) ) { return false; } break; case self::TIDYIGNORE: break; case self::TIDYABORTED: return false; break; default: WBClass::load('WBException_Xml'); throw new WBException_Xml('Unspecified return Value of validateMarkup', 1, __CLASS__ ); break; } } } //parse leftover CData if( strlen( $this->contentStack[$i] ) ) { if( !$this->parseCData( $this->contentStack[$i] ) ) { return false; } } // we are done if( !$this->client->onScanComplete() ) { return false; } return true; } /** * Delivers the right tagname with his namespace if specified * * @param string $ns the Namespace * @param string $tag the tag * @return string */ protected function getTagName( $ns, $tag ) { if( mb_strlen( $ns ) > 0 ){ return "$ns:$tag"; } else { return $tag; } } /** * Delivers the right tagname with his namespace if specified * * @param string $ns the Namespace * @param string $tag the tag * @return boolean true = yes there is a namespace as well * false = there is no namespace */ protected function splitTagName( &$tag, &$ns ) { $splitted = explode(':', $tag); if( count( $splitted ) == 2 ) { $ns = $splitted[0]; $tag = $splitted[1]; return true; } return false; } /** * Used to validate the red markup content by passing the closing tag with its namespace. * * If wanted this functions checks every time when a closing tag was red * that the doc structure is still valid. If not it will automatically * been corrected and reported to the WBMarkup_Handler. * * @param string $ns the namespace * @param string $tag the tag * @return const int TIDYVALID = the scanner is not set to validate or the doc is valid * TIDYIGNORE = this closing tag has not been opened before => ignore this tag * TIDYABORTED = the content was not valid, but during the validation the WBMarkup_Handler interrupeted the scanner */ protected function tidyMarkup($ns, $tag) { if (!$this->autoTidy){ return self::TIDYVALID; } //count our stack $cnt = count($this->tagStack); // the tagorder is valid => we don't have to do a thing if ($cnt == 0|| $this->tagStack[$cnt - 1] === $this->getTagName($ns, $tag)) { return self::TIDYVALID; } // seek of opened tags $found = array_search($this->getTagName($ns, $tag), $this->tagStack); // no found, ignore it if ($found === false){ return self::TIDYIGNORE; } // close all left over tags while ($cnt > $found) { $tag = ''; $ns = ''; $tag = array_pop($this->tagStack); $this->splitTagName($tag, $ns); /* * Tell the client that we're closing this previous tag for him * and be sure that he doesn't interrupt you */ if (!$this->client->onEndElement($ns, $tag, false)) { return self::TIDYABORTED; } --$cnt; } return 1; } /** * Parses cData for possible entities * * This function will parse the cData with another regular expression. * Found entities like " " are reported to each client. As always * clients decide whether to continue or stop scanner * * @param strings $cData * @return bool true to continue, false to stop scanner */ protected function parseCData( &$cData ) { // find entities $regExp = '/&(#?)([\w]+);/u'; $cDataStack = preg_split( $regExp, $cData, -1, PREG_SPLIT_DELIM_CAPTURE ); $cnt = count( $cDataStack ) - 1; $i = 0; //Scan now our content while( $i < $cnt ) { // simple character data $text = $cDataStack[$i++]; // default is no unicode $unicode = false; // is Unicode, e.g. Ä if( $cDataStack[$i++] === '#' ) { $unicode = true; } //entity $entity = $cDataStack[$i++]; // simple text if( strlen( $text ) ) { if( !$this->client->onCharacterData( $text ) ) { return false; } } // found entity? if( strlen( $entity ) ) { if( !$this->client->onEntityElement( $entity, $unicode ) ) { return false; } } } // process leftover cdata if( strlen( $cDataStack[$i] ) ) { if( !$this->client->onCharacterData( $cDataStack[$i] ) ) { return false; } } return true; } /** * Converted attribute string to array * * Parses the attribute string and return an array wich contains the name * and his value. * * The attributes string is something like: src="pic.jpg" width="100px" * * @param string $string contains attribute string * @return array attribute list */ protected function parseAttributes( $string ) { $string = trim($string); $attributes = array(); if (empty($string)) { return $attributes; } // parse properly quoted attributes $match = array(); if (preg_match_all( '/([a-z:A-Z_0-9]+)="((?:\\\.|[^"\\\])*)"/U', $string, $match )) { for( $i = 0; $i < count( $match[1] ); $i++ ) { $attributes[$match[1][$i]] = $match[2][$i]; } } // try to parse non-quote attribures else { $list = array_map('trim', explode(' ', $string)); foreach ($list as $l) { if (empty($l)) { continue; } // key-value if (strstr($l, '=')) { $l = explode('=', $l, 2); } // binary attribute else { $l = array($l, 1); } // only if attribute does not exist if (isset($attributes[$l[0]])) { continue; } $attributes[$l[0]] = $l[1]; } } return $attributes; } }