* @license PHP License * @package wb * @subpackage Markup */ WBClass::load('WBMarkup_Handler'); /** * Tag scanner / Markup Scanner * * Provides an SAX like event drivern API for none-XML but tagging like code * This class only scanns the XML-a-like string or file and devides it into * character data, opening tags, closing tags and entities. Other than real * XML scanner/parser it does not validate anything. On the contrary it tries * to walk through even corrupt content. Therefore it is useful to parse * "badly formated old-school HTML" or any other type of tag-based markup * string. * * As said, the scanner is only capable in deviding documents, it does not know * anything about the purpose. Albeit it has a handler which implements all * neccessary event functions (OnStartElement, OnCharacterData, etc.). This * handler actually is responsible to do whatever it is designed to do - usually * to transform the document to this or that format. * * @version 0.4.0 * @package wb * @subpackage Markup */ class WBMarkup_Scanner { /** * Stack of split content to parse through * @var array */ protected $contentStack = array(); /** * List of handlers that implements event callbacks * @var Markup_Handler */ protected $client = null; /** * Well known tags which are always empty * @var array */ protected $alwaysEmpty = array( 'img', 'br', 'input', 'hr' ); /** * Tell whether to well known empty tags should be closed automatically * @var bool */ protected $autoCloseEmpty = true; /** * The Scanner automatically validates and corrects the content * @var boolean */ protected $autoTidy = true; /** * Used for the autovalidMarkup Function. Every opened tag gonna be stored * on this stack an will be removed when the closing tag was scanned * * @var array */ protected $tagStack = array(); /** * Constant return values used in the tidyMarkup Function */ const TIDYVALID = 1; /** * Used if the */ const TIDYABORTED = 2; /** * Constant return values used in the tidyMarkup Function */ const TIDYIGNORE = 3; /** * Maximum Content Length * @var int */ private $maxContentLength = 0; /** * Current Position of Content * * Actually, this is the character date position * @var int */ private $contentPos = 0; /** * Suffix if content is to long * @var string */ private $content2LongSuffix = ' ...'; /** * Set Maxiumum Content Length * @param int */ public function setMaxContentLength($bytes = 0) { $this->maxContentLength = $bytes; } /** * Who is capable to handle the upcoming events? * * The handler is the one that actually does something within found tags * and character data. The handler will be called during scan - hence it * must implement the WBMarkup_Handler interface * * @todo see whether type-hint can be used * @param Markup_Handler $client * @return bool true on success */ public function setHandler($client) { /* if (!is_subclass_of($client , 'WBMarkup_Handler')) { WBClass::load('WBException_Class'); throw new WBException_Class('Handler must implement interface "WBMarkup_Handler"', 1, __CLASS__); } */ $this->client = $client; return true; } /** * Convinient function to scan files instead of strings * * Load file and call string scanner, piece of cake. * * @param string $file * @return bool true if scanner runs through, false if it was aborded. * @see scan() */ public function scanFile($file) { return $this->scan(file_get_contents($file)); } /** * Scan string * * Scans whole string and informs the specified client about EndElements, * StartElements and CDATA * * @param string $content * @return bool true if scanner runs through, false if it was aborded. */ public function scan($content) { if (!$this->client->onScanStart($content)) { return false; } $this->contentPos = 0; // shortcut if there are no tags at all if (false === strpos($content, '<')) { $content = $this->cData2cData($content); if (!$this->client->onCharacterData($content)) { return false; } if (!$this->client->onScanComplete()) { return false; } return true; } // remove comments $content = preg_replace('/\<\!--(.*)--\>/Ums', '', $content); /* * First step is splitting the given content with our regular expression. * After that the whole content is organized with an array. One Element * is always five elements long. Every elements gives information about * CDATA, the whole tag including bracets, indicator for closing tag, the tag name, attributes of that tag * * Example: * My CDATA Content
more CDATA Content * becomes: * '', , '', tag, '' * 'My CDATA Content', '
' , '', 'br', '/' * 'more CDATA Content', '', '', 'img', 'src=pic.jpg" /' */ $regExp = '/(<(\/?)([\w:]+)[[:space:]]*([^>]*)>)/im'; $this->contentStack = preg_split($regExp, $content, -1, PREG_SPLIT_DELIM_CAPTURE); $cnt = count($this->contentStack) - 1; $i = 0; // walk through content while ($i < $cnt) { $cData = $this->contentStack[$i++]; $fullTag = $this->contentStack[$i++]; // end tag like $end = false; if ($this->contentStack[$i++] === '/') { $end = true; } // tags with namespace $ns = false; $tag = $this->contentStack[$i++]; if (false !== strpos($tag, ':')) { list($ns, $tag) = explode(':', $tag, 2); } // string holding attributes: src="img.gif" foo="bar"... $attributeString = $this->contentStack[$i++]; // manage empty or autoclose tags tags like or $empty = false; if ( '/' === substr($attributeString, -1, 1)) { $attributeString = substr($attributeString, 0, -1); $empty = true; } else if ($this->autoCloseEmpty && in_array(strtolower($tag), $this->alwaysEmpty)) { $empty = true; } // character data if (0 < strlen($cData)){ if (!$this->parseCData($cData) ){ return false; } } // reached max length if (-1 == $this->contentPos) { while (!empty($this->tagStack)) { $ns = ''; $tag = array_pop($this->tagStack); $this->splitTagName($tag, $ns); $this->client->onEndElement($ns, $tag, false); } break; } // start tag if (!$end) { $this->tagStack[] = $this->getTagName($ns, $tag); $attributes = $this->parseAttributes($attributeString); if (!$this->client->onStartElement($ns, $tag, $attributes, $empty)) { return false; } } // end tag if ($end || $empty) { switch ($this->tidyMarkup($ns, $tag)){ case self::TIDYVALID: array_pop($this->tagStack); if( !$this->client->onEndElement($ns, $tag, $empty)) { return false; } break; case self::TIDYIGNORE: break; case self::TIDYABORTED: return false; break; default: WBClass::load('WBException_Xml'); throw new WBException_Xml('Unspecified return Value of validateMarkup', 1, __CLASS__ ); break; } } } // parse leftover CData if (0 < strlen($this->contentStack[$i])) { $text = $this->cData2cData($this->contentStack[$i]); if (!$this->parseCData($text)) { return false; } } // we are done if (!$this->client->onScanComplete()) { return false; } return true; } /** * Delivers the right tagname with his namespace if specified * * @param string $ns the Namespace * @param string $tag the tag * @return string */ protected function getTagName($ns, $tag) { if (0 < mb_strlen($ns)) { return sprintf('%s:%s', $ns, $tag); } return $tag; } /** * Delivers the right tagname with his namespace if specified * * @param string $ns the Namespace * @param string $tag the tag * @return boolean true = yes there is a namespace as well * false = there is no namespace */ protected function splitTagName(&$tag, &$ns) { $splitted = explode(':', $tag); if (2 == count($splitted)) { $ns = $splitted[0]; $tag = $splitted[1]; return true; } return false; } /** * Used to validate the read markup content by passing the closing tag with its namespace. * * If wanted this functions checks every time when a closing tag was red * that the doc structure is still valid. If not it will automatically * been corrected and reported to the WBMarkup_Handler. * * @param string $ns the namespace * @param string $tag the tag * @return int TIDYVALID = the scanner is not set to validate or the doc is valid * TIDYIGNORE = this closing tag has not been opened before => ignore this tag * TIDYABORTED = the content was not valid, but during the validation the WBMarkup_Handler interrupeted the scanner */ protected function tidyMarkup($ns, $tag) { if (!$this->autoTidy){ return self::TIDYVALID; } //count our stack $cnt = count($this->tagStack); // the tagorder is valid => we don't have to do a thing if ($cnt == 0|| $this->tagStack[$cnt - 1] === $this->getTagName($ns, $tag)) { return self::TIDYVALID; } // seek of opened tags $found = array_search($this->getTagName($ns, $tag), $this->tagStack); // no found, ignore it if ($found === false){ return self::TIDYIGNORE; } // close all left over tags while ($cnt > $found) { $tag = ''; $ns = ''; $tag = array_pop($this->tagStack); $this->splitTagName($tag, $ns); /* * Tell the client that we're closing this previous tag for him * and be sure that he doesn't interrupt you */ if (!$this->client->onEndElement($ns, $tag, false)) { return self::TIDYABORTED; } --$cnt; } return 1; } /** * Parses cData for possible entities * * This function will parse the cData with another regular expression. * Found entities like " " are reported to each client. As always * clients decide whether to continue or stop scanner * * @param strings $cData * @return bool true to continue, false to stop scanner */ protected function parseCData(&$cData) { // find entities $regExp = '/&(#?)([\w]+);/u'; $cDataStack = preg_split($regExp, $cData, -1, PREG_SPLIT_DELIM_CAPTURE); $cnt = count($cDataStack) - 1; $i = 0; // scan content while ($i < $cnt) { // simple character data $text = $this->cData2cData($cDataStack[$i++]); // default is no unicode $unicode = false; // is Unicode, e.g. Ä if ('#' == $cDataStack[$i++]) { $unicode = true; } //entity $entity = $cDataStack[$i++]; // simple text if (0 < strlen($text)) { if (!$this->client->onCharacterData($text)) { return false; } } // found entity? if (0 < strlen($entity)) { ++$this->contentPos; if (!$this->client->onEntityElement($entity, $unicode)) { return false; } } } // process leftover cdata if (0 < strlen($cDataStack[$i])) { $text = $this->cData2cData($cDataStack[$i]); if (!$this->client->onCharacterData($text)) { return false; } } return true; } /** * Text 2 Text * * Collect text until max length reached * * @see maxContentLength * @param string * @return string */ private function cData2cData($text) { $tmp = array(); $token = preg_split('/(\\s+)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); foreach ($token as $t) { // simply add white spaces if (preg_match('/\\s/', $t)) { $tmp[] = $t; continue; } $len = strlen($t); $this->contentPos += $len; if (0 < $this->maxContentLength && $this->maxContentLength <= $this->contentPos) { $tmp[] = substr($t, 0, $len); $tmp[] = $this->content2LongSuffix; $this->contentPos = -1; break; } $tmp[] = $t; } return implode('', $tmp); } /** * Converted attribute string to array * * Parses the attribute string and return an array wich contains the name * and his value. * * The attributes string is something like: src="pic.jpg" width="100px" * * @param string $string contains attribute string * @return array attribute list */ protected function parseAttributes($string) { $string = trim($string); $attributes = array(); if (empty($string)) { return $attributes; } // parse properly quoted attributes $match = array(); if (preg_match_all( '/([a-z:A-Z_0-9]+)="((?:\\\.|[^"\\\])*)"/U', $string, $match )) { for( $i = 0; $i < count( $match[1] ); $i++ ) { $attributes[$match[1][$i]] = $match[2][$i]; } } // try to parse non-quote attribures else { $list = array_map('trim', explode(' ', $string)); foreach ($list as $l) { if (empty($l)) { continue; } // key-value if (strstr($l, '=')) { $l = explode('=', $l, 2); } // binary attribute else { $l = array($l, 1); } // only if attribute does not exist if (isset($attributes[$l[0]])) { continue; } $attributes[$l[0]] = $l[1]; } } return $attributes; } }