<?php
/**
 * Tag scanner / Markup Scanner
 *
 * $Id$
 *
 * @author gERD Schaufelberger <gerd@php-tools.net>
 * @license PHP License
 * @package wb
 * @subpackage Markup
 */

WBClass::load('WBMarkup_Handler');

/**
 * Tag scanner / Markup Scanner
 *
 * Provides an SAX like event drivern API for none-XML but tagging like code
 * This class only scanns the XML-a-like string or file and devides it into
 * character data, opening tags, closing tags and entities. Other than real
 * XML scanner/parser it does not validate anything. On the contrary it tries
 * to walk through even corrupt content. Therefore it is useful to parse
 * "badly formated old-school HTML" or any other type of tag-based markup
 * string.
 *
 * As said, the scanner is only capable in deviding documents, it does not know
 * anything about the purpose. Albeit it has a handler which implements all
 * neccessary event functions (OnStartElement, OnCharacterData, etc.). This
 * handler actually is responsible to do whatever it is designed to do - usually
 * to transform the document to this or that format.
 *
 * @version 0.3.0
 * @package wb
 * @subpackage Markup
 */
class WBMarkup_Scanner
{

   /**
    * Stack of split content to parse through
    * @var array
    */
    protected $contentStack     = array();

   /**
    * List of handlers that implements event callbacks
    * @var Markup_Handler
    */
    protected $client   = null;

   /**
    * Well known tags which are always empty
    * @var array
    */
    protected $alwaysEmpty    = array(
                                        'img',
                                        'br',
                                        'input',
                                        'hr'
                                    );

   /**
    * Tell whether to well known empty tags should be closed automatically
    * @var bool
    */
    protected $autoCloseEmpty   = true;

   /**
    * The Scanner automatically validates and corrects the content
    * @var boolean
    */
    protected $autoTidy  = true;

   /**
    * Used for the autovalidMarkup Function. Every opened tag gonna be stored
    * on this stack an will be removed when the closing tag was scanned
    *
    * @var array
    */
    protected $tagStack = array();
    
   /**
    * Constant return values used in the tidyMarkup Function
    */
    const TIDYVALID     = 1;

   /**
    * Used if the
    */
    const TIDYABORTED   = 2;

   /**
    * Constant return values used in the tidyMarkup Function
    */
    const TIDYIGNORE    = 3;

   /**
    * Who is capable to handle the upcoming events?
    *
    * The handler is the one that actually does something within found tags 
    * and character data. The handler will be called during scan - hence it 
    * must implement the WBMarkup_Handler interface
    *
	* @todo see whether type-hint can be used
    * @param Markup_Handler $client
    * @return bool true on success
    */
    public function setHandler($client)
    {
		/*
        if (!is_subclass_of($client , 'WBMarkup_Handler')) {
            WBClass::load('WBException_Class');
            throw new WBException_Class('Handler must implement interface "WBMarkup_Handler"', 1, __CLASS__);
        }
        
		*/
        $this->client   =   $client;
        return true;
    }

   /**
    * Convinient function to scan files instead of strings
    *
    * Load file and call string scanner, piece of cake.
    *
    * @param string $file
    * @return bool true if scanner runs through, false if it was aborded.
    * @see scan()
    */
    public function scanFile( $file )
    {
        return $this->scan( file_get_contents( $file ) );
    }

   /**
    * Scan string
    *
    * Scans whole string and informs the specified client about EndElements,
    * StartElements and CDATA
    *
    * @param string $content
    * @return bool true if scanner runs through, false if it was aborded.
    */
    public function scan( $content )
    {
        if( !$this->client->onScanStart( $content ) ) {
            return false;
        }

        // shortcut if there are no tags at all
        if( false === strpos( $content, '<' ) ) {
            if( !$this->client->onCharacterData( $content ) ) {
                return false;
            }
            if( !$this->client->onScanComplete() ) {
                return false;
            }
            return true;
        }

        // remove comments
        $content    =   preg_replace('/\<\!--(.*)--\>/Ums', '', $content);
        
       /*
        * First step is splitting the given content with our regular expression.
        * After that the whole content is organized with an array. One Element
        * is always five elements long. Every elements gives information about
        * CDATA, the whole tag including bracets, indicator for closing tag, the tag name, attributes of that tag
        *
        * Example:
        * <tag>My CDATA Content<br />more CDATA Content<img src="pic.jpg" />
        * becomes:
        * '', <tag>, '', tag, ''
        * 'My CDATA Content', '<br />'  , '', 'br', '/'
        * 'more CDATA Content', '<img src="pic.jpg" />', '', 'img', 'src=pic.jpg" /'
        */
        $regExp           = '/(<(\/?)([\w:]+)[[:space:]]*([^>]*)>)/im';

        $this->contentStack = preg_split( $regExp, $content, -1, PREG_SPLIT_DELIM_CAPTURE );

        $cnt    =   count( $this->contentStack ) - 1;
        $i      =   0;

        // walk through content
        while( $i < $cnt ) {

            $cData              = $this->contentStack[$i++];
            $fullTag            = $this->contentStack[$i++];
            // end tag like </table>
            $end        = false;
            if( $this->contentStack[$i++] === '/' ) {
                $end    =   true;
            }

            // tags with namespace <foo:bar>
            $ns                 = false;
            $tag                = $this->contentStack[$i++];
            if( strpos( $tag, ':' ) !== false ) {
                list( $ns, $tag ) = explode( ':', $tag, 2 );
            }

            // string holding attributes: src="img.gif" foo="bar"...
            $attributeString    = $this->contentStack[$i++];

            // manage empty or autoclose tags tags like <img /> or <img>
            $empty      = false;
            if( substr( $attributeString, -1, 1 ) === '/' ) {
                $attributeString    =   substr( $attributeString, 0, -1 );
                $empty      = true;
            } else if( $this->autoCloseEmpty && in_array( strtolower( $tag ), $this->alwaysEmpty ) ) {
                $empty      = true;
            }

            // character data
            if( strlen( $cData ) ){
            	if( !$this->parseCData( $cData ) ){
                    return false;
                }
            }

            // start tag
            if( !$end ) {
                $this->tagStack[]   =   $this->getTagName( $ns, $tag );
                $attributes         =   $this->parseAttributes( $attributeString );

                if( !$this->client->onStartElement(  $ns, $tag, $attributes, $empty ) ) {
                    return false;
                }
            }

            // end tag
            if( $end || $empty ) {
                switch( $this->tidyMarkup( $ns, $tag )){

                    case self::TIDYVALID:
                        array_pop( $this->tagStack );
                        if( !$this->client->onEndElement( $ns, $tag, $empty ) ) {
                            return false;
                        }
                        break;

                    case self::TIDYIGNORE:
                        break;

                    case self::TIDYABORTED:
                        return false;
                        break;

                    default:
                        WBClass::load('WBException_Xml');
                        throw new WBException_Xml('Unspecified return Value of validateMarkup', 1, __CLASS__ );
                        break;
                }
            }
        }

        //parse leftover CData
        if( strlen( $this->contentStack[$i] ) ) {
	        if( !$this->parseCData( $this->contentStack[$i] ) ) {
	            return false;
	        }
        }

        // we are done
        if( !$this->client->onScanComplete() ) {
            return false;
        }
        return true;
    }

   /**
    * Delivers the right tagname with his namespace if specified
    *
    * @param string $ns the Namespace
    * @param string $tag the tag
    * @return string
    */
    protected function getTagName( $ns, $tag )
    {
        if( mb_strlen( $ns ) > 0 ){
            return "$ns:$tag";
        } else {
            return $tag;
        }
    }

   /**
    * Delivers the right tagname with his namespace if specified
    *
    * @param string $ns the Namespace
    * @param string $tag the tag
    * @return boolean  true = yes there is a namespace as well
    * 					false = there is no namespace
    */
    protected function splitTagName( &$tag, &$ns )
    {
        $splitted = explode(':', $tag);
        if( count( $splitted ) == 2 ) {
            $ns  = $splitted[0];
            $tag = $splitted[1];
            return true;
        }

        return false;
    }

   /**
    * Used to validate the red markup content by passing the closing tag with its namespace.
    *
    * If wanted this functions checks every time when a closing tag was red
    * that the doc structure is still valid. If not it will automatically 
    * been corrected and reported to the WBMarkup_Handler.
    *
    * @param string $ns the namespace
    * @param string $tag the tag
    * @return const int TIDYVALID    = the scanner is not set to validate or the doc is valid
    * 					 TIDYIGNORE  = this closing tag has not been opened before => ignore this tag
    * 					 TIDYABORTED = the content was not valid, but during the validation the WBMarkup_Handler interrupeted the scanner
    */
    protected function tidyMarkup($ns, $tag)
    {
        if (!$this->autoTidy){
            return self::TIDYVALID;
        }

        //count our stack
        $cnt = count($this->tagStack);

        // the tagorder is valid => we don't have to do a thing
        if ($cnt == 0|| $this->tagStack[$cnt - 1] === $this->getTagName($ns, $tag)) {
            return self::TIDYVALID;
        }

        // seek of opened tags
        $found = array_search($this->getTagName($ns, $tag), $this->tagStack);

        // no found, ignore it
        if ($found === false){
            return self::TIDYIGNORE;
        }

        // close all left over tags
        while ($cnt > $found) {

            $tag = '';
            $ns  = '';

            $tag = array_pop($this->tagStack);
            $this->splitTagName($tag, $ns);

            /*
             * Tell the client that we're closing this previous tag for him
             * and be sure that he doesn't interrupt you
             */
            if (!$this->client->onEndElement($ns, $tag, false)) {
                return self::TIDYABORTED;
            }

            --$cnt;
        }

        return 1;
    }

   /**
    * Parses cData for possible entities
    *
    * This function will parse the cData with another regular expression.
    * Found entities like "&nbsp;" are reported to each client. As always
    * clients decide whether to continue or stop scanner
    *
    * @param strings $cData
    * @return bool true to continue, false to stop scanner
    */
    protected function parseCData( &$cData )
    {
    	// find entities
    	$regExp = '/&(#?)([\w]+);/u';

        $cDataStack =   preg_split( $regExp, $cData, -1, PREG_SPLIT_DELIM_CAPTURE );
        $cnt        =   count( $cDataStack ) - 1;
        $i          =   0;

        //Scan now our content
        while( $i < $cnt ) {

        	// simple character data
            $text = $cDataStack[$i++];

            // default is no unicode
            $unicode = false;

            // is Unicode, e.g. &#196;
            if( $cDataStack[$i++] === '#' ) {
            	$unicode = true;
            }

            //entity
            $entity = $cDataStack[$i++];

            // simple text
            if( strlen( $text ) ) {
                if( !$this->client->onCharacterData( $text ) ) {
                    return false;
                }
			}

			// found entity?
			if( strlen( $entity ) ) {
                if( !$this->client->onEntityElement( $entity, $unicode ) ) {
                    return false;
                }
			}
        }

        // process leftover cdata
        if( strlen( $cDataStack[$i] ) ) {
            if( !$this->client->onCharacterData( $cDataStack[$i] ) ) {
                return false;
            }
        }

    	return true;
    }

   /**
    * Converted attribute string to array
    *
    * Parses the attribute string and return an array wich contains the name
    * and his value.
    *
    * The attributes string is something like: src="pic.jpg" width="100px"
    *
    * @param string $string contains attribute string
    * @return array attribute list
    */
    protected function parseAttributes( $string )
    {
        $string     =   trim($string);
        $attributes =   array();
        if (empty($string)) {
            return $attributes;
        }

        // parse properly quoted attributes
        $match      =   array();
        if (preg_match_all( '/([a-z:A-Z_0-9]+)="((?:\\\.|[^"\\\])*)"/U', $string, $match )) {
            for( $i = 0; $i < count( $match[1] ); $i++ ) {
                $attributes[$match[1][$i]]  =    $match[2][$i];
            }
        }

        // try to parse non-quote attribures
        else {
            $list   =   array_map('trim', explode(' ', $string));
            foreach ($list as $l) {
                if (empty($l)) {
                    continue;
                }

                // key-value
                if (strstr($l, '=')) {
                    $l  =   explode('=', $l, 2);
                }
                // binary attribute
                else {
                    $l  =   array($l, 1);
                }

                // only if attribute does not exist
                if (isset($attributes[$l[0]])) {
                    continue;
                }
                $attributes[$l[0]]  =   $l[1];
            }
        }

        return $attributes;
    }
}