<?php
/**
 * Tag scanner / Markup Scanner
 *
 * $Id$
 *
 * @author gERD Schaufelberger <gerd@php-tools.net>
 * @license PHP License
 * @package wb
 * @subpackage Markup
 */

WBClass::load('WBMarkup_Handler');

/**
 * Tag scanner / Markup Scanner
 *
 * Provides an SAX like event drivern API for none-XML but tagging like code
 * This class only scanns the XML-a-like string or file and devides it into
 * character data, opening tags, closing tags and entities. Other than real
 * XML scanner/parser it does not validate anything. On the contrary it tries
 * to walk through even corrupt content. Therefore it is useful to parse
 * "badly formated old-school HTML" or any other type of tag-based markup
 * string.
 *
 * As said, the scanner is only capable in deviding documents, it does not know
 * anything about the purpose. Albeit it has a handler which implements all
 * neccessary event functions (OnStartElement, OnCharacterData, etc.). This
 * handler actually is responsible to do whatever it is designed to do - usually
 * to transform the document to this or that format.
 *
 * @version 0.4.0
 * @package wb
 * @subpackage Markup
 */
class WBMarkup_Scanner
{

    /**
     * Stack of split content to parse through
     * @var array
     */
    protected $contentStack     = array();

    /**
     * List of handlers that implements event callbacks
     * @var Markup_Handler
     */
    protected $client   = null;

    /**
     * Well known tags which are always empty
     * @var array
     */
    protected $alwaysEmpty    = array(
                                        'img',
                                        'br',
                                        'input',
                                        'hr'
                                    );

    /**
     * Tell whether to well known empty tags should be closed automatically
     * @var bool
     */
    protected $autoCloseEmpty   = true;

    /**
     * The Scanner automatically validates and corrects the content
     * @var boolean
     */
    protected $autoTidy  = true;

    /**
     * Used for the autovalidMarkup Function. Every opened tag gonna be stored
     * on this stack an will be removed when the closing tag was scanned
     *
     * @var array
     */
    protected $tagStack = array();
    
    /**
     * Constant return values used in the tidyMarkup Function
     */
    const TIDYVALID     = 1;

    /**
     * Used if the
     */
    const TIDYABORTED   = 2;

    /**
     * Constant return values used in the tidyMarkup Function
     */
    const TIDYIGNORE    = 3;

    /**
     * Maximum Content Length
     * @var int
     */
    private $maxContentLength   =   0;

    /**
     * Current Position of Content
     *
     * Actually, this is the character date position
     * @var int
     */
    private $contentPos =   0;

    /**
     * Suffix if content is to long
     * @var string
     */
    private $content2LongSuffix =   ' ...';

    /**
     * Set Maxiumum Content Length
     * @param int
     */
    public function setMaxContentLength($bytes = 0)
    {
        $this->maxContentLength =   $bytes;
    }

    /**
     * Who is capable to handle the upcoming events?
     *
     * The handler is the one that actually does something within found tags
     * and character data. The handler will be called during scan - hence it
     * must implement the WBMarkup_Handler interface
     *
     * @todo see whether type-hint can be used
     * @param Markup_Handler $client
     * @return bool true on success
     */
    public function setHandler($client)
    {
        /*
        if (!is_subclass_of($client , 'WBMarkup_Handler')) {
            WBClass::load('WBException_Class');
            throw new WBException_Class('Handler must implement interface "WBMarkup_Handler"', 1, __CLASS__);
        }
        */
        $this->client   =   $client;
        return true;
    }

    /**
     * Convinient function to scan files instead of strings
     *
     * Load file and call string scanner, piece of cake.
     *
     * @param string $file
     * @return bool true if scanner runs through, false if it was aborded.
     * @see scan()
     */
    public function scanFile($file)
    {
        return $this->scan(file_get_contents($file));
    }

    /**
     * Scan string
     *
     * Scans whole string and informs the specified client about EndElements,
     * StartElements and CDATA
     *
     * @param string $content
     * @return bool true if scanner runs through, false if it was aborded.
     */
    public function scan($content)
    {
        if (!$this->client->onScanStart($content)) {
            return false;
        }

        $this->contentPos   =   0;

        // shortcut if there are no tags at all
        if (false === strpos($content, '<')) {
            $content    =   $this->cData2cData($content);

            if (!$this->client->onCharacterData($content)) {
                return false;
            }
            if (!$this->client->onScanComplete()) {
                return false;
            }
            return true;
        }

        // remove comments
        $content    =   preg_replace('/\<\!--(.*)--\>/Ums', '', $content);
        
       /*
        * First step is splitting the given content with our regular expression.
        * After that the whole content is organized with an array. One Element
        * is always five elements long. Every elements gives information about
        * CDATA, the whole tag including bracets, indicator for closing tag, the tag name, attributes of that tag
        *
        * Example:
        * <tag>My CDATA Content<br />more CDATA Content<img src="pic.jpg" />
        * becomes:
        * '', <tag>, '', tag, ''
        * 'My CDATA Content', '<br />'  , '', 'br', '/'
        * 'more CDATA Content', '<img src="pic.jpg" />', '', 'img', 'src=pic.jpg" /'
        */
        $regExp           = '/(<(\/?)([\w:]+)[[:space:]]*([^>]*)>)/im';

        $this->contentStack = preg_split($regExp, $content, -1, PREG_SPLIT_DELIM_CAPTURE);
        $cnt    =   count($this->contentStack) - 1;
        $i      =   0;

        // walk through content
        while ($i < $cnt) {

            $cData              = $this->contentStack[$i++];
            $fullTag            = $this->contentStack[$i++];
            // end tag like </table>
            $end        = false;
            if ($this->contentStack[$i++] === '/') {
                $end    =   true;
            }

            // tags with namespace <foo:bar>
            $ns                 = false;
            $tag                = $this->contentStack[$i++];
            if (false !== strpos($tag, ':')) {
                list($ns, $tag) = explode(':', $tag, 2);
            }

            // string holding attributes: src="img.gif" foo="bar"...
            $attributeString    = $this->contentStack[$i++];

            // manage empty or autoclose tags tags like <img /> or <img>
            $empty      = false;
            if ( '/' === substr($attributeString, -1, 1)) {
                $attributeString    =   substr($attributeString, 0, -1);
                $empty      = true;
            } else if ($this->autoCloseEmpty && in_array(strtolower($tag), $this->alwaysEmpty)) {
                $empty      = true;
            }

            // character data
            if (0 < strlen($cData)){
                if (!$this->parseCData($cData) ){
                    return false;
                }
            }

            // reached max length
            if (-1 == $this->contentPos) {
                while (!empty($this->tagStack)) {
                    $ns     =   '';
                    $tag    =   array_pop($this->tagStack);
                    $this->splitTagName($tag, $ns);
                    $this->client->onEndElement($ns, $tag, false);
                }
                break;
            }


            // start tag
            if (!$end) {
                $this->tagStack[]   =   $this->getTagName($ns, $tag);
                $attributes         =   $this->parseAttributes($attributeString);

                if (!$this->client->onStartElement($ns, $tag, $attributes, $empty)) {
                    return false;
                }
            }

            // end tag
            if ($end || $empty) {
                switch ($this->tidyMarkup($ns, $tag)){

                    case self::TIDYVALID:
                        array_pop($this->tagStack);
                        if( !$this->client->onEndElement($ns, $tag, $empty)) {
                            return false;
                        }
                        break;

                    case self::TIDYIGNORE:
                        break;

                    case self::TIDYABORTED:
                        return false;
                        break;

                    default:
                        WBClass::load('WBException_Xml');
                        throw new WBException_Xml('Unspecified return Value of validateMarkup', 1, __CLASS__ );
                        break;
                }
            }
        }

        // parse leftover CData
        if (0 < strlen($this->contentStack[$i])) {
            $text   =   $this->cData2cData($this->contentStack[$i]);
            if (!$this->parseCData($text)) {
                return false;
            }
        }

        // we are done
        if (!$this->client->onScanComplete()) {
            return false;
        }
        return true;
    }

    /**
     * Delivers the right tagname with his namespace if specified
     *
     * @param string $ns the Namespace
     * @param string $tag the tag
     * @return string
     */
    protected function getTagName($ns, $tag)
    {
        if (0 < mb_strlen($ns)) {
            return sprintf('%s:%s', $ns, $tag);
        }
        return $tag;
    }

    /**
     * Delivers the right tagname with his namespace if specified
     *
     * @param string $ns the Namespace
     * @param string $tag the tag
     * @return boolean  true = yes there is a namespace as well
     *         false = there is no namespace
     */
    protected function splitTagName(&$tag, &$ns)
    {
        $splitted   =   explode(':', $tag);
        if (2 == count($splitted)) {
            $ns     =   $splitted[0];
            $tag    =   $splitted[1];
            return true;
        }

        return false;
    }

    /**
     * Used to validate the read markup content by passing the closing tag with its namespace.
     *
     * If wanted this functions checks every time when a closing tag was red
     * that the doc structure is still valid. If not it will automatically
     * been corrected and reported to the WBMarkup_Handler.
     *
     * @param string $ns the namespace
     * @param string $tag the tag
     * @return int TIDYVALID = the scanner is not set to validate or the doc is valid
     *           TIDYIGNORE = this closing tag has not been opened before => ignore this tag
     *           TIDYABORTED = the content was not valid, but during the validation the WBMarkup_Handler interrupeted the scanner
     */
    protected function tidyMarkup($ns, $tag)
    {
        if (!$this->autoTidy){
            return self::TIDYVALID;
        }

        //count our stack
        $cnt = count($this->tagStack);

        // the tagorder is valid => we don't have to do a thing
        if ($cnt == 0|| $this->tagStack[$cnt - 1] === $this->getTagName($ns, $tag)) {
            return self::TIDYVALID;
        }

        // seek of opened tags
        $found = array_search($this->getTagName($ns, $tag), $this->tagStack);

        // no found, ignore it
        if ($found === false){
            return self::TIDYIGNORE;
        }

        // close all left over tags
        while ($cnt > $found) {

            $tag = '';
            $ns  = '';

            $tag = array_pop($this->tagStack);
            $this->splitTagName($tag, $ns);

            /*
             * Tell the client that we're closing this previous tag for him
             * and be sure that he doesn't interrupt you
             */
            if (!$this->client->onEndElement($ns, $tag, false)) {
                return self::TIDYABORTED;
            }

            --$cnt;
        }

        return 1;
    }

    /**
     * Parses cData for possible entities
     *
     * This function will parse the cData with another regular expression.
     * Found entities like "&nbsp;" are reported to each client. As always
     * clients decide whether to continue or stop scanner
     *
     * @param strings $cData
     * @return bool true to continue, false to stop scanner
     */
    protected function parseCData(&$cData)
    {
        // find entities
        $regExp = '/&(#?)([\w]+);/u';

        $cDataStack =   preg_split($regExp, $cData, -1, PREG_SPLIT_DELIM_CAPTURE);
        $cnt        =   count($cDataStack) - 1;
        $i          =   0;

        // scan content
        while ($i < $cnt) {

            // simple character data
            $text = $this->cData2cData($cDataStack[$i++]);

            // default is no unicode
            $unicode = false;

            // is Unicode, e.g. &#196;
            if ('#' == $cDataStack[$i++]) {
                $unicode = true;
            }

            //entity
            $entity = $cDataStack[$i++];

            // simple text
            if (0 < strlen($text)) {
                if (!$this->client->onCharacterData($text)) {
                    return false;
                }
            }

            // found entity?
            if (0 < strlen($entity)) {
                ++$this->contentPos;
                if (!$this->client->onEntityElement($entity, $unicode)) {
                    return false;
                }
            }
        }

        // process leftover cdata
        if (0 < strlen($cDataStack[$i])) {
            $text = $this->cData2cData($cDataStack[$i]);
            if (!$this->client->onCharacterData($text)) {
                return false;
            }
        }

        return true;
    }

    /**
     * Text 2 Text
     *
     * Collect text until max length reached
     *
     * @see maxContentLength
     * @param string
     * @return string
     */
    private function cData2cData($text)
    {
        $tmp    =   array();
        $token  =   preg_split('/(\\s+)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
        foreach ($token as $t) {

            // simply add white spaces
            if (preg_match('/\\s/', $t)) {
                $tmp[]  =   $t;
                continue;
            }

            $len    =   strlen($t);
            $this->contentPos   +=  $len;

            if (0 < $this->maxContentLength && $this->maxContentLength <= $this->contentPos) {
                $tmp[]  =   substr($t, 0, $len);
                $tmp[]  =   $this->content2LongSuffix;

                $this->contentPos   =   -1;
                break;
            }

            $tmp[]  =   $t;
        }

        return implode('', $tmp);
    }

    /**
     * Converted attribute string to array
     *
     * Parses the attribute string and return an array wich contains the name
     * and his value.
     *
     * The attributes string is something like: src="pic.jpg" width="100px"
     *
     * @param string $string contains attribute string
     * @return array attribute list
     */
    protected function parseAttributes($string)
    {
        $string     =   trim($string);
        $attributes =   array();
        if (empty($string)) {
            return $attributes;
        }

        // parse properly quoted attributes
        $match      =   array();
        if (preg_match_all( '/([a-z:A-Z_0-9]+)="((?:\\\.|[^"\\\])*)"/U', $string, $match )) {
            for( $i = 0; $i < count( $match[1] ); $i++ ) {
                $attributes[$match[1][$i]]  =    $match[2][$i];
            }
        }

        // try to parse non-quote attribures
        else {
            $list   =   array_map('trim', explode(' ', $string));
            foreach ($list as $l) {
                if (empty($l)) {
                    continue;
                }

                // key-value
                if (strstr($l, '=')) {
                    $l  =   explode('=', $l, 2);
                }
                // binary attribute
                else {
                    $l  =   array($l, 1);
                }

                // only if attribute does not exist
                if (isset($attributes[$l[0]])) {
                    continue;
                }
                $attributes[$l[0]]  =   $l[1];
            }
        }

        return $attributes;
    }
}