<?php
/**
 * Manager Wikipedia Index
 *
 * $Id$
 *
 * @author gERD Schaufelberger <gerd@php-tools.net>
 * @license PHP License
 * @package WB
 * @subpackage Wikipedia
 */

WBClass::load( 'WBCache'
            , 'WBStream'
            , 'WBClock'
            , 'WBLog' );

/**
 * Manage wimkipedia index
 *
 * Download and search Wikipedia index. This index class create local copies of
 * the  Wikipedia page index. This allows to search Wikipedia articles (fast)
 * which is quite usefull to create links to Wikipedia etc.
 *
 * As Wikipedia is available for other languages then English, as well, there
 * are is an index for each language (en, de, fr).
 *
 * For configuration values see {@link wikipedia.xml}. Also there is a script
 * (which uses this class) that updates the local copy of Wikipedia index on
 * a regular base.
 *
 *
 *
 * @version 0.1.1
 * @package WB
 * @subpackage Wikipedia
 */
class WBWikipedia_Index
{
    /**
     * SQL connector
     * @var WBDatasource_SQL
     */
    protected $sql;

    /**
     * Logger
     * @var WBLog
     */
    protected $log;

    /**
     * force
     * @var bool
     */
    protected $force = false;

    /**
     * prefix string for used tables
     * @var string
     */
    protected $pre = 'wp';

    /**
     * current langauge
     * @var string
     */
    protected $lang = 'en';

    /**
     * start up
     *
     * Start SQL connection
     */
    public function __construct()
    {
        $this->log     =   WBLog::start( __CLASS__ );
        $this->sql     =   WBClass::create( 'WBDatasource_SQL' );
        $this->config  =   WBClass::create( 'WBConfig' );
        $this->config->load( 'wikipedia' );
        $this->pre     =   $this->config->get( 'table/prefix', $this->pre );
    }

    /**
     * use language
     *
     * @param string $lang
     */
    public function setLang( $lang )
    {
        $langs  =    $this->config->get( 'index' );

        if( !is_array( $langs ) ) {
            WBClass::load( 'WBException_Config' );
            throw new WBException_Argument( 'Config "wikipedia.xml" does not know any indices.', 1, __CLASS__ );
            return false;
        }
        if( !in_array( $lang, $langs ) ) {
            WBClass::load( 'WBException_Config' );
            throw new WBException_Argument( 'Argument "'. $lang .'" does not match values on config "wikipedia.xml"', 2, __CLASS__ );
            return false;
        }

        $this->lang =   $lang;
        return true;
    }


    public function find( $search, &$list )
    {
        $lang   =   $this->lang;
        $pre    =   $this->pre;
        $search =   str_replace( ' ', '_', $search );
        $sFor   =   $this->sql->quote( $search, true );
        $sRev   =   $this->sql->quote( strrev( $search ), true );

        echo "search : " . $search . "\n";

        $sql    =   <<<EOT
SELECT SQL_CALC_FOUND_ROWS * FROM ${pre}wikipedia_${lang}_page
    WHERE title LIKE '${sFor}%'
        OR title_reverse LIKE '${sRev}%'
    ORDER BY title
    LIMIT 10
EOT;

        $list       =   array();
        $res        =   $this->sql->query( $sql );
        while( $l = $res->fetch_assoc() ) {
            $list[] =   $l;
        }
        $res->free_result();

        $foundRes   =   $this->sql->query( 'SELECT FOUND_ROWS()' );
        $found      =   $foundRes->fetch_array( MYSQLI_NUM );
        return $found[0];
    }

    /**
     * set force flag
     *
     * Force update process, even if not required. Don't use this
     * unless you really know what you do! See description of
     * {@link update()} and wait and wait and wait...
     *
     * @see updateAll()
     * @see update()
     * @param bool $force
     * @return bool current state of force flage
     */
    public function setForce( $force = null )
    {
        if( is_bool( $force ) ) {
            $this->force    =    $force;
        }
        return $this->force;
    }


    /**
     * update index for all languages
     *
     * Call @link upadte() for all languages this class ist configured
     *
     * Usually, this is what you cronjob is supposed to call. :-)
     *
     * @see update()
     * @return unknown
     */
    public function updateAll()
    {
        $langs  =    $this->config->get( 'index' );
        if( !is_array( $lang ) ) {
            return false;
        }

        foreach( $lags as $lang ) {
            $this->setLang( $lang );
            $this->update();
        }
        return true;
    }

    /**
     * Update index for one language
     *
     * Template method that manages update steps
     *
     * - fetch download URL
     * - check whether update is erquired
     * - download SQL dump from wikipedia
     * - prepare table
     * - source SQL dump int temporary table
     * - copy relevant data to local table
     * - cleaup
     *
     * At first, it tries to fetch the URL of the downloadable file. In case
     * the URL hasn't changed, you are lucky and no update is required.
     * Otherwise you have to wait...
     *
     * Please consider, that this process is very time consuming - simply
     * because of huge numbers. The biggest Wikipedia - the english one - has
     * more than 2.3 million articles. The download file is about 340 MBytes.
     * Depending on you bandwith the download itself may take several hours.
     *
     * The fetched file unzips to 1.1 GByte. Thats quite a whole lot SQL
     * INSERT statements. Feeding you databse with this dump takes around 5
     * minutes.
     *
     * Then relevant columns are copied to another (the productivity) table
     * which is quite fast (less than 2 minutes). Well, as the indices were
     * switched off, the database server has to build the keys. This takes
     * ages
     *
     * @see lookupUpdate()
     * @see download()
     * @see prepareTablePage()
     * @see sourceSQLDump()
     * @see prepareTable()
     * @see fillTable()
     * @see finishTable()
     * @see removeTablePage()
     * @return bool true in case update was done
     */
    public function update()
    {
        $lang   =   $this->lang;
        $log    =    array(
            'action'	=>    'lookup',
            'lang'		=>    $lang,
            'elapsed'	=>    0
            );

        // look for URL
        $start    =    WBClock::now();
        $url = $this->lookupUpdate( $lang );
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );

        if( empty( $url ) ) {
            $log['action']  = 'failed-lookup';
            $log['elapsed'] = WBClock::stop( $start ) . 'ms';
            return false;
        }

        // load cached list
        $start =   WBClock::now();
        $cache =   WBCache::get( __CLASS__ . '.lookup' );
        if( !is_array( $cache ) ) {
            $cache = array();
        }

        // check whether this url is cached
        if( ( isset( $cache[$lang] ) && $cache[$lang] == $url ) ) {
            $log['action']  = 'cached';
            $log['elapsed'] = WBClock::stop( $start ) . 'ms';

            if( $this->force ) {
                $log['action'] = 'forced';
                $this->log->notice( $log );
            } else {
                $this->log->notice( $log );
                return false;
            }
        }

        $sqlFile  =    $this->download( $url );
        $log['action']  = 'download';
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );

        $start =   WBClock::now();
        $this->prepareTablePage();
        $log['action']  = 'prepareTablePage';
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );

        $start =   WBClock::now();
        $this->sourceSQLDump( $sqlFile );
        $log['action']  = 'sourceSQLDump';
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );
        $start =   WBClock::now();
        $this->prepareTable();
        $log['action']  = 'prepareTable';
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );

        $start =   WBClock::now();
        $this->fillTable();
        $log['action']  = 'fillTable';
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );

        $start =   WBClock::now();
        $this->finishTable();
        $log['action']  = 'finishTable';
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );

        $start =   WBClock::now();
        $this->removeTablePage();
        $log['action']  = 'removeTablePage';
        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->notice( $log );

        // update Cache
        $cache[$lang] =    $url;
        WBCache::set(  __CLASS__ . '.lookup', $cache );

        // remove dump
        unlink( $sqlFile );
        return true;
    }

    /**
    * See whether update is needed
    *
    * Check RSS feed for new versions
    *
    * @return string $url update URL or empty string if no update is required
    */
    public function lookupUpdate()
    {
        $lang   =   $this->lang;
        $rssUrl =    sprintf( 'http://dumps.wikimedia.org/%swiki/latest/%swiki-latest-page.sql.gz-rss.xml', $lang, $lang );
        $tmpFile  =  tempnam( WBParam::get( 'wb/dir/base' ) . '/var/tmp', $this->pre );
        chmod( $tmpFile, 0666 );

        $log    =    array(
            'action'    =>  'rss',
            'lang'      =>  $lang,
            'elapsed'   =>  0,
            'url'       =>  $rssUrl
            );

        $start  =   WBClock::now();
        $dl  = WBStream::open( $rssUrl, 'r' );
        $tmp = WBStream::open( $tmpFile, 'w' );
        while( !feof( $dl ) ) {
            $buffer = fread( $dl, 4096 );
            fwrite( $tmp, $buffer );
        }
        fclose( $tmp );

        $log['elapsed'] = WBClock::stop( $start ) . 'ms';
        $this->log->debug( $log );

        WBClass::load( 'XML_RSS' );
        $rss    =   new XML_RSS( $tmpFile );
        $rss->parse();
        unlink( $tmpFile );

        $url = null;
        foreach( $rss->getItems() as $item ) {
            $title = $item['title'];
            if( !preg_match( '/href=\"([^\"]+)\"/', $item['description'], $match ) ) {
                break;
            }
            $url    =    $match[1];

            // only the first one
            break;
        }
        return $url;
    }

    /**
     * Donwload SQL dump from URL
     *
     * As we download from Wikipedia, those dumps are quite large, event (g)ziped
     * E.g. der German has more than 60MByte, the English one over 330MByte
     *
     * This also removes the head of the downloaded file. Because each dump
     * starts with "CREATE TABLE" and adds a bunch of keys, the following INSERT
     * statements are quite slow. Hence the file will be choped where the actual
     * dump starts.
     *
     * @param strnig $url typical "http://download.wikimedia.org/dewiki/20080320/dewiki-20080320-page.sql.gz"
     * @return string $file name of temporary file
     */
    protected function download( $url )
    {
        $tmpFile  =  tempnam( WBParam::get( 'wb/dir/base' ) . '/var/tmp', $this->pre );
        chmod( $tmpFile, 0666 );

        // download
        $dl = fopen( $url,'r' );
        if( !$dl ) {
            WBClass::load( 'WBException_File' );
            throw new WBException_File( 'Failed to open download "' . $url .'"', 2, __CLASS__ );
        }

        $tmp      =  fopen( $tmpFile . '.gz', 'w' );
        while( !feof( $dl ) ) {
            $buffer = fread( $dl, 4096 );
            fwrite( $tmp, $buffer );
        }
        fclose( $dl );
        fclose( $tmp );

        // unpack and remove CREATE statement
        $zip = gzopen( $tmpFile . '.gz', 'r' );
        $tmp = fopen( $tmpFile, 'w' );

        // remove head with "CREATE TABLE"
        while( !feof( $zip ) ) {
            $buffer = gzgets( $zip, 4096 );
            if( strncmp( $buffer, '-- Dumping data for table `page`', 27 ) == 0 ) {
                break;;
            }
        }

        // just save rest of file
        while( !feof( $zip ) ) {
            $buffer = gzread( $zip, 8192 );
            fwrite( $tmp, $buffer );
        }

        fclose( $zip );
        fclose( $tmp );

        unlink( $tmpFile . '.gz' );
        return $tmpFile;
    }

    /**
     * create temporary table
     *
     * To source the dump, a temporary table that suits the INSERT statements
     * from Wikipedia's SQL dump. No keys nor unique constrains
     *
     * @return bool true on success
     */
    protected function prepareTablePage()
    {
        // remove old table page
        $res    =    $this->sql->query( 'DROP TABLE IF EXISTS `page`' );

        $sql	=	<<<EOT
CREATE TABLE `page` (
  `page_id` int(8) unsigned NOT NULL,
  `page_namespace` int(11) NOT NULL default '0',
  `page_title` varchar(255) binary NOT NULL default '',
  `page_restrictions` tinyblob NOT NULL,
  `page_counter` bigint(20) unsigned NOT NULL default '0',
  `page_is_redirect` tinyint(1) unsigned NOT NULL default '0',
  `page_is_new` tinyint(1) unsigned NOT NULL default '0',
  `page_random` double unsigned NOT NULL default '0',
  `page_touched` varchar(14) binary NOT NULL default '',
  `page_latest` int(8) unsigned NOT NULL default '0',
  `page_len` int(8) unsigned NOT NULL default '0',
  `page_no_title_convert` tinyint(1) NOT NULL default '0'
)
EOT;

        $res    =    $this->sql->query( $sql );

        // mangle table for some languages
        if( $this->lang == 'en' ) {
            $res    =    $this->sql->query( 'ALTER TABLE `page` DROP COLUMN `page_no_title_convert`' );
        }

        $res    =    $this->sql->query( 'ALTER TABLE `page` DISABLE KEYS' );

        return true;
    }

    /**
     * Remove temporary table
     *
     * This table is very large, therefore, we need to get rid of it
     * as soon as possible.
     *
     * @return bool true on success
     */
    protected function removeTablePage()
    {
        $res    =    $this->sql->query( 'DROP TABLE IF EXISTS `page`' );
        return true;
    }

    /**
     * Create index table for language
     *
     * As we want to search articles by titie we just need enough
     * to create links to the wikipedia web site:
     *   - title
     *   - latest
     *
     * Still, there is also "touched" and "len" both.
     *
     * @link http://en.wikipedia.org/wiki/Wikipedia:Citing_Wikipedia#Citation_to_Wikipedia
     * @link http://de.wikipedia.org/wiki/Zitieren_von_Internetquellen#Zitierrichtlinien
     *
     * @return unknown
     */
    protected function prepareTable()
    {
        $lang   =    $this->lang;
        $res    =    $this->sql->query( sprintf( 'DROP TABLE IF EXISTS `%swikipedia_%s_page_tmp`', $this->pre, $lang ) );

        $pre    =    $this->pre;
        $sql	=	<<<EOT
CREATE TABLE `${pre}wikipedia_${lang}_page_tmp` (
  `id` int(8) unsigned NOT NULL auto_increment,
  `title` varchar(255) NOT NULL default '',
  `title_reverse` varchar(100) NOT NULL default '',
  `touched` datetime,
  `latest` int(8) unsigned NOT NULL default '0',
  `len` int(8) unsigned NOT NULL default '0',
  PRIMARY KEY  (`id`),
  KEY `title` (`title`),
  KEY `title_reverse` (`title_reverse`)
) DEFAULT CHARSET=utf8 COMMENT='wikipedia ${lang} page index'
EOT;
        $res    =    $this->sql->query( $sql );
        $res    =    $this->sql->query( sprintf( 'ALTER TABLE `%swikipedia_%s_page_tmp` DISABLE KEYS', $this->pre, $lang ) );
        return true;
    }

    /**
     * Fill table
     *
     * Copy relevant data from temporary table
     *
     * @return bool true on success
     */
    protected function fillTable()
    {
        $lang   =   $this->lang;
        $pre    =    $this->pre;
        $sql    =    <<<EOT
INSERT INTO ${pre}wikipedia_${lang}_page_tmp (id, title, title_reverse, len, latest, touched)
   SELECT page_id, page_title, REVERSE( page_title ), page_len, page_latest, page_touched
           FROM `page`
           WHERE `page_namespace` = 0
EOT;
        $res    =    $this->sql->query( $sql );
        return true;
    }

    /**
     * polish table
     *
     * Switch on keys
     *
     * @return bool true on success
     */
    protected function finishTable()
    {
        $lang   =   $this->lang;
        $res    =    $this->sql->query( sprintf( 'ALTER TABLE `%swikipedia_%s_page_tmp` ENABLE KEYS', $this->pre, $lang ) );

        $res    =    $this->sql->query( sprintf( 'DROP TABLE IF EXISTS `%swikipedia_%s_page`', $this->pre, $lang ) );
        $res    =    $this->sql->query( sprintf( 'RENAME TABLE `%swikipedia_%s_page_tmp` TO `%swikipedia_%s_page`', $this->pre, $lang, $this->pre, $lang ) );
        return true;
    }

    /**
     * Feed DB with dump
     *
     * This usually takes quite some time. Depending on your server's setup
     * it can easily waste a few minutes. E.g. sourcing the english dump
     * that contained 12 000 000 rows took 5 min.
     *
     * @param string $dump path to file holding SQL dump
     * @return unknown
     */
    protected function sourceSQLDump( $dump )
    {
        $conf   =   WBClass::create( 'WBConfig' );
        $conf->load( 'config' );

        $cmd    =   'mysql -u %s --password="%s" %s <%s';
        $cmd    =   sprintf( $cmd, $conf->get( 'db/user' ), $conf->get( 'db/password' ), $conf->get( 'db/database' ), $dump );
        exec( $cmd, $out, $ret );
        return true;
    }

}
?>