* @license PHP License * @package WB * @subpackage Wikipedia */ WBClass::load( 'WBCache' , 'WBStream' , 'WBClock' , 'WBLog' ); /** * Manage wimkipedia index * * Download and search Wikipedia index. This index class create local copies of * the Wikipedia page index. This allows to search Wikipedia articles (fast) * which is quite usefull to create links to Wikipedia etc. * * As Wikipedia is available for other languages then English, as well, there * are is an index for each language (en, de, fr). * * For configuration values see {@link wikipedia.xml}. Also there is a script * (which uses this class) that updates the local copy of Wikipedia index on * a regular base. * * * * @version 0.1.1 * @package WB * @subpackage Wikipedia */ class WBWikipedia_Index { /** * SQL connector * @var WBDatasource_SQL */ protected $sql; /** * Logger * @var WBLog */ protected $log; /** * force * @var bool */ protected $force = false; /** * prefix string for used tables * @var string */ protected $pre = 'wp'; /** * current langauge * @var string */ protected $lang = 'en'; /** * start up * * Start SQL connection */ public function __construct() { $this->log = WBLog::start( __CLASS__ ); $this->sql = WBClass::create( 'WBDatasource_SQL' ); $this->config = WBClass::create( 'WBConfig' ); $this->config->load( 'wikipedia' ); $this->pre = $this->config->get( 'table/prefix', $this->pre ); } /** * use language * * @param string $lang */ public function setLang( $lang ) { $langs = $this->config->get( 'index' ); if( !is_array( $langs ) ) { WBClass::load( 'WBException_Config' ); throw new WBException_Argument( 'Config "wikipedia.xml" does not know any indices.', 1, __CLASS__ ); return false; } if( !in_array( $lang, $langs ) ) { WBClass::load( 'WBException_Config' ); throw new WBException_Argument( 'Argument "'. $lang .'" does not match values on config "wikipedia.xml"', 2, __CLASS__ ); return false; } $this->lang = $lang; return true; } public function find( $search, &$list ) { $lang = $this->lang; $pre = $this->pre; $search = str_replace( ' ', '_', $search ); $sFor = $this->sql->quote( $search, true ); $sRev = $this->sql->quote( strrev( $search ), true ); echo "search : " . $search . "\n"; $sql = <<sql->query( $sql ); while( $l = $res->fetch_assoc() ) { $list[] = $l; } $res->free_result(); $foundRes = $this->sql->query( 'SELECT FOUND_ROWS()' ); $found = $foundRes->fetch_array( MYSQLI_NUM ); return $found[0]; } /** * set force flag * * Force update process, even if not required. Don't use this * unless you really know what you do! See description of * {@link update()} and wait and wait and wait... * * @see updateAll() * @see update() * @param bool $force * @return bool current state of force flage */ public function setForce( $force = null ) { if( is_bool( $force ) ) { $this->force = $force; } return $this->force; } /** * update index for all languages * * Call @link upadte() for all languages this class ist configured * * Usually, this is what you cronjob is supposed to call. :-) * * @see update() * @return unknown */ public function updateAll() { $langs = $this->config->get( 'index' ); if( !is_array( $lang ) ) { return false; } foreach( $lags as $lang ) { $this->setLang( $lang ); $this->update(); } return true; } /** * Update index for one language * * Template method that manages update steps * * - fetch download URL * - check whether update is erquired * - download SQL dump from wikipedia * - prepare table * - source SQL dump int temporary table * - copy relevant data to local table * - cleaup * * At first, it tries to fetch the URL of the downloadable file. In case * the URL hasn't changed, you are lucky and no update is required. * Otherwise you have to wait... * * Please consider, that this process is very time consuming - simply * because of huge numbers. The biggest Wikipedia - the english one - has * more than 2.3 million articles. The download file is about 340 MBytes. * Depending on you bandwith the download itself may take several hours. * * The fetched file unzips to 1.1 GByte. Thats quite a whole lot SQL * INSERT statements. Feeding you databse with this dump takes around 5 * minutes. * * Then relevant columns are copied to another (the productivity) table * which is quite fast (less than 2 minutes). Well, as the indices were * switched off, the database server has to build the keys. This takes * ages * * @see lookupUpdate() * @see download() * @see prepareTablePage() * @see sourceSQLDump() * @see prepareTable() * @see fillTable() * @see finishTable() * @see removeTablePage() * @return bool true in case update was done */ public function update() { $lang = $this->lang; $log = array( 'action' => 'lookup', 'lang' => $lang, 'elapsed' => 0 ); // look for URL $start = WBClock::now(); $url = $this->lookupUpdate( $lang ); $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); if( empty( $url ) ) { $log['action'] = 'failed-lookup'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; return false; } // load cached list $start = WBClock::now(); $cache = WBCache::get( __CLASS__ . '.lookup' ); if( !is_array( $cache ) ) { $cache = array(); } // check whether this url is cached if( ( isset( $cache[$lang] ) && $cache[$lang] == $url ) ) { $log['action'] = 'cached'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; if( $this->force ) { $log['action'] = 'forced'; $this->log->notice( $log ); } else { $this->log->notice( $log ); return false; } } $sqlFile = $this->download( $url ); $log['action'] = 'download'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); $start = WBClock::now(); $this->prepareTablePage(); $log['action'] = 'prepareTablePage'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); $start = WBClock::now(); $this->sourceSQLDump( $sqlFile ); $log['action'] = 'sourceSQLDump'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); $start = WBClock::now(); $this->prepareTable(); $log['action'] = 'prepareTable'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); $start = WBClock::now(); $this->fillTable(); $log['action'] = 'fillTable'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); $start = WBClock::now(); $this->finishTable(); $log['action'] = 'finishTable'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); $start = WBClock::now(); $this->removeTablePage(); $log['action'] = 'removeTablePage'; $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->notice( $log ); // update Cache $cache[$lang] = $url; WBCache::set( __CLASS__ . '.lookup', $cache ); // remove dump unlink( $sqlFile ); return true; } /** * See whether update is needed * * Check RSS feed for new versions * * @return string $url update URL or empty string if no update is required */ public function lookupUpdate() { $lang = $this->lang; $rssUrl = sprintf( 'http://dumps.wikimedia.org/%swiki/latest/%swiki-latest-page.sql.gz-rss.xml', $lang, $lang ); $tmpFile = tempnam( WBParam::get( 'wb/dir/base' ) . '/var/tmp', $this->pre ); chmod( $tmpFile, 0666 ); $log = array( 'action' => 'rss', 'lang' => $lang, 'elapsed' => 0, 'url' => $rssUrl ); $start = WBClock::now(); $dl = WBStream::open( $rssUrl, 'r' ); $tmp = WBStream::open( $tmpFile, 'w' ); while( !feof( $dl ) ) { $buffer = fread( $dl, 4096 ); fwrite( $tmp, $buffer ); } fclose( $tmp ); $log['elapsed'] = WBClock::stop( $start ) . 'ms'; $this->log->debug( $log ); WBClass::load( 'XML_RSS' ); $rss = new XML_RSS( $tmpFile ); $rss->parse(); unlink( $tmpFile ); $url = null; foreach( $rss->getItems() as $item ) { $title = $item['title']; if( !preg_match( '/href=\"([^\"]+)\"/', $item['description'], $match ) ) { break; } $url = $match[1]; // only the first one break; } return $url; } /** * Donwload SQL dump from URL * * As we download from Wikipedia, those dumps are quite large, event (g)ziped * E.g. der German has more than 60MByte, the English one over 330MByte * * This also removes the head of the downloaded file. Because each dump * starts with "CREATE TABLE" and adds a bunch of keys, the following INSERT * statements are quite slow. Hence the file will be choped where the actual * dump starts. * * @param strnig $url typical "http://download.wikimedia.org/dewiki/20080320/dewiki-20080320-page.sql.gz" * @return string $file name of temporary file */ protected function download( $url ) { $tmpFile = tempnam( WBParam::get( 'wb/dir/base' ) . '/var/tmp', $this->pre ); chmod( $tmpFile, 0666 ); // download $dl = fopen( $url,'r' ); if( !$dl ) { WBClass::load( 'WBException_File' ); throw new WBException_File( 'Failed to open download "' . $url .'"', 2, __CLASS__ ); } $tmp = fopen( $tmpFile . '.gz', 'w' ); while( !feof( $dl ) ) { $buffer = fread( $dl, 4096 ); fwrite( $tmp, $buffer ); } fclose( $dl ); fclose( $tmp ); // unpack and remove CREATE statement $zip = gzopen( $tmpFile . '.gz', 'r' ); $tmp = fopen( $tmpFile, 'w' ); // remove head with "CREATE TABLE" while( !feof( $zip ) ) { $buffer = gzgets( $zip, 4096 ); if( strncmp( $buffer, '-- Dumping data for table `page`', 27 ) == 0 ) { break;; } } // just save rest of file while( !feof( $zip ) ) { $buffer = gzread( $zip, 8192 ); fwrite( $tmp, $buffer ); } fclose( $zip ); fclose( $tmp ); unlink( $tmpFile . '.gz' ); return $tmpFile; } /** * create temporary table * * To source the dump, a temporary table that suits the INSERT statements * from Wikipedia's SQL dump. No keys nor unique constrains * * @return bool true on success */ protected function prepareTablePage() { // remove old table page $res = $this->sql->query( 'DROP TABLE IF EXISTS `page`' ); $sql = <<sql->query( $sql ); // mangle table for some languages if( $this->lang == 'en' ) { $res = $this->sql->query( 'ALTER TABLE `page` DROP COLUMN `page_no_title_convert`' ); } $res = $this->sql->query( 'ALTER TABLE `page` DISABLE KEYS' ); return true; } /** * Remove temporary table * * This table is very large, therefore, we need to get rid of it * as soon as possible. * * @return bool true on success */ protected function removeTablePage() { $res = $this->sql->query( 'DROP TABLE IF EXISTS `page`' ); return true; } /** * Create index table for language * * As we want to search articles by titie we just need enough * to create links to the wikipedia web site: * - title * - latest * * Still, there is also "touched" and "len" both. * * @link http://en.wikipedia.org/wiki/Wikipedia:Citing_Wikipedia#Citation_to_Wikipedia * @link http://de.wikipedia.org/wiki/Zitieren_von_Internetquellen#Zitierrichtlinien * * @return unknown */ protected function prepareTable() { $lang = $this->lang; $res = $this->sql->query( sprintf( 'DROP TABLE IF EXISTS `%swikipedia_%s_page_tmp`', $this->pre, $lang ) ); $pre = $this->pre; $sql = <<sql->query( $sql ); $res = $this->sql->query( sprintf( 'ALTER TABLE `%swikipedia_%s_page_tmp` DISABLE KEYS', $this->pre, $lang ) ); return true; } /** * Fill table * * Copy relevant data from temporary table * * @return bool true on success */ protected function fillTable() { $lang = $this->lang; $pre = $this->pre; $sql = <<sql->query( $sql ); return true; } /** * polish table * * Switch on keys * * @return bool true on success */ protected function finishTable() { $lang = $this->lang; $res = $this->sql->query( sprintf( 'ALTER TABLE `%swikipedia_%s_page_tmp` ENABLE KEYS', $this->pre, $lang ) ); $res = $this->sql->query( sprintf( 'DROP TABLE IF EXISTS `%swikipedia_%s_page`', $this->pre, $lang ) ); $res = $this->sql->query( sprintf( 'RENAME TABLE `%swikipedia_%s_page_tmp` TO `%swikipedia_%s_page`', $this->pre, $lang, $this->pre, $lang ) ); return true; } /** * Feed DB with dump * * This usually takes quite some time. Depending on your server's setup * it can easily waste a few minutes. E.g. sourcing the english dump * that contained 12 000 000 rows took 5 min. * * @param string $dump path to file holding SQL dump * @return unknown */ protected function sourceSQLDump( $dump ) { $conf = WBClass::create( 'WBConfig' ); $conf->load( 'config' ); $cmd = 'mysql -u %s --password="%s" %s <%s'; $cmd = sprintf( $cmd, $conf->get( 'db/user' ), $conf->get( 'db/password' ), $conf->get( 'db/database' ), $dump ); exec( $cmd, $out, $ret ); return true; } } ?>