* @license PHP License * @package WB * @subpackage base */ WBClass::load('WBDictionary' , 'WBString'); /** * Dictionary: URL * * Manage URLs * * @version 0.2.3 * @package WB * @subpackage */ class WBDictionary_URL extends WBDictionary { /** * Table access * @var WBDatasource_Table */ protected $table; /** * additional data to populate record * * This array will be extended by primary key of vfsfile * * @see init() * @var array */ protected $mergeFields = array( 'title', 'description', 'headlines' ); static private $wordCache = array(); /** * 2nd constructor * */ public function init() { $this->table = WBClass::create('WBDatasource_Table'); $this->mergeFields[] = $this->table->getIdentifier('vfsfile'); } /** * find entry in dictionary * * Actual look up data in dictionary * * @param array $data * @return array|null */ protected function find($data) { $cacheId = $this->getCacheId($data); $cache = $this->getCache($cacheId); if (!empty($cache)) { return $cache; } $clause = array(); foreach ($data as $field => $value) { $clause[] = array( 'field' => $field, 'value' => $value ); } $row = $this->table->get('url', null, null, $clause); if (0 == count($row)) { return null; } $this->setCache($cacheId, $row[0]); return $row[0]; } /** * load record by id * * Restore dictionary object from table * * @param string $id */ public function load($id) { if ($this->id == $id) { return; } $data = $this->table->get('url', $id); if (count($data) != 1) { WBClass::load('WBException_Argument'); throw new WBException_Argument('Failed to load dictionary record URL by id ' . $id . '!', 1, __CLASS__); } $data = $data[0]; $this->data = $data; $this->id = $data['id']; $this->word = $data['path']; switch ($data['protocol']) { case 'mailto': $this->word = sprintf('%s@%s', $data['path'], $data['host']); break; case 'self': break; case 'javascript': $this->word = sprintf('javascript:%s', $data['path']); break; default: $this->word = sprintf('%s://%s%s', $data['protocol'], $data['host'], $data['path']); break; } $cacheId = $this->getCacheId($data); $this->setCache($cacheId, $data); } /** * extract dictionary data from word * * Well, in URL dictionary, a word is a URL. This method extracts * protocol, hostname and path from URL. URL also may be e-mail addresses * * @see include/WB/WBDictionary#explode() * @param string $word * @return array */ protected function explode($word) { $word = trim($word); if (empty($word)) { return $this->explodeEmpty(); } if (0 == strncmp('javascript:', $word, 11)) { return $this->explodeJavascript($word); } if (strstr($word, '@')) { return $this->explodeEMail($word); } return $this->explodeUrl($word); } /** * Convert Empty String to URL-Data * * @return array */ private function explodeEmpty() { $url = array( 'protocol' => '', 'host' => '', 'path' => '' ); return $url; } /** * Convert javascript Call to URL-Data * * @param string $word * @return array */ private function explodeJavascript($word) { $word = substr($word, 11); $url = array( 'protocol' => 'javascript', 'host' => '', 'path' => $word ); return $url; } /** * convert e-mail address to URL data * * @todo validate e-mail address properly * @param string $word * @return array */ private function explodeEMail($word) { $word = explode('@', $word, 2); $url = array( 'protocol' => 'mailto', 'host' => $word[1], 'path' => $word[0] ); return $url; } /** * explode standard URLs * * * * @todo check for username and password in URL * @see explodeServiceUrl() * @param string $word * @return array */ private function explodeUrl($word) { $url = array( 'protocol' => '[[PROTOCOL]]', 'host' => '[[SERVER]]', 'path' => '/' ); $word = WBString::replaceSuperPlaceholders($word); // prepend SERVER to local URLs if ('/' == $word[0]) { $word = WBString::replaceSuperPlaceholders('[[PROTOCOL]]://[[SERVER]]') . $word; } // lazy www.-url if (0 == strncmp('www.', $word, 4)) { $word = 'http://' . $word; } // URL points to this site? $self = WBString::replaceSuperPlaceholders('[[PROTOCOL]]://[[SERVER]][[DOCROOT]]'); if (strncmp($self, $word, strlen($self)) == 0) { $word = substr($word, strlen($self)); return $this->explodeServiceUrl($word); } if (preg_match('|^(\\w+)://([^/]*)|', $word, $match)) { $url['protocol'] = $match[1]; $url['host'] = $match[2]; $word = substr($word, strlen($url['protocol'] . '://' . $url['host'])); if (0 === $word || empty($word)) { $word = '/'; } } else { $url['protocol'] = 'string'; $url['host'] = ''; } $url['path'] = $word; return $url; } /** * explode rest of path from URL for "local" services * * Local services are definde in main configuration in secrtion service * and mapped to strings like [[SERVICE_HTML]]. This method tries to * find any [[SERVICE_ path and replaces the URL path with the * corresponding super placeholder. Therefore no real URLs or pathes must * be saved in URL dictionary for well knwon services. * * @param $word * @return array */ protected function explodeServiceUrl($word) { $url = array( 'protocol' => 'self', 'host' => '', 'path' => '/' ); if (empty($word)) { return $url; } $sph = WBString::getSuperPlaceHolder(); // docroot must be removed $docroot = $sph['map']['docroot']; $docrootLen = strlen($docroot); $found = false; foreach ($sph['replace'] as $i => $r) { if (strncmp($sph['search'][$i], '[[SERVICE_', 10) != 0) { continue; } // is URL within docroot? if (!strncmp($r, $docroot, $docrootLen) == 0) { continue; } $r = substr($r, $docrootLen); $rt = $r; // the base URL also works without slash at the end if (strlen($word) < strlen($r)) { $rt = rtrim($rt, '/'); } if (strncmp($rt, $word, strlen($rt)) != 0) { continue; } // remove actual path and use placeholder $word = substr($word, strlen($r)); $word = $sph['search'][$i] . $word; $found = true; break; } // try SELF_NO_LANG if SERVICE_ failed if (!$found) { $r = substr($sph['map']['self_no_lang'], $docrootLen); if (strncmp($r, $word, strlen($r)) == 0) { $word = substr($word, strlen($r)); $word = '[[SELF_NO_LANG]]' . $word; } } $url['path'] = $word; return $url; } /** * add record to dictionary * * @see include/WB/WBDictionary#insert() * @param array record to save * @return string record id */ protected function insert($data) { $cacheId = $this->getCacheId($data); $id = $this->table->save('url', '__new', $data); $data['id'] = $id; $this->setCache($cacheId, $data); return $id; } /** * save additional data to dictionary record * * This is used to populate the dictionary record * * @see populate() * @see $mergeFields * @param array $data * @return array */ protected function merge($save) { $this->table->save('url', $this->id, $save); return $save; } /** * automatical populate function * * Surf to URL and extract title, description as well as * * @param string $id * @see include/WB/WBDictionary#autoPopulate() */ public function autoPopulate($id = null) { if ($id) { $this->load($id); } $url = $this->word; $tmp = null; switch ($this->data['protocol']) { case 'self': return; // @todo build propper URL $url = WBString::replaceSuperPlaceholders('http://[[SERVER]]' . $url); // fall through case 'http': case 'https': $tmp = $this->download($url); break; default: // don't do anything for other protocols break; } if (!$tmp) { return; } $mime = $tmp->getMimeType(); if ($mime[0] != 'text' && $mime[1] != 'xml') { return; } $save = array(); $cnt = file_get_contents($tmp->realpath()); if (preg_match( '#(.*)#iU', $cnt, $match)) { $save['title'] = trim($match[1]); } if (preg_match_all( '#<(h1|h2|h3|h4)[^>]*>(.*)#i', $cnt, $matches, PREG_SET_ORDER)) { $blurb = array(); foreach ($matches as $match) { $blurb[] = str_replace("\n", ' ', strip_tags($match[2])); } $save['headlines'] = implode("\n", $blurb); } $this->populate($save); // schedule for taking screen shot of web site $queue = array( $this->table->getIdentifier('urlscreenshotqueue') => $this->id ); $this->table->save('urlscreenshotqueue', '__new', $queue); } /** * download to temporary file * * * * @param string $url * @return WBFile */ protected function download($url) { $tmp = WBClass::create('WBFile'); $tmp->tempnam('url'); WBClass::load('WBStream'); $fin = WBStream::open($url, 'r'); $fout = fopen($tmp->realpath(), 'w'); while (!feof($fin)) { $buffer = fgets($fin, 8192); fputs($fout, $buffer, strlen($buffer)); } fclose($fin); fclose($fout); return $tmp; } /** * Build cacheId from data * * @return string */ private function getCacheId($data) { return sprintf('%s://%s/%s', $data['protocol'], $data['host'], $data['path']); } /** * Get cached data * * @param string $cacheId * @return array|null */ private function getCache($cacheId) { if (isset(self::$wordCache[$cacheId])) { return self::$wordCache[$cacheId]; } return null; } /** * store in cache * * @param string $cacheId * @param array $data */ private function setCache($cacheId, $data) { self::$wordCache[$cacheId] = $data; } }