[ Index ]

PHP Cross Reference of Joomla 2.5.4 DE

title

Body

[close]

/administrator/components/com_finder/helpers/indexer/ -> helper.php (source)

   1  <?php
   2  /**
   3   * @package     Joomla.Administrator
   4   * @subpackage  com_finder
   5   *
   6   * @copyright   Copyright (C) 2005 - 2012 Open Source Matters, Inc. All rights reserved.
   7   * @license     GNU General Public License version 2 or later; see LICENSE
   8   */
   9  
  10  defined('_JEXEC') or die;
  11  
  12  // Register dependent classes.
  13  JLoader::register('FinderIndexerStemmer', dirname(__FILE__) . '/stemmer.php');
  14  JLoader::register('FinderIndexerToken', dirname(__FILE__) . '/token.php');
  15  
  16  /**
  17   * Helper class for the Finder indexer package.
  18   *
  19   * @package     Joomla.Administrator
  20   * @subpackage  com_finder
  21   * @since       2.5
  22   */
  23  class FinderIndexerHelper
  24  {
  25      /**
  26       * The token stemmer object. The stemmer is set by whatever class
  27       * wishes to use it but it must be an instance of FinderIndexerStemmer.
  28       *
  29       * @var        FinderIndexerStemmer
  30       * @since    2.5
  31       */
  32      public static $stemmer;
  33  
  34      /**
  35       * Method to parse input into plain text.
  36       *
  37       * @param   string  $input   The raw input.
  38       * @param   string  $format  The format of the input. [optional]
  39       *
  40       * @return  string  The parsed input.
  41       *
  42       * @since   2.5
  43       * @throws  Exception on invalid parser.
  44       */
  45  	public static function parse($input, $format = 'html')
  46      {
  47          // Get a parser for the specified format and parse the input.
  48          return FinderIndexerParser::getInstance($format)->parse($input);
  49      }
  50  
  51      /**
  52       * Method to tokenize a text string.
  53       *
  54       * @param   string   $input   The input to tokenize.
  55       * @param   string   $lang    The language of the input.
  56       * @param   boolean  $phrase  Flag to indicate whether input could be a phrase. [optional]
  57       *
  58       * @return  array  An array of FinderIndexerToken objects.
  59       *
  60       * @since   2.5
  61       */
  62  	public static function tokenize($input, $lang, $phrase = false)
  63      {
  64          static $cache;
  65          $store = JString::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
  66  
  67          // Check if the string has been tokenized already.
  68          if ($store && isset($cache[$store]))
  69          {
  70              return $cache[$store];
  71          }
  72  
  73          $tokens = array();
  74          $terms = array();
  75          $quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');
  76  
  77          // Get the simple language key.
  78          $lang = FinderIndexerHelper::getPrimaryLanguage($lang);
  79  
  80          /*
  81           * Parsing the string input into terms is a multi-step process.
  82           *
  83           * Regexes:
  84           *    1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
  85           *    2. Remove plus, dash, period, and comma characters located before letter characters.
  86           *  3. Remove plus, dash, period, and comma characters located after other characters.
  87           *  4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
  88           *  5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
  89           *  6. Remove orphaned quote characters.
  90           *  7. Replace the assorted single quotation marks with the ASCII standard single quotation.
  91           *  8. Remove multiple space characters and replaces with a single space.
  92           */
  93          $input = JString::strtolower($input);
  94          $input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
  95          $input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
  96          $input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
  97          $input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input); // Ungreedy
  98          $input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
  99          $input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
 100          $input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
 101          $input = preg_replace('#\s+#mui', ' ', $input);
 102          $input = JString::trim($input);
 103  
 104          // Explode the normalized string to get the terms.
 105          $terms = explode(' ', $input);
 106  
 107          /*
 108           * If we have Unicode support and are dealing with Chinese text, Chinese
 109           * has to be handled specially because there are not necessarily any spaces
 110           * between the "words". So, we have to test if the words belong to the Chinese
 111           * character set and if so, explode them into single glyphs or "words".
 112           */
 113          if ($lang === 'zh')
 114          {
 115              // Iterate through the terms and test if they contain Chinese.
 116              for ($i = 0, $n = count($terms); $i < $n; $i++)
 117              {
 118                  $charMatches = array();
 119                  $charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
 120  
 121                  // Split apart any groups of Chinese characters.
 122                  for ($j = 0; $j < $charCount; $j++)
 123                  {
 124                      $tSplit = JString::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
 125                      if (!empty($tSplit))
 126                      {
 127                          $terms[$i] = $tSplit;
 128                      }
 129                      else
 130                      {
 131                          unset($terms[$i]);
 132                      }
 133  
 134                      $terms[] = $charMatches[0][$j];
 135                  }
 136              }
 137  
 138              // Reset array keys.
 139              $terms = array_values($terms);
 140          }
 141  
 142          /*
 143           * If we have to handle the input as a phrase, that means we don't
 144           * tokenize the individual terms and we do not create the two and three
 145           * term combinations. The phrase must contain more than one word!
 146           */
 147          if ($phrase === true && count($terms) > 1)
 148          {
 149              // Create tokens from the phrase.
 150              $tokens[] = new FinderIndexerToken($terms, $lang);
 151          }
 152          else
 153          {
 154              // Create tokens from the terms.
 155              for ($i = 0, $n = count($terms); $i < $n; $i++)
 156              {
 157                  $tokens[] = new FinderIndexerToken($terms[$i], $lang);
 158              }
 159  
 160              // Create two and three word phrase tokens from the individual words.
 161              for ($i = 0, $n = count($tokens); $i < $n; $i++)
 162              {
 163                  // Setup the phrase positions.
 164                  $i2 = $i + 1;
 165                  $i3 = $i + 2;
 166  
 167                  // Create the two word phrase.
 168                  if ($i2 < $n && isset($tokens[$i2]))
 169                  {
 170                      // Tokenize the two word phrase.
 171                      $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
 172                      $token->derived = true;
 173  
 174                      // Add the token to the stack.
 175                      $tokens[] = $token;
 176                  }
 177  
 178                  // Create the three word phrase.
 179                  if ($i3 < $n && isset($tokens[$i3]))
 180                  {
 181                      // Tokenize the three word phrase.
 182                      $token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
 183                      $token->derived = true;
 184  
 185                      // Add the token to the stack.
 186                      $tokens[] = $token;
 187                  }
 188              }
 189          }
 190  
 191          if ($store)
 192          {
 193              $cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
 194              return $cache[$store];
 195          }
 196          else
 197          {
 198              return count($tokens) > 1 ? $tokens : array_shift($tokens);
 199          }
 200      }
 201  
 202      /**
 203       * Method to get the base word of a token. This method uses the public
 204       * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
 205       * the original token is returned.
 206       *
 207       * @param   string  $token  The token to stem.
 208       * @param   string  $lang   The language of the token.
 209       *
 210       * @return  string  The root token.
 211       *
 212       * @since   2.5
 213       */
 214  	public static function stem($token, $lang)
 215      {
 216          // Trim apostrophes at either end of the token.
 217          $token = JString::trim($token, '\'');
 218  
 219          // Trim everything after any apostrophe in the token.
 220          if (($pos = JString::strpos($token, '\'')) !== false)
 221          {
 222              $token = JString::substr($token, 0, $pos);
 223          }
 224  
 225          // Stem the token if we have a valid stemmer to use.
 226          if (self::$stemmer instanceof FinderIndexerStemmer)
 227          {
 228              return self::$stemmer->stem($token, $lang);
 229          }
 230          else
 231          {
 232              return $token;
 233          }
 234      }
 235  
 236      /**
 237       * Method to add a content type to the database.
 238       *
 239       * @param   string  $title  The type of content. For example: PDF
 240       * @param   string  $mime   The mime type of the content. For example: PDF [optional]
 241       *
 242       * @return  integer  The id of the content type.
 243       *
 244       * @since   2.5
 245       * @throws  Exception on database error.
 246       */
 247  	public static function addContentType($title, $mime = null)
 248      {
 249          static $types;
 250  
 251          $db = JFactory::getDBO();
 252          $query = $db->getQuery(true);
 253  
 254          // Check if the types are loaded.
 255          if (empty($types))
 256          {
 257              // Build the query to get the types.
 258              $query->select('*');
 259              $query->from($db->quoteName('#__finder_types'));
 260  
 261              // Get the types.
 262              $db->setQuery($query);
 263              $types = $db->loadObjectList('title');
 264  
 265              // Check for a database error.
 266              if ($db->getErrorNum())
 267              {
 268                  // Throw database error exception.
 269                  throw new Exception($db->getErrorMsg(), 500);
 270              }
 271          }
 272  
 273          // Check if the type already exists.
 274          if (isset($types[$title]))
 275          {
 276              return (int) $types[$title]->id;
 277          }
 278  
 279          // Add the type.
 280          $query->clear();
 281          $query->insert($db->quoteName('#__finder_types'));
 282          $query->columns(array($db->quoteName('title'), $db->quoteName('mime')));
 283          $query->values($db->quote($title) . ', ' . $db->quote($mime));
 284          $db->setQuery($query);
 285          $db->query();
 286  
 287          // Check for a database error.
 288          if ($db->getErrorNum())
 289          {
 290              // Throw database error exception.
 291              throw new Exception($db->getErrorMsg(), 500);
 292          }
 293  
 294          // Return the new id.
 295          return (int) $db->insertid();
 296      }
 297  
 298      /**
 299       * Method to check if a token is common in a language.
 300       *
 301       * @param   string  $token  The token to test.
 302       * @param   string  $lang   The language to reference.
 303       *
 304       * @return  boolean  True if common, false otherwise.
 305       *
 306       * @since   2.5
 307       */
 308  	public static function isCommon($token, $lang)
 309      {
 310          static $data;
 311  
 312          // Load the common tokens for the language if necessary.
 313          if (!isset($data[$lang]))
 314          {
 315              $data[$lang] = FinderIndexerHelper::getCommonWords($lang);
 316          }
 317  
 318          // Check if the token is in the common array.
 319          if (in_array($token, $data[$lang]))
 320          {
 321              return true;
 322          }
 323          else
 324          {
 325              return false;
 326          }
 327      }
 328  
 329      /**
 330       * Method to get an array of common terms for a language.
 331       *
 332       * @param   string  $lang  The language to use.
 333       *
 334       * @return  array  Array of common terms.
 335       *
 336       * @since   2.5
 337       * @throws  Exception on database error.
 338       */
 339  	public static function getCommonWords($lang)
 340      {
 341          $db = JFactory::getDBO();
 342  
 343          // Create the query to load all the common terms for the language.
 344          $query = $db->getQuery(true);
 345          $query->select($db->quoteName('term'));
 346          $query->from($db->quoteName('#__finder_terms_common'));
 347          $query->where($db->quoteName('language') . ' = ' . $db->quote($lang));
 348  
 349          // Load all of the common terms for the language.
 350          $db->setQuery($query);
 351          $results = $db->loadColumn();
 352  
 353          // Check for a database error.
 354          if ($db->getErrorNum())
 355          {
 356              // Throw database error exception.
 357              throw new Exception($db->getErrorMsg(), 500);
 358          }
 359  
 360          return $results;
 361      }
 362  
 363      /**
 364       * Method to get the default language for the site.
 365       *
 366       * @return  string  The default language string.
 367       *
 368       * @since   2.5
 369       */
 370  	public static function getDefaultLanguage()
 371      {
 372          static $lang;
 373  
 374          // Get the default language.
 375          if (empty($lang))
 376          {
 377              $lang = JComponentHelper::getParams('com_languages')->get('site', 'en-GB');
 378          }
 379  
 380          return $lang;
 381      }
 382  
 383      /**
 384       * Method to parse a language/locale key and return a simple language string.
 385       *
 386       * @param   string  $lang  The language/locale key. For example: en-GB
 387       *
 388       * @return  string  The simple language string. For example: en
 389       *
 390       * @since   2.5
 391       */
 392  	public static function getPrimaryLanguage($lang)
 393      {
 394          static $data;
 395  
 396          // Only parse the identifier if necessary.
 397          if (!isset($data[$lang]))
 398          {
 399              if (is_callable(array('Locale', 'getPrimaryLanguage')))
 400              {
 401                  // Get the language key using the Locale package.
 402                  $data[$lang] = Locale::getPrimaryLanguage($lang);
 403              }
 404              else
 405              {
 406                  // Get the language key using string position.
 407                  $data[$lang] = JString::substr($lang, 0, JString::strpos($lang, '-'));
 408              }
 409          }
 410  
 411          return $data[$lang];
 412      }
 413  
 414      /**
 415       * Method to get the path (SEF route) for a content item.
 416       *
 417       * @param   string  $url  The non-SEF route to the content item.
 418       *
 419       * @return  string  The path for the content item.
 420       *
 421       * @since   2.5
 422       */
 423  	public static function getContentPath($url)
 424      {
 425          static $router;
 426  
 427          // Only get the router once.
 428          if (!($router instanceof JRouter))
 429          {
 430              jimport('joomla.application.router');
 431              include_once  JPATH_SITE . '/includes/application.php';
 432  
 433              // Get and configure the site router.
 434              $config = JFactory::getConfig();
 435              $router = JRouter::getInstance('site');
 436              $router->setMode($config->get('sef', 1));
 437          }
 438  
 439          // Build the relative route.
 440          $uri = $router->build($url);
 441          $route = $uri->toString(array('path', 'query', 'fragment'));
 442          $route = str_replace(JURI::base(true) . '/', '', $route);
 443  
 444          return $route;
 445      }
 446  
 447      /**
 448       * Method to get extra data for a content before being indexed. This is how
 449       * we add Comments, Tags, Labels, etc. that should be available to Finder.
 450       *
 451       * @param   FinderIndexerResult  &$item  The item to index as an FinderIndexerResult object.
 452       *
 453       * @return  boolean  True on success, false on failure.
 454       *
 455       * @since   2.5
 456       * @throws  Exception on database error.
 457       */
 458  	public static function getContentExtras(FinderIndexerResult &$item)
 459      {
 460          // Get the event dispatcher.
 461          $dispatcher = JDispatcher::getInstance();
 462  
 463          // Load the finder plugin group.
 464          JPluginHelper::importPlugin('finder');
 465  
 466          try
 467          {
 468              // Trigger the event.
 469              $results = $dispatcher->trigger('onPrepareFinderContent', array(&$item));
 470  
 471              // Check the returned results. This is for plugins that don't throw
 472              // exceptions when they encounter serious errors.
 473              if (in_array(false, $results))
 474              {
 475                  throw new Exception($dispatcher->getError(), 500);
 476              }
 477          }
 478          catch (Exception $e)
 479          {
 480              // Handle a caught exception.
 481              throw $e;
 482          }
 483  
 484          return true;
 485      }
 486  
 487      /**
 488       * Method to process content text using the onContentPrepare event trigger.
 489       *
 490       * @param   string     $text    The content to process.
 491       * @param   JRegistry  $params  The parameters object. [optional]
 492       *
 493       * @return  string  The processed content.
 494       *
 495       * @since   2.5
 496       */
 497  	public static function prepareContent($text, $params = null)
 498      {
 499          static $loaded;
 500  
 501          // Get the dispatcher.
 502          $dispatcher = JDispatcher::getInstance();
 503  
 504          // Load the content plugins if necessary and remove any problematic ones.
 505          if (empty($loaded))
 506          {
 507              JPluginHelper::importPlugin('content');
 508              $loaded = true;
 509  
 510              // Create an array of problematic plugins
 511              $conflicts = array('plgContentEmailCloak', 'plgContentLoadmodule');
 512  
 513              // Check if we can access the observers
 514              if (isset($dispatcher->_observers))
 515              {
 516                  // Remove problematic plugins.
 517                  foreach ($dispatcher->_observers as $key => $handler)
 518                  {
 519                      // Remove any function based event handlers that conflict with Finder.
 520                      if (is_array($handler) && isset($handler['handler']) && in_array($handler['handler'], $conflicts))
 521                      {
 522                          unset($dispatcher->_observers[$key]);
 523                      }
 524                      // Remove any object based event handlers that conflict with Finder.
 525                      elseif (is_object($handler) && method_exists($handler, 'update') && in_array(get_class($handler), $conflicts))
 526                      {
 527                          unset($dispatcher->_observers[$key]);
 528                      }
 529                  }
 530              }
 531          }
 532  
 533          // Instantiate the parameter object if necessary.
 534          if (!($params instanceof JRegistry))
 535          {
 536              $registry = new JRegistry;
 537              $registry->loadString($params);
 538              $params = $registry;
 539          }
 540  
 541          // Create a mock content object.
 542          $content = JTable::getInstance('Content');
 543          $content->text = $text;
 544  
 545          // Fire the onContentPrepare event.
 546          $dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0));
 547  
 548          return $content->text;
 549      }
 550  }


Generated: Tue Apr 3 11:40:28 2012 Cross-referenced by PHPXref 0.7.1