[ Index ]
 

Code source de Typo3 4.1.3

Accédez au Source d'autres logiciels libres

Classes | Fonctions | Variables | Constantes | Tables

title

Body

[fermer]

/typo3/sysext/indexed_search/ -> class.external_parser.php (source)

   1  <?php
   2  /***************************************************************
   3  *  Copyright notice
   4  *
   5  *  (c) 2001-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
   6  *  All rights reserved
   7  *
   8  *  This script is part of the TYPO3 project. The TYPO3 project is
   9  *  free software; you can redistribute it and/or modify
  10  *  it under the terms of the GNU General Public License as published by
  11  *  the Free Software Foundation; either version 2 of the License, or
  12  *  (at your option) any later version.
  13  *
  14  *  The GNU General Public License can be found at
  15  *  http://www.gnu.org/copyleft/gpl.html.
  16  *  A copy is found in the textfile GPL.txt and important notices to the license
  17  *  from the author is found in LICENSE.txt distributed with these scripts.
  18  *
  19  *
  20  *  This script is distributed in the hope that it will be useful,
  21  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23  *  GNU General Public License for more details.
  24  *
  25  *  This copyright notice MUST APPEAR in all copies of the script!
  26  ***************************************************************/
  27  /**
  28   * External standard parsers for indexed_search
  29   *
  30   * @author    Kasper Skårhøj <kasperYYYY@typo3.com>
  31   * @coauthor    Olivier Simah <noname_paris@yahoo.fr>
  32   */
  33  /**
  34   * [CLASS/FUNCTION INDEX of SCRIPT]
  35   *
  36   *
  37   *
  38   *   75: class tx_indexed_search_extparse
  39   *   94:     function initParser($extension)
  40   *  214:     function softInit($extension)
  41   *  247:     function searchTypeMediaTitle($extension)
  42   *  323:     function isMultiplePageExtension($extension)
  43   *
  44   *              SECTION: Reading documents (for parsing)
  45   *  354:     function readFileContent($ext,$absFile,$cPKey)
  46   *  521:     function fileContentParts($ext,$absFile)
  47   *  560:     function splitPdfInfo($pdfInfoArray)
  48   *  579:     function removeEndJunk($string)
  49   *
  50   *              SECTION: Backend analyzer
  51   *  606:     function getIcon($extension)
  52   *
  53   * TOTAL FUNCTIONS: 9
  54   * (This index is automatically created/updated by the extension "extdeveval")
  55   *
  56   */
  57  
  58  
  59  
  60  
  61  
  62  
  63  
  64  
  65  
  66  
  67  /**
  68   * External standard parsers for indexed_search
  69   * MUST RETURN utf-8 content!
  70   *
  71   * @author    Kasper Skaarhoj <kasperYYYY@typo3.com>
  72   * @package TYPO3
  73   * @subpackage tx_indexedsearch
  74   */
  75  class tx_indexed_search_extparse {
  76  
  77          // This value is also overridden from config.
  78      var $pdf_mode = -20;    // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
  79  
  80          // This array is configured in initialization:
  81      var $app = array();
  82      var $ext2itemtype_map = array();
  83      var $supportedExtensions = array();
  84  
  85      var $pObj;        // Reference to parent object (indexer class)
  86  
  87  
  88      /**
  89       * Initialize external parser for parsing content.
  90       *
  91       * @param    string        File extension
  92       * @return    boolean        Returns true if extension is supported/enabled, otherwise false.
  93       */
  94  	function initParser($extension)    {
  95  
  96              // Then read indexer-config and set if appropriate:
  97          $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
  98  
  99              // If windows, apply extension to tool name:
 100          $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
 101          $extOK = FALSE;
 102          $mainExtension = '';
 103  
 104              // Ignore extensions
 105          $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
 106          if (in_array($extension, $ignoreExtensions))    {
 107              $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
 108              return FALSE;
 109          }
 110  
 111              // Switch on file extension:
 112          switch($extension)    {
 113              case 'pdf':
 114                      // PDF
 115                  if ($indexerConfig['pdftools'])    {
 116                      $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
 117                      if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe)))    {
 118                          $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
 119                          $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
 120                              // PDF mode:
 121                          $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
 122                          $extOK = TRUE;
 123                      } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
 124                  } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
 125              break;
 126              case 'doc':
 127                      // Catdoc
 128                  if ($indexerConfig['catdoc'])    {
 129                      $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
 130                      if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe))    {
 131                          $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
 132                          $extOK = TRUE;
 133                      } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3);
 134                  } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
 135              break;
 136              case 'pps':        // MS PowerPoint(?)
 137              case 'ppt':        // MS PowerPoint
 138                      // ppthtml
 139                  if ($indexerConfig['ppthtml'])    {
 140                      $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
 141                      if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
 142                          $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
 143                          $extOK = TRUE;
 144                      } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in path '".$ppthtmlPath."ppthtml'",3);
 145                  } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
 146              break;
 147              case 'xls':        // MS Excel
 148                      // Xlhtml
 149                  if ($indexerConfig['xlhtml'])    {
 150                      $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
 151                      if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
 152                          $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
 153                          $extOK = TRUE;
 154                      } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in path '".$xlhtmlPath."xlhtml'",3);
 155                  } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
 156              break;
 157              case 'sxc':        // Open Office Calc.
 158              case 'sxi':        // Open Office Impress
 159              case 'sxw':        // Open Office Writer
 160              case 'ods':        // Oasis OpenDocument Spreadsheet
 161              case 'odp':        // Oasis OpenDocument Presentation
 162              case 'odt':        // Oasis OpenDocument Text
 163                  if ($indexerConfig['unzip'])    {
 164                      $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
 165                      if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe))    {
 166                          $this->app['unzip'] = $unzipPath.'unzip'.$exe;
 167                          $extOK = TRUE;
 168                      } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3);
 169                  } else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1);
 170              break;
 171              case 'rtf':
 172                      // Catdoc
 173                  if ($indexerConfig['unrtf'])    {
 174                      $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
 175                      if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe))    {
 176                          $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
 177                          $extOK = TRUE;
 178                      } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in path '".$unrtfPath."unrtf'",3);
 179                  } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
 180              break;
 181              case 'txt':        // Raw text
 182              case 'csv':        // Raw text
 183              case 'xml':        // PHP strip-tags()
 184              case 'tif':        // PHP EXIF
 185                  $extOK = TRUE;
 186              break;
 187              case 'html':    // PHP strip-tags()
 188              case 'htm':        // PHP strip-tags()
 189                  $extOK = TRUE;
 190                  $mainExtension = 'html';    // making "html" the common "item_type"
 191              break;
 192              case 'jpg':        // PHP EXIF
 193              case 'jpeg':    // PHP EXIF
 194                  $extOK = TRUE;
 195                  $mainExtension = 'jpeg';    // making "jpeg" the common item_type
 196              break;
 197          }
 198  
 199              // If extension was OK:
 200          if ($extOK)    {
 201              $this->supportedExtensions[$extension] = TRUE;
 202              $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
 203              return TRUE;
 204          }
 205      }
 206  
 207      /**
 208       * Initialize external parser for backend modules
 209       * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
 210       *
 211       * @param    string        File extension to initialize for.
 212       * @return    boolean        Returns true if the extension is supported and enabled, otherwise false.
 213       */
 214  	function softInit($extension)    {
 215          switch($extension)    {
 216              case 'pdf':        // PDF
 217              case 'doc':        // MS Word files
 218              case 'pps':        // MS PowerPoint
 219              case 'ppt':        // MS PowerPoint
 220              case 'xls':        // MS Excel
 221              case 'sxc':        // Open Office Calc.
 222              case 'sxi':        // Open Office Impress
 223              case 'sxw':        // Open Office Writer
 224              case 'ods':        // Oasis OpenDocument Spreadsheet
 225              case 'odp':        // Oasis OpenDocument Presentation
 226              case 'odt':        // Oasis OpenDocument Text
 227              case 'rtf':        // RTF documents
 228              case 'txt':        // ASCII Text documents
 229              case 'html':    // HTML
 230              case 'htm':        // HTML
 231              case 'csv':        // Comma Separated Values
 232              case 'xml':        // Generic XML
 233              case 'jpg':        // Jpeg images (EXIF comment)
 234              case 'jpeg':    // Jpeg images (EXIF comment)
 235              case 'tif':        // TIF images (EXIF comment)
 236                  return TRUE;
 237              break;
 238          }
 239      }
 240  
 241      /**
 242       * Return title of entry in media type selector box.
 243       *
 244       * @param    string        File extension
 245       * @return    string        String with label value of entry in media type search selector box (frontend plugin).
 246       */
 247  	function searchTypeMediaTitle($extension)    {
 248  
 249              // Read indexer-config
 250          $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
 251  
 252              // Ignore extensions
 253          $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
 254          if (in_array($extension, $ignoreExtensions))    {
 255              return FALSE;
 256          }
 257  
 258              // Switch on file extension:
 259          switch($extension)    {
 260              case 'pdf':
 261                      // PDF
 262                  if ($indexerConfig['pdftools'])    {
 263                      return 'PDF';
 264                  }
 265              break;
 266              case 'doc':
 267                      // Catdoc
 268                  if ($indexerConfig['catdoc'])    {
 269                      return 'MS Word';
 270                  }
 271              break;
 272              case 'pps':        // MS PowerPoint(?)
 273              case 'ppt':        // MS PowerPoint
 274                      // ppthtml
 275                  if ($indexerConfig['ppthtml'])    {
 276                      return 'MS Powerpoint';
 277                  }
 278              break;
 279              case 'xls':        // MS Excel
 280                      // Xlhtml
 281                  if ($indexerConfig['xlhtml'])    {
 282                      return 'MS Excel';
 283                  }
 284              break;
 285              case 'sxc':        // Open Office Calc.
 286              case 'sxi':        // Open Office Impress
 287              case 'sxw':        // Open Office Writer
 288              case 'ods':        // Oasis OpenDocument Spreadsheet
 289              case 'odp':        // Oasis OpenDocument Presentation
 290              case 'odt':        // Oasis OpenDocument Text
 291                  if ($indexerConfig['unzip'])    {
 292                      return 'Open Office';
 293                  }
 294              break;
 295              case 'rtf':
 296                      // Catdoc
 297                  if ($indexerConfig['unrtf'])    {
 298                      return 'RTF';
 299                  }
 300              break;
 301              case 'html':    // PHP strip-tags()
 302              case 'jpeg':    // PHP EXIF
 303              case 'txt':        // Raw text
 304              case 'csv':        // Raw text
 305              case 'xml':        // PHP strip-tags()
 306              case 'tif':        // PHP EXIF
 307                  return strtoupper($extension);
 308              break;
 309                  // NO entry (duplicates or blank):
 310              case 'htm':        // PHP strip-tags()
 311              case 'jpg':        // PHP EXIF
 312              default:
 313              break;
 314          }
 315      }
 316  
 317      /**
 318       * Returns true if the input extension (item_type) is a potentially a multi-page extension
 319       *
 320       * @param    string        Extension / item_type string
 321       * @return    boolean        Return true if multi-page
 322       */
 323  	function isMultiplePageExtension($extension)    {
 324              // Switch on file extension:
 325          switch((string)$extension)    {
 326              case 'pdf':
 327                  return TRUE;
 328              break;
 329          }
 330      }
 331  
 332  
 333  
 334  
 335  
 336  
 337  
 338  
 339  
 340      /************************
 341       *
 342       * Reading documents (for parsing)
 343       *
 344       ************************/
 345  
 346      /**
 347       * Reads the content of an external file being indexed.
 348       *
 349       * @param    string        File extension, eg. "pdf", "doc" etc.
 350       * @param    string        Absolute filename of file (must exist and be validated OK before calling function)
 351       * @param    string        Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
 352       * @return    array        Standard content array (title, description, keywords, body keys)
 353       */
 354  	function readFileContent($ext,$absFile,$cPKey)    {
 355          unset($contentArr);
 356  
 357              // Return immediately if initialization didn't set support up:
 358          if (!$this->supportedExtensions[$ext])    return FALSE;
 359  
 360              // Switch by file extension
 361          switch ($ext)    {
 362              case 'pdf':
 363                  if ($this->app['pdfinfo'])    {
 364                          // Getting pdf-info:
 365                      $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
 366                      exec($cmd,$res);
 367                      $pdfInfo = $this->splitPdfInfo($res);
 368                      unset($res);
 369                      if (intval($pdfInfo['pages']))    {
 370                          list($low,$high) = explode('-',$cPKey);
 371  
 372                              // Get pdf content:
 373                          $tempFileName = t3lib_div::tempnam('Typo3_indexer');        // Create temporary name
 374                          @unlink ($tempFileName);    // Delete if exists, just to be safe.
 375                          $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName;
 376                          exec($cmd);
 377                          if (@is_file($tempFileName))    {
 378                              $content = t3lib_div::getUrl($tempFileName);
 379                              unlink($tempFileName);
 380                          } else {
 381                              $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
 382                          }
 383                          if (strlen($content))    {
 384                              $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
 385                          }
 386                      }
 387                  }
 388              break;
 389              case 'doc':
 390                  if ($this->app['catdoc'])    {
 391                      $cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';
 392                      exec($cmd,$res);
 393                      $content = implode(chr(10),$res);
 394                      unset($res);
 395                      $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
 396                  }
 397              break;
 398              case 'pps':
 399              case 'ppt':
 400                  if ($this->app['ppthtml'])    {
 401                      $cmd = $this->app['ppthtml'].' "'.$absFile.'"';
 402                      exec($cmd,$res);
 403                      $content = implode(chr(10),$res);
 404                      unset($res);
 405                      $content = $this->pObj->convertHTMLToUtf8($content);
 406                      $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
 407                      $contentArr['title'] = basename($absFile);    // Make sure the title doesn't expose the absolute path!
 408                  }
 409              break;
 410              case 'xls':
 411                  if ($this->app['xlhtml'])    {
 412                      $cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';
 413                      exec($cmd,$res);
 414                      $content = implode(chr(10),$res);
 415                      unset($res);
 416                      $content = $this->pObj->convertHTMLToUtf8($content);
 417                      $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
 418                      $contentArr['title'] = basename($absFile);    // Make sure the title doesn't expose the absolute path!
 419                  }
 420              break;
 421              case 'sxi':
 422              case 'sxc':
 423              case 'sxw':
 424              case 'ods':
 425              case 'odp':
 426              case 'odt':
 427                  if ($this->app['unzip'])    {
 428                          // Read content.xml:
 429                      $cmd = $this->app['unzip'].' -p "'.$absFile.'" content.xml';
 430                      exec($cmd,$res);
 431                      $content_xml = implode(chr(10),$res);
 432                      unset($res);
 433  
 434                          // Read meta.xml:
 435                      $cmd = $this->app['unzip'].' -p "'.$absFile.'" meta.xml';
 436                      exec($cmd, $res);
 437                      $meta_xml = implode(chr(10),$res);
 438                      unset($res);
 439  
 440                      $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
 441                      $contentArr = $this->pObj->splitRegularContent($utf8_content);
 442                      $contentArr['title'] = basename($absFile);    // Make sure the title doesn't expose the absolute path!
 443  
 444                          // Meta information
 445                      $metaContent = t3lib_div::xml2tree($meta_xml);
 446                      $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
 447                      if (is_array($metaContent))    {
 448                          $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
 449                          $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
 450  
 451                              // Keywords collected:
 452                          if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword']))    {
 453                              foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat)    {
 454                                  $contentArr['keywords'].= $kwDat['values'][0].' ';
 455                              }
 456                          }
 457                      }
 458                  }
 459              break;
 460              case 'rtf':
 461                  if ($this->app['unrtf'])    {
 462                      $cmd = $this->app['unrtf'].' "'.$absFile.'"';
 463                      exec($cmd,$res);
 464                      $fileContent = implode(chr(10),$res);
 465                      unset($res);
 466                      $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
 467                      $contentArr = $this->pObj->splitHTMLContent($fileContent);
 468                  }
 469              break;
 470              case 'txt':
 471              case 'csv':        // Raw text
 472                  $content = t3lib_div::getUrl($absFile);
 473                      // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
 474                  $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
 475                  $contentArr = $this->pObj->splitRegularContent($content);
 476                  $contentArr['title'] = basename($absFile);    // Make sure the title doesn't expose the absolute path!
 477              break;
 478              case 'html':
 479              case 'htm':
 480                  $fileContent = t3lib_div::getUrl($absFile);
 481                  $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
 482                  $contentArr = $this->pObj->splitHTMLContent($fileContent);
 483              break;
 484              case 'xml':        // PHP strip-tags()
 485                  $fileContent = t3lib_div::getUrl($absFile);
 486  
 487                      // Finding charset:
 488                  eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
 489                  $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
 490  
 491                      // Converting content:
 492                  $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
 493                  $contentArr = $this->pObj->splitRegularContent($fileContent);
 494                  $contentArr['title'] = basename($absFile);    // Make sure the title doesn't expose the absolute path!
 495              break;
 496              case 'jpg':        // PHP EXIF
 497              case 'jpeg':    // PHP EXIF
 498              case 'tif':        // PHP EXIF
 499                  if (function_exists('exif_read_data'))    {
 500                      $exif = exif_read_data($absFile, 'IFD0');
 501                  } else {
 502                      $exif = FALSE;
 503                  }
 504  
 505                  if ($exif)    {
 506                      $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);    // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
 507                  } else {
 508                      $comment = '';
 509                  }
 510                  $contentArr = $this->pObj->splitRegularContent($comment);
 511                  $contentArr['title'] = basename($absFile);    // Make sure the title doesn't expose the absolute path!
 512              break;
 513              default:
 514                  return false;
 515              break;
 516          }
 517              // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
 518          if (is_array($contentArr) && !$contentArr['title'])    {
 519              $contentArr['title'] = str_replace('_',' ',basename($absFile));    // Substituting "_" for " " because many filenames may have this instead of a space char.
 520          }
 521  
 522          return $contentArr;
 523      }
 524  
 525      /**
 526       * Creates an array with pointers to divisions of document.
 527       * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
 528       *
 529       * @param    string        File extension
 530       * @param    string        Absolute filename (must exist and be validated OK before calling function)
 531       * @return    array        Array of pointers to sections that the document should be divided into
 532       */
 533  	function fileContentParts($ext,$absFile)    {
 534          $cParts = array(0);
 535          switch ($ext)    {
 536              case 'pdf':
 537                      // Getting pdf-info:
 538                  $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
 539                  exec($cmd,$res);
 540                  $pdfInfo = $this->splitPdfInfo($res);
 541                  unset($res);
 542  
 543                  if (intval($pdfInfo['pages']))    {
 544                      $cParts = array();
 545  
 546                          // Calculate mode
 547                      if ($this->pdf_mode>0)    {
 548                          $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
 549                      } else {
 550                          $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
 551                      }
 552  
 553                          // Traverse and create intervals.
 554                      for ($a=0;$a<$iter;$a++)    {
 555                          $low = floor($a*($pdfInfo['pages']/$iter))+1;
 556                          $high = floor(($a+1)*($pdfInfo['pages']/$iter));
 557                          $cParts[] = $low.'-'.$high;
 558                      }
 559                  }
 560              break;
 561          }
 562          return $cParts;
 563      }
 564  
 565      /**
 566       * Analysing PDF info into a useable format.
 567       *
 568       * @param    array        Array of PDF content, coming from the pdfinfo tool
 569       * @return    array        Result array
 570       * @access private
 571       * @see fileContentParts()
 572       */
 573  	function splitPdfInfo($pdfInfoArray)    {
 574          $res = array();
 575          if (is_array($pdfInfoArray))    {
 576              foreach($pdfInfoArray as $line)    {
 577                  $parts = explode(':',$line,2);
 578                  if (count($parts)>1 && trim($parts[0]))    {
 579                      $res[strtolower(trim($parts[0]))] = trim($parts[1]);
 580                  }
 581              }
 582          }
 583          return $res;
 584      }
 585  
 586      /**
 587       * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
 588       *
 589       * @param    string        String to clean up
 590       * @return    string        String
 591       */
 592  	function removeEndJunk($string)    {
 593          return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
 594      }
 595  
 596  
 597  
 598  
 599  
 600  
 601  
 602  
 603  
 604  
 605  
 606  
 607      /************************
 608       *
 609       * Backend analyzer
 610       *
 611       ************************/
 612  
 613      /**
 614       * Return icon for file extension
 615       *
 616       * @param    string        File extension, lowercase.
 617       * @return    string        Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
 618       */
 619  	function getIcon($extension)    {
 620          if ($extension=='htm')    $extension = 'html';
 621          if ($extension=='jpeg')    $extension = 'jpg';
 622          return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
 623      }
 624  }
 625  
 626  if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])    {
 627      include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
 628  }
 629  ?>


Généré le : Sun Nov 25 17:13:16 2007 par Balluche grâce à PHPXref 0.7
  Clicky Web Analytics