当前位置：首页 → 问答吧 → zend_search_lucene搜索英文的BUG

zend_search_lucene搜索英文的BUG

时间：2011-08-24

来源：互联网

用zend_search_lucene搜索英文时，有时模糊搜索不能搜索出结果
例如数据库中有数据(zz12,zz13,100pp1,100pp2)
建立索引
搜索zz就不能搜索到结果
只能搜索zz12这样才可以
PHP code


<?php
class CnLuceneAnalyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common 
{ 
    private $_position; 
    private $_cnStopWords = array( ); 
     
    public function setCnStopWords( $cnStopWords ) 
    { 
        $this->_cnStopWords = $cnStopWords; 
    } 

    /** 
    * Reset token stream 
    */ 
    public function reset() 
    { 
        $this->_position = 0; 
        $search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "'", "<", ">", "$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", "：", "）", "（", "．", "。", "，", "！", "；", "“", "”", "‘", "’", "〔", "〕", "、", "—", "　", "《", "》", "－", "…", "【", "】", "？", "￥" ); 
        
        $this->_input = str_replace( $search, '', $this->_input ); 
        $this->_input = str_replace( $this->_cnStopWords, ' ', $this->_input ); 
    } 

    /** 
    * Tokenization stream API 
    * Get next token 
    * Returns null at the end of stream 
    * 
    * @return Zend_Search_Lucene_Analysis_Token|null 
    */ 
    public function nextToken() 
    { 
        if ($this->_input === null) 
        { 
            return null; 
        } 

        $len = strlen($this->_input); 
        //print "原始数据：".$this->_input."<br />"; 
        while ($this->_position < $len) 
        { 
            // 去掉开头的空格 
            while ($this->_position < $len &&$this->_input[$this->_position]==' ' ) 
            { 
                $this->_position++; 
            } 

            $termStartPosition = $this->_position; 
            $temp_char = $this->_input[$this->_position]; 
            $isCnWord = false; 

            if(ord($temp_char)>127) 
            { 
                $i = 0;       
                while( $this->_position < $len && ord( $this->_input[$this->_position] )>127 ) 
                { 
                    $this->_position = $this->_position + 3; 
                    $i ++; 
                    if($i==2) 
                    { 
                        $isCnWord = true; 
                        break; 
                    } 
                } 

                if($i==1) continue; 
            } 
            else 
            { 
                while ($this->_position < $len && ctype_alnum( $this->_input[$this->_position] )) 
                { 
                    $this->_position++; 
                } 
                //echo $this->_position.":".$this->_input[$this->_position-1]."\n"; 
            } 
            if ($this->_position == $termStartPosition) 
            { 
                $this->_position++; 
                continue; 
            } 
     
            $tmp_str = substr($this->_input, $termStartPosition, $this->_position - $termStartPosition); 
             
            $token = new Zend_Search_Lucene_Analysis_Token( $tmp_str, $termStartPosition,$this->_position ); 
             
            $token = $this->normalize($token); 

            if($isCnWord) 
            { 
                $this->_position = $this->_position - 3; 
            } 

            if ($token !== null) 
            { 
                return $token; 
            } 
        } 
         
        return null; 
    } 
} 
?>