From be4ccd29e94abc47480a4ae1d01d9119aa6a2c46 Mon Sep 17 00:00:00 2001 From: hetao Date: Tue, 25 Apr 2017 11:35:08 +0800 Subject: [PATCH 1/4] support chinese segmentation use scws --- config/config.h.in | 3 + configure | 48 +++++ configure.ac | 22 +++ src/Makefile.am | 14 +- src/Makefile.in | 10 +- src/sphinx.cpp | 424 +++++++++++++++++++++++++++++++++++++++++++- src/sphinx.h | 13 ++ src/sphinxutils.cpp | 16 +- src/sphinxutils.h | 3 + 9 files changed, 543 insertions(+), 10 deletions(-) diff --git a/config/config.h.in b/config/config.h.in index 901d15839..de0217e41 100644 --- a/config/config.h.in +++ b/config/config.h.in @@ -294,6 +294,9 @@ /* RLP library support */ #undef USE_RLP +/* SCWS library support */ +#undef USE_SCWS + /* define to use POSIX Syslog for logging */ #undef USE_SYSLOG diff --git a/configure b/configure index b552f8c26..00bb47da8 100755 --- a/configure +++ b/configure @@ -608,6 +608,8 @@ LTLIBOBJS CONFDIR USE_RLP_FALSE USE_RLP_TRUE +USE_SCWS_FALSE +USE_SCWS_TRUE USE_RE2_FALSE USE_RE2_TRUE LIBRE2_PATH @@ -653,6 +655,7 @@ DEPDIR OBJEXT EXEEXT ac_ct_CC +ac_cv_use_scws CPPFLAGS LDFLAGS CFLAGS @@ -750,6 +753,7 @@ with_re2 with_re2_includes with_re2_libs with_rlp +with_scws with_iconv with_unixodbc enable_mem_override @@ -1424,6 +1428,8 @@ Optional Packages: --with-re2-libs path to RE2 libraries --with-rlp compile with RLP library support (default is disabled) + --with-scws compile with scws library support (default is + disabled) --with-iconv compile with iconv support (default is autodetect) --with-unixodbc compile with UnixODBC support (default is autodetect) @@ -8296,6 +8302,44 @@ fi +# Check whether --with-scws was given. +if test "${with_scws+set}" = set; then : + withval=$with_scws; ac_cv_use_scws=$withval +else + ac_cv_use_scws=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to compile with scws library support" >&5 +$as_echo_n "checking whether to compile with scws library support... " >&6; } +if test x$ac_cv_use_scws != xno; then + if test -d $withval && test -f $withval/include/scws/scws.h; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define USE_SCWS 1" >>confdefs.h + + else + as_fn_error $? "missing SCWS sources from libscws" "$LINENO" 5 + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +$as_echo "#define USE_SCWS 0" >>confdefs.h + +fi + if test x$ac_cv_use_scws != xno; then + USE_SCWS_TRUE= + USE_SCWS_FALSE='#' +else + USE_SCWS_TRUE='#' + USE_SCWS_FALSE= +fi + + + got_expat=0 dl_expat=0 @@ -9176,6 +9220,10 @@ if test -z "${USE_RLP_TRUE}" && test -z "${USE_RLP_FALSE}"; then as_fn_error $? "conditional \"USE_RLP\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${USE_SCWS_TRUE}" && test -z "${USE_SCWS_FALSE}"; then + as_fn_error $? "conditional \"USE_SCWS\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 diff --git a/configure.ac b/configure.ac index 39624407c..5c53c91db 100644 --- a/configure.ac +++ b/configure.ac @@ -564,6 +564,28 @@ fi AM_CONDITIONAL(USE_RLP, test x$ac_cv_use_rlp != xno) +dnl --- + +AC_ARG_WITH([scws], + AC_HELP_STRING([--with-scws], [compile with scws library support (default is disabled)]), + [ac_cv_use_scws=$withval], [ac_cv_use_scws=no] +) + +AC_MSG_CHECKING([whether to compile with scws library support]) +if test x$ac_cv_use_scws != xno; then + if test -d $ac_cv_use_scws && test -f $ac_cv_use_scws/include/scws/scws.h; then + AC_MSG_RESULT([yes]) + AC_DEFINE(USE_SCWS, 1, [scws library support]) + else + AC_MSG_ERROR([missing scws sources from libscws]) + fi +else + AC_MSG_RESULT([no]) + AC_DEFINE(USE_SCWS, 0, [scws library support]) +fi +AM_CONDITIONAL(USE_SCWS, test x$ac_cv_use_scws != xno) + + dnl --- got_expat=0 diff --git a/src/Makefile.am b/src/Makefile.am index c7ae07cb3..4666ff430 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -4,7 +4,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \ sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp sphinxqcache.cpp \ sphinxrlp.cpp -ARFLAGS = crU +ARFLAGS = cr noinst_LIBRARIES = libsphinx.a libsphinx_a_SOURCES = $(SRC_SPHINX) @@ -31,6 +31,14 @@ RLP_LIBS = RLP_INC = endif -AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" -COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) +if USE_SCWS +SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a +SCWS_INC = -I@ac_cv_use_scws@/include +else +SCWS_LIBS = +SCWS_INC = +endif + +AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" +COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $S(CWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS) LDADD = $(COMMON_LIBS) diff --git a/src/Makefile.in b/src/Makefile.in index e1540d10d..1e0043bc8 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -251,7 +251,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \ sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp sphinxqcache.cpp \ sphinxrlp.cpp -ARFLAGS = crU +ARFLAGS = cr noinst_LIBRARIES = libsphinx.a libsphinx_a_SOURCES = $(SRC_SPHINX) indexer_SOURCES = indexer.cpp @@ -265,8 +265,12 @@ BUILT_SOURCES = extract-version @USE_RLP_TRUE@RLP_LIBS = -L$(top_srcdir)/rlp/lib/amd64-glibc25-gcc42 -lbtrlpc -lbtrlpcore -lbtutils @USE_RLP_FALSE@RLP_INC = @USE_RLP_TRUE@RLP_INC = -I$(top_srcdir)/rlp/rlp/include -I$(top_srcdir)/rlp/utilities/include -D_REENTRANT -AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" -COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) +@USE_SCWS_FALSE@SCWS_LIBS = +@USE_SCWS_TRUE@SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a +@USE_SCWS_FALSE@SCWS_INC = +@USE_SCWS_TRUE@SCWS_INC = -I@ac_cv_use_scws@/include +AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" +COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(SCWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS) LDADD = $(COMMON_LIBS) all: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) all-am diff --git a/src/sphinx.cpp b/src/sphinx.cpp index bd95ded2a..474cfc169 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2442,6 +2442,53 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8 }; +/// SCWS tokenizer +#if USE_SCWS +scws_t scws_global; +int scws_config_set=false; + +template < bool IS_QUERY > +class CSphTokenizer_SCWS : public CSphTokenizerBase2 +{ +public: + CSphTokenizer_SCWS (); + ~CSphTokenizer_SCWS (); + virtual void SetBuffer ( const BYTE * sBuffer, int iLength ); + virtual BYTE * GetToken (); + virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const; + virtual void Setup ( const CSphTokenizerSettings & tSettings ){ + CSphTokenizerBase2::Setup ( tSettings ); + if(scws_config_set==false){ + scws_config_set=true; + if ( !tSettings.m_scwsDict.IsEmpty () ) + { + scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); + } + if ( !tSettings.m_scwsRule.IsEmpty ()) + { + scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ()); + } + scws_set_charset(scws_global, "utf8"); + scws_set_ignore(scws_global, true); + + + if ( tSettings.m_scwsMulti) + { + scws_set_multi(scws_global, tSettings.m_scwsMulti << 12); + }else{ + scws_set_multi(scws_global, 0); + } + } + scws_source = scws_fork(scws_global); + } + virtual int GetCodepointLength ( int iCode ) const; + virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); } + const BYTE * m_pText; + scws_res_t res,cur; + scws_t scws_source; +}; +#endif + struct CSphNormalForm { CSphString m_sForm; @@ -2781,6 +2828,13 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer () return new CSphTokenizer_UTF8Ngram (); } +#if USE_SCWS +ISphTokenizer * sphCreateUTF8SCWSTokenizer () +{ + return new CSphTokenizer_SCWS (); +} +#endif + ///////////////////////////////////////////////////////////////////////////// enum @@ -3354,8 +3408,14 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett return true; tSettings.m_iType = tReader.GetByte (); - if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM ) - { + + if ( + tSettings.m_iType!=TOKENIZER_UTF8 + && tSettings.m_iType!=TOKENIZER_NGRAM +#if USE_SCWS + && tSettings.m_iType!=TOKENIZER_SCWS +#endif + ){ sWarning = "can't load an old index with SBCS tokenizer"; return false; } @@ -3386,6 +3446,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett tSettings.m_sBlendChars = tReader.GetString (); if ( uVersion>=24 ) tSettings.m_sBlendMode = tReader.GetString(); +#if USE_SCWS + tSettings.m_scwsMulti= tReader.GetDword(); + tSettings.m_scwsDict= tReader.GetString(); + tSettings.m_scwsRule= tReader.GetString(); +#endif return true; } @@ -3415,6 +3480,11 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i tWriter.PutString ( tSettings.m_sNgramChars.cstr () ); tWriter.PutString ( tSettings.m_sBlendChars.cstr () ); tWriter.PutString ( tSettings.m_sBlendMode.cstr () ); +#if USE_SCWS + tWriter.PutDword( tSettings.m_scwsMulti); + tWriter.PutString ( tSettings.m_scwsDict.cstr()) ; + tWriter.PutString ( tSettings.m_scwsRule.cstr()); +#endif } @@ -3682,10 +3752,13 @@ void ISphTokenizer::Setup ( const CSphTokenizerSettings & tSettings ) ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError ) { CSphScopedPtr pTokenizer ( NULL ); - + switch ( tSettings.m_iType ) { case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break; +#if USE_SCWS + case TOKENIZER_SCWS: pTokenizer = sphCreateUTF8SCWSTokenizer (); break; +#endif case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break; default: sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType ); @@ -5209,6 +5282,351 @@ BYTE * CSphTokenizer_UTF8Ngram::GetToken () return CSphTokenizer_UTF8::GetToken (); } + +///////////////////////////////////////////////////////////////////////////// + +#if USE_SCWS + + + +template < bool IS_QUERY > +CSphTokenizer_SCWS::CSphTokenizer_SCWS () +{ + + CSphString sTmp; + SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp ); + m_bHasBlend = false; + if(scws_global==NULL) scws_global = scws_new(); +} +template < bool IS_QUERY > +CSphTokenizer_SCWS::~CSphTokenizer_SCWS () +{ + scws_free(scws_source); +} + + +template < bool IS_QUERY > +void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength ) +{ + // check that old one is over and that new length is sane + assert ( iLength>=0 ); + + // set buffer + m_pTokenStart = m_pTokenEnd = NULL; + m_pBlendStart = m_pBlendEnd = NULL; + + m_pText = m_pBuffer = sBuffer; + m_pBufferMax = sBuffer + iLength; + m_pCur = sBuffer; + + m_iOvershortCount = 0; + m_bBoundary = m_bTokenBoundary = false; + + res = cur = NULL; + scws_send_text(scws_source, (char*)m_pText, iLength); +} + + +template < bool IS_QUERY > +BYTE * CSphTokenizer_SCWS::GetToken () +{ + m_bWasSpecial = false; + m_bBlended = false; + m_iOvershortCount = 0; + m_bTokenBoundary = false; + m_bWasSynonym = false; + if( m_bHasBlend) + { + BYTE * pVar = GetBlendedVariant (); + if ( pVar ) + return pVar; + m_bBlendedPart = ( m_pBlendEnd!=NULL ); + } + + bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases + bool bGotSoft = false; // hey Beavis he said soft huh huhhuh + + m_pTokenStart = NULL; + for ( ;; ) + { + // get next codepoint + const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already + + if(cur !=NULL){ + memcpy(m_sAccum, m_pText + cur->off, cur->len); + m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); + cur = cur->next; + return m_sAccum; + } + m_pText = m_pCur; + + + int iCodePoint; + int iCode; + if ( pCuroff, cur->len); + m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); + + m_pTokenStart = m_pText + cur->off; + m_pCur = m_pText + cur->off + cur->len; + m_pTokenEnd = m_pCur; + + cur = cur->next; + if(cur == NULL){ + m_iLastTokenLen = 0; + m_iAccum = 0; + scws_free_result(res); + } + return m_sAccum; + } + } +} + +template < bool IS_QUERY > +ISphTokenizer * CSphTokenizer_SCWS::Clone ( ESphTokenizerClone eMode ) const +{ + if ( eMode!=SPH_CLONE_INDEX ) { + CSphTokenizer_SCWS *pClone = new CSphTokenizer_SCWS(); + pClone->CloneBase ( this, eMode ); + pClone->Setup(m_tSettings); + return pClone; + } else { + CSphTokenizer_SCWS *pClone = new CSphTokenizer_SCWS(); + pClone->CloneBase ( this, eMode ); + pClone->Setup(m_tSettings); + return pClone; + } +} + + +template < bool IS_QUERY > +int CSphTokenizer_SCWS::GetCodepointLength ( int iCode ) const +{ + if ( iCode<128 ) + return 1; + + int iBytes = 0; + while ( iCode & 0x80 ) + { + iBytes++; + iCode <<= 1; + } + + assert ( iBytes>=2 && iBytes<=4 ); + return iBytes; +} +#endif ////////////////////////////////////////////////////////////////////////// CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer ) diff --git a/src/sphinx.h b/src/sphinx.h index da4d6e570..388a63693 100644 --- a/src/sphinx.h +++ b/src/sphinx.h @@ -66,6 +66,10 @@ #include #endif +#if USE_SCWS +#include +#endif + #if USE_WINDOWS typedef __int64 SphOffset_t; #define STDOUT_FILENO fileno(stdout) @@ -449,6 +453,7 @@ class CSphLowercaser int m_iChunks; ///< how much chunks are actually allocated int * m_pData; ///< chunks themselves +public: int * m_pChunk [ CHUNK_COUNT ]; ///< pointers to non-empty chunks }; @@ -496,6 +501,14 @@ struct CSphTokenizerSettings CSphString m_sNgramChars; CSphString m_sBlendChars; CSphString m_sBlendMode; + + + #if USE_SCWS + CSphString m_scwsDict; + CSphString m_scwsRule; + int m_scwsMulti; +#endif + CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output CSphTokenizerSettings (); diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp index 883eaa614..459eb1912 100644 --- a/src/sphinxutils.cpp +++ b/src/sphinxutils.cpp @@ -591,6 +591,12 @@ static KeyDesc_t g_dKeysIndex[] = { "rlp_context", 0, NULL }, { "ondisk_attrs", 0, NULL }, { "index_token_filter", 0, NULL }, +#if USE_SCWS + { "scws", 0, NULL }, + { "scws_dict", 0, NULL }, + { "scws_rule", 0, NULL }, + { "scws_multi", 0, NULL }, +#endif { NULL, 0, NULL } }; @@ -1274,7 +1280,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings else sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" ); } - +#if USE_SCWS + if ( hIndex ( "scws" ) ) + { + tSettings.m_iType = TOKENIZER_SCWS; + tSettings.m_scwsDict = hIndex.GetStr ( "scws_dict" ); + tSettings.m_scwsRule = hIndex.GetStr ( "scws_rule" ); + tSettings.m_scwsMulti = hIndex.GetInt ( "scws_multi",0 ); + } +#endif tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" ); tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 ); tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" ); diff --git a/src/sphinxutils.h b/src/sphinxutils.h index c56ca0fa2..3fdfd3820 100644 --- a/src/sphinxutils.h +++ b/src/sphinxutils.h @@ -147,6 +147,9 @@ enum // where was TOKENIZER_SBCS=1 once TOKENIZER_UTF8 = 2, TOKENIZER_NGRAM = 3 +#if USE_SCWS + ,TOKENIZER_SCWS = 4 +#endif }; /// load config file From 82fe60dc3fe35726c6fe99bdb29f43c2cbf07ecf Mon Sep 17 00:00:00 2001 From: hetao Date: Thu, 27 Apr 2017 16:30:06 +0800 Subject: [PATCH 2/4] fix memory leak --- src/sphinx.cpp | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 474cfc169..280531d02 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -5296,7 +5296,9 @@ CSphTokenizer_SCWS::CSphTokenizer_SCWS () CSphString sTmp; SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp ); m_bHasBlend = false; - if(scws_global==NULL) scws_global = scws_new(); + if(scws_global==NULL) { + scws_global = scws_new(); + } } template < bool IS_QUERY > CSphTokenizer_SCWS::~CSphTokenizer_SCWS () @@ -5308,8 +5310,8 @@ CSphTokenizer_SCWS::~CSphTokenizer_SCWS () template < bool IS_QUERY > void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength ) { - // check that old one is over and that new length is sane - assert ( iLength>=0 ); + // check that old one is over and that new length is sane + assert ( iLength>=0 ); // set buffer m_pTokenStart = m_pTokenEnd = NULL; @@ -5321,9 +5323,9 @@ void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength m_iOvershortCount = 0; m_bBoundary = m_bTokenBoundary = false; - + res = cur = NULL; - scws_send_text(scws_source, (char*)m_pText, iLength); + scws_send_text(scws_source, (char*)m_pText, iLength); } @@ -5353,11 +5355,18 @@ BYTE * CSphTokenizer_SCWS::GetToken () const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already if(cur !=NULL){ - memcpy(m_sAccum, m_pText + cur->off, cur->len); - m_sAccum[cur->len]='\0'; - sphColumnToLowercase ( (char *)( m_sAccum ) ); cur = cur->next; - return m_sAccum; + if(cur != NULL){ + memcpy(m_sAccum, m_pText + cur->off, cur->len); + m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); + return m_sAccum; + }else{ + m_iLastTokenLen = 0; + m_iAccum = 0; + scws_free_result(res); + } + } m_pText = m_pCur; @@ -5582,12 +5591,6 @@ BYTE * CSphTokenizer_SCWS::GetToken () m_pCur = m_pText + cur->off + cur->len; m_pTokenEnd = m_pCur; - cur = cur->next; - if(cur == NULL){ - m_iLastTokenLen = 0; - m_iAccum = 0; - scws_free_result(res); - } return m_sAccum; } } From 8a4bc2c51ec84332857b98066a563a063c8c2b6b Mon Sep 17 00:00:00 2001 From: hetao Date: Thu, 27 Apr 2017 18:06:33 +0800 Subject: [PATCH 3/4] increase index speed --- src/sphinx.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 280531d02..b75d006d8 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -5345,6 +5345,25 @@ BYTE * CSphTokenizer_SCWS::GetToken () m_bBlendedPart = ( m_pBlendEnd!=NULL ); } + if(!IS_QUERY){ + if(cur == NULL){ + res = (cur = scws_get_result(scws_source)); + if(cur == NULL){ + return NULL; + } + } + memcpy(m_sAccum, m_pText + cur->off, cur->len); + m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); + m_iLastTokenLen = 0; + m_iAccum = 0; + cur = cur->next; + if(cur == NULL){ + scws_free_result(res); + } + return m_sAccum; + } + bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases bool bGotSoft = false; // hey Beavis he said soft huh huhhuh From 866eff1a84aa034dbf7cb1cdcf5b34bfe368855f Mon Sep 17 00:00:00 2001 From: hetao Date: Fri, 28 Apr 2017 14:24:00 +0800 Subject: [PATCH 4/4] support multi dict and xdb format --- src/sphinx.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index b75d006d8..9a29b75c8 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2462,11 +2462,27 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 scws_config_set=true; if ( !tSettings.m_scwsDict.IsEmpty () ) { - scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); + + int mode,ret; + + CSphVector dicts; + + sphSplit ( dicts, tSettings.m_scwsDict.cstr() ," \t,;"); + ARRAY_FOREACH ( i, dicts) + { + mode = SCWS_XDICT_MEM | SCWS_XDICT_XDB; + if (!dicts[i].Ends(".xdb")){ + mode |= SCWS_XDICT_TXT; + } + ret = scws_add_dict(scws_global, dicts[i].cstr (), mode); + sphInfo("scws set dict [%s], mode [%d], ret [%d]",dicts[i].cstr (),mode,ret); + } + } if ( !tSettings.m_scwsRule.IsEmpty ()) { - scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ()); + scws_set_rule(scws_global, tSettings.m_scwsRule.cstr ()); + sphInfo("scws set rule [%s]",tSettings.m_scwsRule.cstr ()); } scws_set_charset(scws_global, "utf8"); scws_set_ignore(scws_global, true); @@ -2475,6 +2491,7 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 if ( tSettings.m_scwsMulti) { scws_set_multi(scws_global, tSettings.m_scwsMulti << 12); + sphInfo("scws set muliti[%d]",tSettings.m_scwsMulti); }else{ scws_set_multi(scws_global, 0); }