From be4ccd29e94abc47480a4ae1d01d9119aa6a2c46 Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Tue, 25 Apr 2017 11:35:08 +0800
Subject: [PATCH 1/4] support chinese segmentation use scws

---
 config/config.h.in  |   3 +
 configure           |  48 +++++
 configure.ac        |  22 +++
 src/Makefile.am     |  14 +-
 src/Makefile.in     |  10 +-
 src/sphinx.cpp      | 424 +++++++++++++++++++++++++++++++++++++++++++-
 src/sphinx.h        |  13 ++
 src/sphinxutils.cpp |  16 +-
 src/sphinxutils.h   |   3 +
 9 files changed, 543 insertions(+), 10 deletions(-)

diff --git a/config/config.h.in b/config/config.h.in
index 901d15839..de0217e41 100644
--- a/config/config.h.in
+++ b/config/config.h.in
@@ -294,6 +294,9 @@
 /* RLP library support */
 #undef USE_RLP
 
+/* SCWS library support */
+#undef USE_SCWS
+
 /* define to use POSIX Syslog for logging */
 #undef USE_SYSLOG
 
diff --git a/configure b/configure
index b552f8c26..00bb47da8 100755
--- a/configure
+++ b/configure
@@ -608,6 +608,8 @@ LTLIBOBJS
 CONFDIR
 USE_RLP_FALSE
 USE_RLP_TRUE
+USE_SCWS_FALSE
+USE_SCWS_TRUE
 USE_RE2_FALSE
 USE_RE2_TRUE
 LIBRE2_PATH
@@ -653,6 +655,7 @@ DEPDIR
 OBJEXT
 EXEEXT
 ac_ct_CC
+ac_cv_use_scws
 CPPFLAGS
 LDFLAGS
 CFLAGS
@@ -750,6 +753,7 @@ with_re2
 with_re2_includes
 with_re2_libs
 with_rlp
+with_scws
 with_iconv
 with_unixodbc
 enable_mem_override
@@ -1424,6 +1428,8 @@ Optional Packages:
   --with-re2-libs         path to RE2 libraries
   --with-rlp              compile with RLP library support (default is
                           disabled)
+  --with-scws             compile with scws library support (default is
+                          disabled)
   --with-iconv            compile with iconv support (default is autodetect)
   --with-unixodbc         compile with UnixODBC support (default is
                           autodetect)
@@ -8296,6 +8302,44 @@ fi
 
 
 
+# Check whether --with-scws was given.
+if test "${with_scws+set}" = set; then :
+  withval=$with_scws; ac_cv_use_scws=$withval
+else
+  ac_cv_use_scws=no
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to compile with scws library support" >&5
+$as_echo_n "checking whether to compile with scws library support... " >&6; }
+if test x$ac_cv_use_scws != xno; then
+	if test -d $withval && test -f $withval/include/scws/scws.h; then
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define USE_SCWS 1" >>confdefs.h
+
+	else
+		as_fn_error $? "missing SCWS sources from libscws" "$LINENO" 5
+	fi
+else
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+$as_echo "#define USE_SCWS 0" >>confdefs.h
+
+fi
+ if test x$ac_cv_use_scws != xno; then
+  USE_SCWS_TRUE=
+  USE_SCWS_FALSE='#'
+else
+  USE_SCWS_TRUE='#'
+  USE_SCWS_FALSE=
+fi
+
+
+
 
 got_expat=0
 dl_expat=0
@@ -9176,6 +9220,10 @@ if test -z "${USE_RLP_TRUE}" && test -z "${USE_RLP_FALSE}"; then
   as_fn_error $? "conditional \"USE_RLP\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${USE_SCWS_TRUE}" && test -z "${USE_SCWS_FALSE}"; then
+  as_fn_error $? "conditional \"USE_SCWS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
diff --git a/configure.ac b/configure.ac
index 39624407c..5c53c91db 100644
--- a/configure.ac
+++ b/configure.ac
@@ -564,6 +564,28 @@ fi
 AM_CONDITIONAL(USE_RLP, test x$ac_cv_use_rlp != xno)
 
 
+dnl ---
+
+AC_ARG_WITH([scws],
+	AC_HELP_STRING([--with-scws], [compile with scws library support (default is disabled)]),
+	[ac_cv_use_scws=$withval], [ac_cv_use_scws=no]
+)
+
+AC_MSG_CHECKING([whether to compile with scws library support])
+if test x$ac_cv_use_scws != xno; then
+	if test -d $ac_cv_use_scws && test -f $ac_cv_use_scws/include/scws/scws.h; then
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(USE_SCWS, 1, [scws library support])
+	else
+		AC_MSG_ERROR([missing scws sources from libscws])
+	fi
+else
+	AC_MSG_RESULT([no])
+	AC_DEFINE(USE_SCWS, 0, [scws library support])
+fi
+AM_CONDITIONAL(USE_SCWS, test x$ac_cv_use_scws != xno)
+
+
 dnl ---
 
 got_expat=0
diff --git a/src/Makefile.am b/src/Makefile.am
index c7ae07cb3..4666ff430 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -4,7 +4,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \
 	sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp sphinxqcache.cpp \
 	sphinxrlp.cpp
 
-ARFLAGS = crU
+ARFLAGS = cr
 noinst_LIBRARIES = libsphinx.a
 libsphinx_a_SOURCES = $(SRC_SPHINX)
 
@@ -31,6 +31,14 @@ RLP_LIBS =
 RLP_INC =
 endif
 
-AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
-COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+if USE_SCWS
+SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a
+SCWS_INC = -I@ac_cv_use_scws@/include
+else
+SCWS_LIBS =
+SCWS_INC =
+endif
+
+AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $S(CWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS)
 LDADD = $(COMMON_LIBS)
diff --git a/src/Makefile.in b/src/Makefile.in
index e1540d10d..1e0043bc8 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -251,7 +251,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \
 	sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp sphinxqcache.cpp \
 	sphinxrlp.cpp
 
-ARFLAGS = crU
+ARFLAGS = cr
 noinst_LIBRARIES = libsphinx.a
 libsphinx_a_SOURCES = $(SRC_SPHINX)
 indexer_SOURCES = indexer.cpp
@@ -265,8 +265,12 @@ BUILT_SOURCES = extract-version
 @USE_RLP_TRUE@RLP_LIBS = -L$(top_srcdir)/rlp/lib/amd64-glibc25-gcc42 -lbtrlpc -lbtrlpcore -lbtutils
 @USE_RLP_FALSE@RLP_INC = 
 @USE_RLP_TRUE@RLP_INC = -I$(top_srcdir)/rlp/rlp/include -I$(top_srcdir)/rlp/utilities/include -D_REENTRANT
-AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
-COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+@USE_SCWS_FALSE@SCWS_LIBS = 
+@USE_SCWS_TRUE@SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a
+@USE_SCWS_FALSE@SCWS_INC = 
+@USE_SCWS_TRUE@SCWS_INC = -I@ac_cv_use_scws@/include
+AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(SCWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS)
 LDADD = $(COMMON_LIBS)
 all: $(BUILT_SOURCES)
 	$(MAKE) $(AM_MAKEFLAGS) all-am
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index bd95ded2a..474cfc169 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2442,6 +2442,53 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
 };
 
 
+/// SCWS tokenizer
+#if USE_SCWS
+scws_t scws_global; 
+int scws_config_set=false;
+
+template < bool IS_QUERY >
+class CSphTokenizer_SCWS : public CSphTokenizerBase2
+{
+public:
+                                                    CSphTokenizer_SCWS ();
+                                                    ~CSphTokenizer_SCWS ();
+        virtual void                SetBuffer ( const BYTE * sBuffer, int iLength );
+        virtual BYTE *              GetToken ();
+        virtual ISphTokenizer *     Clone ( ESphTokenizerClone eMode ) const;
+	virtual void		Setup ( const CSphTokenizerSettings & tSettings ){
+		CSphTokenizerBase2::Setup ( tSettings ); 
+		if(scws_config_set==false){
+			scws_config_set=true;
+			if ( !tSettings.m_scwsDict.IsEmpty ()  )
+			{ 
+				scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
+			}
+			if ( !tSettings.m_scwsRule.IsEmpty ())
+			{ 
+				scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ());
+			}
+			scws_set_charset(scws_global, "utf8");
+			scws_set_ignore(scws_global, true);
+
+
+			if ( tSettings.m_scwsMulti)
+			{ 
+				scws_set_multi(scws_global, tSettings.m_scwsMulti << 12);
+			}else{
+				scws_set_multi(scws_global, 0);
+			}
+		}
+		scws_source = scws_fork(scws_global);
+	}
+        virtual int                 GetCodepointLength ( int iCode ) const;
+        virtual int                 GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+	const BYTE * m_pText;
+	scws_res_t res,cur;
+	scws_t scws_source; 
+};
+#endif
+
 struct CSphNormalForm
 {
 	CSphString				m_sForm;
@@ -2781,6 +2828,13 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
 	return new CSphTokenizer_UTF8Ngram<false> ();
 }
 
+#if USE_SCWS
+ISphTokenizer * sphCreateUTF8SCWSTokenizer ()
+{
+        return new CSphTokenizer_SCWS<false> ();
+}
+#endif
+
 /////////////////////////////////////////////////////////////////////////////
 
 enum
@@ -3354,8 +3408,14 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		return true;
 
 	tSettings.m_iType = tReader.GetByte ();
-	if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
-	{
+
+        if ( 
+                tSettings.m_iType!=TOKENIZER_UTF8 
+                && tSettings.m_iType!=TOKENIZER_NGRAM  
+#if USE_SCWS
+                && tSettings.m_iType!=TOKENIZER_SCWS
+#endif
+        ){
 		sWarning = "can't load an old index with SBCS tokenizer";
 		return false;
 	}
@@ -3386,6 +3446,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		tSettings.m_sBlendChars = tReader.GetString ();
 	if ( uVersion>=24 )
 		tSettings.m_sBlendMode = tReader.GetString();
+#if USE_SCWS
+	tSettings.m_scwsMulti= tReader.GetDword();
+	tSettings.m_scwsDict= tReader.GetString();
+	tSettings.m_scwsRule= tReader.GetString();
+#endif
 
 	return true;
 }
@@ -3415,6 +3480,11 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
+#if USE_SCWS
+	tWriter.PutDword( tSettings.m_scwsMulti);
+	tWriter.PutString ( tSettings.m_scwsDict.cstr()) ;
+	tWriter.PutString ( tSettings.m_scwsRule.cstr());
+#endif
 }
 
 
@@ -3682,10 +3752,13 @@ void ISphTokenizer::Setup ( const CSphTokenizerSettings & tSettings )
 ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError )
 {
 	CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL );
-
+        
 	switch ( tSettings.m_iType )
 	{
 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
+#if USE_SCWS
+		case TOKENIZER_SCWS:	pTokenizer = sphCreateUTF8SCWSTokenizer (); break;
+#endif               
 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
 		default:
 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
@@ -5209,6 +5282,351 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
 	return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
 }
 
+
+/////////////////////////////////////////////////////////////////////////////
+
+#if USE_SCWS
+
+
+
+template < bool IS_QUERY >
+CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
+{
+
+	CSphString sTmp;
+	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
+	m_bHasBlend = false;
+	if(scws_global==NULL) scws_global = scws_new();
+}
+template < bool IS_QUERY >
+CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
+{
+	scws_free(scws_source);
+}
+
+
+template < bool IS_QUERY >
+void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+        // check that old one is over and that new length is sane
+        assert ( iLength>=0 );
+
+	// set buffer
+	m_pTokenStart = m_pTokenEnd = NULL;
+	m_pBlendStart = m_pBlendEnd = NULL;
+
+	m_pText = m_pBuffer = sBuffer;
+	m_pBufferMax = sBuffer + iLength;
+	m_pCur = sBuffer;
+
+	m_iOvershortCount = 0;
+	m_bBoundary = m_bTokenBoundary = false;
+        
+	res = cur = NULL;
+        scws_send_text(scws_source, (char*)m_pText, iLength);
+}
+
+
+template < bool IS_QUERY >
+BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
+{
+	m_bWasSpecial = false;
+	m_bBlended = false;
+	m_iOvershortCount = 0;
+	m_bTokenBoundary = false;
+	m_bWasSynonym = false;
+	if( m_bHasBlend)
+	{
+		BYTE * pVar = GetBlendedVariant ();
+		if ( pVar )
+			return pVar;
+		m_bBlendedPart = ( m_pBlendEnd!=NULL );
+	}
+
+	bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases
+	bool bGotSoft = false; // hey Beavis he said soft huh huhhuh
+
+	m_pTokenStart = NULL;
+	for ( ;; )
+	{
+		// get next codepoint
+		const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already
+
+		if(cur !=NULL){
+			memcpy(m_sAccum, m_pText + cur->off, cur->len);
+			m_sAccum[cur->len]='\0';
+			sphColumnToLowercase ( (char *)( m_sAccum ) );
+			cur = cur->next;
+			return m_sAccum;
+		}
+		m_pText = m_pCur;
+
+
+		int iCodePoint;
+		int iCode;
+		if ( pCur<m_pBufferMax && *pCur<128 )
+		{
+			iCodePoint = *m_pCur++;
+			iCode = m_tLC.m_pChunk[0][iCodePoint];
+		} else
+		{
+			iCodePoint = GetCodepoint(); // advances m_pCur
+			iCode = m_tLC.ToLower ( iCodePoint );
+		}
+
+		// handle escaping
+		bool bWasEscaped = ( IS_QUERY && iCodePoint=='\\' ); // whether current codepoint was escaped
+		if ( bWasEscaped )
+		{
+			iCodePoint = GetCodepoint();
+			iCode = m_tLC.ToLower ( iCodePoint );
+			if ( !Special2Simple ( iCode ) )
+				iCode = 0;
+		}
+		// handle eof
+		if ( iCode<0 )
+		{
+			FlushAccum ();
+
+			// suddenly, exceptions
+			if ( m_pExc && m_pTokenStart && CheckException ( m_pTokenStart, pCur, IS_QUERY ) )
+				return m_sAccum;
+
+			// skip trailing short word
+			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen )
+			{
+				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) )
+				{
+					if ( m_iLastTokenLen )
+						m_iOvershortCount++;
+					m_iLastTokenLen = 0;
+
+					if( m_bHasBlend)
+						BlendAdjust ( pCur );
+					return NULL;
+				}
+			}
+
+			// keep token end here as BlendAdjust might change m_pCur
+			m_pTokenEnd = m_pCur;
+			if( m_bHasBlend&& !BlendAdjust ( pCur ) )
+				return NULL;
+			if( m_bHasBlend&& m_bBlended )
+				return GetBlendedVariant();
+
+			// return trailing word
+			return m_sAccum;
+		}
+
+		// handle all the flags..
+		if_const ( IS_QUERY )
+			iCode = CodepointArbitrationQ ( iCode, bWasEscaped, *m_pCur );
+		else if ( m_bDetectSentences )
+			iCode = CodepointArbitrationI ( iCode );
+
+		// handle ignored chars
+		if ( iCode & FLAG_CODEPOINT_IGNORE ){
+			continue;
+		}
+
+		// handle blended characters
+		if( m_bHasBlend&& ( iCode & FLAG_CODEPOINT_BLEND ) )
+		{
+			if ( m_pBlendEnd )
+				iCode = 0;
+			else
+			{
+				m_bBlended = true;
+				m_pBlendStart = m_iAccum ? m_pTokenStart : pCur;
+			}
+		}
+
+		// handle soft-whitespace-only tokens
+		if ( !bGotNonToken && !m_iAccum )
+		{
+			if ( !bGotSoft )
+			{
+				// detect opening soft whitespace
+				if ( ( iCode==0 && !IsWhitespace ( iCodePoint ) && !IsPunctuation ( iCodePoint ) )
+						|| ( ( iCode & FLAG_CODEPOINT_BLEND ) && !m_iAccum ) )
+				{
+					bGotSoft = true;
+				}
+			} else
+			{
+				// detect closing hard whitespace or special
+				// (if there was anything meaningful in the meantime, we must never get past the outer if!)
+				if ( IsWhitespace ( iCodePoint ) || ( iCode & FLAG_CODEPOINT_SPECIAL ) )
+				{
+					m_iOvershortCount++;
+					bGotNonToken = true;
+				}
+			}
+		}
+
+		// handle whitespace and boundary
+		if ( m_bBoundary && ( iCode==0 ) )
+		{
+			m_bTokenBoundary = true;
+			m_iBoundaryOffset = pCur - m_pBuffer - 1;
+		}
+		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
+
+		// handle separator (aka, most likely a token!)
+		if ( iCode==0 || m_bBoundary )
+		{
+			FlushAccum ();
+
+			// suddenly, exceptions
+			if ( m_pExc && CheckException ( m_pTokenStart ? m_pTokenStart : pCur, pCur, IS_QUERY ) ){
+				return m_sAccum;
+			}
+
+			if( m_bHasBlend&& !BlendAdjust ( pCur ) ){
+				continue;
+			}
+
+
+			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen
+					&& !( m_bShortTokenFilter && ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) ) )
+			{
+				if ( m_iLastTokenLen )
+					m_iOvershortCount++;
+				continue;
+			} else
+			{
+				m_pTokenEnd = pCur;
+				if( m_bHasBlend&& m_bBlended ){
+					return GetBlendedVariant();
+				}
+				return m_sAccum;
+			}
+		}
+
+		// handle specials
+		if ( iCode & FLAG_CODEPOINT_SPECIAL )
+		{
+			// skip short words preceding specials
+			if ( m_iAccum<m_tSettings.m_iMinWordLen )
+			{
+				m_sAccum[m_iAccum] = '\0';
+
+				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iAccum ) )
+				{
+					if ( m_iAccum )
+						m_iOvershortCount++;
+
+					FlushAccum ();
+				}
+			}
+
+			if ( m_iAccum==0 )
+			{
+				m_bNonBlended = m_bNonBlended || ( !( iCode & FLAG_CODEPOINT_BLEND ) && !( iCode & FLAG_CODEPOINT_SPECIAL ) );
+				m_bWasSpecial = !( iCode & FLAG_CODEPOINT_NGRAM );
+				m_pTokenStart = pCur;
+				m_pTokenEnd = m_pCur;
+				AccumCodepoint ( iCode & MASK_CODEPOINT ); // handle special as a standalone token
+			} else
+			{
+				m_pCur = pCur; // we need to flush current accum and then redo special char again
+				m_pTokenEnd = pCur;
+			}
+			FlushAccum ();
+
+			// suddenly, exceptions
+			if ( m_pExc && CheckException ( m_pTokenStart, pCur, IS_QUERY ) ){
+				return m_sAccum;
+			}
+			if( m_bHasBlend)
+			{
+				if ( !BlendAdjust ( pCur ) )
+					continue;
+				if ( m_bBlended )
+					return GetBlendedVariant();
+			}
+
+			return m_sAccum;
+		}
+
+		if ( m_iAccum==0 )
+			m_pTokenStart = pCur;
+
+		// tricky bit
+		// heading modifiers must not (!) affected blended status
+		// eg. we want stuff like '=-' (w/o apostrophes) thrown away when pure_blend is on
+
+		if( m_bHasBlend)
+			if_const (!( IS_QUERY && !m_iAccum && sphIsModifier ( iCode & MASK_CODEPOINT ) ) )
+				m_bNonBlended = m_bNonBlended || !( iCode & FLAG_CODEPOINT_BLEND );
+		// just accumulate
+		// manual inlining of utf8 encoder gives us a few extra percent
+		// which is important here, this is a hotspot
+		if ( m_iAccum<SPH_MAX_WORD_LEN && ( m_pAccum-m_sAccum+SPH_MAX_UTF8_BYTES<=(int)sizeof(m_sAccum) ) )
+		{
+			iCode &= MASK_CODEPOINT;
+			m_iAccum++;
+
+			scws_send_text(scws_source, (char*)m_pText, strlen((char*)m_pText));
+			res = (cur = scws_get_result(scws_source));//只读取一个单词
+			if(cur == NULL){
+				FlushAccum();
+				return NULL;
+			}
+
+			memcpy(m_sAccum, pCur+cur->off, cur->len);
+			m_sAccum[cur->len]='\0';
+			sphColumnToLowercase ( (char *)( m_sAccum ) );
+
+			m_pTokenStart = m_pText + cur->off;
+			m_pCur = m_pText + cur->off + cur->len;
+			m_pTokenEnd = m_pCur;
+
+			cur = cur->next;
+			if(cur == NULL){
+				m_iLastTokenLen = 0;
+				m_iAccum = 0;
+				scws_free_result(res);
+			}
+			return m_sAccum;
+		}
+	}
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_SCWS<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+        if ( eMode!=SPH_CLONE_INDEX ) {
+                CSphTokenizer_SCWS<true> *pClone = new CSphTokenizer_SCWS<true>();
+                pClone->CloneBase ( this, eMode );
+		pClone->Setup(m_tSettings);
+                return pClone;
+        } else {
+                CSphTokenizer_SCWS<false> *pClone = new CSphTokenizer_SCWS<false>();
+                pClone->CloneBase ( this, eMode );
+		pClone->Setup(m_tSettings);
+                return pClone;
+        }
+}
+
+
+template < bool IS_QUERY >
+int CSphTokenizer_SCWS<IS_QUERY>::GetCodepointLength ( int iCode ) const
+{
+        if ( iCode<128 )
+                return 1;
+
+        int iBytes = 0;
+        while ( iCode & 0x80 )
+        {
+                iBytes++;
+                iCode <<= 1;
+        }
+
+        assert ( iBytes>=2 && iBytes<=4 );
+        return iBytes;
+}
+#endif
 //////////////////////////////////////////////////////////////////////////
 
 CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
diff --git a/src/sphinx.h b/src/sphinx.h
index da4d6e570..388a63693 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -66,6 +66,10 @@
 #include <mysql.h>
 #endif
 
+#if USE_SCWS
+#include <scws/scws.h>
+#endif
+
 #if USE_WINDOWS
 typedef __int64				SphOffset_t;
 #define STDOUT_FILENO		fileno(stdout)
@@ -449,6 +453,7 @@ class CSphLowercaser
 
 	int					m_iChunks;					///< how much chunks are actually allocated
 	int *				m_pData;					///< chunks themselves
+public:
 	int *				m_pChunk [ CHUNK_COUNT ];	///< pointers to non-empty chunks
 };
 
@@ -496,6 +501,14 @@ struct CSphTokenizerSettings
 	CSphString			m_sNgramChars;
 	CSphString			m_sBlendChars;
 	CSphString			m_sBlendMode;
+        
+
+ #if USE_SCWS       
+        CSphString                      m_scwsDict;
+        CSphString                      m_scwsRule;
+        int                      m_scwsMulti;
+#endif      
+
 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
 
 						CSphTokenizerSettings ();
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index 883eaa614..459eb1912 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -591,6 +591,12 @@ static KeyDesc_t g_dKeysIndex[] =
 	{ "rlp_context",			0, NULL },
 	{ "ondisk_attrs",			0, NULL },
 	{ "index_token_filter",		0, NULL },
+#if USE_SCWS
+	{ "scws",		0, NULL },
+	{ "scws_dict",		0, NULL },
+	{ "scws_rule",		0, NULL },
+	{ "scws_multi",		0, NULL },
+#endif
 	{ NULL,						0, NULL }
 };
 
@@ -1274,7 +1280,15 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 		else
 			sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
 	}
-
+#if USE_SCWS
+        if ( hIndex ( "scws" ) )
+        {
+                tSettings.m_iType = TOKENIZER_SCWS;
+                tSettings.m_scwsDict = hIndex.GetStr ( "scws_dict" );
+                tSettings.m_scwsRule = hIndex.GetStr ( "scws_rule" );
+                tSettings.m_scwsMulti = hIndex.GetInt ( "scws_multi",0 );
+        }
+#endif
 	tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
 	tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
 	tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index c56ca0fa2..3fdfd3820 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -147,6 +147,9 @@ enum
 	// where was TOKENIZER_SBCS=1 once
 	TOKENIZER_UTF8		= 2,
 	TOKENIZER_NGRAM	= 3
+#if USE_SCWS
+	,TOKENIZER_SCWS	= 4
+#endif
 };
 
 /// load config file

From 82fe60dc3fe35726c6fe99bdb29f43c2cbf07ecf Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Thu, 27 Apr 2017 16:30:06 +0800
Subject: [PATCH 2/4] fix memory leak

---
 src/sphinx.cpp | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 474cfc169..280531d02 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -5296,7 +5296,9 @@ CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
 	CSphString sTmp;
 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
 	m_bHasBlend = false;
-	if(scws_global==NULL) scws_global = scws_new();
+	if(scws_global==NULL) {
+		scws_global = scws_new();
+	}
 }
 template < bool IS_QUERY >
 CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
@@ -5308,8 +5310,8 @@ CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
 template < bool IS_QUERY >
 void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
 {
-        // check that old one is over and that new length is sane
-        assert ( iLength>=0 );
+	// check that old one is over and that new length is sane
+	assert ( iLength>=0 );
 
 	// set buffer
 	m_pTokenStart = m_pTokenEnd = NULL;
@@ -5321,9 +5323,9 @@ void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 
 	m_iOvershortCount = 0;
 	m_bBoundary = m_bTokenBoundary = false;
-        
+
 	res = cur = NULL;
-        scws_send_text(scws_source, (char*)m_pText, iLength);
+	scws_send_text(scws_source, (char*)m_pText, iLength);
 }
 
 
@@ -5353,11 +5355,18 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 		const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already
 
 		if(cur !=NULL){
-			memcpy(m_sAccum, m_pText + cur->off, cur->len);
-			m_sAccum[cur->len]='\0';
-			sphColumnToLowercase ( (char *)( m_sAccum ) );
 			cur = cur->next;
-			return m_sAccum;
+			if(cur != NULL){
+				memcpy(m_sAccum, m_pText + cur->off, cur->len);
+				m_sAccum[cur->len]='\0';
+				sphColumnToLowercase ( (char *)( m_sAccum ) );
+				return m_sAccum;
+			}else{
+				m_iLastTokenLen = 0;
+				m_iAccum = 0;
+				scws_free_result(res);
+			}
+
 		}
 		m_pText = m_pCur;
 
@@ -5582,12 +5591,6 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 			m_pCur = m_pText + cur->off + cur->len;
 			m_pTokenEnd = m_pCur;
 
-			cur = cur->next;
-			if(cur == NULL){
-				m_iLastTokenLen = 0;
-				m_iAccum = 0;
-				scws_free_result(res);
-			}
 			return m_sAccum;
 		}
 	}

From 8a4bc2c51ec84332857b98066a563a063c8c2b6b Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Thu, 27 Apr 2017 18:06:33 +0800
Subject: [PATCH 3/4] increase index speed

---
 src/sphinx.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 280531d02..b75d006d8 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -5345,6 +5345,25 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 		m_bBlendedPart = ( m_pBlendEnd!=NULL );
 	}
 
+	if(!IS_QUERY){
+		if(cur == NULL){
+			res = (cur = scws_get_result(scws_source));
+			if(cur == NULL){
+				return NULL;
+			}
+		}
+		memcpy(m_sAccum, m_pText + cur->off, cur->len);
+		m_sAccum[cur->len]='\0';
+		sphColumnToLowercase ( (char *)( m_sAccum ) );
+		m_iLastTokenLen = 0;
+		m_iAccum = 0;
+		cur = cur->next;
+		if(cur == NULL){
+			scws_free_result(res);
+		}
+		return m_sAccum;
+	}
+
 	bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases
 	bool bGotSoft = false; // hey Beavis he said soft huh huhhuh
 

From 866eff1a84aa034dbf7cb1cdcf5b34bfe368855f Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Fri, 28 Apr 2017 14:24:00 +0800
Subject: [PATCH 4/4] support multi dict and xdb format

---
 src/sphinx.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index b75d006d8..9a29b75c8 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2462,11 +2462,27 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
 			scws_config_set=true;
 			if ( !tSettings.m_scwsDict.IsEmpty ()  )
 			{ 
-				scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
+
+				int mode,ret;
+
+				CSphVector<CSphString> dicts;
+
+				sphSplit ( dicts, tSettings.m_scwsDict.cstr() ," \t,;");
+				ARRAY_FOREACH ( i, dicts)
+				{
+					mode = SCWS_XDICT_MEM | SCWS_XDICT_XDB;
+					if (!dicts[i].Ends(".xdb")){
+						mode |= SCWS_XDICT_TXT;
+					}
+					ret = scws_add_dict(scws_global, dicts[i].cstr (), mode);
+					sphInfo("scws set dict [%s], mode [%d], ret [%d]",dicts[i].cstr (),mode,ret);
+				}
+
 			}
 			if ( !tSettings.m_scwsRule.IsEmpty ())
 			{ 
-				scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ());
+				scws_set_rule(scws_global, tSettings.m_scwsRule.cstr ());
+				sphInfo("scws set rule [%s]",tSettings.m_scwsRule.cstr ());
 			}
 			scws_set_charset(scws_global, "utf8");
 			scws_set_ignore(scws_global, true);
@@ -2475,6 +2491,7 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
 			if ( tSettings.m_scwsMulti)
 			{ 
 				scws_set_multi(scws_global, tSettings.m_scwsMulti << 12);
+				sphInfo("scws set muliti[%d]",tSettings.m_scwsMulti);
 			}else{
 				scws_set_multi(scws_global, 0);
 			}