Skip to content

Commit

Permalink
Merge branch 'ajay/fts_special' into 'master'
Browse files Browse the repository at this point in the history
Simplify handling of FTS special chars

See merge request Plasticity/magnitude!3
  • Loading branch information
AjayP13 committed Mar 3, 2018
2 parents 29e39e3 + 1792940 commit 7283948
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 48 deletions.
21 changes: 9 additions & 12 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@ before_script:
- apt-get install python-dev -y
- apt-get install python3-dev -y
- apt-get install openssh-server -y
# Add GitLab SSH private deploy key
- eval $(ssh-agent -s)
- tmpfile=$(mktemp ~/pk.XXXXXX)
- echo "$SSH_PRIVATE_KEY" > $tmpfile
- ssh-add $tmpfile
- rm $tmpfile
# Setup SSH configuration
- mkdir -p ~/.ssh
- echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
Expand All @@ -51,9 +45,6 @@ Test Python 2:
- ls
- python2 -m tests.tests -i GoogleNews-vectors-negative300.magnitude -s GoogleNews-vectors-negative300.subword.magnitude -a GoogleNews-vectors-negative300.approx.magnitude -- -v

only:
- master

Test Python 3:
stage: Test Python 3
script:
Expand All @@ -73,12 +64,16 @@ Test Python 3:
- ls
- python3 -m tests.tests -i GoogleNews-vectors-negative300.magnitude -s GoogleNews-vectors-negative300.subword.magnitude -a GoogleNews-vectors-negative300.approx.magnitude -- -v

only:
- master

Deploy to PyPI:
stage: Deploy to PyPI
script:
# Add GitLab SSH private deploy key
- eval $(ssh-agent -s)
- tmpfile=$(mktemp ~/pk.XXXXXX)
- echo "$SSH_PRIVATE_KEY" > $tmpfile
- ssh-add $tmpfile
- rm $tmpfile
# Tag the release on GitLab
- rm -rf ../tagger
- mkdir -p ../tagger
- cd ../tagger
Expand All @@ -91,7 +86,9 @@ Deploy to PyPI:
- cd $CI_PROJECT_DIR
- rm -rf ../tagger
- sleep 60 # Wait for GitLab to mirror to GitHub
# Create a release on GitHub
- curl -u plasticity-admin:$GITHUB_TOKEN -d "{\"tag_name\":\"$(python setup.py -V)\", \"name\":\"Release $(python setup.py -V)\"}" -H "Content-Type:"" application/json" -X POST https://api.github.com/repos/plasticityai/$CI_PROJECT_NAME/releases
# Upload to PyPI
- envsubst < deployment/.pypirc > ~/.pypirc
- chmod 600 ~/.pypirc
- python setup.py sdist upload -r pypitest
Expand Down
54 changes: 19 additions & 35 deletions pymagnitude/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,13 @@ def _oov_key_t(self, key):

def _db_query_similar_keys_vector(self, key, orig_key, topn = 3):
"""Finds similar keys in the database and gets the mean vector."""
def _sql_escape_single(s):
return s.replace("'", "''")

def _sql_escape_fts(s):
return ''.join("\\"+c if c in Magnitude.FTS_SPECIAL
else c for c in s).replace('"', '""')

if self.subword:
current_subword_start = self.subword_end
BOW_length = len(Magnitude.BOW)
Expand All @@ -413,21 +420,21 @@ def _db_query_similar_keys_vector(self, key, orig_key, topn = 3):
exact_match = []
if true_key_len <= 6:
beginning_and_end_clause = """
magnitude.key LIKE "{0}%"
magnitude.key LIKE '{0}%'
AND LENGTH(magnitude.key) <= {2} DESC,
magnitude.key LIKE "%{1}"
magnitude.key LIKE '%{1}'
AND LENGTH(magnitude.key) <= {2} DESC,"""
beginning_and_end_clause = beginning_and_end_clause.format(
key[BOW_length:BOW_length+1].replace("'", "''"),
key[-EOW_length-1:-EOW_length].replace("'", "''"),
_sql_escape_single(key[BOW_length:BOW_length+1]),
_sql_escape_single(key[-EOW_length-1:-EOW_length]),
str(true_key_len))
if true_key_len <= 5 and key_shrunk != key:
exact_match = list(char_ngrams(
key_shrunk, true_key_len, true_key_len))
search_query = """
SELECT magnitude.*
FROM magnitude_subword, magnitude
WHERE char_ngrams {0}
WHERE char_ngrams MATCH ?
AND magnitude.rowid = magnitude_subword.rowid
ORDER BY
(
Expand All @@ -439,44 +446,21 @@ def _db_query_similar_keys_vector(self, key, orig_key, topn = 3):
LIMIT ?;
"""
if len(exact_match) > 0:
# Handle fts3 special characters
if any((c in Magnitude.FTS_SPECIAL)
for c in ''.join(exact_match)):
q = search_query.format(
'IN (' + ', '.join('?' * len(exact_match)) + ')')
params = exact_match + [topn]
else:
q = search_query.format('MATCH ?')
params = (' OR '.join('"{0}"'.format(
e.replace('"', '""')) for e in exact_match), topn)
results = self._db().execute(q, params).fetchall()
params = (' OR '.join('"{0}"'.format(_sql_escape_fts(e))
for e in exact_match), topn)
results = self._db().execute(search_query, params).fetchall()
else:
results = []
if len(results) == 0:
while (len(results) < topn and
current_subword_start >= self.subword_start):
ngrams = list(char_ngrams(
key, current_subword_start, self.subword_end))
# Handle fts3 special characters
if any((c in Magnitude.FTS_SPECIAL)
for c in ''.join(ngrams)):
q = search_query.format(
'IN (' + ', '.join('?' * len(ngrams)) + ')')
params = ngrams + [topn]
else:
q = search_query.format('MATCH ?')
params = (' OR '.join('"{0}"'.format(
n.replace('"', '""')) for n in ngrams), topn)
results = self._db().execute(q, params).fetchall()
params = (' OR '.join('"{0}"'.format(_sql_escape_fts(n))
for n in ngrams), topn)
results = self._db().execute(search_query,
params).fetchall()
current_subword_start -= 1
# if current_subword_start > self.subword_start:
# results = self._db().execute(search_query,
# ('(' + ') AND ('.join([
# ' OR '.join(char_ngrams(key,
# current_subword_start, self.subword_end)),
# ' OR '.join(char_ngrams(key,
# self.subword_start, self.subword_start))
# ]) + ')', topn)).fetchall()
else:
results = self._db().execute(
"""
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
setup(
name='pymagnitude',
packages=find_packages(exclude=['tests', 'tests.*']),
version='0.1.5',
version='0.1.6',
description='A fast, efficient universal vector embedding utility package.',
long_description="""
About
Expand Down

0 comments on commit 7283948

Please sign in to comment.