texar-pytorch/tests/data/embedding_test.py at 1e4416fc0ba8838e7bc3352dd9f5b157d2fc2ff7 · asyml/texar-pytorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Unit tests for embedding related operations.
"""

import sys
import tempfile
import unittest

import numpy as np

from texar.torch.data import embedding

Py3 = sys.version_info[0] == 3


class EmbeddingTest(unittest.TestCase):
    """Tests embedding related operations.
    """

    def test_load_glove(self):
        """Tests the load_glove function.
        """
        word_vec_lines = ["word 1.2 3.4 5.6", "词 1. 3. 5."]
        glove_file = tempfile.NamedTemporaryFile(mode="w+")
        if Py3:
            glove_file.write('\n'.join(word_vec_lines))
        else:
            glove_file.write('\n'.join(word_vec_lines).encode("utf-8"))
        glove_file.flush()
        vocab = {"word": 0, "词": 1}
        word_vecs = np.zeros([2, 3])

        word_vecs = embedding.load_glove(glove_file.name, vocab, word_vecs)

        self.assertEqual(word_vecs.shape[0], 2)
        self.assertEqual(word_vecs.shape[1], 3)
        np.testing.assert_array_equal(word_vecs[0], [1.2, 3.4, 5.6])
        np.testing.assert_array_equal(word_vecs[1], [1., 3., 5.])

    def test_load_word2vec(self):
        """Tests the load_word2vec function.
        """
        header = "2 3"
        words = ["word", "词"]
        vec = np.array([1.2, 3.4, 5.6], dtype='float32')
        w2v_file = tempfile.NamedTemporaryFile()
        w2v_file.write((header + "\n").encode('utf-8'))
        for word in words:
            w2v_file.write((word + " ").encode('utf-8'))
            w2v_file.write(vec.tostring() + b'\n')
        w2v_file.flush()
        vocab = {"word": 0, "词": 1}
        word_vecs = np.zeros([2, 3])

        word_vecs = embedding.load_word2vec(w2v_file.name, vocab, word_vecs)

        self.assertEqual(word_vecs.shape[0], 2)
        self.assertEqual(word_vecs.shape[1], 3)
        np.testing.assert_array_equal(word_vecs[0], vec)
        np.testing.assert_array_equal(word_vecs[1], vec)

    def test_embedding(self):
        """Tests :class:`texar.torch.data.embedding.Embedding`.
        """
        vocab = {"word": 0, "词": 1}
        emb = embedding.Embedding(vocab)
        self.assertEqual(len(emb.word_vecs), len(vocab))


if __name__ == "__main__":
    unittest.main()