Skip to content

Commit 8b0e8c2

Browse files
authored
Reducing size (blugelabs#8)
* feat: add trunk compress for storedFields * feat: use zstd replace snappy * feat: compress docValue * feat: reback the docValue compress * feat: packed docNum and Offset for docValue * doc: update go.mod * feat: packed numeric of posting list * feat: compress numeric of posting list * feat: packed numeric of posting list * feat: compress intcoder * feat: run optimize on bitmap * feat: optimize document values chunk * doc: add author * style: change implement * fix: tests * test: add test for document coder * style: format code * update trunk to chunk * update sort of Authors * rename BufferSize to Size and remove Close method * update version * fix panic when search memory * rename variables
1 parent 09719ef commit 8b0e8c2

20 files changed

+575
-182
lines changed

AUTHORS

+1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@
77
#
88
# Please keep the list sorted.
99

10+
Hengfei Yang <[email protected]>
1011
Marty Schoch <[email protected]>

contentcoder.go

+10-5
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ import (
1818
"bytes"
1919
"encoding/binary"
2020
"io"
21-
22-
"github.com/golang/snappy"
2321
)
2422

2523
var termSeparator byte = 0xff
@@ -39,7 +37,7 @@ type chunkedContentCoder struct {
3937

4038
chunkMeta []metaData
4139

42-
compressed []byte // temp buf for snappy compression
40+
compressed []byte // temp buf for compression
4341
}
4442

4543
// metaData represents the data information inside a
@@ -107,18 +105,25 @@ func (c *chunkedContentCoder) flushContents() error {
107105
}
108106

109107
// write out the metaData slice
108+
diffDocNum := uint64(0)
109+
diffDvOffset := uint64(0)
110110
for _, meta := range c.chunkMeta {
111-
err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
111+
err = writeUvarints(&c.chunkMetaBuf, meta.DocNum-diffDocNum, meta.DocDvOffset-diffDvOffset)
112112
if err != nil {
113113
return err
114114
}
115+
diffDocNum = meta.DocNum
116+
diffDvOffset = meta.DocDvOffset
115117
}
116118

117119
// write the metadata to final data
118120
metaData := c.chunkMetaBuf.Bytes()
119121
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
120122
// write the compressed data to the final data
121-
c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
123+
c.compressed, err = ZSTDCompress(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes(), ZSTDCompressionLevel)
124+
if err != nil {
125+
return err
126+
}
122127
c.final = append(c.final, c.compressed...)
123128

124129
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))

contentcoder_test.go

+16-12
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@ func TestChunkedContentCoder(t *testing.T) {
3333
docNums: []uint64{0},
3434
vals: [][]byte{[]byte("bluge")},
3535
// 1 chunk, chunk-0 length 11(b), value
36-
expected: []byte{0x1, 0x0, 0x5, 0x5, 0x10, 'b', 'l', 'u', 'g', 'e',
37-
0xa,
38-
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1,
39-
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1},
36+
expected: []byte{
37+
0x1, 0x0, 0x5, 0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x29, 0x0, 0x0,
38+
'b', 'l', 'u', 'g', 'e',
39+
0x7e, 0xde, 0xed, 0x4a, 0x15, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
40+
0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1,
41+
},
4042
},
4143
{
4244
maxDocNum: 1,
@@ -47,11 +49,13 @@ func TestChunkedContentCoder(t *testing.T) {
4749
[]byte("scorch"),
4850
},
4951

50-
expected: []byte{0x1, 0x0, 0x6, 0x6, 0x14, 0x75, 0x70, 0x73, 0x69, 0x64,
51-
0x65, 0x1, 0x1, 0x6, 0x6, 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68,
52-
0xb, 0x16,
53-
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2,
54-
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2},
52+
expected: []byte{
53+
0x1, 0x0, 0x6, 0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x31, 0x0, 0x0,
54+
0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x35, 0x89, 0x5a, 0xd,
55+
0x1, 0x1, 0x6, 0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x31, 0x0, 0x0,
56+
0x73, 0x63, 0x6f, 0x72, 0x63, 0x68, 0xc4, 0x46, 0x89, 0x39, 0x16, 0x2c,
57+
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2,
58+
},
5559
},
5660
}
5761

@@ -61,7 +65,7 @@ func TestChunkedContentCoder(t *testing.T) {
6165
for i, docNum := range test.docNums {
6266
err := cic.Add(docNum, test.vals[i])
6367
if err != nil {
64-
t.Fatalf("error adding to intcoder: %v", err)
68+
t.Fatalf("error adding to contentcoder: %v", err)
6569
}
6670
}
6771
_ = cic.Close()
@@ -98,11 +102,11 @@ func TestChunkedContentCoders(t *testing.T) {
98102
for i, docNum := range docNums {
99103
err := cic1.Add(docNum, vals[i])
100104
if err != nil {
101-
t.Fatalf("error adding to intcoder: %v", err)
105+
t.Fatalf("error adding to contentcoder: %v", err)
102106
}
103107
err = cic2.Add(docNum, vals[i])
104108
if err != nil {
105-
t.Fatalf("error adding to intcoder: %v", err)
109+
t.Fatalf("error adding to contentcoder: %v", err)
106110
}
107111
}
108112
_ = cic1.Close()

documentcoder.go

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
package ice
2+
3+
import (
4+
"bytes"
5+
"encoding/binary"
6+
"io"
7+
)
8+
9+
const defaultDocumentChunkSize uint32 = 128
10+
11+
type chunkedDocumentCoder struct {
12+
chunkSize uint64
13+
w io.Writer
14+
buf *bytes.Buffer
15+
metaBuf []byte
16+
n uint64
17+
bytes uint64
18+
compressed []byte
19+
offsets []uint64
20+
}
21+
22+
func newChunkedDocumentCoder(chunkSize uint64, w io.Writer) *chunkedDocumentCoder {
23+
c := &chunkedDocumentCoder{
24+
chunkSize: chunkSize,
25+
w: w,
26+
}
27+
c.buf = bytes.NewBuffer(nil)
28+
c.metaBuf = make([]byte, binary.MaxVarintLen64)
29+
c.offsets = append(c.offsets, 0)
30+
return c
31+
}
32+
33+
func (c *chunkedDocumentCoder) Add(docNum uint64, meta, data []byte) (int, error) {
34+
var wn, n int
35+
var err error
36+
n = binary.PutUvarint(c.metaBuf, uint64(len(meta)))
37+
if n, err = c.writeToBuf(c.metaBuf[:n]); err != nil {
38+
return 0, err
39+
}
40+
wn += n
41+
n = binary.PutUvarint(c.metaBuf, uint64(len(data)))
42+
if n, err = c.writeToBuf(c.metaBuf[:n]); err != nil {
43+
return 0, err
44+
}
45+
wn += n
46+
if n, err = c.writeToBuf(meta); err != nil {
47+
return 0, err
48+
}
49+
wn += n
50+
if n, err = c.writeToBuf(data); err != nil {
51+
return 0, err
52+
}
53+
wn += n
54+
55+
return wn, c.newLine()
56+
}
57+
58+
func (c *chunkedDocumentCoder) writeToBuf(data []byte) (int, error) {
59+
return c.buf.Write(data)
60+
}
61+
62+
func (c *chunkedDocumentCoder) newLine() error {
63+
c.n++
64+
if c.n%c.chunkSize != 0 {
65+
return nil
66+
}
67+
return c.flush()
68+
}
69+
70+
func (c *chunkedDocumentCoder) flush() error {
71+
if c.buf.Len() > 0 {
72+
var err error
73+
c.compressed, err = ZSTDCompress(c.compressed[:cap(c.compressed)], c.buf.Bytes(), ZSTDCompressionLevel)
74+
if err != nil {
75+
return err
76+
}
77+
n, err := c.w.Write(c.compressed)
78+
if err != nil {
79+
return err
80+
}
81+
c.bytes += uint64(n)
82+
c.buf.Reset()
83+
}
84+
c.offsets = append(c.offsets, c.bytes)
85+
return nil
86+
}
87+
88+
func (c *chunkedDocumentCoder) Write() error {
89+
// flush first
90+
if err := c.flush(); err != nil {
91+
return err
92+
}
93+
var err error
94+
var wn, n int
95+
// write chunk offsets
96+
for _, offset := range c.offsets {
97+
n = binary.PutUvarint(c.metaBuf, offset)
98+
if _, err = c.w.Write(c.metaBuf[:n]); err != nil {
99+
return err
100+
}
101+
wn += n
102+
}
103+
// write chunk offset length
104+
err = binary.Write(c.w, binary.BigEndian, uint32(wn))
105+
if err != nil {
106+
return err
107+
}
108+
// write chunk num
109+
err = binary.Write(c.w, binary.BigEndian, uint32(len(c.offsets)))
110+
if err != nil {
111+
return err
112+
}
113+
return nil
114+
}
115+
116+
func (c *chunkedDocumentCoder) Reset() {
117+
c.compressed = c.compressed[:0]
118+
c.offsets = c.offsets[:0]
119+
c.n = 0
120+
c.bytes = 0
121+
c.buf.Reset()
122+
}
123+
124+
// Size returns buffer size of current chunk
125+
func (c *chunkedDocumentCoder) Size() uint64 {
126+
return uint64(c.buf.Len())
127+
}
128+
129+
// Len returns chunks num
130+
func (c *chunkedDocumentCoder) Len() int {
131+
return len(c.offsets)
132+
}
133+
134+
// Len returns chunks num
135+
func (c *chunkedDocumentCoder) Offsets() []uint64 {
136+
m := make([]uint64, 0, len(c.offsets))
137+
m = append(m, c.offsets...)
138+
return m
139+
}

documentcoder_test.go

+124
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
package ice
2+
3+
import (
4+
"bytes"
5+
"testing"
6+
)
7+
8+
func TestChunkedDocumentCoder(t *testing.T) {
9+
tests := []struct {
10+
chunkSize uint64
11+
docNums []uint64
12+
metas [][]byte
13+
datas [][]byte
14+
expected []byte
15+
expectedChunkNum int
16+
}{
17+
{
18+
chunkSize: 1,
19+
docNums: []uint64{0},
20+
metas: [][]byte{{0}},
21+
datas: [][]byte{[]byte("bluge")},
22+
expected: []byte{
23+
0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x41,
24+
0x0, 0x0, 0x1, 0x5, 0x0, 0x62, 0x6c, 0x75, 0x67, 0x65, 0x2b, 0x30, 0x97, 0x33, 0x0, 0x15, 0x15,
25+
0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x3,
26+
},
27+
expectedChunkNum: 3, // left, chunk, right
28+
},
29+
{
30+
chunkSize: 1,
31+
docNums: []uint64{0, 1},
32+
metas: [][]byte{{0}, {1}},
33+
datas: [][]byte{[]byte("upside"), []byte("scorch")},
34+
expected: []byte{
35+
0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x49,
36+
0x0, 0x0, 0x1, 0x6, 0x0, 0x75, 0x70, 0x73, 0x69, 0x64, 0x65,
37+
0x36, 0x6e, 0x7e, 0x39, 0x28, 0xb5, 0x2f, 0xfd, 0x4, 0x0, 0x49,
38+
0x0, 0x0, 0x1, 0x6, 0x1, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68,
39+
0x8f, 0x83, 0xa3, 0x37, 0x0, 0x16, 0x2c, 0x2c,
40+
0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x4,
41+
},
42+
expectedChunkNum: 4, // left, chunk, chunk, right
43+
},
44+
}
45+
46+
for _, test := range tests {
47+
var actual bytes.Buffer
48+
cic := newChunkedDocumentCoder(test.chunkSize, &actual)
49+
for i, docNum := range test.docNums {
50+
_, err := cic.Add(docNum, test.metas[i], test.datas[i])
51+
if err != nil {
52+
t.Fatalf("error adding to documentcoder: %v", err)
53+
}
54+
}
55+
err := cic.Write()
56+
if err != nil {
57+
t.Fatalf("error writing: %v", err)
58+
}
59+
if !bytes.Equal(test.expected, actual.Bytes()) {
60+
t.Errorf("got:%s, expected:%s", actual.String(), string(test.expected))
61+
}
62+
if test.expectedChunkNum != cic.Len() {
63+
t.Errorf("got:%d, expected:%d", cic.Len(), test.expectedChunkNum)
64+
}
65+
}
66+
}
67+
68+
func TestChunkedDocumentCoders(t *testing.T) {
69+
chunkSize := uint64(2)
70+
docNums := []uint64{0, 1, 2, 3, 4, 5}
71+
metas := [][]byte{
72+
{0},
73+
{1},
74+
{2},
75+
{3},
76+
{4},
77+
{5},
78+
}
79+
datas := [][]byte{
80+
[]byte("scorch"),
81+
[]byte("does"),
82+
[]byte("better"),
83+
[]byte("than"),
84+
[]byte("upside"),
85+
[]byte("down"),
86+
}
87+
chunkNum := 5 // left, chunk, chunk, chunk, right
88+
89+
var actual1, actual2 bytes.Buffer
90+
// chunkedDocumentCoder that writes out at the end
91+
cic1 := newChunkedDocumentCoder(chunkSize, &actual1)
92+
// chunkedContentCoder that writes out in chunks
93+
cic2 := newChunkedDocumentCoder(chunkSize, &actual2)
94+
95+
for i, docNum := range docNums {
96+
_, err := cic1.Add(docNum, metas[i], datas[i])
97+
if err != nil {
98+
t.Fatalf("error adding to documentcoder: %v", err)
99+
}
100+
_, err = cic2.Add(docNum, metas[i], datas[i])
101+
if err != nil {
102+
t.Fatalf("error adding to documentcoder: %v", err)
103+
}
104+
}
105+
106+
err := cic1.Write()
107+
if err != nil {
108+
t.Fatalf("error writing: %v", err)
109+
}
110+
err = cic2.Write()
111+
if err != nil {
112+
t.Fatalf("error writing: %v", err)
113+
}
114+
115+
if !bytes.Equal(actual1.Bytes(), actual2.Bytes()) {
116+
t.Errorf("%s != %s", actual1.String(), actual2.String())
117+
}
118+
if chunkNum != cic1.Len() {
119+
t.Errorf("got:%d, expected:%d", cic1.Len(), chunkNum)
120+
}
121+
if chunkNum != cic2.Len() {
122+
t.Errorf("got:%d, expected:%d", cic2.Len(), chunkNum)
123+
}
124+
}

0 commit comments

Comments
 (0)