Skip to content

Commit c25c04e

Browse files
committed
More speed
1 parent 4a8bd46 commit c25c04e

File tree

2 files changed

+170
-12
lines changed

2 files changed

+170
-12
lines changed

pinecone/grpc/utils.py

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import annotations
22

33
from typing import Any, TYPE_CHECKING
4-
from google.protobuf import json_format
54
from google.protobuf.message import Message
65

76
import uuid
@@ -56,6 +55,61 @@ def dict_to_proto_struct(d: dict | None) -> "Struct":
5655
return s
5756

5857

58+
def _struct_to_dict(struct: "Struct") -> dict[str, Any]:
59+
"""Convert a protobuf Struct to dict by directly accessing fields.
60+
61+
This optimized version is ~2x faster than json_format.MessageToDict
62+
by avoiding JSON serialization/deserialization overhead.
63+
64+
Args:
65+
struct: A protobuf Struct message.
66+
67+
Returns:
68+
Dictionary representation of the Struct.
69+
"""
70+
71+
result: dict[str, Any] = {}
72+
for key, value in struct.fields.items():
73+
# Directly access the Value fields based on which one is set
74+
if value.HasField("null_value"):
75+
result[key] = None
76+
elif value.HasField("number_value"):
77+
result[key] = value.number_value
78+
elif value.HasField("string_value"):
79+
result[key] = value.string_value
80+
elif value.HasField("bool_value"):
81+
result[key] = value.bool_value
82+
elif value.HasField("struct_value"):
83+
result[key] = _struct_to_dict(value.struct_value)
84+
elif value.HasField("list_value"):
85+
# Convert ListValue to Python list
86+
list_result = []
87+
for item in value.list_value.values:
88+
if item.HasField("null_value"):
89+
list_result.append(None)
90+
elif item.HasField("number_value"):
91+
list_result.append(item.number_value)
92+
elif item.HasField("string_value"):
93+
list_result.append(item.string_value)
94+
elif item.HasField("bool_value"):
95+
list_result.append(item.bool_value)
96+
elif item.HasField("struct_value"):
97+
list_result.append(_struct_to_dict(item.struct_value))
98+
elif item.HasField("list_value"):
99+
# Nested lists
100+
nested_list = []
101+
for nested_item in item.list_value.values:
102+
if nested_item.HasField("number_value"):
103+
nested_list.append(nested_item.number_value)
104+
elif nested_item.HasField("string_value"):
105+
nested_list.append(nested_item.string_value)
106+
elif nested_item.HasField("bool_value"):
107+
nested_list.append(nested_item.bool_value)
108+
list_result.append(nested_list)
109+
result[key] = list_result
110+
return result
111+
112+
59113
def parse_sparse_values(sparse_values: dict | None) -> SparseValues:
60114
from typing import cast
61115

@@ -76,33 +130,33 @@ def parse_fetch_response(
76130
"""
77131
# Extract response info from initial metadata
78132
from pinecone.utils.response_info import extract_response_info
133+
from pinecone.db_data.dataclasses import SparseValues
79134

80135
metadata = initial_metadata or {}
81136
response_info = extract_response_info(metadata)
82137

83138
# Directly access protobuf fields instead of converting entire message to dict
139+
vectors = response.vectors
84140
vd = {}
85141
# namespace is a required string field, so it will always have a value (default empty string)
86142
namespace = response.namespace
87143

88144
# Iterate over vectors map directly
89-
for vec_id, vec in response.vectors.items():
145+
for vec_id, vec in vectors.items():
90146
# Convert vector.values (RepeatedScalarFieldContainer) to list
91147
values = list(vec.values) if vec.values else []
92148

93149
# Handle sparse_values if present (check if field is set and not empty)
94150
parsed_sparse = None
95151
if vec.HasField("sparse_values") and vec.sparse_values:
96-
from pinecone.db_data.dataclasses import SparseValues
97-
98152
parsed_sparse = SparseValues(
99153
indices=list(vec.sparse_values.indices), values=list(vec.sparse_values.values)
100154
)
101155

102-
# Convert metadata Struct to dict only when needed
156+
# Convert metadata Struct to dict only when needed using optimized conversion
103157
metadata_dict = None
104158
if vec.HasField("metadata") and vec.metadata:
105-
metadata_dict = json_format.MessageToDict(vec.metadata)
159+
metadata_dict = _struct_to_dict(vec.metadata)
106160

107161
vd[vec_id] = Vector(
108162
id=vec.id, values=values, sparse_values=parsed_sparse, metadata=metadata_dict
@@ -152,10 +206,10 @@ def parse_fetch_by_metadata_response(
152206
}
153207
)
154208

155-
# Convert metadata Struct to dict only when needed
209+
# Convert metadata Struct to dict only when needed using optimized conversion
156210
metadata_dict = None
157211
if vec.HasField("metadata") and vec.metadata:
158-
metadata_dict = json_format.MessageToDict(vec.metadata)
212+
metadata_dict = _struct_to_dict(vec.metadata)
159213

160214
vd[vec_id] = _Vector(
161215
id=vec.id,
@@ -289,9 +343,9 @@ def query_response_to_dict(response: "ProtoQueryResponse") -> dict[str, Any]:
289343
"values": list(match.sparse_values.values),
290344
}
291345

292-
# Convert metadata if present
346+
# Convert metadata if present using optimized conversion
293347
if match.HasField("metadata") and match.metadata:
294-
match_dict["metadata"] = json_format.MessageToDict(match.metadata)
348+
match_dict["metadata"] = _struct_to_dict(match.metadata)
295349

296350
result["matches"].append(match_dict)
297351

@@ -342,10 +396,10 @@ def parse_query_response(
342396
indices=list(match.sparse_values.indices), values=list(match.sparse_values.values)
343397
)
344398

345-
# Convert metadata Struct to dict only when needed
399+
# Convert metadata Struct to dict only when needed using optimized conversion
346400
metadata_dict = None
347401
if match.HasField("metadata") and match.metadata:
348-
metadata_dict = json_format.MessageToDict(match.metadata)
402+
metadata_dict = _struct_to_dict(match.metadata)
349403

350404
sc = ScoredVector(
351405
id=match.id,
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""Performance tests for parse_fetch_response optimizations.
2+
3+
This test measures the performance impact of optimizations to parse_fetch_response,
4+
specifically the _struct_to_dict optimization vs json_format.MessageToDict.
5+
"""
6+
7+
import random
8+
import pytest
9+
from google.protobuf import struct_pb2
10+
11+
from pinecone.core.grpc.protos.db_data_2025_10_pb2 import FetchResponse, Vector, Usage
12+
from pinecone.grpc.utils import parse_fetch_response, _struct_to_dict
13+
from google.protobuf import json_format
14+
15+
16+
def create_vector_with_metadata(id: str, dimension: int, metadata_size: int = 2) -> Vector:
17+
"""Create a Vector protobuf message with metadata."""
18+
values = [random.random() for _ in range(dimension)]
19+
20+
# Create metadata with specified number of fields
21+
metadata = struct_pb2.Struct()
22+
metadata_dict = {}
23+
for i in range(metadata_size):
24+
metadata_dict[f"key_{i}"] = f"value_{random.randint(1, 100)}"
25+
if i % 3 == 0:
26+
metadata_dict[f"num_{i}"] = random.random()
27+
elif i % 3 == 1:
28+
metadata_dict[f"bool_{i}"] = random.choice([True, False])
29+
metadata.update(metadata_dict)
30+
31+
return Vector(id=id, values=values, metadata=metadata)
32+
33+
34+
def create_fetch_response_with_metadata(
35+
num_vectors: int, dimension: int, metadata_size: int = 2
36+
) -> FetchResponse:
37+
"""Create a FetchResponse protobuf message with vectors that have metadata."""
38+
vectors = {}
39+
for i in range(num_vectors):
40+
vector = create_vector_with_metadata(f"vec_{i}", dimension, metadata_size)
41+
vectors[f"vec_{i}"] = vector
42+
43+
return FetchResponse(
44+
vectors=vectors, namespace="test_namespace", usage=Usage(read_units=num_vectors)
45+
)
46+
47+
48+
class TestFetchResponseOptimization:
49+
"""Performance benchmarks for parse_fetch_response optimizations."""
50+
51+
@pytest.mark.parametrize(
52+
"num_vectors,dimension,metadata_size",
53+
[
54+
(10, 128, 2),
55+
(10, 128, 10),
56+
(100, 128, 2),
57+
(100, 128, 10),
58+
(1000, 128, 2),
59+
(1000, 128, 10),
60+
],
61+
)
62+
def test_parse_fetch_response_with_metadata(
63+
self, benchmark, num_vectors, dimension, metadata_size
64+
):
65+
"""Benchmark parse_fetch_response with vectors containing metadata."""
66+
response = create_fetch_response_with_metadata(num_vectors, dimension, metadata_size)
67+
benchmark(parse_fetch_response, response, None)
68+
69+
def test_struct_to_dict_vs_message_to_dict(self, benchmark):
70+
"""Compare _struct_to_dict vs json_format.MessageToDict performance."""
71+
# Create a struct with various value types
72+
struct = struct_pb2.Struct()
73+
struct.update(
74+
{
75+
"string_field": "test_value",
76+
"number_field": 123.456,
77+
"bool_field": True,
78+
"list_field": [1, 2, 3, "four", 5.0],
79+
"nested": {"inner": "value", "num": 42},
80+
}
81+
)
82+
83+
# Benchmark our optimized version
84+
result_optimized = benchmark(_struct_to_dict, struct)
85+
86+
# Verify correctness by comparing with MessageToDict
87+
result_standard = json_format.MessageToDict(struct)
88+
assert result_optimized == result_standard, "Results don't match!"
89+
90+
@pytest.mark.parametrize("num_fields", [1, 5, 10, 20, 50])
91+
def test_struct_to_dict_scaling(self, benchmark, num_fields):
92+
"""Test how _struct_to_dict performance scales with number of fields."""
93+
struct = struct_pb2.Struct()
94+
metadata_dict = {}
95+
for i in range(num_fields):
96+
metadata_dict[f"key_{i}"] = f"value_{i}"
97+
if i % 2 == 0:
98+
metadata_dict[f"num_{i}"] = float(i)
99+
struct.update(metadata_dict)
100+
101+
result = benchmark(_struct_to_dict, struct)
102+
# We add num_fields string fields, plus (num_fields + 1) // 2 number fields (for even indices: 0, 2, 4, ...)
103+
expected_fields = num_fields + ((num_fields + 1) // 2)
104+
assert len(result) == expected_fields

0 commit comments

Comments
 (0)