Skip to content

Commit 7711eb1

Browse files
committed
Retry on mongos operation failures
1 parent 2e2b5a1 commit 7711eb1

File tree

5 files changed

+167
-8
lines changed

5 files changed

+167
-8
lines changed

lib/mongo/client.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,10 @@ def hash
151151
# @option options [ Logger ] :logger A custom logger if desired.
152152
# @option options [ true, false ] :truncate_logs Whether to truncate the
153153
# logs at the default 250 characters.
154+
# @option options [ Integer ] :max_read_retries The maximum number of read
155+
# retries on mongos query failures.
156+
# @option options [ Float ] :read_retry_interval The interval, in seconds,
157+
# in which reads on a mongos are retried.
154158
#
155159
# @since 2.0.0
156160
def initialize(addresses_or_uri, options = Options::Redacted.new)

lib/mongo/cluster.rb

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,16 @@ class Cluster
2525
include Event::Subscriber
2626
include Loggable
2727

28+
# The default number of mongos read retries.
29+
#
30+
# @since 2.1.1
31+
MAX_READ_RETRIES = 1
32+
33+
# The default mongos read retry interval, in seconds.
34+
#
35+
# @since 2.1.1
36+
READ_RETRY_INTERVAL = 5
37+
2838
# @return [ Hash ] The options hash.
2939
attr_reader :options
3040

@@ -143,6 +153,32 @@ def elect_primary!(description)
143153
@topology = topology.elect_primary(description, servers_list)
144154
end
145155

156+
# Get the maximum number of times the cluster can retry a read operation on
157+
# a mongos.
158+
#
159+
# @example Get the max read retries.
160+
# cluster.max_read_retries
161+
#
162+
# @return [ Integer ] The maximum retries.
163+
#
164+
# @since 2.1.1
165+
def max_read_retries
166+
options[:max_read_retries] || MAX_READ_RETRIES
167+
end
168+
169+
# Get the interval, in seconds, in which a mongos read operation is
170+
# retried.
171+
#
172+
# @example Get the read retry interval.
173+
# cluster.read_retry_interval
174+
#
175+
# @return [ Float ] The interval.
176+
#
177+
# @since 2.1.1
178+
def read_retry_interval
179+
options[:read_retry_interval] || READ_RETRY_INTERVAL
180+
end
181+
146182
# Notify the cluster that a standalone server was discovered so that the
147183
# topology can be updated accordingly.
148184
#

lib/mongo/error/operation_failure.rb

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,35 @@ class Error
1818
# Raised when an operation fails for some reason.
1919
#
2020
# @since 2.0.0
21-
class OperationFailure < Error; end
21+
class OperationFailure < Error
22+
23+
# These are magic error messages that could indicate a cluster
24+
# reconfiguration behind a mongos. We cannot check error codes as they
25+
# change between versions, for example 15988 which has 2 completely
26+
# different meanings between 2.4 and 3.0.
27+
#
28+
# @since 2.1.1
29+
RETRY_MESSAGES = [
30+
'transport error',
31+
'socket exception',
32+
"can't connect",
33+
'no master',
34+
'not master',
35+
'connect failed',
36+
'error querying'
37+
].freeze
38+
39+
# Can the operation that caused the error be retried?
40+
#
41+
# @example Is the error retryable?
42+
# error.retryable?
43+
#
44+
# @return [ true, false ] If the error is retryable.
45+
#
46+
# @since 2.1.1
47+
def retryable?
48+
RETRY_MESSAGES.any?{ |m| message.include?(m) }
49+
end
50+
end
2251
end
2352
end

lib/mongo/retryable.rb

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,28 +26,44 @@ module Retryable
2626

2727
# Execute a read operation with a retry.
2828
#
29+
# @api private
30+
#
2931
# @example Execute the read.
3032
# read_with_retry do
3133
# ...
3234
# end
3335
#
3436
# @note This only retries read operations on socket errors.
3537
#
38+
# @param [ Integer ] attempt The retry attempt count - for internal use.
3639
# @param [ Proc ] block The block to execute.
3740
#
3841
# @return [ Result ] The result of the operation.
3942
#
4043
# @since 2.1.0
41-
def read_with_retry(&block)
44+
def read_with_retry(attempt = 0, &block)
4245
begin
4346
block.call
4447
rescue Error::SocketError, Error::SocketTimeoutError
4548
retry_operation(&block)
49+
rescue Error::OperationFailure => e
50+
if cluster.sharded? && e.retryable?
51+
if attempt < max_read_retries
52+
# We don't scan the cluster in this case as Mongos always returns
53+
# ready after a ping whether no matter what the state behind it is.
54+
sleep(read_retry_interval)
55+
read_with_retry(attempt - 1, &block)
56+
end
57+
else
58+
raise e
59+
end
4660
end
4761
end
4862

4963
# Execute a write operation with a retry.
5064
#
65+
# @api private
66+
#
5167
# @example Execute the write.
5268
# write_with_retry do
5369
# ...

spec/mongo/retryable_spec.rb

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ def initialize(operation, cluster)
1414
@cluster = cluster
1515
end
1616

17+
def max_read_retries
18+
cluster.max_read_retries
19+
end
20+
21+
def read_retry_interval
22+
cluster.read_retry_interval
23+
end
24+
1725
def read
1826
read_with_retry do
1927
operation.execute
@@ -81,14 +89,80 @@ def write
8189

8290
context 'when an operation failure occurs' do
8391

84-
before do
85-
expect(operation).to receive(:execute).and_raise(Mongo::Error::OperationFailure).ordered
92+
context 'when the cluster is not a mongos' do
93+
94+
before do
95+
expect(operation).to receive(:execute).and_raise(Mongo::Error::OperationFailure).ordered
96+
expect(cluster).to receive(:sharded?).and_return(false)
97+
end
98+
99+
it 'raises an exception' do
100+
expect {
101+
retryable.read
102+
}.to raise_error(Mongo::Error::OperationFailure)
103+
end
86104
end
87105

88-
it 'raises an exception' do
89-
expect {
90-
retryable.read
91-
}.to raise_error(Mongo::Error::OperationFailure)
106+
context 'when the cluster is a mongos' do
107+
108+
context 'when the operation failure is not retryable' do
109+
110+
let(:error) do
111+
Mongo::Error::OperationFailure.new('not authorized')
112+
end
113+
114+
before do
115+
expect(operation).to receive(:execute).and_raise(error).ordered
116+
expect(cluster).to receive(:sharded?).and_return(true)
117+
end
118+
119+
it 'raises the exception' do
120+
expect {
121+
retryable.read
122+
}.to raise_error(Mongo::Error::OperationFailure)
123+
end
124+
end
125+
126+
context 'when the operation failure is retryable' do
127+
128+
let(:error) do
129+
Mongo::Error::OperationFailure.new('no master')
130+
end
131+
132+
context 'when the retry succeeds' do
133+
134+
before do
135+
expect(operation).to receive(:execute).and_raise(error).ordered
136+
expect(cluster).to receive(:sharded?).and_return(true)
137+
expect(cluster).to receive(:max_read_retries).and_return(1).ordered
138+
expect(cluster).to receive(:read_retry_interval).and_return(0.1).ordered
139+
expect(operation).to receive(:execute).and_return(true).ordered
140+
end
141+
142+
it 'returns the result' do
143+
expect(retryable.read).to be true
144+
end
145+
end
146+
147+
context 'when the retry fails once and then succeeds' do
148+
149+
before do
150+
expect(operation).to receive(:execute).and_raise(error).ordered
151+
expect(cluster).to receive(:sharded?).and_return(true)
152+
expect(cluster).to receive(:max_read_retries).and_return(1).ordered
153+
expect(cluster).to receive(:read_retry_interval).and_return(0.1).ordered
154+
expect(operation).to receive(:execute).and_raise(error).ordered
155+
expect(cluster).to receive(:sharded?).and_return(true)
156+
expect(cluster).to receive(:max_read_retries).and_return(1).ordered
157+
expect(cluster).to receive(:read_retry_interval).and_return(0.1).ordered
158+
expect(operation).to receive(:execute).and_return(true).ordered
159+
end
160+
161+
it 'returns the result' do
162+
expect(retryable.read).to be true
163+
end
164+
end
165+
end
92166
end
93167
end
94168
end

0 commit comments

Comments
 (0)