Skip to content

Commit c0f24b9

Browse files
authored
Merge pull request #3402 from AElfProject/feature/new-stable-stream
improve grpc stream stability and performance
2 parents 841eedc + 5b3fb96 commit c0f24b9

18 files changed

Lines changed: 471 additions & 137 deletions

src/AElf.OS.Core/AElf.OS.Core.csproj

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<Project Sdk="Microsoft.NET.Sdk">
2-
<Import Project="..\..\common.props"/>
2+
<Import Project="..\..\common.props" />
33
<PropertyGroup>
44
<TargetFramework>net6.0</TargetFramework>
55
<RootNamespace>AElf.OS</RootNamespace>
@@ -8,8 +8,8 @@
88
<Description>Core module for the OS layer.</Description>
99
</PropertyGroup>
1010
<ItemGroup>
11-
<ProjectReference Include="..\AElf.Kernel.Node\AElf.Kernel.Node.csproj"/>
12-
<ProjectReference Include="..\AElf.Kernel.Token\AElf.Kernel.Token.csproj"/>
11+
<ProjectReference Include="..\AElf.Kernel.Node\AElf.Kernel.Node.csproj" />
12+
<ProjectReference Include="..\AElf.Kernel.Token\AElf.Kernel.Token.csproj" />
1313
</ItemGroup>
1414
<ItemGroup>
1515
<CommonMessage Include="..\..\protobuf\network_types.proto">

src/AElf.OS.Core/Network/Events/StreamMessageReceivedEvent.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@ namespace AElf.OS.Network.Events;
44

55
public class StreamMessageReceivedEvent
66
{
7-
public StreamMessageReceivedEvent(ByteString message, string clientPubkey)
7+
public StreamMessageReceivedEvent(ByteString message, string clientPubkey, string requestId)
88
{
99
Message = message;
1010
ClientPubkey = clientPubkey;
11+
RequestId = requestId;
1112
}
1213

1314
public ByteString Message { get; }
1415

1516
public string ClientPubkey { get; }
17+
18+
public string RequestId { get; }
1619
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
using AElf.OS.Network.Application;
2+
using AElf.OS.Network.Infrastructure;
3+
4+
namespace AElf.OS.Network.Events;
5+
6+
public class StreamPeerExceptionEvent
7+
{
8+
public NetworkException Exception { get; }
9+
public IPeer Peer { get; }
10+
11+
public StreamPeerExceptionEvent(NetworkException exception, IPeer peer)
12+
{
13+
Exception = exception;
14+
Peer = peer;
15+
}
16+
}

src/AElf.OS.Network.Grpc/Connection/PeerDialer.cs

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ public class PeerDialer : IPeerDialer
3333
{
3434
private readonly IAccountService _accountService;
3535
private readonly IHandshakeProvider _handshakeProvider;
36-
private KeyCertificatePair _clientKeyCertificatePair;
37-
private IStreamTaskResourcePool _streamTaskResourcePool;
36+
private readonly IStreamTaskResourcePool _streamTaskResourcePool;
3837
public ILocalEventBus EventBus { get; set; }
3938

4039
public PeerDialer(IAccountService accountService,
@@ -46,8 +45,6 @@ public PeerDialer(IAccountService accountService,
4645
EventBus = NullLocalEventBus.Instance;
4746

4847
Logger = NullLogger<PeerDialer>.Instance;
49-
50-
CreateClientKeyCertificatePair();
5148
}
5249

5350
private NetworkOptions NetworkOptions => NetworkOptionsSnapshot.Value;
@@ -142,7 +139,17 @@ public async Task<GrpcPeer> DialBackPeerByStreamAsync(DnsEndPoint remoteEndpoint
142139
};
143140
Logger.LogWarning("DialBackPeerByStreamAsync meta={meta}", meta);
144141
var peer = new GrpcStreamBackPeer(remoteEndpoint, info, responseStream, _streamTaskResourcePool, meta);
145-
142+
peer.SetStreamSendCallBack(async (ex, streamMessage, callTimes) =>
143+
{
144+
if (ex == null)
145+
Logger.LogDebug("streamRequest write success {times}-{requestId}-{messageType}-{this}-{latency}", callTimes, streamMessage.RequestId, streamMessage.MessageType, peer,
146+
CommonHelper.GetRequestLatency(streamMessage.RequestId));
147+
else
148+
{
149+
Logger.LogError(ex, "streamRequest write fail, {requestId}-{messageType}-{this}", streamMessage.RequestId, streamMessage.MessageType, peer);
150+
await EventBus.PublishAsync(new StreamPeerExceptionEvent(ex, peer), false);
151+
}
152+
});
146153
peer.UpdateLastReceivedHandshake(handshake);
147154

148155
return peer;
@@ -191,11 +198,6 @@ public async Task<GrpcPeer> DialBackPeerAsync(DnsEndPoint remoteEndpoint, Handsh
191198
return peer;
192199
}
193200

194-
private void CreateClientKeyCertificatePair()
195-
{
196-
_clientKeyCertificatePair = TlsHelper.GenerateKeyCertificatePair();
197-
}
198-
199201
/// <summary>
200202
/// Calls the server side DoHandshake RPC method, in order to establish a 2-way connection.
201203
/// </summary>
@@ -245,6 +247,17 @@ private async Task<GrpcStreamPeer> DailStreamPeerAsync(GrpcClient client, DnsEnd
245247
{ GrpcConstants.PubkeyMetadataKey, nodePubkey },
246248
{ GrpcConstants.PeerInfoMetadataKey, connectionInfo.ToString() }
247249
});
250+
streamPeer.SetStreamSendCallBack(async (ex, streamMessage, callTimes) =>
251+
{
252+
if (ex == null)
253+
Logger.LogDebug("streamRequest write success {times}-{requestId}-{messageType}-{this}-{latency}", callTimes, streamMessage.RequestId, streamMessage.MessageType, streamPeer,
254+
CommonHelper.GetRequestLatency(streamMessage.RequestId));
255+
else
256+
{
257+
Logger.LogError(ex, "streamRequest write fail, {requestId}-{messageType}-{this}", streamMessage.RequestId, streamMessage.MessageType, streamPeer);
258+
await EventBus.PublishAsync(new StreamPeerExceptionEvent(ex, streamPeer), false);
259+
}
260+
});
248261
var success = await BuildStreamForPeerAsync(streamPeer, call);
249262
return success ? streamPeer : null;
250263
}
@@ -266,12 +279,17 @@ public async Task<bool> BuildStreamForPeerAsync(GrpcStreamPeer streamPeer, Async
266279
{
267280
try
268281
{
269-
await call.ResponseStream.ForEachAsync(async req => await
270-
EventBus.PublishAsync(new StreamMessageReceivedEvent(req.ToByteString(), streamPeer.Info.Pubkey), false));
282+
await call.ResponseStream.ForEachAsync(async req =>
283+
{
284+
Logger.LogDebug("listenReceive request={requestId} {streamType}-{messageType} latency={latency}", req.RequestId, req.StreamType, req.MessageType, CommonHelper.GetRequestLatency(req.RequestId));
285+
await EventBus.PublishAsync(new StreamMessageReceivedEvent(req.ToByteString(), streamPeer.Info.Pubkey, req.RequestId), false);
286+
});
271287
Logger.LogWarning("listen end and complete {remoteEndPoint}", streamPeer.RemoteEndpoint.ToString());
272288
}
273289
catch (Exception e)
274290
{
291+
if (e is RpcException exception)
292+
await EventBus.PublishAsync(new StreamPeerExceptionEvent(streamPeer.HandleRpcException(exception, "listen err {remoteEndPoint}"), streamPeer));
275293
Logger.LogError(e, "listen err {remoteEndPoint}", streamPeer.RemoteEndpoint.ToString());
276294
}
277295
}, tokenSource.Token);
@@ -329,8 +347,9 @@ private async Task<GrpcClient> CreateClientAsync(DnsEndPoint remoteEndpoint)
329347
return null;
330348

331349
Logger.LogDebug($"Upgrading connection to TLS: {certificate}.");
350+
var clientKeyCertificatePair = TlsHelper.GenerateKeyCertificatePair();
332351
ChannelCredentials credentials =
333-
new SslCredentials(TlsHelper.ObjectToPem(certificate), _clientKeyCertificatePair);
352+
new SslCredentials(TlsHelper.ObjectToPem(certificate), clientKeyCertificatePair);
334353

335354
var channel = new Channel(remoteEndpoint.ToString(), credentials, new List<ChannelOption>
336355
{
@@ -340,7 +359,7 @@ private async Task<GrpcClient> CreateClientAsync(DnsEndPoint remoteEndpoint)
340359
new(GrpcConstants.GrpcArgKeepalivePermitWithoutCalls, GrpcConstants.GrpcArgKeepalivePermitWithoutCallsOpen),
341360
new(GrpcConstants.GrpcArgHttp2MaxPingsWithoutData, GrpcConstants.GrpcArgHttp2MaxPingsWithoutDataVal),
342361
new(GrpcConstants.GrpcArgKeepaliveTimeoutMs, GrpcConstants.GrpcArgKeepaliveTimeoutMsVal),
343-
new(GrpcConstants.GrpcArgKeepaliveTimeMs, GrpcConstants.GrpcArgKeepaliveTimeMsVal)
362+
new(GrpcConstants.GrpcArgKeepaliveTimeMs, GrpcConstants.GrpcArgKeepaliveTimeMsVal),
344363
});
345364

346365
var nodePubkey = AsyncHelper.RunSync(() => _accountService.GetPublicKeyAsync()).ToHex();

src/AElf.OS.Network.Grpc/GrpcConstants.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ public static class GrpcConstants
1313
public const string GrpcArgHttp2MaxPingsWithoutData = "grpc.http2_max_pings_without_data";
1414
public const string GrpcArgKeepaliveTimeoutMs = "grpc.keepalive_timeout_ms";
1515
public const string GrpcArgKeepaliveTimeMs = "grpc.keepalive_time_ms";
16+
// public const string GrpcArgHttp2WriteBufferSize = "grpc.http2.write_buffer_size";
1617

1718
public const int GrpcArgKeepalivePermitWithoutCallsOpen = 1;
1819
public const int GrpcArgHttp2MaxPingsWithoutDataVal = 0;
1920
public const int GrpcArgKeepaliveTimeoutMsVal = 60 * 1000;
2021
public const int GrpcArgKeepaliveTimeMsVal = 2 * 60 * 60 * 1000;
22+
// public const int GrpcArgHttp2WriteBufferSizeVal = 6 * 1024;
2123

2224
public const string GrpcGzipConst = "gzip";
2325

src/AElf.OS.Network.Grpc/GrpcNetworkModule.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public override void ConfigureServices(ServiceConfigurationContext context)
1616
context.Services.AddSingleton<PeerService.PeerServiceBase, GrpcServerService>();
1717

1818
// Internal dependencies
19-
context.Services.AddTransient<IPeerDialer, PeerDialer>();
19+
context.Services.AddSingleton<IPeerDialer, PeerDialer>();
2020
context.Services.AddSingleton<GrpcServerService>();
2121

2222
context.Services.AddSingleton<AuthInterceptor>();

src/AElf.OS.Network.Grpc/Helpers/CommonHelper.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ public static string GenerateRequestId()
1212
return timeMs.ToString() + '_' + guid;
1313
}
1414

15+
public static long GetRequestLatency(string requestId)
16+
{
17+
var sp = requestId.Split("_");
18+
if (sp.Length != 2) return -1;
19+
return long.TryParse(sp[0], out var start) ? DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() - start : -1;
20+
}
21+
1522
public static bool GreaterThanSupportStreamMinVersion(this string version, string minVersion)
1623
{
1724
return Version.Parse(version).CompareTo(Version.Parse(minVersion)) >= 0;

src/AElf.OS.Network.Grpc/Peer/GrpcPeer.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ namespace AElf.OS.Network.Grpc;
2525
public class GrpcPeer : IPeer
2626
{
2727
private const int MaxMetricsPerMethod = 100;
28-
protected const int BlockRequestTimeout = 700;
29-
protected const int CheckHealthTimeout = 1000;
28+
protected const int BlockRequestTimeout = 2000;
29+
protected const int CheckHealthTimeout = 2000;
3030
protected const int BlocksRequestTimeout = 5000;
31-
protected const int GetNodesTimeout = 500;
31+
protected const int GetNodesTimeout = 2000;
3232
protected const int UpdateHandshakeTimeout = 3000;
3333
protected const int StreamRecoveryWaitTime = 500;
3434

@@ -394,7 +394,7 @@ protected virtual void RecordMetric(GrpcRequest grpcRequest, Timestamp requestSt
394394
/// This method handles the case where the peer is potentially down. If the Rpc call
395395
/// put the channel in TransientFailure or Connecting, we give the connection a certain time to recover.
396396
/// </summary>
397-
protected virtual NetworkException HandleRpcException(RpcException exception, string errorMessage)
397+
public virtual NetworkException HandleRpcException(RpcException exception, string errorMessage)
398398
{
399399
var message = $"Failed request to {this}: {errorMessage}";
400400
var type = NetworkExceptionType.Rpc;

src/AElf.OS.Network.Grpc/Peer/GrpcStreamBackPeer.cs

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
using System;
22
using System.Collections.Generic;
3+
using System.Linq;
34
using System.Net;
45
using System.Threading.Tasks;
56
using AElf.OS.Network.Application;
7+
using AElf.OS.Network.Grpc.Helpers;
68
using AElf.OS.Network.Protocol.Types;
9+
using AElf.Types;
710
using Grpc.Core;
811

912
namespace AElf.OS.Network.Grpc;
@@ -22,13 +25,35 @@ public GrpcStreamBackPeer(DnsEndPoint remoteEndpoint, PeerConnectionInfo peerCon
2225

2326
public override async Task CheckHealthAsync()
2427
{
25-
var request = new GrpcRequest { ErrorMessage = "Check health failed." };
28+
var requestId = CommonHelper.GenerateRequestId();
29+
var request = new GrpcRequest { ErrorMessage = $"Check health failed.requestId={requestId}" };
2630

2731
var data = new Metadata
2832
{
2933
{ GrpcConstants.TimeoutMetadataKey, CheckHealthTimeout.ToString() },
3034
};
31-
await RequestAsync(() => StreamRequestAsync(MessageType.HealthCheck, new HealthCheckRequest(), data), request);
35+
await RequestAsync(() => StreamRequestAsync(MessageType.HealthCheck, new HealthCheckRequest(), data, requestId), request);
36+
}
37+
38+
public override async Task<List<BlockWithTransactions>> GetBlocksAsync(Hash firstHash, int count)
39+
{
40+
var blockRequest = new BlocksRequest { PreviousBlockHash = firstHash, Count = count };
41+
var blockInfo = $"{{ first: {firstHash}, count: {count} }}";
42+
43+
var requestId = CommonHelper.GenerateRequestId();
44+
var request = new GrpcRequest
45+
{
46+
ErrorMessage = $"Get blocks for {blockInfo} failed.requestId={requestId}",
47+
MetricName = nameof(MetricNames.GetBlocks),
48+
MetricInfo = $"Get blocks for {blockInfo}"
49+
};
50+
51+
var data = new Metadata
52+
{
53+
{ GrpcConstants.TimeoutMetadataKey, BlocksRequestTimeout.ToString() },
54+
};
55+
var listMessage = await RequestAsync(() => StreamRequestAsync(MessageType.RequestBlocks, blockRequest, data, requestId), request);
56+
return listMessage != null ? BlockList.Parser.ParseFrom(listMessage.Message).Blocks.ToList() : new List<BlockWithTransactions>();
3257
}
3358

3459
public override async Task DisconnectAsync(bool gracefulDisconnect)
@@ -57,10 +82,10 @@ public override Task<bool> TryRecoverAsync()
5782
}
5883

5984

60-
protected override NetworkException HandleRpcException(RpcException exception, string errorMessage)
85+
public override NetworkException HandleRpcException(RpcException exception, string errorMessage)
6186
{
6287
var message = $"Failed request to {this}: {errorMessage}";
63-
var type = NetworkExceptionType.Rpc;
88+
var type = NetworkExceptionType.Rpc;
6489
if (exception.StatusCode ==
6590
// there was an exception, not related to connectivity.
6691
StatusCode.Cancelled)

0 commit comments

Comments
 (0)