Skip to content

Commit 6345786

Browse files
RSDK-9369: Initialize FTDC after the web service starts. (#4593)
Co-authored-by: Benjamin Rewis <[email protected]>
1 parent 7218458 commit 6345786

File tree

5 files changed

+44
-16
lines changed

5 files changed

+44
-16
lines changed

ftdc/ftdc.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,13 @@ type Statser interface {
7070
// The Stats method must return a struct with public field members that are either:
7171
// - Numbers (e.g: int, float64, byte, etc...)
7272
// - A "recursive" structure that has the same properties as this return value (public field
73-
// members with numbers, or more structures). (NOT YET SUPPORTED)
73+
// members with numbers, or more structures).
7474
//
75-
// The return value must not be a map. This is to enforce a "schema" constraints.
75+
// The return value must always have the same schema. That is, the returned value has the same
76+
// set of keys/structure members. A simple structure where all the member fields are numbers
77+
// satisfies this requirement.
78+
//
79+
// The return value may not (yet) be a `map`. Even if the returned map always has the same keys.
7680
Stats() any
7781
}
7882

robot/impl/local_robot.go

+24-2
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ type localRobot struct {
8888
// map keyed by Module.Name. This is necessary to get the package manager to use a new folder
8989
// when a local tarball is updated.
9090
localModuleVersions map[string]semver.Version
91+
startFtdcOnce sync.Once
9192
ftdc *ftdc.FTDC
9293

9394
// whether the robot is actively reconfiguring
@@ -244,7 +245,13 @@ func (r *localRobot) Logger() logging.Logger {
244245

245246
// StartWeb starts the web server, will return an error if server is already up.
246247
func (r *localRobot) StartWeb(ctx context.Context, o weboptions.Options) (err error) {
247-
return r.webSvc.Start(ctx, o)
248+
ret := r.webSvc.Start(ctx, o)
249+
r.startFtdcOnce.Do(func() {
250+
if r.ftdc != nil {
251+
r.ftdc.Start()
252+
}
253+
})
254+
return ret
248255
}
249256

250257
// StopWeb stops the web server, will be a noop if server is not up.
@@ -421,8 +428,23 @@ func newWithResources(
421428
partID = cfg.Cloud.ID
422429
}
423430
// CloudID is also known as the robot part id.
431+
//
432+
// RSDK-9369: We create a new FTDC worker, but do not yet start it. This is because the
433+
// `webSvc` gets registered with FTDC before we construct the underlying
434+
// `webSvc.rpcServer`. Which happens when calling `localRobot.StartWeb`. We've postponed
435+
// starting FTDC to when that method is called (the first time).
436+
//
437+
// As per the FTDC.Statser interface documentation, the return value of `webSvc.Stats` must
438+
// always have the same schema. Otherwise we risk the ftdc "schema" getting out of sync with
439+
// the data being written. Having `webSvc.Stats` conform to the API requirements is
440+
// challenging when we want to include stats from the `rpcServer`.
441+
//
442+
// RSDK-9369 can be reverted, having the FTDC worker getting started here, when we either:
443+
// - Relax the requirement that successive calls to `Stats` have the same schema or
444+
// - Guarantee that the `rpcServer` is initialized (enough) when the web service is
445+
// constructed to get a valid copy of its stats object (for the schema's sake). Even if
446+
// the web service has not been "started".
424447
ftdcWorker = ftdc.New(ftdc.DefaultDirectory(config.ViamDotDir, partID), logger.Sublogger("ftdc"))
425-
ftdcWorker.Start()
426448
}
427449

428450
closeCtx, cancel := context.WithCancel(ctx)

robot/web/web.go

+14
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,20 @@ func (svc *webService) foreignServiceHandler(srv interface{}, stream googlegrpc.
925925
}
926926
}
927927

928+
type stats struct {
929+
RPCServer any
930+
}
931+
932+
// Stats returns ftdc data on behalf of the rpcServer and other web services.
933+
func (svc *webService) Stats() any {
934+
// RSDK-9369: It's not ideal to block in `Stats`. But we don't today expect this to be
935+
// problematic, and alternatives are more complex/expensive.
936+
svc.mu.Lock()
937+
defer svc.mu.Unlock()
938+
939+
return stats{svc.rpcServer.Stats()}
940+
}
941+
928942
// RestartStatusResponse is the JSON response of the `restart_status` HTTP
929943
// endpoint.
930944
type RestartStatusResponse struct {

robot/web/web_c.go

-6
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,6 @@ type webService struct {
5757
modWorkers sync.WaitGroup
5858
}
5959

60-
func (svc *webService) Stats() any {
61-
return struct {
62-
RPCServer any
63-
}{svc.rpcServer.Stats()}
64-
}
65-
6660
// Reconfigure pulls resources and updates the stream server audio and video streams with the new resources.
6761
func (svc *webService) Reconfigure(ctx context.Context, deps resource.Dependencies, _ resource.Config) error {
6862
svc.mu.Lock()

robot/web/web_notc.go

-6
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,5 @@ func (svc *webService) initStreamServer(ctx context.Context) error {
6666
return nil
6767
}
6868

69-
func (svc *webService) Stats() any {
70-
return struct {
71-
RPCServer any
72-
}{svc.rpcServer.Stats()}
73-
}
74-
7569
// stub for missing gostream
7670
type options struct{}

0 commit comments

Comments
 (0)