Skip to content

Commit d6e2c2e

Browse files
authored
Defer remote components to avoid OkHttp class-loading side-effects (#8131)
* Additional logging to help triage custom log-manager / jmx-builder test failures. * Move SharedCommunicationObjects creation to InstallDatadogTracerCallback constructor * Add ability to pause remote components until SharedCommunicationObjects is ready * Delay starting trace-writer and data-stream-monitoring until remote I/O is allowed * Delay starting logs-intake backend until remote I/O is allowed * Install tracer and logs-intake as soon as necessary, but defer any remote components when use of OkHttp should be delayed * Remove exemption where we didn't defer if the custom logging manager or JMX builder was on the system classpath (because the main thread would find it there if OkHttp triggered initialization of JUL.). We now make OkHttp calls from our own background threads, which are isolated from the system classloader, not the main thread - so this exemption no longer makes sense. * Adjust minimumBranchCoverage to account for new double-checked lock in SharedCommunicationObjects
1 parent 5b1d1e4 commit d6e2c2e

File tree

10 files changed

+160
-138
lines changed

10 files changed

+160
-138
lines changed

communication/build.gradle

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ dependencies {
2525
}
2626

2727
ext {
28-
minimumBranchCoverage = 0.6
28+
minimumBranchCoverage = 0.5
2929
minimumInstructionCoverage = 0.8
3030
excludedClassesCoverage = [
3131
'datadog.communication.ddagent.ExternalAgentLauncher',

communication/src/main/java/datadog/communication/ddagent/SharedCommunicationObjects.java

+48-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import datadog.remoteconfig.DefaultConfigurationPoller;
1212
import datadog.trace.api.Config;
1313
import datadog.trace.util.AgentTaskScheduler;
14+
import java.util.ArrayList;
15+
import java.util.List;
1416
import java.util.concurrent.TimeUnit;
1517
import java.util.function.Supplier;
1618
import okhttp3.HttpUrl;
@@ -21,12 +23,23 @@
2123
public class SharedCommunicationObjects {
2224
private static final Logger log = LoggerFactory.getLogger(SharedCommunicationObjects.class);
2325

26+
private final List<Runnable> pausedComponents = new ArrayList<>();
27+
private volatile boolean paused;
28+
2429
public OkHttpClient okHttpClient;
2530
public HttpUrl agentUrl;
2631
public Monitoring monitoring;
2732
private DDAgentFeaturesDiscovery featuresDiscovery;
2833
private ConfigurationPoller configurationPoller;
2934

35+
public SharedCommunicationObjects() {
36+
this(false);
37+
}
38+
39+
public SharedCommunicationObjects(boolean paused) {
40+
this.paused = paused;
41+
}
42+
3043
public void createRemaining(Config config) {
3144
if (monitoring == null) {
3245
monitoring = Monitoring.DISABLED;
@@ -46,6 +59,32 @@ public void createRemaining(Config config) {
4659
}
4760
}
4861

62+
public void whenReady(Runnable callback) {
63+
if (paused) {
64+
synchronized (pausedComponents) {
65+
if (paused) {
66+
pausedComponents.add(callback);
67+
return;
68+
}
69+
}
70+
}
71+
callback.run(); // not paused, run immediately
72+
}
73+
74+
public void resume() {
75+
paused = false;
76+
synchronized (pausedComponents) {
77+
for (Runnable callback : pausedComponents) {
78+
try {
79+
callback.run();
80+
} catch (Throwable e) {
81+
log.warn("Problem resuming remote component {}", callback, e);
82+
}
83+
}
84+
pausedComponents.clear();
85+
}
86+
}
87+
4988
private static HttpUrl parseAgentUrl(Config config) {
5089
String agentUrl = config.getAgentUrl();
5190
if (agentUrl.startsWith("unix:")) {
@@ -100,11 +139,16 @@ public DDAgentFeaturesDiscovery featuresDiscovery(Config config) {
100139
agentUrl,
101140
config.isTraceAgentV05Enabled(),
102141
config.isTracerMetricsEnabled());
103-
if (AGENT_THREAD_GROUP.equals(Thread.currentThread().getThreadGroup())) {
104-
featuresDiscovery.discover(); // safe to run on same thread
142+
143+
if (paused) {
144+
// defer remote discovery until remote I/O is allowed
105145
} else {
106-
// avoid performing blocking I/O operation on application thread
107-
AgentTaskScheduler.INSTANCE.execute(featuresDiscovery::discover);
146+
if (AGENT_THREAD_GROUP.equals(Thread.currentThread().getThreadGroup())) {
147+
featuresDiscovery.discover(); // safe to run on same thread
148+
} else {
149+
// avoid performing blocking I/O operation on application thread
150+
AgentTaskScheduler.INSTANCE.execute(featuresDiscovery::discover);
151+
}
108152
}
109153
}
110154
return featuresDiscovery;

dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/Agent.java

+41-35
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import static datadog.trace.util.AgentThreadFactory.AgentThread.PROFILER_STARTUP;
1111
import static datadog.trace.util.AgentThreadFactory.AgentThread.TRACE_STARTUP;
1212
import static datadog.trace.util.AgentThreadFactory.newAgentThread;
13-
import static datadog.trace.util.Strings.getResourceName;
1413
import static datadog.trace.util.Strings.propertyNameToSystemPropertyName;
1514
import static datadog.trace.util.Strings.toEnvVar;
1615

@@ -348,7 +347,7 @@ public void run() {
348347
* logging facility. Likewise on IBM JDKs OkHttp may indirectly load 'IBMSASL' which in turn loads LogManager.
349348
*/
350349
InstallDatadogTracerCallback installDatadogTracerCallback =
351-
new InstallDatadogTracerCallback(initTelemetry, inst);
350+
new InstallDatadogTracerCallback(initTelemetry, inst, delayOkHttp);
352351
if (delayOkHttp) {
353352
log.debug("Custom logger detected. Delaying Datadog Tracer initialization.");
354353
registerLogManagerCallback(installDatadogTracerCallback);
@@ -497,28 +496,21 @@ public void execute() {
497496
}
498497

499498
protected static class InstallDatadogTracerCallback extends ClassLoadCallBack {
500-
private final InitializationTelemetry initTelemetry;
501499
private final Instrumentation instrumentation;
500+
private final Object sco;
501+
private final Class<?> scoClass;
502+
private final boolean delayOkHttp;
502503

503504
public InstallDatadogTracerCallback(
504-
InitializationTelemetry initTelemetry, Instrumentation instrumentation) {
505-
this.initTelemetry = initTelemetry;
505+
InitializationTelemetry initTelemetry,
506+
Instrumentation instrumentation,
507+
boolean delayOkHttp) {
508+
this.delayOkHttp = delayOkHttp;
506509
this.instrumentation = instrumentation;
507-
}
508-
509-
@Override
510-
public AgentThread agentThread() {
511-
return TRACE_STARTUP;
512-
}
513-
514-
@Override
515-
public void execute() {
516-
Object sco;
517-
Class<?> scoClass;
518510
try {
519511
scoClass =
520512
AGENT_CLASSLOADER.loadClass("datadog.communication.ddagent.SharedCommunicationObjects");
521-
sco = scoClass.getConstructor().newInstance();
513+
sco = scoClass.getConstructor(boolean.class).newInstance(delayOkHttp);
522514
} catch (ClassNotFoundException
523515
| NoSuchMethodException
524516
| InstantiationException
@@ -528,10 +520,23 @@ public void execute() {
528520
}
529521

530522
installDatadogTracer(initTelemetry, scoClass, sco);
523+
maybeInstallLogsIntake(scoClass, sco);
524+
}
525+
526+
@Override
527+
public AgentThread agentThread() {
528+
return TRACE_STARTUP;
529+
}
530+
531+
@Override
532+
public void execute() {
533+
if (delayOkHttp) {
534+
resumeRemoteComponents();
535+
}
536+
531537
maybeStartAppSec(scoClass, sco);
532538
maybeStartIast(instrumentation, scoClass, sco);
533539
maybeStartCiVisibility(instrumentation, scoClass, sco);
534-
maybeStartLogsIntake(scoClass, sco);
535540
// start debugger before remote config to subscribe to it before starting to poll
536541
maybeStartDebugger(instrumentation, scoClass, sco);
537542
maybeStartRemoteConfig(scoClass, sco);
@@ -540,6 +545,18 @@ public void execute() {
540545
startTelemetry(instrumentation, scoClass, sco);
541546
}
542547
}
548+
549+
private void resumeRemoteComponents() {
550+
try {
551+
// remote components were paused for custom log-manager/jmx-builder
552+
// add small delay before resuming remote I/O to help stabilization
553+
Thread.sleep(1_000);
554+
scoClass.getMethod("resume").invoke(sco);
555+
} catch (InterruptedException ignore) {
556+
} catch (Throwable e) {
557+
log.error("Error resuming remote components", e);
558+
}
559+
}
543560
}
544561

545562
protected static class StartProfilingAgentCallback extends ClassLoadCallBack {
@@ -866,17 +883,18 @@ private static void maybeStartCiVisibility(Instrumentation inst, Class<?> scoCla
866883
}
867884
}
868885

869-
private static void maybeStartLogsIntake(Class<?> scoClass, Object sco) {
886+
private static void maybeInstallLogsIntake(Class<?> scoClass, Object sco) {
870887
if (agentlessLogSubmissionEnabled) {
871888
StaticEventLogger.begin("Logs Intake");
872889

873890
try {
874891
final Class<?> logsIntakeSystemClass =
875892
AGENT_CLASSLOADER.loadClass("datadog.trace.logging.intake.LogsIntakeSystem");
876-
final Method logsIntakeInstallerMethod = logsIntakeSystemClass.getMethod("start", scoClass);
893+
final Method logsIntakeInstallerMethod =
894+
logsIntakeSystemClass.getMethod("install", scoClass);
877895
logsIntakeInstallerMethod.invoke(null, sco);
878896
} catch (final Throwable e) {
879-
log.warn("Not starting Logs Intake subsystem", e);
897+
log.warn("Not installing Logs Intake subsystem", e);
880898
}
881899

882900
StaticEventLogger.end("Logs Intake");
@@ -1267,14 +1285,8 @@ private static boolean isAppUsingCustomLogManager(final EnumSet<Library> librari
12671285

12681286
final String logManagerProp = System.getProperty("java.util.logging.manager");
12691287
if (logManagerProp != null) {
1270-
final boolean onSysClasspath =
1271-
ClassLoader.getSystemResource(getResourceName(logManagerProp)) != null;
12721288
log.debug("Prop - logging.manager: {}", logManagerProp);
1273-
log.debug("logging.manager on system classpath: {}", onSysClasspath);
1274-
// Some applications set java.util.logging.manager but never actually initialize the logger.
1275-
// Check to see if the configured manager is on the system classpath.
1276-
// If so, it should be safe to initialize jmxfetch which will setup the log manager.
1277-
return !onSysClasspath;
1289+
return true;
12781290
}
12791291

12801292
return false;
@@ -1305,14 +1317,8 @@ private static boolean isAppUsingCustomJMXBuilder(final EnumSet<Library> librari
13051317

13061318
final String jmxBuilderProp = System.getProperty("javax.management.builder.initial");
13071319
if (jmxBuilderProp != null) {
1308-
final boolean onSysClasspath =
1309-
ClassLoader.getSystemResource(getResourceName(jmxBuilderProp)) != null;
13101320
log.debug("Prop - javax.management.builder.initial: {}", jmxBuilderProp);
1311-
log.debug("javax.management.builder.initial on system classpath: {}", onSysClasspath);
1312-
// Some applications set javax.management.builder.initial but never actually initialize JMX.
1313-
// Check to see if the configured JMX builder is on the system classpath.
1314-
// If so, it should be safe to initialize jmxfetch which will setup JMX.
1315-
return !onSysClasspath;
1321+
return true;
13161322
}
13171323

13181324
return false;

dd-java-agent/agent-logs-intake/src/main/java/datadog/trace/logging/intake/LogsIntakeSystem.java

+3-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package datadog.trace.logging.intake;
22

3-
import datadog.communication.BackendApi;
43
import datadog.communication.BackendApiFactory;
54
import datadog.communication.ddagent.SharedCommunicationObjects;
65
import datadog.trace.api.Config;
@@ -12,18 +11,16 @@ public class LogsIntakeSystem {
1211

1312
private static final Logger LOGGER = LoggerFactory.getLogger(LogsIntakeSystem.class);
1413

15-
public static void start(SharedCommunicationObjects sco) {
14+
public static void install(SharedCommunicationObjects sco) {
1615
Config config = Config.get();
1716
if (!config.isAgentlessLogSubmissionEnabled()) {
1817
LOGGER.debug("Agentless logs intake is disabled");
1918
return;
2019
}
2120

2221
BackendApiFactory apiFactory = new BackendApiFactory(config, sco);
23-
BackendApi backendApi = apiFactory.createBackendApi(BackendApiFactory.Intake.LOGS);
24-
LogsDispatcher dispatcher = new LogsDispatcher(backendApi);
25-
LogsWriterImpl writer = new LogsWriterImpl(config, dispatcher);
26-
writer.start();
22+
LogsWriterImpl writer = new LogsWriterImpl(config, apiFactory);
23+
sco.whenReady(writer::start);
2724

2825
LogsIntake.registerWriter(writer);
2926
}

dd-java-agent/agent-logs-intake/src/main/java/datadog/trace/logging/intake/LogsWriterImpl.java

+9-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
import static datadog.trace.util.AgentThreadFactory.AGENT_THREAD_GROUP;
44

5+
import datadog.communication.BackendApi;
6+
import datadog.communication.BackendApiFactory;
7+
import datadog.communication.BackendApiFactory.Intake;
58
import datadog.trace.api.Config;
69
import datadog.trace.api.logging.intake.LogsWriter;
710
import datadog.trace.util.AgentThreadFactory;
@@ -23,12 +26,12 @@ public class LogsWriterImpl implements LogsWriter {
2326
private static final int ENQUEUE_LOG_TIMEOUT_MILLIS = 1_000;
2427

2528
private final Map<String, Object> commonTags;
26-
private final LogsDispatcher logsDispatcher;
29+
private final BackendApiFactory apiFactory;
2730
private final BlockingQueue<Map<String, Object>> messageQueue;
2831
private final Thread messagePollingThread;
2932

30-
public LogsWriterImpl(Config config, LogsDispatcher logsDispatcher) {
31-
this.logsDispatcher = logsDispatcher;
33+
public LogsWriterImpl(Config config, BackendApiFactory apiFactory) {
34+
this.apiFactory = apiFactory;
3235

3336
commonTags = new HashMap<>();
3437
commonTags.put("ddsource", "java");
@@ -84,6 +87,9 @@ public void log(Map<String, Object> message) {
8487
}
8588

8689
private void logPollingLoop() {
90+
BackendApi backendApi = apiFactory.createBackendApi(Intake.LOGS);
91+
LogsDispatcher logsDispatcher = new LogsDispatcher(backendApi);
92+
8793
while (!Thread.currentThread().isInterrupted()) {
8894
try {
8995
List<Map<String, Object>> batch = new ArrayList<>();

dd-java-agent/src/test/groovy/datadog/trace/agent/CustomLogManagerTest.groovy

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class CustomLogManagerTest extends Specification {
2727
, true) == 0
2828
}
2929

30-
def "agent services starts up in premain if configured log manager on system classpath"() {
30+
def "agent services startup is delayed even if configured log manager on system classpath"() {
3131
expect:
3232
IntegrationTestUtils.runOnSeparateJvm(LogManagerSetter.getName()
3333
, [

dd-java-agent/src/test/groovy/datadog/trace/agent/CustomMBeanServerBuilderTest.groovy

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class CustomMBeanServerBuilderTest extends Specification {
2828
, true) == 0
2929
}
3030

31-
def "JMXFetch starts up in premain if configured MBeanServerBuilder on system classpath"() {
31+
def "JMXFetch startup is delayed even if configured MBeanServerBuilder on system classpath"() {
3232
expect:
3333
IntegrationTestUtils.runOnSeparateJvm(MBeanServerBuilderSetter.getName()
3434
, [

0 commit comments

Comments
 (0)