Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coverage analysis agent #928

Draft
wants to merge 27 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
871aaf6
A draft coverage analyzer
DonggeLiu Mar 25, 2025
2982c64
We are not interested in mem leaks
DonggeLiu Mar 25, 2025
0f08775
Prompt builder for coverage analyzer
DonggeLiu Mar 25, 2025
efc14e0
Adjust coverage result and analysis result
DonggeLiu Mar 25, 2025
0a93b3f
Call the corresponding agent for specific tasks
DonggeLiu Mar 25, 2025
da5711f
Save run log into runresult
DonggeLiu Mar 25, 2025
7b9627c
Add coverage analyzer for experiments
DonggeLiu Mar 25, 2025
990fe6d
temp fix for enhancer and one_prompt_prototyper
DonggeLiu Mar 25, 2025
e4711d0
bug fix
DonggeLiu Mar 25, 2025
61b0b24
Unify agent selection and execution statements
DonggeLiu Mar 25, 2025
683f658
Priming template for coverage analyzer
DonggeLiu Mar 25, 2025
d9cdb38
Use saved run log, the text file will not exist in cloud build agents
DonggeLiu Mar 25, 2025
2937278
Add tool guide
DonggeLiu Mar 25, 2025
93480fb
a todo
DonggeLiu Mar 25, 2025
80b95fe
fix type error
DonggeLiu Mar 25, 2025
6fa3cbf
Refine enhancer prompt and truncate overlong run log
DonggeLiu Mar 26, 2025
ca25e6f
bug fix
DonggeLiu Mar 26, 2025
90e0366
Do not save full fuzzing log to avoid OOM
DonggeLiu Mar 26, 2025
5d8c0ac
Improve prompt
DonggeLiu Mar 26, 2025
7439cc9
Upload the result dir + tag agent cloud build by function name
DonggeLiu Mar 27, 2025
c5a2ef0
Bug fix: use unique ID, only upload the required result files
DonggeLiu Mar 27, 2025
f372a5f
organize cached image and oss_fuzz project path
DonggeLiu Mar 28, 2025
fe330f6
More info in report via chathistory
DonggeLiu Mar 29, 2025
5c2c80e
temp disable caching
DonggeLiu Mar 29, 2025
6185465
temp disable caching
DonggeLiu Mar 29, 2025
217079e
Revert 'temp disable caching'
DonggeLiu Apr 2, 2025
ef0858c
Disable cache for execution stage only
DonggeLiu Apr 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 171 additions & 1 deletion agent/coverage_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,178 @@
"""An LLM agent to analyze and provide insight of a fuzz target's low coverage.
Use it as a usual module locally, or as script in cloud builds.
"""
from typing import Optional

import logger
from agent.base_agent import BaseAgent
from experiment.workdir import WorkDirs
from llm_toolkit import prompt_builder
from llm_toolkit.prompt_builder import CoverageAnalyzerTemplateBuilder
from llm_toolkit.prompts import Prompt
from results import AnalysisResult, CoverageResult, Result, RunResult
from tool.container_tool import ProjectContainerTool


class CoverageAnalyzer(BaseAgent):
pass
"""The Agent to refine a compilable fuzz target for higher coverage."""

def _initial_prompt(self, results: list[Result]) -> Prompt:
"""Constructs initial prompt of the agent."""
last_result = results[-1]
benchmark = last_result.benchmark

if not isinstance(last_result, RunResult):
logger.error('The last result in %s is not RunResult: %s',
self.name,
results,
trial=self.trial)
return Prompt()

builder = CoverageAnalyzerTemplateBuilder(self.llm, benchmark, last_result)
prompt = builder.build(example_pair=[],
tool_guides=self.inspect_tool.tutorial(),
project_dir=self.inspect_tool.project_dir)
# TODO: A different file name/dir.
prompt.save(self.args.work_dirs.prompt)

return prompt

def _container_handle_conclusion(self, cur_round: int, response: str,
coverage_result: CoverageResult,
prompt: Prompt) -> Optional[Prompt]:
"""Runs a compilation tool to validate the new fuzz target and build script
from LLM."""
conclusion = self._parse_tag(response, 'conclusion')
if not conclusion:
return prompt
logger.info('----- ROUND %02d Received conclusion -----',
cur_round,
trial=self.trial)

coverage_result.improve_required = conclusion.strip().lower() == 'true'
coverage_result.insight = self._parse_tag(response, 'insights')
coverage_result.suggestions = self._parse_tag(response, 'suggestions')

return None

def _container_tool_reaction(
self, cur_round: int, response: str, run_result: RunResult,
coverage_result: CoverageResult) -> Optional[Prompt]:
"""Validates LLM conclusion or executes its command."""
del run_result
prompt = prompt_builder.DefaultTemplateBuilder(self.llm, None).build([])

prompt = self._container_handle_bash_commands(response, self.inspect_tool,
prompt)
# Only report conclusion when no more bash investigation is required.
if not prompt.gettext():
# Then build fuzz target.
prompt = self._container_handle_conclusion(cur_round, response,
coverage_result, prompt)
if prompt is None:
# Succeeded.
return None

# Finally check invalid responses.
if not response or not prompt.get():
prompt = self._container_handle_invalid_tool_usage(
self.inspect_tool, cur_round, response, prompt)
prompt.append("""
Provide your verified conclusion with analysis insights and suggestions in the following format:
* A clean Boolean value (True or False) representing your analysis conclusion on whether code coverage needs improvement.
* Analysis insights of the low coverage, as detailed as possible with source code evidence.
* Suggestions to improve the code coverage, this can be text description, code snippet, or the full refined fuzz target.

For example:
<conclusion>
True
</conclusion>
<insights>
The low coverage comes from the fact that the current fuzz target exercises only one very narrow code path—in this case, a single call to {FUNCTION_SIGNATURE} with naive argument derived directly from the input data. This approach misses many branches within the {PROJECT} because:

* Single Argument Limitation: By always providing a unprocessed and naive argument, the fuzz target never tests the handling of complex values, which likely involves additional logic (e.g., iterating over the array, handling edge cases like empty or very long tokens, and validating numeric conversions for lengths).

* Lack of Input Variation: Since the fuzzer input is used verbatim as the only command argument, many conditional paths (e.g., those triggered by specific token contents or argument counts) remain untested.

* Untested Functions: Only the function-under-test ({FUNCTION_SIGNATURE}) is being invoked. {PROJECT} has several functions (e.g., functions from {PROJECT_DIR}) that are necessary or conventional to invoke before the function as preparations, but their logic isn’t reached by the current target.

To increase code coverage, I need the following improvements:

* Fine-grained input preprocessing.
Instead of using naive values like NULL or constant strings, or passing the entire input as a single argument, split it into multiple tokens of suitable sizes and content. This will allow the fuzz target to test scenarios where:

The function requires tailored input (value, format, data structures, etc.).

Edge cases occur (e.g., empty tokens, very short or very long tokens).

Fuzz Additional Functions:
To further increase coverage in the {PROJECT} library, I will need to add other functions like:

Function X and Y from {PROJECT} to prepare the program state before invoking {FUNCTION_SIGNATURE}.

Function Z if available, or other parameter preparation functions to better initialize function parameters based on the data generated by fuzzer.
</insights>
<suggestions>
Create Proper parameters
Instead of using a dummy context (or no context at all), allocate and initialize each parameter with the expected type and content. Typically, this structure embeds a regular `type_a` plus additional fields. I can either try to call `function_a` or manually allocate the structure and initialize its members. This includes initializing the internal `type_b` (via `function_b`) which `{FUNCTION_SIGNATURE}` uses to parse incoming data.

Simulate Data Reception
Feed the fuzz input into the {FUNCTION_SIGNATURE} by calling something like:
```
# Code snippet.
```
This makes sure that when {FUNCTION_SIGNATURE} is called, it has some data to process. I can then observe how the parser behaves with various inputs (valid replies, malformed data, etc.).

Call `function_c`
With the context properly set up, invoking `function_c` will prepare the program states for `{FUNCTION_SIGNATURE}` to traverse more code paths (error handling, reply parsing, etc.). This is where more of {PROJECT}’s logic will be exercised.

Optionally Vary Context Fields
I will also consider fuzzing some of the fields within parameters to trigger different branches.

Here is the revised fuzz target:
```
# New fuzz target
```
</suggestions>
""")

return prompt

def execute(self, result_history: list[Result]) -> AnalysisResult:
"""Executes the agent to analyze the root cause to the low coverage."""
WorkDirs(self.args.work_dirs.base, keep=True)
last_result = result_history[-1]
assert isinstance(last_result, RunResult)

logger.info('Executing %s', self.name, trial=last_result.trial)
benchmark = last_result.benchmark
# TODO(dongge): Use the generated fuzz target and build script here.
self.inspect_tool = ProjectContainerTool(benchmark, name='inspect')
self.inspect_tool.compile(extra_commands=' && rm -rf /out/* > /dev/null')
cur_round = 1
coverage_result = CoverageResult()
prompt = self._initial_prompt(result_history)

try:
client = self.llm.get_chat_client(model=self.llm.get_model())
while prompt and cur_round < self.max_round:
response = self.chat_llm(cur_round,
client=client,
prompt=prompt,
trial=last_result.trial)
prompt = self._container_tool_reaction(cur_round, response, last_result,
coverage_result)
cur_round += 1
finally:
# Cleanup: stop and remove the container
logger.debug('Stopping and removing the inspect container %s',
self.inspect_tool.container_id,
trial=last_result.trial)
self.inspect_tool.terminate()

analysis_result = AnalysisResult(
author=self,
run_result=last_result,
coverage_result=coverage_result,
chat_history={self.name: coverage_result.to_dict()})
return analysis_result
30 changes: 25 additions & 5 deletions agent/enhancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
"""
import logger
from agent.prototyper import Prototyper
from llm_toolkit.prompt_builder import EnhancerTemplateBuilder, JvmFixingBuilder
from llm_toolkit.prompts import Prompt
from llm_toolkit.prompt_builder import (CoverageEnhancerTemplateBuilder,
EnhancerTemplateBuilder,
JvmFixingBuilder)
from llm_toolkit.prompts import Prompt, TextPrompt
from results import AnalysisResult, BuildResult, Result


Expand Down Expand Up @@ -52,9 +54,27 @@ def _initial_prompt(self, results: list[Result]) -> Prompt:
last_result.run_result.fuzz_target_source, [])
prompt = builder.build([], None, None)
else:
error_desc, errors = last_result.semantic_result.get_error_info()
builder = EnhancerTemplateBuilder(self.llm, benchmark, last_build_result,
error_desc, errors)
# TODO(dongge): Refine this logic.
if last_result.semantic_result:
error_desc, errors = last_result.semantic_result.get_error_info()
builder = EnhancerTemplateBuilder(self.llm, benchmark,
last_build_result, error_desc, errors)
elif last_result.coverage_result:
builder = CoverageEnhancerTemplateBuilder(
self.llm,
benchmark,
last_build_result,
coverage_result=last_result.coverage_result)
else:
logger.error(
'Last result does not contain either semantic result or '
'coverage result',
trial=self.trial)
# TODO(dongge): Give some default initial prompt.
prompt = TextPrompt(
'Last result does not contain either semantic result or '
'coverage result')
return prompt
prompt = builder.build(example_pair=[],
tool_guides=self.inspect_tool.tutorial(),
project_dir=self.inspect_tool.project_dir)
Expand Down
25 changes: 18 additions & 7 deletions agent/one_prompt_enhancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,25 @@ def _initial_prompt(self, results: list[Result]) -> Prompt:
last_result.run_result.fuzz_target_source, [])
prompt = builder.build([], None, None)
else:
error_desc, errors = last_result.semantic_result.get_error_info()
# TODO(dongge): Refine this logic.
builder = DefaultTemplateBuilder(self.llm)
prompt = builder.build_fixer_prompt(benchmark,
last_result.fuzz_target_source,
error_desc,
errors,
context='',
instruction='')
if last_result.semantic_result:
error_desc, errors = last_result.semantic_result.get_error_info()
prompt = builder.build_fixer_prompt(benchmark,
last_result.fuzz_target_source,
error_desc,
errors,
context='',
instruction='')
else:
prompt = builder.build_fixer_prompt(
benchmark=benchmark,
raw_code=last_result.fuzz_target_source,
error_desc='',
errors=[],
coverage_result=last_result.coverage_result,
context='',
instruction='')
# TODO: A different file name/dir.
prompt.save(self.args.work_dirs.prompt)

Expand Down
7 changes: 5 additions & 2 deletions agent/one_prompt_prototyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,11 @@ def _advice_fuzz_target(self, build_result: BuildResult,
instruction = code_fixer.collect_instructions(
build_result.benchmark, errors, build_result.fuzz_target_source)
prompt = builder.build_fixer_prompt(build_result.benchmark,
build_result.fuzz_target_source, '',
errors, context, instruction)
build_result.fuzz_target_source,
'',
errors,
context=context,
instruction=instruction)

return prompt

Expand Down
25 changes: 5 additions & 20 deletions agent/semantic_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"""An LLM agent to generate a simple fuzz target prototype that can build.
Use it as a usual module locally, or as script in cloud builds.
"""
import os
import re
from collections import defaultdict, namedtuple
from typing import Optional
Expand Down Expand Up @@ -61,11 +60,8 @@ def execute(self, result_history: list[Result]) -> AnalysisResult:
last_result = result_history[-1]
assert isinstance(last_result, RunResult)

with open(
os.path.join(last_result.work_dirs.run_logs, f'{self.trial:02}.log'),
'rb') as fuzzer_log:
_, _, _, _, semantic_result = self._parse_libfuzzer_logs(
fuzzer_log, last_result.benchmark.project)
_, _, _, _, semantic_result = self._parse_libfuzzer_logs(
last_result.run_log, last_result.benchmark.project)

analysis_result = AnalysisResult(
author=self,
Expand All @@ -75,24 +71,13 @@ def execute(self, result_history: list[Result]) -> AnalysisResult:
return analysis_result

def _parse_libfuzzer_logs(self,
log_handle,
fuzzlog,
project_name: str,
check_cov_increase: bool = True) -> ParseResult:
"""Parses libFuzzer logs."""
lines = None
try:
fuzzlog = log_handle.read(-1)
# Some crashes can mess up the libfuzzer output and raise decode error.
fuzzlog = fuzzlog.decode('utf-8', errors='ignore')
lines = fuzzlog.split('\n')
except MemoryError as e:
# Some logs from abnormal fuzz targets are too large to be parsed.
logger.error('%s is too large to parse: %s',
log_handle.name,
e,
trial=self.trial)
return ParseResult(0, 0, False, '',
SemanticCheckResult(SemanticCheckResult.LOG_MESS_UP))
# Some crashes can mess up the libfuzzer output and raise decode error.
lines = fuzzlog.split('\n')

cov_pcs, total_pcs, crashes = 0, 0, False

Expand Down
20 changes: 13 additions & 7 deletions common/cloud_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class CloudBuilder:

def __init__(self, args: argparse.Namespace) -> None:
self.tags = ['ofg', 'agent', args.cloud_experiment_name]
self.exp_args = args
self.credentials, self.project_id = default()
assert self.project_id, 'Cloud experiment requires a Google cloud project.'
assert hasattr(
Expand Down Expand Up @@ -86,20 +87,25 @@ def _upload_to_gcs(self, local_file_path: str) -> str:
logging.info('Uploaded %s to %s', local_file_path, bucket_file_url)
return bucket_file_url

def _prepare_and_upload_archive(self) -> str:
def _prepare_and_upload_archive(self, result_history: list[Result]) -> str:
"""Archives and uploads local OFG repo to cloud build."""
files_in_dir = set(
dir_files = set(
os.path.relpath(os.path.join(root, file))
for root, _, files in os.walk(OFG_ROOT_DIR)
for file in files)
files_in_git = set(
git_files = set(
subprocess.check_output(['git', 'ls-files'],
cwd=OFG_ROOT_DIR,
text=True).splitlines())
file_to_upload = list(files_in_dir & files_in_git)
result_files = set(
os.path.relpath(os.path.join(root, file))
for root, _, files in os.walk(result_history[-1].work_dirs.base)
for file in files)
file_to_upload = list((dir_files & git_files) | result_files)

with tempfile.TemporaryDirectory() as tmpdirname:
archive_name = f'ofg-repo-{uuid.uuid4().hex}.tar.gz'
archive_name = (f'{self.exp_args.cloud_experiment_name}-ofg-repo-'
f'{uuid.uuid4().hex}.tar.gz')
archive_path = os.path.join(tmpdirname, archive_name)
tar_command = ['tar', '-czf', archive_path] + file_to_upload
subprocess.run(tar_command, cwd=OFG_ROOT_DIR, check=True)
Expand Down Expand Up @@ -295,7 +301,7 @@ def run(self, agent: BaseAgent, result_history: list[Result],
self.tags += [
str(agent),
str(result_history[-1].benchmark.project),
# TODO(dongge): A tag for function name, compatible with tag format.
str(result_history[-1].benchmark.function_name),
str(result_history[-1].trial)
]
# Step1: Generate dill files.
Expand All @@ -306,7 +312,7 @@ def run(self, agent: BaseAgent, result_history: list[Result],
# TODO(dongge): Encrypt dill files?

# Step 2: Upload OFG repo and dill files to GCS.
ofg_url = self._prepare_and_upload_archive()
ofg_url = self._prepare_and_upload_archive(result_history)
agent_url = self._upload_to_gcs(agent_dill)
results_url = self._upload_to_gcs(results_dill)

Expand Down
Loading