diff --git a/.gitignore b/.gitignore index 5918d39..deeb679 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,10 @@ result worktrees/ .direnv/ + +# Test artifacts from e2e agent tests +solo-*.txt +e2e-*.txt +skill-*.txt +pair-*.txt +todo-run-cmd-*.md diff --git a/flake.nix b/flake.nix index 78046d2..0a62180 100644 --- a/flake.nix +++ b/flake.nix @@ -93,6 +93,7 @@ pkgs.just pkgs.git pkgs.jq + pkgs.python3 ]; RUST_SRC_PATH = "${rustToolchain}/lib/rustlib/src/rust/library"; diff --git a/justfile b/justfile index 97f707e..625da9d 100644 --- a/justfile +++ b/justfile @@ -12,9 +12,11 @@ pre-merge: nix-check test-agents: ./tests/e2e-agents.sh -# Test full 2-agent implement protocol end-to-end -test-implement: - ./tests/e2e-implement.sh +# Run the full e2e agent test matrix (solo + pairs, concurrent) +test-matrix *ARGS: + cargo install --path . --quiet + rally command install --target all --force + python3 tests/e2e_matrix.py {{ARGS}} # Pick a todo and print copy-pasteable prompts for implement session agents implement: diff --git a/src/cli.rs b/src/cli.rs index f558e7f..4e85c39 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -305,7 +305,6 @@ pub enum CommandTargetArg { Droid, Pi, ClaudeCode, - Factory, All, } diff --git a/src/command_surface.rs b/src/command_surface.rs index 65a07b4..48f7eff 100644 --- a/src/command_surface.rs +++ b/src/command_surface.rs @@ -234,7 +234,6 @@ enum HarnessTarget { Droid, Pi, ClaudeCode, - Factory, } impl HarnessTarget { @@ -244,7 +243,6 @@ impl HarnessTarget { Self::Droid, Self::Pi, Self::ClaudeCode, - Self::Factory, ] } @@ -254,7 +252,6 @@ impl HarnessTarget { Self::Droid => "droid", Self::Pi => "pi", Self::ClaudeCode => "claude-code", - Self::Factory => "factory", } } @@ -264,35 +261,34 @@ impl HarnessTarget { Self::Droid => "RALLY_DROID_HOME", Self::Pi => "RALLY_PI_HOME", Self::ClaudeCode => "RALLY_CLAUDE_CODE_HOME", - Self::Factory => "RALLY_FACTORY_HOME", } } fn default_root(self, home: &std::path::Path) -> PathBuf { match self { Self::Codex => home.join(".codex"), - Self::Droid => home.join(".droid"), + Self::Droid => home.join(".factory"), Self::Pi => home.join(".pi"), Self::ClaudeCode => home.join(".claude"), - Self::Factory => home.join(".factory"), } } - fn wrapper_path(self, root: &std::path::Path) -> PathBuf { + fn skill_path(self, root: &std::path::Path) -> PathBuf { match self { - Self::Codex => root.join("prompts").join("rally.md"), - Self::Pi => root.join("agent").join("prompts").join("rally.md"), - _ => root.join("commands").join("rally.md"), + Self::Pi => root.join("agent").join("skills").join("rally").join("SKILL.md"), + _ => root.join("skills").join("rally").join("SKILL.md"), } } - fn metadata(self) -> &'static str { + /// Legacy wrapper paths that should be cleaned up on install. + fn legacy_paths(self, root: &std::path::Path) -> Vec { match self { - Self::Codex => "codex-slash-command", - Self::Droid => "droid-slash-command", - Self::Pi => "pi-slash-command", - Self::ClaudeCode => "claude-code-slash-command", - Self::Factory => "factory-slash-command", + Self::Codex => vec![root.join("prompts").join("rally.md")], + Self::Pi => vec![ + root.join("commands").join("rally.md"), + root.join("agent").join("prompts").join("rally.md"), + ], + _ => vec![root.join("commands").join("rally.md")], } } } @@ -381,7 +377,6 @@ enum DoctorStatus { WrapperMissing, InstalledManaged, InstalledManagedModified, - InstalledManagedWrongTarget, InstalledUnmanaged, } @@ -393,7 +388,6 @@ impl fmt::Display for DoctorStatus { Self::WrapperMissing => "wrapper-missing", Self::InstalledManaged => "installed-managed", Self::InstalledManagedModified => "installed-managed-modified", - Self::InstalledManagedWrongTarget => "installed-managed-wrong-target", Self::InstalledUnmanaged => "installed-unmanaged", }; f.write_str(label) @@ -442,7 +436,6 @@ struct UninstallOutcome { #[derive(Clone, Debug, Eq, PartialEq)] enum ManagedOwnership { ManagedForTarget, - ManagedForOtherTarget(String), NotManaged, } @@ -526,12 +519,20 @@ impl TargetAdapter for DotDirAdapter { ); } - Ok(self.target.wrapper_path(&root)) + Ok(self.target.skill_path(&root)) } fn render_wrapper(&self, ctx: &WrapperRenderContext) -> String { - let protocol = format!( - r#"IMPORTANT: You MUST run bash commands to participate in this session. Do NOT try to do the work without running rally commands first. + format!( + r#"--- +name: rally +description: "Run rally multi-agent coding sessions. Use when the user invokes /rally or asks to run a rally workflow." +rally-managed: true +--- + +# Rally + +IMPORTANT: You MUST run bash commands to participate in this session. Do NOT try to do the work without running rally commands first. FIRST, run this bash command immediately with the user's arguments: @@ -548,29 +549,10 @@ This will print your instruction. Then follow this loop: 5. REPEAT from step 1. Do NOT stop until the output says "session complete; stop working". If the command prints "Waiting for ..." — keep waiting. Another agent needs to act first. -If the command exits with code 2 or prints "session complete" — you are done. Stop."#, +If the command exits with code 2 or prints "session complete" — you are done. Stop. +"#, rally = ctx.rally_bin - ); - - match self.target { - HarnessTarget::Codex => format!( - "{protocol}\n\n[//]: # (rally-managed: true)\n[//]: # (rally-target: {target})\n", - protocol = protocol, - target = self.target - ), - HarnessTarget::Pi => format!( - "---\ndescription: \"Run rally workflow\"\n---\n\n\n\n\n{protocol}\n", - target = self.target, - meta = self.target.metadata(), - protocol = protocol - ), - _ => format!( - "\n\n\n\n{protocol}\n", - target = self.target, - meta = self.target.metadata(), - protocol = protocol - ), - } + ) } } @@ -585,7 +567,6 @@ fn selected_targets(target: CommandTargetArg) -> Vec { CommandTargetArg::Droid => vec![HarnessTarget::Droid], CommandTargetArg::Pi => vec![HarnessTarget::Pi], CommandTargetArg::ClaudeCode => vec![HarnessTarget::ClaudeCode], - CommandTargetArg::Factory => vec![HarnessTarget::Factory], } } @@ -598,7 +579,7 @@ fn install_with_environment( let allow_missing = matches!(args.target, CommandTargetArg::All); for target in selected_targets(args.target) { let root = env.root_for(target)?; - let wrapper_path = target.wrapper_path(&root); + let skill_path = target.skill_path(&root); let action = if !root.exists() { if allow_missing { InstallAction::SkipEnvironmentMissing @@ -632,6 +613,11 @@ fn install_with_environment( classify_install_action(&artifact.path, &artifact.content, target, args.force)?; if !args.dry_run { apply_install_action(&artifact, action)?; + for legacy in target.legacy_paths(&root) { + if legacy.exists() && legacy != artifact.path { + let _ = fs::remove_file(&legacy); + } + } } outcomes.push(InstallOutcome { target: artifact.target, @@ -646,7 +632,7 @@ fn install_with_environment( } outcomes.push(InstallOutcome { target, - path: wrapper_path, + path: skill_path, action, }); } @@ -672,17 +658,6 @@ fn classify_install_action( match managed_ownership(¤t, target) { ManagedOwnership::ManagedForTarget => Ok(InstallAction::UpdateManaged), - ManagedOwnership::ManagedForOtherTarget(other_target) => { - if force { - Ok(InstallAction::OverwriteForced) - } else { - bail!( - "refusing to overwrite Rally-managed file for target '{}' at {}. Re-run with --force to replace it", - other_target, - path.display() - ) - } - } ManagedOwnership::NotManaged => { if force { Ok(InstallAction::OverwriteForced) @@ -712,66 +687,38 @@ fn apply_install_action(artifact: &WrapperArtifact, action: InstallAction) -> Re } } -fn managed_ownership(content: &[u8], target: HarnessTarget) -> ManagedOwnership { - let Some(parsed_target) = parse_managed_target(content) else { - return ManagedOwnership::NotManaged; - }; - if parsed_target == target.slug() { +fn managed_ownership(content: &[u8], _target: HarnessTarget) -> ManagedOwnership { + if is_rally_managed(content) { ManagedOwnership::ManagedForTarget } else { - ManagedOwnership::ManagedForOtherTarget(parsed_target) + ManagedOwnership::NotManaged } } -fn parse_managed_target(content: &[u8]) -> Option { - let text = std::str::from_utf8(content).ok()?; - - // Skip leading YAML frontmatter (Pi prompt templates use ---\n...\n---) - let body = if text.starts_with("---\n") || text.starts_with("---\r\n") { - text.find("\n---") - .and_then(|i| text.get(i + 4..)) - .unwrap_or(text) - } else { - text +fn is_rally_managed(content: &[u8]) -> bool { + let Some(text) = std::str::from_utf8(content).ok() else { + return false; }; - // HTML comment format (Claude, Pi, Droid, Factory): - // - // - let mut lines = body.lines().map(|l| l.trim()).filter(|l| !l.is_empty()); - let first = lines.next()?; - if first == RALLY_MANAGED_MARKER { - let target_line = lines.next()?; - let prefix = ""; - if target_line.starts_with(prefix) && target_line.ends_with(suffix) { - let target = &target_line[prefix.len()..target_line.len() - suffix.len()]; - if !target.trim().is_empty() { - return Some(target.trim().to_string()); - } + // SKILL.md format: check YAML frontmatter for rally-managed: true + if text.starts_with("---\n") || text.starts_with("---\r\n") { + if let Some(fm_end) = text.find("\n---") { + let frontmatter = &text[..fm_end]; + return frontmatter.lines().any(|l| l.trim() == "rally-managed: true"); } - return None; } - // Markdown comment format (Codex): - // [//]: # (rally-managed: true) - // [//]: # (rally-target: ) - let managed_md = "[//]: # (rally-managed: true)"; - let target_prefix = "[//]: # (rally-target: "; - for line in text.lines() { - let trimmed = line.trim(); - if trimmed == managed_md { - continue; - } - if trimmed.starts_with(target_prefix) && trimmed.ends_with(')') { - let inner = &trimmed[target_prefix.len()..trimmed.len() - 1]; - if !inner.trim().is_empty() { - return Some(inner.trim().to_string()); - } - } + // Legacy HTML comment format + if text.lines().any(|l| l.trim() == RALLY_MANAGED_MARKER) { + return true; + } + + // Legacy markdown comment format (Codex) + if text.lines().any(|l| l.trim() == "[//]: # (rally-managed: true)") { + return true; } - None + false } fn doctor_with_environment( @@ -782,16 +729,16 @@ fn doctor_with_environment( let mut outcomes = Vec::new(); for target in selected_targets(args.target) { let root = env.root_for(target)?; - let wrapper = target.wrapper_path(&root); + let skill = target.skill_path(&root); let status = if !root.exists() { DoctorStatus::EnvironmentMissing } else if !root.is_dir() { DoctorStatus::InvalidEnvironment - } else if !wrapper.exists() { + } else if !skill.exists() { DoctorStatus::WrapperMissing } else { - let bytes = fs::read(&wrapper) - .with_context(|| format!("reading wrapper {}", wrapper.display()))?; + let bytes = fs::read(&skill) + .with_context(|| format!("reading skill {}", skill.display()))?; match managed_ownership(&bytes, target) { ManagedOwnership::ManagedForTarget => { let expected = adapter_for(target).render_wrapper(render_ctx); @@ -801,16 +748,13 @@ fn doctor_with_environment( DoctorStatus::InstalledManagedModified } } - ManagedOwnership::ManagedForOtherTarget(_) => { - DoctorStatus::InstalledManagedWrongTarget - } ManagedOwnership::NotManaged => DoctorStatus::InstalledUnmanaged, } }; outcomes.push(DoctorOutcome { target, root, - wrapper, + wrapper: skill, status, }); } @@ -824,34 +768,39 @@ fn uninstall_with_environment( let mut outcomes = Vec::new(); for target in selected_targets(args.target) { let root = env.root_for(target)?; - let wrapper = target.wrapper_path(&root); + let skill = target.skill_path(&root); let action = if !root.exists() { UninstallAction::SkippedEnvironmentMissing } else if !root.is_dir() { UninstallAction::SkippedInvalidEnvironment - } else if !wrapper.exists() { + } else if !skill.exists() { UninstallAction::SkippedWrapperMissing } else { - let bytes = fs::read(&wrapper) - .with_context(|| format!("reading wrapper {}", wrapper.display()))?; + let bytes = fs::read(&skill) + .with_context(|| format!("reading skill {}", skill.display()))?; match managed_ownership(&bytes, target) { ManagedOwnership::ManagedForTarget => { if args.dry_run { UninstallAction::WouldRemoveManaged } else { - fs::remove_file(&wrapper) - .with_context(|| format!("removing wrapper {}", wrapper.display()))?; + fs::remove_file(&skill) + .with_context(|| format!("removing skill {}", skill.display()))?; + for legacy in target.legacy_paths(&root) { + if legacy.exists() { + let _ = fs::remove_file(&legacy); + } + } UninstallAction::RemovedManaged } } - ManagedOwnership::ManagedForOtherTarget(_) | ManagedOwnership::NotManaged => { + ManagedOwnership::NotManaged => { UninstallAction::SkippedUnmanaged } } }; outcomes.push(UninstallOutcome { target, - wrapper, + wrapper: skill, action, }); } @@ -1100,7 +1049,7 @@ mod tests { .collect::>(); assert_eq!( names, - vec!["codex", "droid", "pi", "claude-code", "factory"] + vec!["codex", "droid", "pi", "claude-code"] ); } @@ -1119,20 +1068,21 @@ mod tests { #[test] fn discover_install_path_uses_default_dot_directory_layout() -> Result<()> { let home = unique_dir("rally-command-adapter-layout"); - let root = home.join(".droid"); + let root = home.join(".factory"); fs::create_dir_all(&root).with_context(|| format!("creating {}", root.display()))?; let adapter = DotDirAdapter::new(HarnessTarget::Droid); let path = adapter.discover_install_path(&env_with_home(&home))?; - assert_eq!(path, root.join("commands").join("rally.md")); + assert_eq!(path, root.join("skills").join("rally").join("SKILL.md")); Ok(()) } #[test] fn wrapper_rendering_delegates_to_command_run() { - let adapter = DotDirAdapter::new(HarnessTarget::Factory); + let adapter = DotDirAdapter::new(HarnessTarget::Droid); let rendered = adapter.render_wrapper(&WrapperRenderContext::default()); assert!(rendered.contains("rally command run $ARGUMENTS")); - assert!(rendered.contains("rally-target: factory")); + assert!(rendered.contains("rally-managed: true")); + assert!(rendered.starts_with("---\nname: rally\n")); } fn prepare_target_root(home: &Path, target: HarnessTarget) -> Result { @@ -1204,8 +1154,8 @@ mod tests { assert_eq!(first.len(), 1); assert_eq!(first[0].action, InstallAction::Create); - let wrapper = root.join("prompts").join("rally.md"); - let content = fs::read_to_string(&wrapper)?; + let skill = root.join("skills").join("rally").join("SKILL.md"); + let content = fs::read_to_string(&skill)?; assert!(content.contains("rally command run $ARGUMENTS")); assert!(content.contains("rally-managed: true")); @@ -1223,9 +1173,9 @@ mod tests { fn install_refuses_to_overwrite_unknown_file_without_force() -> Result<()> { let home = unique_dir("rally-command-install-no-force"); let root = prepare_target_root(&home, HarnessTarget::Pi)?; - let wrapper = root.join("agent").join("prompts").join("rally.md"); - fs::create_dir_all(wrapper.parent().expect("wrapper parent"))?; - fs::write(&wrapper, "custom wrapper")?; + let skill = root.join("agent").join("skills").join("rally").join("SKILL.md"); + fs::create_dir_all(skill.parent().expect("skill parent"))?; + fs::write(&skill, "custom wrapper")?; let env = env_with_home(&home); let render_ctx = WrapperRenderContext::default(); @@ -1238,7 +1188,7 @@ mod tests { let msg = err.to_string(); assert!(msg.contains("refusing to overwrite non-Rally file")); assert!(msg.contains("--force")); - assert_eq!(fs::read_to_string(&wrapper)?, "custom wrapper"); + assert_eq!(fs::read_to_string(&skill)?, "custom wrapper"); Ok(()) } @@ -1246,9 +1196,9 @@ mod tests { fn install_overwrites_unknown_file_with_force() -> Result<()> { let home = unique_dir("rally-command-install-force"); let root = prepare_target_root(&home, HarnessTarget::ClaudeCode)?; - let wrapper = root.join("commands").join("rally.md"); - fs::create_dir_all(wrapper.parent().expect("wrapper parent"))?; - fs::write(&wrapper, "custom wrapper")?; + let skill = root.join("skills").join("rally").join("SKILL.md"); + fs::create_dir_all(skill.parent().expect("skill parent"))?; + fs::write(&skill, "custom wrapper")?; let env = env_with_home(&home); let render_ctx = WrapperRenderContext::default(); @@ -1258,8 +1208,8 @@ mod tests { &render_ctx, )?; assert_eq!(outcome[0].action, InstallAction::OverwriteForced); - let content = fs::read_to_string(&wrapper)?; - assert!(content.contains(RALLY_MANAGED_MARKER)); + let content = fs::read_to_string(&skill)?; + assert!(content.contains("rally-managed: true")); Ok(()) } @@ -1285,7 +1235,7 @@ mod tests { .expect("droid outcome"); assert_eq!(codex.action, InstallAction::Create); assert_eq!(droid.action, InstallAction::SkipEnvironmentMissing); - assert!(codex_root.join("prompts").join("rally.md").exists()); + assert!(codex_root.join("skills").join("rally").join("SKILL.md").exists()); Ok(()) } @@ -1293,11 +1243,11 @@ mod tests { fn marker_substring_does_not_make_file_managed() -> Result<()> { let home = unique_dir("rally-command-marker-substring"); let root = prepare_target_root(&home, HarnessTarget::Pi)?; - let wrapper = root.join("agent").join("prompts").join("rally.md"); - fs::create_dir_all(wrapper.parent().expect("wrapper parent"))?; + let skill = root.join("agent").join("skills").join("rally").join("SKILL.md"); + fs::create_dir_all(skill.parent().expect("skill parent"))?; fs::write( - &wrapper, - "user file\ncontains somewhere\n", + &skill, + "user file\ncontains rally-managed: true somewhere\n", )?; let env = env_with_home(&home); let render_ctx = WrapperRenderContext::default(); @@ -1317,24 +1267,24 @@ mod tests { let uninstall = uninstall_with_environment(&uninstall_args(CommandTargetArg::Pi, false), &env)?; assert_eq!(uninstall[0].action, UninstallAction::SkippedUnmanaged); - assert!(wrapper.exists()); + assert!(skill.exists()); Ok(()) } #[test] fn doctor_reports_managed_install_status() -> Result<()> { let home = unique_dir("rally-command-doctor-managed"); - let _root = prepare_target_root(&home, HarnessTarget::Factory)?; + let _root = prepare_target_root(&home, HarnessTarget::Droid)?; let env = env_with_home(&home); let render_ctx = WrapperRenderContext::default(); install_with_environment( - &install_args(CommandTargetArg::Factory, false, false), + &install_args(CommandTargetArg::Droid, false, false), &env, &render_ctx, )?; let result = doctor_with_environment( - &doctor_args(CommandTargetArg::Factory, false), + &doctor_args(CommandTargetArg::Droid, false), &env, &render_ctx, )?; @@ -1344,14 +1294,14 @@ mod tests { } #[test] - fn doctor_reports_wrong_target_when_header_target_mismatches_path() -> Result<()> { - let home = unique_dir("rally-command-doctor-target-mismatch"); + fn doctor_reports_unmanaged_when_file_is_not_rally() -> Result<()> { + let home = unique_dir("rally-command-doctor-unmanaged"); let root = prepare_target_root(&home, HarnessTarget::Codex)?; - let wrapper = root.join("prompts").join("rally.md"); - fs::create_dir_all(wrapper.parent().expect("wrapper parent"))?; + let skill = root.join("skills").join("rally").join("SKILL.md"); + fs::create_dir_all(skill.parent().expect("skill parent"))?; fs::write( - &wrapper, - "\n\n\n# /rally\n", + &skill, + "---\nname: rally\ndescription: custom rally skill\n---\n# My Rally\n", )?; let env = env_with_home(&home); let render_ctx = WrapperRenderContext::default(); @@ -1360,7 +1310,7 @@ mod tests { &env, &render_ctx, )?; - assert_eq!(result[0].status, DoctorStatus::InstalledManagedWrongTarget); + assert_eq!(result[0].status, DoctorStatus::InstalledUnmanaged); Ok(()) } @@ -1377,8 +1327,8 @@ mod tests { &env, &render_ctx, )?; - let unmanaged = unmanaged_root.join("commands").join("rally.md"); - fs::create_dir_all(unmanaged.parent().expect("wrapper parent"))?; + let unmanaged = unmanaged_root.join("skills").join("rally").join("SKILL.md"); + fs::create_dir_all(unmanaged.parent().expect("skill parent"))?; fs::write(&unmanaged, "custom content")?; let outcome = @@ -1393,7 +1343,7 @@ mod tests { .expect("droid result"); assert_eq!(codex.action, UninstallAction::RemovedManaged); assert_eq!(droid.action, UninstallAction::SkippedUnmanaged); - assert!(!managed_root.join("prompts").join("rally.md").exists()); + assert!(!managed_root.join("skills").join("rally").join("SKILL.md").exists()); assert_eq!(fs::read_to_string(&unmanaged)?, "custom content"); Ok(()) } @@ -1409,11 +1359,11 @@ mod tests { &env, &render_ctx, )?; - let wrapper = root.join("agent").join("prompts").join("rally.md"); + let skill = root.join("agent").join("skills").join("rally").join("SKILL.md"); let outcome = uninstall_with_environment(&uninstall_args(CommandTargetArg::Pi, true), &env)?; assert_eq!(outcome[0].action, UninstallAction::WouldRemoveManaged); - assert!(wrapper.exists()); + assert!(skill.exists()); Ok(()) } diff --git a/tests/command_install_run.rs b/tests/command_install_run.rs index aad3c9b..42c5d84 100644 --- a/tests/command_install_run.rs +++ b/tests/command_install_run.rs @@ -96,16 +96,14 @@ fn command_install_run_entrypoint_integration() -> Result<()> { let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let codex_root = temp_home.join(".codex"); - let droid_root = temp_home.join(".droid"); + let droid_root = temp_home.join(".factory"); let pi_root = temp_home.join(".pi"); let claude_root = temp_home.join(".claude"); - let factory_root = temp_home.join(".factory"); for root in [ &codex_root, &droid_root, &pi_root, &claude_root, - &factory_root, ] { fs::create_dir_all(root)?; } @@ -115,34 +113,33 @@ fn command_install_run_entrypoint_integration() -> Result<()> { ®istry, )?; - for (target, root) in [ + for (_target, root) in [ ("codex", &codex_root), ("droid", &droid_root), ("pi", &pi_root), ("claude-code", &claude_root), - ("factory", &factory_root), ] { - let wrapper = match target { - "codex" => root.join("prompts").join("rally.md"), - "pi" => root.join("agent").join("prompts").join("rally.md"), - _ => root.join("commands").join("rally.md"), + let skill = match _target { + "pi" => root.join("agent").join("skills").join("rally").join("SKILL.md"), + _ => root.join("skills").join("rally").join("SKILL.md"), }; - let content = fs::read_to_string(&wrapper)?; + let content = fs::read_to_string(&skill)?; assert!(content.contains("rally command run $ARGUMENTS")); - assert!(content.contains(&format!("rally-target: {target}"))); + assert!(content.contains("rally-managed: true")); } - let codex_wrapper = codex_root.join("prompts").join("rally.md"); - let before = fs::read_to_string(&codex_wrapper)?; + let codex_skill = codex_root.join("skills").join("rally").join("SKILL.md"); + let before = fs::read_to_string(&codex_skill)?; run_cli( &["rally", "command", "install", "--target", "codex"], ®istry, )?; - let after = fs::read_to_string(&codex_wrapper)?; + let after = fs::read_to_string(&codex_skill)?; assert_eq!(before, after); - let droid_wrapper = droid_root.join("commands").join("rally.md"); - fs::write(&droid_wrapper, "custom wrapper content")?; + let droid_skill = droid_root.join("skills").join("rally").join("SKILL.md"); + fs::create_dir_all(droid_skill.parent().expect("skill parent"))?; + fs::write(&droid_skill, "custom wrapper content")?; let err = run_cli( &["rally", "command", "install", "--target", "droid"], ®istry, @@ -169,15 +166,15 @@ fn command_install_run_entrypoint_integration() -> Result<()> { ], ®istry, )?; - assert!(codex_wrapper.exists()); + assert!(codex_skill.exists()); run_cli( &["rally", "command", "uninstall", "--target", "codex"], ®istry, )?; - assert!(!codex_wrapper.exists()); + assert!(!codex_skill.exists()); assert_eq!( - fs::read_to_string(&droid_wrapper)?, + fs::read_to_string(&droid_skill)?, "custom wrapper content" ); diff --git a/tests/e2e_matrix.py b/tests/e2e_matrix.py new file mode 100644 index 0000000..72d70a9 --- /dev/null +++ b/tests/e2e_matrix.py @@ -0,0 +1,465 @@ +#!/usr/bin/env python3 +""" +Concurrent e2e test matrix for rally implement protocol. + +Runs all requested (implementer, reviewer) agent pairs concurrently. +Each test creates a unique 1-step todo, launches both agents, polls +the session state, and reports results. + +Usage: + python3 tests/e2e_matrix.py # all available pairs + python3 tests/e2e_matrix.py --solo # solo tests only (--reviewers 0) + python3 tests/e2e_matrix.py --pairs # pair tests only + python3 tests/e2e_matrix.py --impl claude --revw pi # specific pair + python3 tests/e2e_matrix.py --timeout 180 # custom timeout +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +import tempfile +import textwrap +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +WORKSPACE = Path(__file__).resolve().parent.parent +HOME = Path.home() +SESSIONS_DIR = HOME / ".rally" / "sessions" + +AGENTS = { + "claude": { + "launch_stdin": lambda prompt, logfile: [ + "claude", "--dangerously-skip-permissions", "--no-session-persistence" + ], + "prompt_style": "stdin", + "skill_prefix": "/rally", + }, + "codex": { + "launch_exec": lambda prompt, logfile: [ + "codex", "exec", prompt, + "--dangerously-bypass-approvals-and-sandbox", "--ephemeral" + ], + "prompt_style": "exec", + "skill_prefix": "$rally", + }, + "pi": { + "launch_stdin": lambda prompt, logfile: [ + "pi", "--no-session", "--tools", "bash" + ], + "prompt_style": "stdin", + "skill_prefix": "Use the rally skill to:", + }, + "droid": { + "launch_exec": lambda prompt, logfile: [ + "droid", "exec", prompt, "--skip-permissions-unsafe" + ], + "prompt_style": "exec", + "skill_prefix": "/rally", + }, +} + + +@dataclass +class TestResult: + name: str + impl_agent: str + revw_agent: Optional[str] + session: str + phase: str + agents_joined: list + file_content: Optional[str] + duration: float + passed: bool + failure_reason: str = "" + logdir: str = "" + errors: list = field(default_factory=list) + + +def available_agents(): + return [a for a in AGENTS if shutil.which(a)] + + +def read_state(session_name): + state_file = SESSIONS_DIR / session_name / "state.json" + if not state_file.exists(): + return None + try: + return json.loads(state_file.read_text()) + except (json.JSONDecodeError, OSError): + return None + + +def launch_agent(agent_name, prompt, logfile): + cfg = AGENTS[agent_name] + if cfg["prompt_style"] == "stdin": + cmd = cfg["launch_stdin"](prompt, logfile) + proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=open(logfile, "w"), + stderr=subprocess.STDOUT, + cwd=str(WORKSPACE), + ) + proc.stdin.write(prompt.encode()) + proc.stdin.close() + else: + cmd = cfg["launch_exec"](prompt, logfile) + proc = subprocess.Popen( + cmd, + stdout=open(logfile, "w"), + stderr=subprocess.STDOUT, + cwd=str(WORKSPACE), + ) + return proc + + +def build_prompt(agent_name, role, todo_filename, extra_flags=""): + cfg = AGENTS[agent_name] + prefix = cfg["skill_prefix"] + args = f"implement {todo_filename} {role}" + if extra_flags: + args += f" {extra_flags}" + return f"{prefix} {args}" + + +def run_solo_test(agent_name, timeout=120): + ts = int(time.time() * 1000) + todo_name = f"solo-{agent_name}-{ts}" + todo_file = WORKSPACE / f"{todo_name}.md" + session_name = f"implement-{todo_name}" + outfile = WORKSPACE / f"solo-{agent_name}-{ts}.txt" + + logdir = tempfile.mkdtemp(prefix=f"rally-solo-{agent_name}-") + + todo_file.write_text(textwrap.dedent(f"""\ + ## Spec + Solo test for {agent_name}. + ## Plan + 1. Create {outfile.name} with content "hello {agent_name}" + Acceptance criteria: + - {outfile.name} exists with "hello {agent_name}" + """)) + + start = time.time() + prompt = build_prompt(agent_name, "implementer", todo_file.name, "--reviewers 0") + logfile = os.path.join(logdir, "agent.log") + + proc = launch_agent(agent_name, prompt, logfile) + + # Poll + final_phase = "no-session" + elapsed = 0 + while elapsed < timeout: + time.sleep(5) + elapsed = time.time() - start + state = read_state(session_name) + if state: + final_phase = state.get("phase", "unknown") + # For solo, any phase past registration means rally was used + if final_phase in ("review", "final_review", "done"): + break + + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + + duration = time.time() - start + file_content = outfile.read_text().strip() if outfile.exists() else None + state = read_state(session_name) + agents_joined = list(state.get("agents", {}).keys()) if state else [] + + passed = ( + state is not None # session was created (rally was invoked) + and final_phase in ("implement", "review", "final_review", "done") + and file_content is not None + ) + + errors = [] + if state is None: + errors.append("session never created -- agent didn't run rally") + if file_content is None: + errors.append(f"output file {outfile.name} not created") + elif file_content != f"hello {agent_name}": + errors.append(f"file content mismatch: got '{file_content}', expected 'hello {agent_name}'") + + result = TestResult( + name=f"solo-{agent_name}", + impl_agent=agent_name, + revw_agent=None, + session=session_name, + phase=final_phase, + agents_joined=agents_joined, + file_content=file_content, + duration=duration, + passed=passed, + failure_reason="; ".join(errors) if errors else "", + logdir=logdir, + errors=errors, + ) + + # Cleanup + todo_file.unlink(missing_ok=True) + outfile.unlink(missing_ok=True) + session_dir = SESSIONS_DIR / session_name + if session_dir.exists(): + shutil.rmtree(session_dir, ignore_errors=True) + + return result + + +def run_pair_test(impl_agent, revw_agent, timeout=300): + ts = int(time.time() * 1000) + todo_name = f"pair-{impl_agent}-{revw_agent}-{ts}" + todo_file = WORKSPACE / f"{todo_name}.md" + session_name = f"implement-{todo_name}" + outfile = WORKSPACE / f"pair-{impl_agent}-{revw_agent}-{ts}.txt" + + logdir = tempfile.mkdtemp(prefix=f"rally-pair-{impl_agent}-{revw_agent}-") + + todo_file.write_text(textwrap.dedent(f"""\ + ## Spec + Pair test: {impl_agent} implements, {revw_agent} reviews. + ## Plan + 1. Create {outfile.name} with content "step 1" + Acceptance criteria: + - {outfile.name} exists with "step 1" + 2. Append " and step 2" to {outfile.name} + Acceptance criteria: + - {outfile.name} contains "step 1 and step 2" + """)) + + start = time.time() + impl_prompt = build_prompt(impl_agent, "implementer", todo_file.name) + revw_prompt = build_prompt(revw_agent, "reviewer", todo_file.name) + + impl_log = os.path.join(logdir, "impl.log") + revw_log = os.path.join(logdir, "revw.log") + + impl_proc = launch_agent(impl_agent, impl_prompt, impl_log) + time.sleep(8) + revw_proc = launch_agent(revw_agent, revw_prompt, revw_log) + + # Poll with phase trace + phase_trace = [] + final_phase = "no-session" + elapsed = 0 + while elapsed < timeout: + time.sleep(10) + elapsed = time.time() - start + state = read_state(session_name) + if state: + final_phase = state.get("phase", "unknown") + agents = list(state.get("agents", {}).keys()) + phase_trace.append((int(elapsed), final_phase, agents)) + if final_phase == "done": + break + + for proc in (impl_proc, revw_proc): + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + + duration = time.time() - start + file_content = outfile.read_text().strip() if outfile.exists() else None + state = read_state(session_name) + agents_joined = list(state.get("agents", {}).keys()) if state else [] + + # Check for handoff: phase must go implement -> review -> implement (step 2) + saw_handoff = False + last_phase = None + implement_count = 0 + for _, ph, _ in phase_trace: + if ph == "implement": + implement_count += 1 + if last_phase == "review" and ph == "implement" and implement_count >= 2: + saw_handoff = True + last_phase = ph + + errors = [] + if state is None: + errors.append("session never created") + elif "implementer" not in agents_joined: + errors.append("implementer never joined") + elif "reviewer-1" not in agents_joined: + errors.append("reviewer never joined") + if final_phase != "done": + errors.append(f"phase={final_phase}, expected done") + if file_content is None: + errors.append(f"output file {outfile.name} not created") + elif "step 1 and step 2" not in file_content: + errors.append(f"file content wrong: '{file_content}'") + if not saw_handoff and final_phase == "done": + errors.append("no implement->review->implement handoff detected") + + passed = final_phase == "done" and not errors + + result = TestResult( + name=f"pair-{impl_agent}+{revw_agent}", + impl_agent=impl_agent, + revw_agent=revw_agent, + session=session_name, + phase=final_phase, + agents_joined=agents_joined, + file_content=file_content, + duration=duration, + passed=passed, + failure_reason="; ".join(errors) if errors else "", + logdir=logdir, + errors=errors, + ) + + # Cleanup + todo_file.unlink(missing_ok=True) + outfile.unlink(missing_ok=True) + session_dir = SESSIONS_DIR / session_name + if session_dir.exists(): + shutil.rmtree(session_dir, ignore_errors=True) + + return result + + +def print_results(results): + print() + print("=" * 72) + print(f"{'TEST':<30} {'RESULT':<8} {'PHASE':<14} {'TIME':>6} DETAILS") + print("-" * 72) + for r in results: + status = "\033[32mPASS\033[0m" if r.passed else "\033[31mFAIL\033[0m" + print(f"{r.name:<30} {status:<17} {r.phase:<14} {r.duration:5.0f}s {r.failure_reason[:40]}") + print("-" * 72) + + passed = sum(1 for r in results if r.passed) + total = len(results) + color = "\033[32m" if passed == total else "\033[31m" + print(f"{color}{passed}/{total} passed\033[0m") + + failed = [r for r in results if not r.passed] + if failed: + print() + print("FAILURE DETAILS:") + for r in failed: + print(f"\n {r.name}:") + print(f" Session: {r.session}") + print(f" Phase: {r.phase}") + print(f" Agents: {r.agents_joined}") + print(f" File: {r.file_content!r}") + print(f" Logs: {r.logdir}") + for e in r.errors: + print(f" ERROR: {e}") + log_files = list(Path(r.logdir).glob("*.log")) + for lf in log_files: + content = lf.read_text() + if content.strip(): + tail = "\n".join(content.strip().splitlines()[-10:]) + print(f" --- {lf.name} (last 10 lines) ---") + for line in tail.splitlines(): + print(f" {line}") + + +def main(): + parser = argparse.ArgumentParser(description="Rally e2e test matrix") + parser.add_argument("--solo", action="store_true", help="solo tests only") + parser.add_argument("--pairs", action="store_true", help="pair tests only") + parser.add_argument("--impl", help="specific implementer agent") + parser.add_argument("--revw", help="specific reviewer agent") + parser.add_argument("--timeout", type=int, default=420, help="per-test timeout (default 420)") + parser.add_argument("--solo-timeout", type=int, default=120, help="solo test timeout (default 120)") + parser.add_argument("--jobs", type=int, default=4, help="max concurrent tests (default 4)") + args = parser.parse_args() + + agents = available_agents() + print(f"Available agents: {', '.join(agents)}") + + if len(agents) == 0: + print("No agents found!") + sys.exit(1) + + # Build test list + futures_map = {} + results = [] + + # Specific pair + if args.impl and args.revw: + if args.impl not in agents or args.revw not in agents: + print(f"Agent not available. Have: {agents}") + sys.exit(1) + with ThreadPoolExecutor(max_workers=1) as pool: + f = pool.submit(run_pair_test, args.impl, args.revw, args.timeout) + results.append(f.result()) + print_results(results) + sys.exit(0 if all(r.passed for r in results) else 1) + + def collect(futures_map): + for future in as_completed(futures_map): + test_name = futures_map[future] + try: + result = future.result() + results.append(result) + status = "PASS" if result.passed else "FAIL" + print(f" [{status}] {test_name} ({result.duration:.0f}s) {result.phase}") + except Exception as e: + print(f" [ERROR] {test_name}: {e}") + + # Phase 1: all solo tests concurrently (each agent used once) + if not args.pairs: + print("\n--- Solo tests ---") + with ThreadPoolExecutor(max_workers=len(agents)) as pool: + futures_map = {} + for agent in agents: + f = pool.submit(run_solo_test, agent, args.solo_timeout) + futures_map[f] = f"solo-{agent}" + collect(futures_map) + + # Phase 2: pair tests in waves (no agent used in two concurrent tests) + if not args.solo: + # Schedule pairs so no agent appears twice per wave + pair_candidates = [ + ("claude", "pi"), + ("pi", "claude"), + ("claude", "codex"), + ] + pairs = [(a, b) for a, b in pair_candidates if a in agents and b in agents] + + # Greedy wave scheduling: each wave has no agent overlap + waves = [] + remaining = list(pairs) + while remaining: + wave = [] + used = set() + still_remaining = [] + for impl_a, revw_a in remaining: + if impl_a not in used and revw_a not in used: + wave.append((impl_a, revw_a)) + used.update([impl_a, revw_a]) + else: + still_remaining.append((impl_a, revw_a)) + waves.append(wave) + remaining = still_remaining + + for i, wave in enumerate(waves): + print(f"\n--- Pair wave {i+1}: {', '.join(f'{a}+{b}' for a,b in wave)} ---") + with ThreadPoolExecutor(max_workers=len(wave)) as pool: + futures_map = {} + for impl_a, revw_a in wave: + f = pool.submit(run_pair_test, impl_a, revw_a, args.timeout) + futures_map[f] = f"pair-{impl_a}+{revw_a}" + collect(futures_map) + + # Sort by name for stable output + results.sort(key=lambda r: r.name) + print_results(results) + sys.exit(0 if all(r.passed for r in results) else 1) + + +if __name__ == "__main__": + main()