algo/scripts/track-test-effectiveness.py
Dan Guido be744b16a2
chore: Conservative dependency updates for Jinja2 security fix (#14792)
* chore: Conservative dependency updates for security

- Update Ansible from 9.1.0 to 9.2.0 (one minor version bump only)
- Update Jinja2 to ~3.1.6 to fix CVE-2025-27516 (critical security fix)
- Pin netaddr to 1.3.0 (current stable version)

This is a minimal, conservative update focused on:
1. Critical security fix for Jinja2
2. Minor ansible update for bug fixes
3. Pinning netaddr to prevent surprises

No changes to Ansible collections - keeping them unpinned for now.

* fix: Address linter issues (ruff, yamllint, shellcheck)

- Fixed ruff configuration by moving linter settings to [tool.ruff.lint] section
- Fixed ruff code issues:
  - Moved imports to top of files (E402)
  - Removed unused variables or commented them out
  - Updated string formatting from % to .format()
  - Replaced dict() calls with literals
  - Fixed assert False usage in tests
- Fixed yamllint issues:
  - Added missing newlines at end of files
  - Removed trailing spaces
  - Added document start markers (---) to YAML files
  - Fixed 'on:' truthy warnings in GitHub workflows
- Fixed shellcheck issues:
  - Properly quoted variables in shell scripts
  - Fixed A && B || C pattern with proper if/then/else
  - Improved FreeBSD rc script quoting

All linters now pass without errors related to our code changes.

* fix: Additional yamllint fixes for GitHub workflows

- Added document start markers (---) to test-effectiveness.yml
- Fixed 'on:' truthy warning by quoting as 'on:'
- Removed trailing spaces from main.yml
- Added missing newline at end of test-effectiveness.yml
2025-08-03 07:45:26 -04:00

234 lines
8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Track test effectiveness by analyzing CI failures and correlating with issues/PRs
This helps identify which tests actually catch bugs vs just failing randomly
"""
import json
import subprocess
from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path
def get_github_api_data(endpoint):
"""Fetch data from GitHub API"""
cmd = ['gh', 'api', endpoint]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error fetching {endpoint}: {result.stderr}")
return None
return json.loads(result.stdout)
def analyze_workflow_runs(repo_owner, repo_name, days_back=30):
"""Analyze workflow runs to find test failures"""
since = (datetime.now() - timedelta(days=days_back)).isoformat()
# Get workflow runs
runs = get_github_api_data(
f'/repos/{repo_owner}/{repo_name}/actions/runs?created=>{since}&status=failure'
)
if not runs:
return {}
test_failures = defaultdict(list)
for run in runs.get('workflow_runs', []):
# Get jobs for this run
jobs = get_github_api_data(
f'/repos/{repo_owner}/{repo_name}/actions/runs/{run["id"]}/jobs'
)
if not jobs:
continue
for job in jobs.get('jobs', []):
if job['conclusion'] == 'failure':
# Try to extract which test failed from logs
logs_url = job.get('logs_url')
if logs_url:
# Parse logs to find test failures
test_name = extract_failed_test(job['name'], run['id'])
if test_name:
test_failures[test_name].append({
'run_id': run['id'],
'run_number': run['run_number'],
'date': run['created_at'],
'branch': run['head_branch'],
'commit': run['head_sha'][:7],
'pr': extract_pr_number(run)
})
return test_failures
def extract_failed_test(job_name, run_id):
"""Extract test name from job - this is simplified"""
# Map job names to test categories
job_to_tests = {
'Basic sanity tests': 'test_basic_sanity',
'Ansible syntax check': 'ansible_syntax',
'Docker build test': 'docker_tests',
'Configuration generation test': 'config_generation',
'Ansible dry-run validation': 'ansible_dry_run'
}
return job_to_tests.get(job_name, job_name)
def extract_pr_number(run):
"""Extract PR number from workflow run"""
for pr in run.get('pull_requests', []):
return pr['number']
return None
def correlate_with_issues(repo_owner, repo_name, test_failures):
"""Correlate test failures with issues/PRs that fixed them"""
correlations = defaultdict(lambda: {'caught_bugs': 0, 'false_positives': 0})
for test_name, failures in test_failures.items():
for failure in failures:
if failure['pr']:
# Check if PR was merged (indicating it fixed a real issue)
pr = get_github_api_data(
f'/repos/{repo_owner}/{repo_name}/pulls/{failure["pr"]}'
)
if pr and pr.get('merged'):
# Check PR title/body for bug indicators
title = pr.get('title', '').lower()
body = pr.get('body', '').lower()
bug_keywords = ['fix', 'bug', 'error', 'issue', 'broken', 'fail']
is_bug_fix = any(keyword in title or keyword in body
for keyword in bug_keywords)
if is_bug_fix:
correlations[test_name]['caught_bugs'] += 1
else:
correlations[test_name]['false_positives'] += 1
return correlations
def generate_effectiveness_report(test_failures, correlations):
"""Generate test effectiveness report"""
report = []
report.append("# Test Effectiveness Report")
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
# Summary
report.append("## Summary")
total_failures = sum(len(f) for f in test_failures.values())
report.append(f"- Total test failures: {total_failures}")
report.append(f"- Unique tests that failed: {len(test_failures)}")
report.append("")
# Effectiveness scores
report.append("## Test Effectiveness Scores")
report.append("| Test | Failures | Caught Bugs | False Positives | Effectiveness |")
report.append("|------|----------|-------------|-----------------|---------------|")
scores = []
for test_name, failures in test_failures.items():
failure_count = len(failures)
caught = correlations[test_name]['caught_bugs']
false_pos = correlations[test_name]['false_positives']
# Calculate effectiveness (bugs caught / total failures)
if failure_count > 0:
effectiveness = caught / failure_count
else:
effectiveness = 0
scores.append((test_name, failure_count, caught, false_pos, effectiveness))
# Sort by effectiveness
scores.sort(key=lambda x: x[4], reverse=True)
for test_name, failures, caught, false_pos, effectiveness in scores:
report.append(f"| {test_name} | {failures} | {caught} | {false_pos} | {effectiveness:.1%} |")
# Recommendations
report.append("\n## Recommendations")
for test_name, failures, caught, false_pos, effectiveness in scores:
if effectiveness < 0.2 and failures > 5:
report.append(f"- ⚠️ Consider improving or removing `{test_name}` (only {effectiveness:.0%} effective)")
elif effectiveness > 0.8:
report.append(f"- ✅ `{test_name}` is highly effective ({effectiveness:.0%})")
return '\n'.join(report)
def save_metrics(test_failures, correlations):
"""Save metrics to JSON for historical tracking"""
metrics_file = Path('.metrics/test-effectiveness.json')
metrics_file.parent.mkdir(exist_ok=True)
# Load existing metrics
if metrics_file.exists():
with open(metrics_file) as f:
historical = json.load(f)
else:
historical = []
# Add current metrics
current = {
'date': datetime.now().isoformat(),
'test_failures': {
test: len(failures) for test, failures in test_failures.items()
},
'effectiveness': {
test: {
'caught_bugs': data['caught_bugs'],
'false_positives': data['false_positives'],
'score': data['caught_bugs'] / (data['caught_bugs'] + data['false_positives'])
if (data['caught_bugs'] + data['false_positives']) > 0 else 0
}
for test, data in correlations.items()
}
}
historical.append(current)
# Keep last 12 months of data
cutoff = datetime.now() - timedelta(days=365)
historical = [
h for h in historical
if datetime.fromisoformat(h['date']) > cutoff
]
with open(metrics_file, 'w') as f:
json.dump(historical, f, indent=2)
if __name__ == '__main__':
# Configure these for your repo
REPO_OWNER = 'trailofbits'
REPO_NAME = 'algo'
print("Analyzing test effectiveness...")
# Analyze last 30 days of CI runs
test_failures = analyze_workflow_runs(REPO_OWNER, REPO_NAME, days_back=30)
# Correlate with issues/PRs
correlations = correlate_with_issues(REPO_OWNER, REPO_NAME, test_failures)
# Generate report
report = generate_effectiveness_report(test_failures, correlations)
print("\n" + report)
# Save report
report_file = Path('.metrics/test-effectiveness-report.md')
report_file.parent.mkdir(exist_ok=True)
with open(report_file, 'w') as f:
f.write(report)
print(f"\nReport saved to: {report_file}")
# Save metrics for tracking
save_metrics(test_failures, correlations)
print("Metrics saved to: .metrics/test-effectiveness.json")