skills/benchmark.sh at main · mixedbread-ai/skills

executable file
417 lines (334 loc) · 15.4 KB
#!/usr/bin/env bash
set -euo pipefail
# ─── Configuration ────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
RESULTS_DIR="${SCRIPT_DIR}/benchmark-results/$(date +%Y%m%d-%H%M%S)"
TIMEOUT_SECONDS=300
MODEL="sonnet"
PLUGIN_DIR="${SCRIPT_DIR}"
CUSTOM_PROMPT=""
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
DIM='\033[2m'
NC='\033[0m'
# ─── Task Prompt ──────────────────────────────────────────────────────────────
TASK_PROMPT='Write a complete Python script called "search_pipeline.py" that implements a production-ready document search pipeline using the Mixedbread API. The script must:
1. Create a store named "benchmark-docs" with these settings:
   - save_content enabled
   - contextualization enabled for the metadata fields "title" and "category"
2. Upload files to the store from a local directory passed as a CLI argument:
   - Support PDF, Markdown, and text files
   - Use parallel uploads (concurrency of 10)
   - Attach metadata: "title" derived from filename, "category" derived from parent directory name
3. Implement a search function that:
   - Accepts a natural language query
   - Uses reranking with model "mixedbread-ai/mxbai-rerank-large-v2"
   - Applies a configurable score threshold (default 0.3)
   - Returns metadata with results
   - Supports filtering by category using the metadata filter API
4. Implement a question-answering function that:
   - Uses the stores QA endpoint with citations enabled
   - Falls back to agentic search with max_rounds=3 if the initial answer has no sources
   - Prints the answer with inline source references
5. Include a CLI interface using argparse with subcommands: "ingest", "search", and "ask".
6. Include proper error handling, logging, and type hints throughout.
Use the mixedbread Python SDK (from mixedbread import Mixedbread). Assume MXBAI_API_KEY is set as an environment variable. Write the complete script to "search_pipeline.py" in the current directory.'
# ─── Judge Prompt ─────────────────────────────────────────────────────────────
JUDGE_PROMPT_TEMPLATE='You are an expert code reviewer evaluating two Python scripts that implement a document search pipeline using the Mixedbread API and Python SDK.
Both scripts were generated by an AI assistant given the same task prompt. Your job is to evaluate them blindly.
## Evaluation Criteria
Score each script from 1-10 on each criterion:
1. **API Correctness** (weight: 3x): Does the script use the correct Mixedbread SDK method signatures, parameter names, and patterns? Check: client initialization, store creation params, file upload syntax, search params (filters, reranking), QA params (citations, agentic search), reranking config.
2. **Completeness** (weight: 2x): Does the script implement all 5 requested features (store creation with contextualization, file upload with metadata, search with reranking and filters, QA with citations and agentic fallback, CLI with argparse)?
3. **Code Quality** (weight: 1x): Type hints, error handling, logging, code organization, and Python best practices.
4. **Production Readiness** (weight: 1x): Proper error handling, graceful degradation, clear output formatting, configuration flexibility.
## Output Format
Respond with EXACTLY this JSON structure and nothing else:
  "response_a": {
    "api_correctness": <1-10>,
    "completeness": <1-10>,
    "code_quality": <1-10>,
    "production_readiness": <1-10>,
    "weighted_score": <calculated: api*3 + completeness*2 + quality*1 + production*1>,
    "api_issues": ["<list specific API mistakes>"],
    "strengths": ["<list notable strengths>"]
  "response_b": {
    "api_correctness": <1-10>,
    "completeness": <1-10>,
    "code_quality": <1-10>,
    "production_readiness": <1-10>,
    "weighted_score": <calculated: api*3 + completeness*2 + quality*1 + production*1>,
    "api_issues": ["<list specific API mistakes>"],
    "strengths": ["<list notable strengths>"]
  "winner": "<A or B or tie>",
  "reasoning": "<2-3 sentence explanation>"
## Response A
PLACEHOLDER_A
## Response B
PLACEHOLDER_B
# ─── Argument Parsing ─────────────────────────────────────────────────────────
    cat <<EOF
Usage: $(basename "$0") [OPTIONS]
Compare Claude Code output with and without Mixedbread skills.
  --model <model>       Model to use for all runs (default: sonnet)
  --timeout <seconds>   Timeout per agent run in seconds (default: 300)
  --prompt <text>       Override the task prompt
  --budget <usd>        Max budget per agent run in USD (default: 1)
  --help                Show this help message
  $(basename "$0") --model sonnet --timeout 300
MAX_BUDGET="1"
while [[ $# -gt 0 ]]; do
    case "$1" in
        --model)   MODEL="$2"; shift 2 ;;
        --timeout) TIMEOUT_SECONDS="$2"; shift 2 ;;
        --prompt)  CUSTOM_PROMPT="$2"; shift 2 ;;
        --budget)  MAX_BUDGET="$2"; shift 2 ;;
        --help)    usage ;;
        *)         echo "Unknown option: $1"; usage ;;
# ─── Prerequisites ────────────────────────────────────────────────────────────
check_prerequisites() {
    local missing=0
    if ! command -v claude &>/dev/null; then
        echo -e "${RED}Error: 'claude' CLI not found. Install from https://claude.ai/code${NC}"
        missing=1
    if ! command -v git &>/dev/null; then
        echo -e "${RED}Error: 'git' not found${NC}"
        missing=1
    if [[ ! -d "${PLUGIN_DIR}/.claude-plugin" ]]; then
        echo -e "${RED}Error: Plugin directory not found at ${PLUGIN_DIR}/.claude-plugin${NC}"
        missing=1
    if [[ $missing -eq 1 ]]; then
        exit 1
# ─── Portable Timeout ─────────────────────────────────────────────────────────
run_with_timeout() {
    local timeout_sec="$1"
    if command -v gtimeout &>/dev/null; then
        gtimeout "${timeout_sec}" "$@"
        return $?
    elif command -v timeout &>/dev/null; then
        timeout "${timeout_sec}" "$@"
        return $?
        # macOS fallback: background the command + watchdog
        "$@" &
        local cmd_pid=$!
            sleep "$timeout_sec"
            kill "$cmd_pid" 2>/dev/null
        local watchdog_pid=$!
        wait "$cmd_pid" 2>/dev/null
        local exit_code=$?
        kill "$watchdog_pid" 2>/dev/null || true
        wait "$watchdog_pid" 2>/dev/null || true
        return $exit_code
# ─── Cleanup ──────────────────────────────────────────────────────────────────
WORKDIR_WITH=""
WORKDIR_WITHOUT=""
PID_WITH=""
PID_WITHOUT=""
cleanup() {
    echo -e "\n${DIM}Cleaning up temp directories...${NC}"
    [[ -n "$PID_WITH" ]]     && kill "$PID_WITH" 2>/dev/null || true
    [[ -n "$PID_WITHOUT" ]]  && kill "$PID_WITHOUT" 2>/dev/null || true
    [[ -n "$WORKDIR_WITH"    && -d "$WORKDIR_WITH" ]]    && rm -rf "$WORKDIR_WITH"
    [[ -n "$WORKDIR_WITHOUT" && -d "$WORKDIR_WITHOUT" ]] && rm -rf "$WORKDIR_WITHOUT"
trap cleanup EXIT
# ─── Setup Working Directories ────────────────────────────────────────────────
setup_workdirs() {
    WORKDIR_WITH="$(mktemp -d)"
    WORKDIR_WITHOUT="$(mktemp -d)"
    for dir in "$WORKDIR_WITH" "$WORKDIR_WITHOUT"; do
        git -C "$dir" init --quiet
        git -C "$dir" commit --allow-empty -m "init" --quiet
    echo -e "  ${DIM}With skills:    ${WORKDIR_WITH}${NC}"
    echo -e "  ${DIM}Without skills: ${WORKDIR_WITHOUT}${NC}"
# ─── Run Both Agents ──────────────────────────────────────────────────────────
run_agents() {
    local prompt="${CUSTOM_PROMPT:-$TASK_PROMPT}"
    echo -e "\n${BOLD}Running agents in parallel...${NC}"
    echo -e "  Model:   ${MODEL}"
    echo -e "  Timeout: ${TIMEOUT_SECONDS}s"
    echo -e "  Budget:  \$${MAX_BUDGET} per run"
    # ── With Skills ──
    echo -e "\n${BLUE}[WITH SKILLS]${NC} Starting..."
        cd "$WORKDIR_WITH"
        run_with_timeout "$TIMEOUT_SECONDS" \
            claude -p "$prompt" \
            --model "$MODEL" \
            --output-format text \
            --dangerously-skip-permissions \
            --no-session-persistence \
            --max-budget-usd "$MAX_BUDGET" \
            --plugin-dir "$PLUGIN_DIR" \
            > "${RESULTS_DIR}/with_skills_output.txt" 2>"${RESULTS_DIR}/with_skills_stderr.log"
    PID_WITH=$!
    # ── Without Skills ──
    echo -e "${BLUE}[WITHOUT SKILLS]${NC} Starting..."
        cd "$WORKDIR_WITHOUT"
        run_with_timeout "$TIMEOUT_SECONDS" \
            claude -p "$prompt" \
            --model "$MODEL" \
            --output-format text \
            --dangerously-skip-permissions \
            --no-session-persistence \
            --max-budget-usd "$MAX_BUDGET" \
            --disable-slash-commands \
            > "${RESULTS_DIR}/without_skills_output.txt" 2>"${RESULTS_DIR}/without_skills_stderr.log"
    PID_WITHOUT=$!
    # ── Wait for both ──
    local with_exit=0 without_exit=0
    wait "$PID_WITH"    || with_exit=$?
    wait "$PID_WITHOUT" || without_exit=$?
    PID_WITH=""
    PID_WITHOUT=""
    if [[ $with_exit -eq 0 ]]; then
        echo -e "${GREEN}[WITH SKILLS]    Completed${NC}"
        echo -e "${RED}[WITH SKILLS]    Exited with code ${with_exit}$( [[ $with_exit -eq 124 ]] && echo " (timeout)" )${NC}"
    if [[ $without_exit -eq 0 ]]; then
        echo -e "${GREEN}[WITHOUT SKILLS] Completed${NC}"
        echo -e "${RED}[WITHOUT SKILLS] Exited with code ${without_exit}$( [[ $without_exit -eq 124 ]] && echo " (timeout)" )${NC}"
# ─── Extract Scripts ──────────────────────────────────────────────────────────
extract_script() {
    local label="$1" workdir="$2"
    if [[ -f "${workdir}/search_pipeline.py" ]]; then
        cp "${workdir}/search_pipeline.py" "${RESULTS_DIR}/${label}_script.py"
        local lines
        lines=$(wc -l < "${RESULTS_DIR}/${label}_script.py" | tr -d ' ')
        echo -e "  ${GREEN}[${label}]${NC} search_pipeline.py (${lines} lines)"
        echo -e "  ${YELLOW}[${label}]${NC} search_pipeline.py not found in workdir"
        echo "# Script was not written to a file by the agent." > "${RESULTS_DIR}/${label}_script.py"
        echo "# Check ${label}_output.txt for the full agent response." >> "${RESULTS_DIR}/${label}_script.py"
extract_scripts() {
    echo -e "\n${BOLD}Extracting generated scripts...${NC}"
    extract_script "with_skills"    "$WORKDIR_WITH"
    extract_script "without_skills" "$WORKDIR_WITHOUT"
# ─── Blind Judge ──────────────────────────────────────────────────────────────
run_judge() {
    echo -e "\n${BOLD}Running blind judge...${NC}"
    local script_with script_without
    script_with="$(cat "${RESULTS_DIR}/with_skills_script.py" 2>/dev/null || echo '# (no script generated)')"
    script_without="$(cat "${RESULTS_DIR}/without_skills_script.py" 2>/dev/null || echo '# (no script generated)')"
    # Randomize A/B assignment
    local coin=$(( RANDOM % 2 ))
    local response_a response_b label_a label_b
    if [[ $coin -eq 0 ]]; then
        label_a="with_skills"
        label_b="without_skills"
        response_a="$script_with"
        response_b="$script_without"
        label_a="without_skills"
        label_b="with_skills"
        response_a="$script_without"
        response_b="$script_with"
    echo "A=${label_a}" >  "${RESULTS_DIR}/judge_mapping.txt"
    echo "B=${label_b}" >> "${RESULTS_DIR}/judge_mapping.txt"
    # Build the judge prompt with scripts injected
    local judge_prompt="$JUDGE_PROMPT_TEMPLATE"
    judge_prompt="${judge_prompt/PLACEHOLDER_A/$response_a}"
    judge_prompt="${judge_prompt/PLACEHOLDER_B/$response_b}"
    run_with_timeout "$TIMEOUT_SECONDS" \
        claude -p "$judge_prompt" \
        --model "$MODEL" \
        --output-format text \
        --dangerously-skip-permissions \
        --no-session-persistence \
        --max-budget-usd "$MAX_BUDGET" \
        --tools "" \
        > "${RESULTS_DIR}/judge_verdict.txt" 2>"${RESULTS_DIR}/judge_stderr.log"
    echo -e "${GREEN}Judge evaluation complete${NC}"
# ─── Report ───────────────────────────────────────────────────────────────────
print_report() {
    echo -e "\n${BOLD}════════════════════════════════════════${NC}"
    echo -e "${BOLD}         BENCHMARK RESULTS              ${NC}"
    echo -e "${BOLD}════════════════════════════════════════${NC}"
    echo ""
    echo -e "  Model:     ${MODEL}"
    echo -e "  Timeout:   ${TIMEOUT_SECONDS}s"
    echo -e "  Results:   ${RESULTS_DIR}"
    echo ""
    # Un-blind
    echo -e "${BOLD}Mapping${NC}"
    cat "${RESULTS_DIR}/judge_mapping.txt"
    echo ""
    # Script sizes
    echo -e "${BOLD}Script sizes${NC}"
    for label in with without; do
        local file="${RESULTS_DIR}/${label}_skills_script.py"
        if [[ -f "$file" ]]; then
            local lines
            lines=$(wc -l < "$file" | tr -d ' ')
            printf "  %-18s %s lines\n" "[${label} skills]" "$lines"
    echo ""
    # Judge verdict
    echo -e "${BOLD}Judge verdict${NC}"
    echo ""
    cat "${RESULTS_DIR}/judge_verdict.txt"
    echo ""
    echo -e "${BOLD}════════════════════════════════════════${NC}"
    echo -e "Full results: ${RESULTS_DIR}"
# ─── Main ─────────────────────────────────────────────────────────────────────
    echo -e "${BOLD}Mixedbread Skills Benchmark${NC}"
    echo -e "${DIM}Comparing Claude Code with and without skills${NC}"
    echo ""
    check_prerequisites
    mkdir -p "$RESULTS_DIR"
    # Save the prompt for reproducibility
    echo "${CUSTOM_PROMPT:-$TASK_PROMPT}" > "${RESULTS_DIR}/prompt.txt"
    setup_workdirs
    run_agents
    extract_scripts
    run_judge
    print_report
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

benchmark.sh

Latest commit

History

benchmark.sh

File metadata and controls