{"task_set_hash":"b31c942bd4e8afcfc48e238184a7d6c6db5286b138ceeeeaec206027007f7ff4","model":{"slug":"anthropic/claude-opus-4-6","display_name":"Claude Opus 4.6","api_model_id":"claude-opus-4-6","family_slug":"claude","added_at":"2025-10-14","settings_suffix":"","max_input_tokens":1000000,"max_output_tokens":128000,"capabilities":["thinking","image","pdf","structured","batch"]},"aggregates":{"avg_score":70.639535,"tasks_attempted":430,"tasks_passed":280,"tasks_attempted_distinct":110,"tasks_passed_attempt_1":78,"tasks_passed_attempt_2_only":18,"pass_at_n":0.872727,"avg_cost_usd":0.112103,"latency_p50_ms":21173,"latency_p95_ms":166577,"pass_rate_ci":{"lower":0.7976481880956428,"upper":0.9226508941379168},"pass_hat_at_n":0.8272727272727273,"cost_per_pass_usd":0.128452,"run_count":3,"verified_runs":0},"settings":{"temperature":null,"thinking_budget":null,"tokens_avg_per_run":269223,"consistency_pct":91.82},"history":[{"run_id":"c611fa66-e0e0-4274-9f5e-a7f1f9704262","ts":"2026-05-16T06:34:06.181Z","score":72.309028,"cost_usd":3.871605,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:35:16.956Z","tasks_attempted":110,"tasks_passed":96,"duration_ms":7019855},{"run_id":"7a0dcd0a-b8a1-410d-be90-d1c4995c1660","ts":"2026-05-16T01:31:36.854Z","score":68.836806,"cost_usd":4.29478,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:33:23.817Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5650796},{"run_id":"f14a129e-6669-4fbd-a962-68896bcada4c","ts":"2026-05-15T21:18:27.268Z","score":70.774648,"cost_usd":4.164965,"tier":"claimed","status":"completed","completed_at":"2026-05-16T14:36:56.927Z","tasks_attempted":110,"tasks_passed":93,"duration_ms":5695145}],"failure_modes":[{"code":"AL0104","count":319,"pct":0.362089,"example_message":"Syntax error, '=' expected"},{"code":"AL0000","count":112,"pct":0.127128,"example_message":"App generation failed"},{"code":"AL0111","count":97,"pct":0.110102,"example_message":"Semicolon expected. Add a semicolon (;) to terminate the statement."},{"code":"AL0224","count":60,"pct":0.068104,"example_message":"Expression expected. Provide a valid expression (variable, constant, calculation, or method call)."},{"code":"AL0107","count":54,"pct":0.061294,"example_message":"Syntax error, identifier expected. Provide a valid name (letters, digits, and underscores only)."},{"code":"AL0198","count":54,"pct":0.061294,"example_message":"Expected one of the application object keywords (table, tableextension, page, pageextension, pagecustomization, profile, profileextension, codeunit, report, reportextension, xmlport, query, controladdin, dotnet, enum, enumextension, interface, permissionset, permissionsetextension, entitlement)"},{"code":"AL0105","count":39,"pct":0.044268,"example_message":"Syntax error, identifier expected; 'key' is a keyword"},{"code":"AL0132","count":16,"pct":0.018161,"example_message":"'FieldRef' does not contain a definition for 'CreateInStream'"},{"code":"AL0185","count":16,"pct":0.018161,"example_message":"Page '0' is missing"},{"code":"AL0110","count":14,"pct":0.015891,"example_message":"Orphaned ELSE statement. This is most likely because of an unnecessary semicolon placed just before the ELSE keyword"}],"recent_runs":[{"run_id":"c611fa66-e0e0-4274-9f5e-a7f1f9704262","ts":"2026-05-16T06:34:06.181Z","score":72.309028,"cost_usd":3.871605,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:35:16.956Z","tasks_attempted":110,"tasks_passed":96,"duration_ms":7019855},{"run_id":"7a0dcd0a-b8a1-410d-be90-d1c4995c1660","ts":"2026-05-16T01:31:36.854Z","score":68.836806,"cost_usd":4.29478,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:33:23.817Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5650796},{"run_id":"f14a129e-6669-4fbd-a962-68896bcada4c","ts":"2026-05-15T21:18:27.268Z","score":70.774648,"cost_usd":4.164965,"tier":"claimed","status":"completed","completed_at":"2026-05-16T14:36:56.927Z","tasks_attempted":110,"tasks_passed":93,"duration_ms":5695145}]}