{"task_set_hash":"b31c942bd4e8afcfc48e238184a7d6c6db5286b138ceeeeaec206027007f7ff4","model":{"slug":"anthropic/claude-opus-4-8","display_name":"Claude Opus 4.8","api_model_id":"claude-opus-4-8","family_slug":"claude","added_at":"2026-05-28","settings_suffix":"","max_input_tokens":1000000,"max_output_tokens":128000,"capabilities":["thinking","image","pdf","structured","batch"]},"aggregates":{"avg_score":67.78291,"tasks_attempted":866,"tasks_passed":542,"tasks_attempted_distinct":110,"tasks_passed_attempt_1":82,"tasks_passed_attempt_2_only":15,"pass_at_n":0.881818,"avg_cost_usd":0.234525,"latency_p50_ms":18219,"latency_p95_ms":150101,"pass_rate_ci":{"lower":0.8082479662756031,"upper":0.9296193862563361},"pass_hat_at_n":0.7454545454545455,"cost_per_pass_usd":0.265956,"run_count":6,"verified_runs":0},"settings":{"temperature":null,"thinking_budget":null,"tokens_avg_per_run":325127,"consistency_pct":77.27},"history":[{"run_id":"84ba3ab3-c78f-49cb-b772-b32b5a048d75","ts":"2026-05-29T21:14:00.965Z","score":67.708333,"cost_usd":3.954625,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:00.139Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":4114783},{"run_id":"df0b62ba-cc34-48d6-9662-20217d799363","ts":"2026-05-29T17:17:03.229Z","score":70.017483,"cost_usd":3.849325,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:03:08.028Z","tasks_attempted":110,"tasks_passed":92,"duration_ms":4206094},{"run_id":"7fb765a9-3404-4c1b-a330-70825dcaf708","ts":"2026-05-29T13:23:06.981Z","score":67.91958,"cost_usd":3.917025,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:01:12.184Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":3987349},{"run_id":"17a3f05f-9aa0-4f22-9bc5-619a6ab72bd3","ts":"2026-05-29T04:23:51.270Z","score":66.976351,"cost_usd":4.837435,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:07:17.459Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":4797300},{"run_id":"ef4e640a-646e-4ec1-8044-34f9e25a3fb4","ts":"2026-05-29T01:01:20.142Z","score":65.689655,"cost_usd":4.64219,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:05:22.041Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":5128446},{"run_id":"a921e213-735a-4fb3-83c4-b2989507d391","ts":"2026-05-28T21:35:35.222Z","score":68.444056,"cost_usd":4.597165,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:03:22.404Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5366408}],"failure_modes":[{"code":"AL0104","count":293,"pct":0.230527,"example_message":"Syntax error, ':' expected"},{"code":"AL0000","count":251,"pct":0.197482,"example_message":"App generation failed"},{"code":"AL0107","count":148,"pct":0.116444,"example_message":"Syntax error, identifier expected. Provide a valid name (letters, digits, and underscores only)."},{"code":"AL0105","count":113,"pct":0.088906,"example_message":"Syntax error, identifier expected; 'key' is a keyword"},{"code":"AL0118","count":75,"pct":0.059009,"example_message":"The name 'CreateSequentialGuid' does not exist in the current context."},{"code":"AL0169","count":62,"pct":0.04878,"example_message":"The option value 'Masked' is not valid. Check the enum definition for valid values."},{"code":"AL0198","count":54,"pct":0.042486,"example_message":"Expected one of the application object keywords (table, tableextension, page, pageextension, pagecustomization, profile, profileextension, codeunit, report, reportextension, xmlport, query, controladdin, dotnet, enum, enumextension, interface, permissionset, permissionsetextension, entitlement)"},{"code":"AL0132","count":53,"pct":0.041699,"example_message":"'RecordRef' does not contain a definition for 'CalcFields'"},{"code":"AL0126","count":21,"pct":0.016522,"example_message":"No overload for method 'CalculateAverageOrderValue' takes 1 arguments. Candidates: 'CalculateAverageOrderValue()' defined in Codeunit 'CG-AL-M007 Mock Calculator' by the extension CG-AL-M007 Prereq by CentralGauge (1.0.0.0)"},{"code":"AL0185","count":18,"pct":0.014162,"example_message":"Codeunit 'Session Information' is missing"}],"recent_runs":[{"run_id":"84ba3ab3-c78f-49cb-b772-b32b5a048d75","ts":"2026-05-29T21:14:00.965Z","score":67.708333,"cost_usd":3.954625,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:00.139Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":4114783},{"run_id":"df0b62ba-cc34-48d6-9662-20217d799363","ts":"2026-05-29T17:17:03.229Z","score":70.017483,"cost_usd":3.849325,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:03:08.028Z","tasks_attempted":110,"tasks_passed":92,"duration_ms":4206094},{"run_id":"7fb765a9-3404-4c1b-a330-70825dcaf708","ts":"2026-05-29T13:23:06.981Z","score":67.91958,"cost_usd":3.917025,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:01:12.184Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":3987349},{"run_id":"17a3f05f-9aa0-4f22-9bc5-619a6ab72bd3","ts":"2026-05-29T04:23:51.270Z","score":66.976351,"cost_usd":4.837435,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:07:17.459Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":4797300},{"run_id":"ef4e640a-646e-4ec1-8044-34f9e25a3fb4","ts":"2026-05-29T01:01:20.142Z","score":65.689655,"cost_usd":4.64219,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:05:22.041Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":5128446},{"run_id":"a921e213-735a-4fb3-83c4-b2989507d391","ts":"2026-05-28T21:35:35.222Z","score":68.444056,"cost_usd":4.597165,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:03:22.404Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5366408}]}