{"task_set_hash":"b31c942bd4e8afcfc48e238184a7d6c6db5286b138ceeeeaec206027007f7ff4","model":{"slug":"anthropic/claude-sonnet-4-6","display_name":"Claude Sonnet 4 6","api_model_id":"claude-sonnet-4-6","family_slug":"claude","added_at":"2026-05-04T16:43:38.341Z","settings_suffix":"","max_input_tokens":1000000,"max_output_tokens":128000,"capabilities":["thinking","image","pdf","structured","batch"]},"aggregates":{"avg_score":64.069206,"tasks_attempted":932,"tasks_passed":534,"tasks_attempted_distinct":110,"tasks_passed_attempt_1":69,"tasks_passed_attempt_2_only":27,"pass_at_n":0.872727,"avg_cost_usd":0.170093,"latency_p50_ms":20119,"latency_p95_ms":165461,"pass_rate_ci":{"lower":0.7976481880956428,"upper":0.9226508941379168},"pass_hat_at_n":0.7272727272727273,"cost_per_pass_usd":0.194899,"run_count":6,"verified_runs":0},"settings":{"temperature":null,"thinking_budget":null,"tokens_avg_per_run":345540,"consistency_pct":80},"history":[{"run_id":"6a052f35-56aa-48f8-bcde-a5227e391df2","ts":"2026-05-29T04:23:51.270Z","score":65.178571,"cost_usd":2.78289,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:07:03.894Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5524006},{"run_id":"3ec0deb9-c3de-44bb-8f49-9b4bed44943c","ts":"2026-05-29T01:01:20.142Z","score":62.824675,"cost_usd":3.216384,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:05:07.959Z","tasks_attempted":110,"tasks_passed":88,"duration_ms":5971357},{"run_id":"a9cd1465-68f6-448b-a841-01755d3b9802","ts":"2026-05-28T21:35:35.222Z","score":65.127389,"cost_usd":3.110493,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:03:07.674Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5904854},{"run_id":"faf6dac6-80c2-46e3-a3aa-bd6c6b4d59da","ts":"2026-05-16T06:34:06.181Z","score":62.738854,"cost_usd":3.158745,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:35:05.822Z","tasks_attempted":110,"tasks_passed":86,"duration_ms":7526818},{"run_id":"203a1b90-d0ce-4245-b316-a52365774a11","ts":"2026-05-16T01:31:36.853Z","score":64.262821,"cost_usd":3.065628,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:33:13.507Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":6337137},{"run_id":"9e7838df-bf03-48e4-b68e-4124d9649e65","ts":"2026-05-15T21:18:27.268Z","score":64.285714,"cost_usd":3.376128,"tier":"claimed","status":"completed","completed_at":"2026-05-16T14:36:15.420Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":5898423}],"failure_modes":[{"code":"AL0104","count":852,"pct":0.367558,"example_message":"Syntax error, ')' expected"},{"code":"AL0000","count":297,"pct":0.128128,"example_message":"App generation failed"},{"code":"AL0111","count":182,"pct":0.078516,"example_message":"Semicolon expected. Add a semicolon (;) to terminate the statement."},{"code":"AL0198","count":131,"pct":0.056514,"example_message":"Expected one of the application object keywords (table, tableextension, page, pageextension, pagecustomization, profile, profileextension, codeunit, report, reportextension, xmlport, query, controladdin, dotnet, enum, enumextension, interface, permissionset, permissionsetextension, entitlement)"},{"code":"AL0107","count":130,"pct":0.056083,"example_message":"Syntax error, identifier expected. Provide a valid name (letters, digits, and underscores only)."},{"code":"AL0105","count":86,"pct":0.037101,"example_message":"Syntax error, identifier expected; 'key' is a keyword"},{"code":"AL0224","count":79,"pct":0.034081,"example_message":"Expression expected. Provide a valid expression (variable, constant, calculation, or method call)."},{"code":"AL0620","count":63,"pct":0.027179,"example_message":"Preprocessor directives must appear as the first non-whitespace character on a line."},{"code":"AL0127","count":52,"pct":0.022433,"example_message":"Member '(property) Visible: Expression' cannot be used like a method. Remove the parentheses or call a valid method."},{"code":"AL0132","count":50,"pct":0.02157,"example_message":"'System' does not contain a definition for 'CreateSequentialGuid'"}],"recent_runs":[{"run_id":"6a052f35-56aa-48f8-bcde-a5227e391df2","ts":"2026-05-29T04:23:51.270Z","score":65.178571,"cost_usd":2.78289,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:07:03.894Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5524006},{"run_id":"3ec0deb9-c3de-44bb-8f49-9b4bed44943c","ts":"2026-05-29T01:01:20.142Z","score":62.824675,"cost_usd":3.216384,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:05:07.959Z","tasks_attempted":110,"tasks_passed":88,"duration_ms":5971357},{"run_id":"a9cd1465-68f6-448b-a841-01755d3b9802","ts":"2026-05-28T21:35:35.222Z","score":65.127389,"cost_usd":3.110493,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:03:07.674Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":5904854},{"run_id":"faf6dac6-80c2-46e3-a3aa-bd6c6b4d59da","ts":"2026-05-16T06:34:06.181Z","score":62.738854,"cost_usd":3.158745,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:35:05.822Z","tasks_attempted":110,"tasks_passed":86,"duration_ms":7526818},{"run_id":"203a1b90-d0ce-4245-b316-a52365774a11","ts":"2026-05-16T01:31:36.853Z","score":64.262821,"cost_usd":3.065628,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:33:13.507Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":6337137},{"run_id":"9e7838df-bf03-48e4-b68e-4124d9649e65","ts":"2026-05-15T21:18:27.268Z","score":64.285714,"cost_usd":3.376128,"tier":"claimed","status":"completed","completed_at":"2026-05-16T14:36:15.420Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":5898423}]}