{"task_set_hash":"b31c942bd4e8afcfc48e238184a7d6c6db5286b138ceeeeaec206027007f7ff4","model":{"slug":"anthropic/claude-opus-4-7","display_name":"Claude Opus 4.7","api_model_id":"claude-opus-4-7","family_slug":"claude","added_at":"2026-01-15","settings_suffix":"","max_input_tokens":1000000,"max_output_tokens":128000,"capabilities":["thinking","image","pdf","structured","batch"]},"aggregates":{"avg_score":70.049361,"tasks_attempted":861,"tasks_passed":545,"tasks_attempted_distinct":110,"tasks_passed_attempt_1":80,"tasks_passed_attempt_2_only":17,"pass_at_n":0.881818,"avg_cost_usd":0.2165,"latency_p50_ms":19873,"latency_p95_ms":157509,"pass_rate_ci":{"lower":0.8082479662756031,"upper":0.9296193862563361},"pass_hat_at_n":0.7636363636363637,"cost_per_pass_usd":0.245515,"run_count":6,"verified_runs":0},"settings":{"temperature":null,"thinking_budget":null,"tokens_avg_per_run":307980,"consistency_pct":81.82},"history":[{"run_id":"68e1635c-6be8-43aa-93e9-52e908bdf3b9","ts":"2026-05-29T21:14:00.965Z","score":69.806338,"cost_usd":3.652305,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:24.651Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":4188235},{"run_id":"eae41fb1-f1a9-488d-85f9-d31f8bd7c6ea","ts":"2026-05-29T17:17:03.231Z","score":71.066434,"cost_usd":3.82835,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:03:33.176Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":4277689},{"run_id":"f10ed110-b8a6-43a4-9913-eb0082cc7e31","ts":"2026-05-29T13:23:06.980Z","score":70.547945,"cost_usd":3.91462,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:01:39.834Z","tasks_attempted":110,"tasks_passed":93,"duration_ms":4753107},{"run_id":"e5dcc595-a287-43b8-8191-f144e24b2a7b","ts":"2026-05-16T06:34:06.181Z","score":68.921233,"cost_usd":4.224575,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:35:29.783Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":7607215},{"run_id":"7d67b00d-60e0-480c-b96c-2ae513ca1a01","ts":"2026-05-16T01:31:36.853Z","score":68.618881,"cost_usd":4.16766,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:33:36.660Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":7923932},{"run_id":"47caf9d8-1397-4ada-8d2e-7001f7583e0c","ts":"2026-05-15T21:18:27.268Z","score":71.365248,"cost_usd":4.027465,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:32:41.372Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":5568456}],"failure_modes":[{"code":"AL0104","count":258,"pct":0.224739,"example_message":"Syntax error, '=' expected"},{"code":"AL0000","count":222,"pct":0.19338,"example_message":"App generation failed"},{"code":"AL0107","count":99,"pct":0.086237,"example_message":"Syntax error, identifier expected. Provide a valid name (letters, digits, and underscores only)."},{"code":"AL0111","count":85,"pct":0.074042,"example_message":"Semicolon expected. Add a semicolon (;) to terminate the statement."},{"code":"AL0105","count":78,"pct":0.067944,"example_message":"Syntax error, identifier expected; 'key' is a keyword"},{"code":"AL0132","count":62,"pct":0.054007,"example_message":"'FieldRef' does not contain a definition for 'CreateInStream'"},{"code":"AL0169","count":48,"pct":0.041812,"example_message":"The option value 'Masked' is not valid. Check the enum definition for valid values."},{"code":"AL0126","count":29,"pct":0.025261,"example_message":"No overload for method 'Clear' takes 1 arguments. Candidates: 'Clear()' defined in Codeunit 'CG H054 Cache' by the extension CentralGauge_CG-AL-H054_2 by CentralGauge (1.0.0.0)"},{"code":"AL0198","count":24,"pct":0.020906,"example_message":"Expected one of the application object keywords (table, tableextension, page, pageextension, pagecustomization, profile, profileextension, codeunit, report, reportextension, xmlport, query, controladdin, dotnet, enum, enumextension, interface, permissionset, permissionsetextension, entitlement)"},{"code":"AL0224","count":22,"pct":0.019164,"example_message":"Expression expected. Provide a valid expression (variable, constant, calculation, or method call)."}],"recent_runs":[{"run_id":"68e1635c-6be8-43aa-93e9-52e908bdf3b9","ts":"2026-05-29T21:14:00.965Z","score":69.806338,"cost_usd":3.652305,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:24.651Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":4188235},{"run_id":"eae41fb1-f1a9-488d-85f9-d31f8bd7c6ea","ts":"2026-05-29T17:17:03.231Z","score":71.066434,"cost_usd":3.82835,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:03:33.176Z","tasks_attempted":110,"tasks_passed":91,"duration_ms":4277689},{"run_id":"f10ed110-b8a6-43a4-9913-eb0082cc7e31","ts":"2026-05-29T13:23:06.980Z","score":70.547945,"cost_usd":3.91462,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:01:39.834Z","tasks_attempted":110,"tasks_passed":93,"duration_ms":4753107},{"run_id":"e5dcc595-a287-43b8-8191-f144e24b2a7b","ts":"2026-05-16T06:34:06.181Z","score":68.921233,"cost_usd":4.224575,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:35:29.783Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":7607215},{"run_id":"7d67b00d-60e0-480c-b96c-2ae513ca1a01","ts":"2026-05-16T01:31:36.853Z","score":68.618881,"cost_usd":4.16766,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:33:36.660Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":7923932},{"run_id":"47caf9d8-1397-4ada-8d2e-7001f7583e0c","ts":"2026-05-15T21:18:27.268Z","score":71.365248,"cost_usd":4.027465,"tier":"claimed","status":"completed","completed_at":"2026-05-16T18:32:41.372Z","tasks_attempted":110,"tasks_passed":90,"duration_ms":5568456}]}