{"task_set_hash":"b31c942bd4e8afcfc48e238184a7d6c6db5286b138ceeeeaec206027007f7ff4","model":{"slug":"gemini/gemini-3.1-pro-preview","display_name":"Gemini 3.1 Pro Preview","api_model_id":"gemini-3.1-pro-preview","family_slug":"gemini","added_at":"2026-05-29T13:23:06.980Z","settings_suffix":" (66K)","max_input_tokens":1048576,"max_output_tokens":65536,"capabilities":null},"aggregates":{"avg_score":73.93617,"tasks_attempted":423,"tasks_passed":289,"tasks_attempted_distinct":110,"tasks_passed_attempt_1":83,"tasks_passed_attempt_2_only":18,"pass_at_n":0.918182,"avg_cost_usd":0.029417,"latency_p50_ms":58961,"latency_p95_ms":570899,"pass_rate_ci":{"lower":0.8517805850797744,"upper":0.9563598486456828},"pass_hat_at_n":0.8363636363636363,"cost_per_pass_usd":0.032039,"run_count":3,"verified_runs":0},"settings":{"temperature":null,"thinking_budget":null,"tokens_avg_per_run":182458,"consistency_pct":86.36},"history":[{"run_id":"cd36fbe5-aafa-444b-9293-23420b1517cb","ts":"2026-05-29T21:14:00.966Z","score":73.392857,"cost_usd":1.108214,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:13.239Z","tasks_attempted":110,"tasks_passed":94,"duration_ms":18669798},{"run_id":"a6a3e726-acac-446e-bd42-96933e5ee957","ts":"2026-05-29T17:17:03.230Z","score":73.339161,"cost_usd":1.111852,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:03:21.963Z","tasks_attempted":110,"tasks_passed":98,"duration_ms":23123995},{"run_id":"97865fb5-e261-41ea-93a8-9e3be118c978","ts":"2026-05-29T13:23:06.980Z","score":75.089286,"cost_usd":1.015852,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:01:28.124Z","tasks_attempted":110,"tasks_passed":97,"duration_ms":22999769}],"failure_modes":[{"code":"AL0132","count":116,"pct":0.206774,"example_message":"'Record Customer' does not contain a definition for 'Preferred Contact Method'"},{"code":"AL0104","count":102,"pct":0.181818,"example_message":"Syntax error, ')' expected"},{"code":"AL0000","count":96,"pct":0.171123,"example_message":"App generation failed"},{"code":"AL0111","count":51,"pct":0.090909,"example_message":"Semicolon expected. Add a semicolon (;) to terminate the statement."},{"code":"AL0360","count":36,"pct":0.064171,"example_message":"Text literal was not properly terminated. Use the character ' to terminate the literal."},{"code":"AL0198","count":32,"pct":0.057041,"example_message":"Expected one of the application object keywords (table, tableextension, page, pageextension, pagecustomization, profile, profileextension, codeunit, report, reportextension, xmlport, query, controladdin, dotnet, enum, enumextension, interface, permissionset, permissionsetextension, entitlement)"},{"code":"AL0169","count":16,"pct":0.02852,"example_message":"The option value 'Integration' is not valid. Check the enum definition for valid values."},{"code":"AL0133","count":15,"pct":0.026738,"example_message":"Argument 2: cannot convert from 'Text' to 'SecretText'"},{"code":"AL0122","count":13,"pct":0.023173,"example_message":"Cannot implicitly convert type 'None' to 'Text'. Use an explicit conversion or change the type."},{"code":"AL0126","count":12,"pct":0.02139,"example_message":"No overload for method 'Clear' takes 1 arguments. Candidates: 'Clear()' defined in Codeunit 'CG H054 Cache' by the extension CentralGauge_CG-AL-H054_1 by CentralGauge (1.0.0.0)"}],"recent_runs":[{"run_id":"cd36fbe5-aafa-444b-9293-23420b1517cb","ts":"2026-05-29T21:14:00.966Z","score":73.392857,"cost_usd":1.108214,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:13.239Z","tasks_attempted":110,"tasks_passed":94,"duration_ms":18669798},{"run_id":"a6a3e726-acac-446e-bd42-96933e5ee957","ts":"2026-05-29T17:17:03.230Z","score":73.339161,"cost_usd":1.111852,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:03:21.963Z","tasks_attempted":110,"tasks_passed":98,"duration_ms":23123995},{"run_id":"97865fb5-e261-41ea-93a8-9e3be118c978","ts":"2026-05-29T13:23:06.980Z","score":75.089286,"cost_usd":1.015852,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:01:28.124Z","tasks_attempted":110,"tasks_passed":97,"duration_ms":22999769}]}