{"task_set_hash":"b31c942bd4e8afcfc48e238184a7d6c6db5286b138ceeeeaec206027007f7ff4","model":{"slug":"gemini/gemini-3.5-flash","display_name":"Gemini 3.5 Flash","api_model_id":"gemini-3.5-flash","family_slug":"gemini","added_at":"2026-05-28T21:35:35.222Z","settings_suffix":"","max_input_tokens":1048576,"max_output_tokens":65536,"capabilities":null},"aggregates":{"avg_score":57.521299,"tasks_attempted":939,"tasks_passed":497,"tasks_attempted_distinct":110,"tasks_passed_attempt_1":76,"tasks_passed_attempt_2_only":21,"pass_at_n":0.881818,"avg_cost_usd":0.056145,"latency_p50_ms":31228,"latency_p95_ms":362994,"pass_rate_ci":{"lower":0.8082479662756031,"upper":0.9296193862563361},"pass_hat_at_n":0.5636363636363636,"cost_per_pass_usd":0.06367,"run_count":6,"verified_runs":0},"settings":{"temperature":null,"thinking_budget":null,"tokens_avg_per_run":275954,"consistency_pct":58.18},"history":[{"run_id":"c3bb2ccf-7036-4cca-a135-aa211a08c9f4","ts":"2026-05-29T21:14:00.965Z","score":60.111465,"cost_usd":0.894107,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:37.832Z","tasks_attempted":110,"tasks_passed":85,"duration_ms":17499830},{"run_id":"1cd7680d-0bea-4d96-af28-fca3f0e75b7d","ts":"2026-05-29T17:17:03.230Z","score":66.469595,"cost_usd":0.92299,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:04:47.096Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":18801643},{"run_id":"473921f2-69d2-42d2-ae94-87d90f38af53","ts":"2026-05-29T13:23:06.981Z","score":67,"cost_usd":0.983451,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:02:54.572Z","tasks_attempted":110,"tasks_passed":93,"duration_ms":24910084},{"run_id":"2c01b44c-9f4a-4308-ba37-85444d3d3796","ts":"2026-05-29T04:23:51.269Z","score":53.560127,"cost_usd":1.092399,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:07:35.013Z","tasks_attempted":110,"tasks_passed":79,"duration_ms":5844656},{"run_id":"90a07673-dba9-4713-971c-01581e2fdaac","ts":"2026-05-29T01:01:20.142Z","score":52.562112,"cost_usd":1.119941,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:06:39.354Z","tasks_attempted":110,"tasks_passed":79,"duration_ms":5813122},{"run_id":"c574b74f-cca1-4101-8d57-6c03fe99b514","ts":"2026-05-28T21:35:35.222Z","score":47.045455,"cost_usd":1.163119,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:04:41.760Z","tasks_attempted":110,"tasks_passed":72,"duration_ms":6246775}],"failure_modes":[{"code":"AL0104","count":1203,"pct":0.33104,"example_message":"Syntax error, ')' expected"},{"code":"AL0183","count":463,"pct":0.127408,"example_message":"Unexpected character '`'. Remove the invalid character or check if a special character needs escaping."},{"code":"AL0000","count":358,"pct":0.098514,"example_message":"App generation failed"},{"code":"AL0111","count":327,"pct":0.089983,"example_message":"Semicolon expected. Add a semicolon (;) to terminate the statement."},{"code":"AL0132","count":317,"pct":0.087232,"example_message":"'Record Customer' does not contain a definition for 'Preferred Contact Method'"},{"code":"AL0198","count":202,"pct":0.055586,"example_message":"Expected one of the application object keywords (table, tableextension, page, pageextension, pagecustomization, profile, profileextension, codeunit, report, reportextension, xmlport, query, controladdin, dotnet, enum, enumextension, interface, permissionset, permissionsetextension, entitlement)"},{"code":"AL0107","count":191,"pct":0.052559,"example_message":"Syntax error, identifier expected. Provide a valid name (letters, digits, and underscores only)."},{"code":"AL0360","count":112,"pct":0.03082,"example_message":"Text literal was not properly terminated. Use the character ' to terminate the literal."},{"code":"AL0105","count":66,"pct":0.018162,"example_message":"Syntax error, identifier expected; 'key' is a keyword"},{"code":"AL0114","count":60,"pct":0.016511,"example_message":"Syntax error, integer literal expected. Provide a numeric value (e.g., 0, 1, 42)."}],"recent_runs":[{"run_id":"c3bb2ccf-7036-4cca-a135-aa211a08c9f4","ts":"2026-05-29T21:14:00.965Z","score":60.111465,"cost_usd":0.894107,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:05:37.832Z","tasks_attempted":110,"tasks_passed":85,"duration_ms":17499830},{"run_id":"1cd7680d-0bea-4d96-af28-fca3f0e75b7d","ts":"2026-05-29T17:17:03.230Z","score":66.469595,"cost_usd":0.92299,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:04:47.096Z","tasks_attempted":110,"tasks_passed":89,"duration_ms":18801643},{"run_id":"473921f2-69d2-42d2-ae94-87d90f38af53","ts":"2026-05-29T13:23:06.981Z","score":67,"cost_usd":0.983451,"tier":"claimed","status":"completed","completed_at":"2026-05-30T08:02:54.572Z","tasks_attempted":110,"tasks_passed":93,"duration_ms":24910084},{"run_id":"2c01b44c-9f4a-4308-ba37-85444d3d3796","ts":"2026-05-29T04:23:51.269Z","score":53.560127,"cost_usd":1.092399,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:07:35.013Z","tasks_attempted":110,"tasks_passed":79,"duration_ms":5844656},{"run_id":"90a07673-dba9-4713-971c-01581e2fdaac","ts":"2026-05-29T01:01:20.142Z","score":52.562112,"cost_usd":1.119941,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:06:39.354Z","tasks_attempted":110,"tasks_passed":79,"duration_ms":5813122},{"run_id":"c574b74f-cca1-4101-8d57-6c03fe99b514","ts":"2026-05-28T21:35:35.222Z","score":47.045455,"cost_usd":1.163119,"tier":"claimed","status":"completed","completed_at":"2026-05-29T08:04:41.760Z","tasks_attempted":110,"tasks_passed":72,"duration_ms":6246775}]}