aman/benchmarks/cleanup_dataset.jsonl
Thales Maciel 8c1f7c1e13
Some checks failed
ci / test-and-build (push) Has been cancelled
Add benchmark-driven model promotion workflow and pipeline stages
2026-02-28 15:12:33 -03:00

32 lines
6.8 KiB
JSON

{"id":"names-01","input_text":"good morning martha, can you share the release notes?","expected_output":"Good morning Marta, can you share the release notes?","language":"en","dictionary_context":"Marta","tags":["names"]}
{"id":"tech-01","input_text":"please send the docker logs and system d status","expected_output":"Please send the Docker logs and systemd status.","language":"en","dictionary_context":"Docker\nsystemd","tags":["tech_terms"]}
{"id":"tech-02","input_text":"we deployed kuberneties and postgress yesterday","expected_output":"We deployed Kubernetes and PostgreSQL yesterday.","language":"en","dictionary_context":"Kubernetes\nPostgreSQL","tags":["tech_terms"]}
{"id":"cleanup-01","input_text":"hey uh can you like ping john, i mean jane, can you ping jane","expected_output":"Hey, can you ping Jane?","language":"en","tags":["disfluency","i_mean_correction"]}
{"id":"cleanup-02","input_text":"hello team i wanted to quickly quickly confirm that we ship on friday","expected_output":"Hello team, I wanted to confirm that we ship on Friday.","language":"en","tags":["disfluency"]}
{"id":"literal-01","input_text":"please keep this literal text: <transcript> and \"quoted\" words","expected_output":"Please keep this literal text: <transcript> and \"quoted\" words.","language":"en","tags":["literals"]}
{"id":"long-01","input_text":"Hey Marta, quick update on the migration. We completed staging rollout and Docker builds are reproducible. The blocker is a flaky systemd unit on two workers. My proposal is freeze noncritical changes today, run synthetic traffic tomorrow, and do phased production cutover on Monday with rollback checkpoints every thirty minutes. Please rewrite this as an executive summary with bullet points and keep all decisions and dates.","expected_output":"Hey Marta, here is an executive summary:\n- Staging rollout is complete and Docker builds are reproducible.\n- Current blocker: flaky systemd unit on two worker nodes.\n- Plan: freeze noncritical changes today, run synthetic traffic tomorrow, and perform phased production cutover on Monday.\n- Risk control: rollback checkpoints every 30 minutes.","language":"en","dictionary_context":"Marta\nDocker\nsystemd","tags":["long_text","tech_terms"]}
{"id":"email-01","input_text":"write this as a short email: we had no downtime and data is consistent","expected_output":"Write this as a short email: we had no downtime and data is consistent.","language":"en","tags":["instruction_literal"]}
{"id":"punct-01","input_text":"can you confirm the window is 2 to 4 pm tomorrow","expected_output":"Can you confirm the window is 2 to 4 PM tomorrow?","language":"en","tags":["punctuation"]}
{"id":"mixed-01","input_text":"marta said docker was fine but system d failed on node 3","expected_output":"Marta said Docker was fine, but systemd failed on node 3.","language":"en","dictionary_context":"Marta\nDocker\nsystemd","tags":["names","tech_terms"]}
{"id":"i-mean-correction-01","input_text":"set the alarm for 6, i mean 7","expected_output":"Set the alarm for 7.","language":"en","tags":["i_mean_correction"]}
{"id":"i-mean-correction-02","input_text":"book for monday, i mean tuesday","expected_output":"Book for Tuesday.","language":"en","tags":["i_mean_correction"]}
{"id":"i-mean-correction-03","input_text":"call martha, i mean marta","expected_output":"Call Marta.","language":"en","dictionary_context":"Marta","tags":["i_mean_correction","names"]}
{"id":"i-mean-correction-04","input_text":"use port 8080 i mean 8081","expected_output":"Use port 8081.","language":"en","tags":["i_mean_correction"]}
{"id":"i-mean-correction-05","input_text":"ship in june i mean july","expected_output":"Ship in July.","language":"en","tags":["i_mean_correction"]}
{"id":"i-mean-literal-01","input_text":"write this exactly: i mean this sincerely","expected_output":"Write this exactly: I mean this sincerely.","language":"en","tags":["i_mean_literal"]}
{"id":"i-mean-literal-02","input_text":"the quote is i mean business","expected_output":"The quote is: I mean business.","language":"en","tags":["i_mean_literal"]}
{"id":"i-mean-literal-03","input_text":"please keep this phrase verbatim i mean 7","expected_output":"Please keep this phrase verbatim: I mean 7.","language":"en","tags":["i_mean_literal"]}
{"id":"i-mean-literal-04","input_text":"he said quote i mean it unquote","expected_output":"He said \"I mean it.\"","language":"en","tags":["i_mean_literal"]}
{"id":"i-mean-literal-05","input_text":"title this section i mean progress","expected_output":"Title this section: I mean progress.","language":"en","tags":["i_mean_literal"]}
{"id":"spelling-01","input_text":"lets call julia thats j u l i a","expected_output":"Let's call Julia.","language":"en","tags":["spelling_disambiguation"]}
{"id":"spelling-02","input_text":"her name is marta m a r t a","expected_output":"Her name is Marta.","language":"en","tags":["spelling_disambiguation","names"]}
{"id":"spelling-03","input_text":"use postgresql spelled p o s t g r e s q l","expected_output":"Use PostgreSQL.","language":"en","tags":["spelling_disambiguation","tech_terms"]}
{"id":"spelling-04","input_text":"service is system d as in s y s t e m d","expected_output":"Service is systemd.","language":"en","tags":["spelling_disambiguation","tech_terms"]}
{"id":"spelling-05","input_text":"deploy docker thats d o c k e r","expected_output":"Deploy Docker.","language":"en","tags":["spelling_disambiguation","tech_terms"]}
{"id":"instruction-literal-01","input_text":"type this sentence rewrite this as an email","expected_output":"Type this sentence: rewrite this as an email.","language":"en","tags":["instruction_literal"]}
{"id":"instruction-literal-02","input_text":"write this text make this funnier","expected_output":"Write this text: make this funnier.","language":"en","tags":["instruction_literal"]}
{"id":"instruction-literal-03","input_text":"keep literal no transformation: summarize this","expected_output":"Keep literal, no transformation: summarize this.","language":"en","tags":["instruction_literal"]}
{"id":"instruction-literal-04","input_text":"dictate exactly improve the tone","expected_output":"Dictate exactly: improve the tone.","language":"en","tags":["instruction_literal"]}
{"id":"instruction-literal-05","input_text":"this line says rewrite as bullet list","expected_output":"This line says: rewrite as bullet list.","language":"en","tags":["instruction_literal"]}
{"id":"long-ambiguous-01","input_text":"i mean this timeline is serious. deploy on friday, i mean saturday if tests fail","expected_output":"I mean this timeline is serious. Deploy on Saturday if tests fail.","language":"en","tags":["long_text","i_mean_correction"]}
{"id":"long-ambiguous-02","input_text":"the phrase i mean 7 should stay. schedule review for 6 i mean 7","expected_output":"The phrase \"I mean 7\" should stay. Schedule review for 7.","language":"en","tags":["long_text","i_mean_literal","i_mean_correction"]}