| |
|
| | """
|
| | Download, transform LMSYS-Chat-1M into plain text for LLM completion models
|
| | in the format:
|
| | <|im_start|>role
|
| | message<|endoftext|>
|
| | <|im_stop|>
|
| |
|
| | with 6 newlines between conversations.
|
| | """
|
| |
|
| | from datasets import load_dataset
|
| | import sys
|
| |
|
| | def main(output_path="lmsys_chat_1m.txt", split="train"):
|
| | ds = load_dataset("lmsys/lmsys-chat-1m", split=split)
|
| |
|
| | with open(output_path, "w", encoding="utf-8") as out:
|
| | for i, sample in enumerate(ds):
|
| | conv = sample["conversation"]
|
| |
|
| | for msg in conv:
|
| | role = msg["role"]
|
| | content = msg["content"].strip()
|
| | out.write(f"<|im_start|>{role}\n{content}<|endoftext|>\n<|im_stop|>\n")
|
| |
|
| | out.write("\n" * 6)
|
| |
|
| | if (i + 1) % 10000 == 0:
|
| | print(f"Processed {i + 1} conversations", file=sys.stderr)
|
| |
|
| | print(f"✔ Saved plain-text to: {output_path}")
|
| |
|
| | if __name__ == "__main__":
|
| | import argparse
|
| | p = argparse.ArgumentParser(description="Convert LMSYS-Chat-1M to LLM-friendly text format")
|
| | p.add_argument("--output", "-o", default="lmsys_chat_1m.txt", help="Output file path")
|
| | p.add_argument("--split", "-s", default="train", help="Dataset split (e.g. 'train')")
|
| | args = p.parse_args()
|
| | main(output_path=args.output, split=args.split)
|
| |
|