| | |
| |
|
| | import json |
| | from pathlib import Path |
| | from datasets import Dataset, DatasetDict |
| | from transformers import AutoTokenizer |
| | from sklearn.model_selection import train_test_split |
| | from collections import Counter |
| | import numpy as np |
| |
|
| | |
| | INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl") |
| | OUTPUT_DIR = Path("data/processed/dataset_binary") |
| | TOKENIZER_NAME = "microsoft/codebert-base" |
| | MAX_LENGTH = 512 |
| | SEED = 42 |
| | MIN_LINES = 5 |
| |
|
| | def load_data(): |
| | print("📂 Wczytywanie danych...") |
| | records = [] |
| | with INPUT_PATH.open(encoding="utf-8") as f: |
| | for line in f: |
| | obj = json.loads(line) |
| | if obj["label"] not in ("good", "bad"): |
| | continue |
| | content = obj.get("content", []) |
| | if isinstance(content, list): |
| | content_text = "\n".join(content) |
| | else: |
| | content_text = content |
| | if len(content_text.strip()) < 10 or len(content) < MIN_LINES: |
| | continue |
| | records.append({ |
| | "text": content_text, |
| | "label": 0 if obj["label"] == "good" else 1 |
| | }) |
| |
|
| | label_counts = Counter([r["label"] for r in records]) |
| | lengths = [len(r["text"].splitlines()) for r in records] |
| | print(f"✅ Wczytano {len(records)} rekordów") |
| | print(f"📊 Rozkład klas: {dict(label_counts)}") |
| | print(f"📏 Średnia długość pliku: {np.mean(lengths):.2f} linii") |
| | return records |
| |
|
| | def split_data(records): |
| | print("🔀 Dzielę dane na train/val/test...") |
| | train_val, test = train_test_split( |
| | records, test_size=0.1, random_state=SEED, stratify=[r["label"] for r in records] |
| | ) |
| | train, val = train_test_split( |
| | train_val, test_size=0.1111, random_state=SEED, stratify=[r["label"] for r in train_val] |
| | ) |
| | return train, val, test |
| |
|
| | def tokenize_dataset(train, val, test): |
| | print("🔤 Tokenizuję dane...") |
| | tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) |
| |
|
| | ds = DatasetDict({ |
| | "train": Dataset.from_list(train), |
| | "validation": Dataset.from_list(val), |
| | "test": Dataset.from_list(test), |
| | }) |
| |
|
| | def tokenize(example): |
| | return tokenizer( |
| | example["text"], |
| | padding="max_length", |
| | truncation=True, |
| | max_length=MAX_LENGTH |
| | ) |
| |
|
| | ds_tokenized = ds.map(tokenize, batched=True) |
| | ds_tokenized = ds_tokenized.remove_columns(["text"]) |
| | return ds_tokenized |
| |
|
| | def save_dataset(ds_tokenized): |
| | OUTPUT_DIR.mkdir(parents=True, exist_ok=True) |
| | print(f"💾 Zapisuję dane do {OUTPUT_DIR} ...") |
| | ds_tokenized.save_to_disk(str(OUTPUT_DIR)) |
| | print("✅ Gotowe.") |
| |
|
| | def main(): |
| | records = load_data() |
| | train, val, test = split_data(records) |
| | ds_tokenized = tokenize_dataset(train, val, test) |
| | save_dataset(ds_tokenized) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|