Instructions to use Canstralian/CyberAttackDetection with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Canstralian/CyberAttackDetection with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="Canstralian/CyberAttackDetection")# Load model directly from transformers import AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained("Canstralian/CyberAttackDetection", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments | |
| from datasets import load_dataset, Dataset, DatasetDict | |
| from config import Config | |
| import torch | |
| from sklearn.model_selection import train_test_split | |
| import pandas as pd | |
| class CyberAttackDetectionModel: | |
| def __init__(self): | |
| # Initialize tokenizer and model | |
| self.tokenizer = AutoTokenizer.from_pretrained(Config.TOKENIZER_NAME) | |
| self.model = AutoModelForCausalLM.from_pretrained(Config.MODEL_NAME) | |
| self.model.to(Config.DEVICE) | |
| def preprocess_data(self, dataset): | |
| """ | |
| Preprocess the raw text dataset by cleaning and tokenizing. | |
| """ | |
| # Clean the dataset (basic text normalization, removing unwanted characters) | |
| def clean_text(text): | |
| # Implement custom cleaning function based on dataset's characteristics | |
| # E.g., removing unwanted characters, special symbols, etc. | |
| text = text.lower() # Example of making text lowercase | |
| text = text.replace("\n", " ") # Removing newlines | |
| return text | |
| # Apply cleaning to the dataset | |
| dataset = dataset.map(lambda x: {'text': clean_text(x['text'])}) | |
| # Tokenization | |
| def tokenize_function(examples): | |
| return self.tokenizer(examples['text'], truncation=True, padding='max_length', max_length=Config.MAX_LENGTH) | |
| # Tokenize the entire dataset | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
| # Set format for PyTorch | |
| tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) | |
| return tokenized_dataset | |
| def fine_tune(self, datasets): | |
| """ | |
| Fine-tune the model with the preprocessed datasets. | |
| """ | |
| # Load datasets (after pre-processing) | |
| dataset_dict = DatasetDict({ | |
| "train": datasets['train'], | |
| "validation": datasets['validation'], | |
| }) | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir=Config.OUTPUT_DIR, | |
| evaluation_strategy="epoch", | |
| learning_rate=Config.LEARNING_RATE, | |
| per_device_train_batch_size=Config.BATCH_SIZE, | |
| per_device_eval_batch_size=Config.BATCH_SIZE, | |
| weight_decay=Config.WEIGHT_DECAY, | |
| save_total_limit=3, | |
| num_train_epochs=Config.NUM_EPOCHS, | |
| logging_dir=Config.LOGGING_DIR, | |
| load_best_model_at_end=True | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=self.model, | |
| args=training_args, | |
| train_dataset=dataset_dict['train'], | |
| eval_dataset=dataset_dict['validation'], | |
| ) | |
| # Fine-tuning | |
| trainer.train() | |
| def predict(self, prompt): | |
| inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=Config.MAX_LENGTH) | |
| inputs = {key: value.to(Config.DEVICE) for key, value in inputs.items()} | |
| outputs = self.model.generate(**inputs, max_length=Config.MAX_LENGTH) | |
| return self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| def load_and_process_datasets(self): | |
| """ | |
| Loads and preprocesses the datasets for fine-tuning. | |
| """ | |
| # Load your OSINT and WhiteRabbitNeo datasets | |
| osint_datasets = [ | |
| 'gonferspanish/OSINT', | |
| 'Inforensics/missing-persons-clue-analysis-osint', | |
| 'jester6136/osint', | |
| 'originalbox/osint' | |
| ] | |
| wrn_datasets = [ | |
| 'WhiteRabbitNeo/WRN-Chapter-2', | |
| 'WhiteRabbitNeo/WRN-Chapter-1', | |
| 'WhiteRabbitNeo/Code-Functions-Level-Cyber' | |
| ] | |
| # Combine all datasets into one for training | |
| combined_datasets = [] | |
| # Load and preprocess OSINT datasets | |
| for dataset_name in osint_datasets: | |
| dataset = load_dataset(dataset_name) | |
| processed_data = self.preprocess_data(dataset['train']) # Assuming the 'train' split exists | |
| combined_datasets.append(processed_data) | |
| # Load and preprocess WhiteRabbitNeo datasets | |
| for dataset_name in wrn_datasets: | |
| dataset = load_dataset(dataset_name) | |
| processed_data = self.preprocess_data(dataset['train']) # Assuming the 'train' split exists | |
| combined_datasets.append(processed_data) | |
| # Combine all preprocessed datasets | |
| full_dataset = DatasetDict() | |
| full_dataset['train'] = Dataset.from_dict(pd.concat([d['train'] for d in combined_datasets])) | |
| full_dataset['validation'] = Dataset.from_dict(pd.concat([d['validation'] for d in combined_datasets])) | |
| return full_dataset | |
| if __name__ == "__main__": | |
| # Create the model object | |
| model = CyberAttackDetectionModel() | |
| # Load and preprocess datasets | |
| preprocessed_datasets = model.load_and_process_datasets() | |
| # Fine-tune the model | |
| model.fine_tune(preprocessed_datasets) | |
| # Example prediction | |
| prompt = "A network scan reveals an open port 22 with an outdated SSH service." | |
| print(model.predict(prompt)) | |