Text Generation
Transformers
PyTorch
English
custom-architecture
rope
rmsnorm
swiglu
flash-attention
16k-context
Eval Results (legacy)
Instructions to use Austin207/Map-NEO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Austin207/Map-NEO with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Austin207/Map-NEO")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Austin207/Map-NEO", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Austin207/Map-NEO with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Austin207/Map-NEO" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Austin207/Map-NEO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/Austin207/Map-NEO
- SGLang
How to use Austin207/Map-NEO with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Austin207/Map-NEO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Austin207/Map-NEO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Austin207/Map-NEO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Austin207/Map-NEO", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use Austin207/Map-NEO with Docker Model Runner:
docker model run hf.co/Austin207/Map-NEO
| # MAP-NEO Conversational Data Preprocessing Pipeline - FIXED VERSION | |
| # Downloads conversational datasets, filters and formats for instruction fine-tuning | |
| import json | |
| import os | |
| import itertools | |
| from pathlib import Path | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| import langdetect | |
| from tqdm import tqdm | |
| import argparse | |
| import random | |
| from collections import defaultdict | |
| class ConversationDataPreprocessor: | |
| def __init__(self, output_dir="data", max_length=1024): | |
| self.output_dir = Path(output_dir) | |
| self.max_length = max_length | |
| self.setup_directories() | |
| def setup_directories(self): | |
| """Create necessary directories""" | |
| dirs = ["conversation_raw", "conversation_processed", "conversation_final"] | |
| for d in dirs: | |
| (self.output_dir / d).mkdir(parents=True, exist_ok=True) | |
| def download_conversational_data(self, dataset_name="OpenAssistant/oasst1", num_conversations=20000): | |
| """Download conversational dataset from HuggingFace""" | |
| print(f"Downloading {num_conversations} conversations from {dataset_name}...") | |
| raw_path = self.output_dir / "conversation_raw" / f"{dataset_name.replace('/', '_')}_raw.jsonl" | |
| try: | |
| # Load dataset | |
| ds = load_dataset(dataset_name, split="train", streaming=True) | |
| downloaded = 0 | |
| with open(raw_path, "w", encoding="utf-8") as f: | |
| for row in tqdm(itertools.islice(ds, num_conversations), total=num_conversations): | |
| # Save raw conversation data | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| downloaded += 1 | |
| print(f"Raw conversational data saved to: {raw_path}") | |
| print(f"Downloaded {downloaded} conversation records") | |
| return raw_path | |
| except Exception as e: | |
| print(f"Error downloading {dataset_name}: {e}") | |
| print("Trying alternative dataset...") | |
| return self.download_alternative_dataset(num_conversations) | |
| def download_alternative_dataset(self, num_conversations=20000): | |
| """Try alternative conversational datasets if primary fails""" | |
| alternative_datasets = [ | |
| "databricks/databricks-dolly-15k", | |
| "tatsu-lab/alpaca", | |
| "vicgalle/alpaca-gpt4" | |
| ] | |
| for dataset_name in alternative_datasets: | |
| try: | |
| print(f"Trying {dataset_name}...") | |
| raw_path = self.output_dir / "conversation_raw" / f"{dataset_name.replace('/', '_')}_raw.jsonl" | |
| ds = load_dataset(dataset_name, split="train") | |
| # Sample if dataset is too large | |
| if len(ds) > num_conversations: | |
| ds = ds.shuffle(seed=42).select(range(num_conversations)) | |
| with open(raw_path, "w", encoding="utf-8") as f: | |
| for row in tqdm(ds): | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| print(f"Successfully downloaded {len(ds)} records from {dataset_name}") | |
| return raw_path | |
| except Exception as e: | |
| print(f"Failed to download {dataset_name}: {e}") | |
| continue | |
| raise Exception("All conversational datasets failed to download") | |
| def process_conversations(self, input_path, dataset_name="auto"): | |
| """Process raw conversational data into standard format""" | |
| print("Processing conversations into standard format...") | |
| input_path = Path(input_path) | |
| # Detect dataset type from filename | |
| if "OpenAssistant" in str(input_path) or "oasst" in str(input_path): | |
| return self.process_openassistant_messages(input_path) | |
| else: | |
| return self.process_other_datasets(input_path) | |
| def process_openassistant_messages(self, input_path): | |
| """Process OpenAssistant individual messages into conversation chains""" | |
| print("🚀 Processing OpenAssistant messages into conversations...") | |
| # Load all messages | |
| messages = [] | |
| print("Loading messages...") | |
| with open(input_path, 'r', encoding='utf-8') as f: | |
| for line in tqdm(f, desc="Reading messages"): | |
| try: | |
| msg = json.loads(line) | |
| # Filter for valid English messages | |
| if (msg.get('lang') == 'en' and | |
| not msg.get('deleted', False) and | |
| msg.get('review_result', False) and | |
| msg.get('text', '').strip()): | |
| messages.append(msg) | |
| except: | |
| continue | |
| print(f"Loaded {len(messages)} valid English messages") | |
| # Group messages by conversation tree | |
| trees = defaultdict(list) | |
| for msg in messages: | |
| tree_id = msg.get('message_tree_id') | |
| if tree_id: | |
| trees[tree_id].append(msg) | |
| print(f"Found {len(trees)} conversation trees") | |
| # Build conversation chains from each tree | |
| conversations = [] | |
| for tree_id, tree_messages in tqdm(trees.items(), desc="Building conversations"): | |
| # Create message lookup | |
| msg_dict = {msg['message_id']: msg for msg in tree_messages} | |
| # Find root messages (no parent) | |
| roots = [msg for msg in tree_messages if not msg.get('parent_id')] | |
| for root in roots: | |
| try: | |
| # Build all possible conversation paths from this root | |
| paths = self.build_conversation_paths(root, msg_dict) | |
| for path in paths: | |
| # Convert to conversation format | |
| conversation = [] | |
| for msg in path: | |
| role = "user" if msg['role'] == "prompter" else "assistant" | |
| conversation.append({ | |
| "role": role, | |
| "content": msg['text'].strip() | |
| }) | |
| # Validate conversation | |
| if self.is_valid_conversation(conversation): | |
| conversations.append({ | |
| "messages": conversation, | |
| "tree_id": tree_id, | |
| "source": "oasst1" | |
| }) | |
| except Exception as e: | |
| # Skip problematic trees | |
| continue | |
| print(f"Extracted {len(conversations)} valid conversations") | |
| # Save processed conversations | |
| output_path = self.output_dir / "conversation_processed" / "conversations_standardized.jsonl" | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| for conv in conversations: | |
| f.write(json.dumps(conv, ensure_ascii=False) + "\n") | |
| print(f"Processed data saved to: {output_path}") | |
| return output_path | |
| def build_conversation_paths(self, root_msg, msg_dict, max_length=8): | |
| """Build all conversation paths starting from a root message - FIXED""" | |
| def build_paths_recursive(msg, current_path): | |
| paths = [] | |
| new_path = current_path + [msg] | |
| # Find children of this message | |
| children = [] | |
| for candidate in msg_dict.values(): | |
| if candidate.get('parent_id') == msg['message_id']: | |
| children.append(candidate) | |
| if not children: | |
| # Leaf node - end of conversation path | |
| if len(new_path) >= 2: # At least user + assistant | |
| paths.append(new_path) | |
| else: | |
| # Continue with each child (take the best ranked one) | |
| # Fix: Handle None values in rank | |
| def get_rank(x): | |
| rank = x.get('rank') | |
| return rank if rank is not None else 999 | |
| try: | |
| children.sort(key=get_rank) # Lower rank = better | |
| best_child = children[0] | |
| if len(new_path) < max_length: # Prevent very long conversations | |
| child_paths = build_paths_recursive(best_child, new_path) | |
| paths.extend(child_paths) | |
| # Also save the current path if it's long enough | |
| if len(new_path) >= 2: | |
| paths.append(new_path) | |
| except: | |
| # If sorting fails, just take the first child | |
| if children and len(new_path) < max_length: | |
| child_paths = build_paths_recursive(children[0], new_path) | |
| paths.extend(child_paths) | |
| return paths | |
| return build_paths_recursive(root_msg, []) | |
| def is_valid_conversation(self, conversation): | |
| """Validate conversation quality""" | |
| # Must have at least 2 messages | |
| if len(conversation) < 2: | |
| return False | |
| # Check for alternating roles (user/assistant pattern) | |
| for i in range(1, len(conversation)): | |
| if conversation[i]['role'] == conversation[i-1]['role']: | |
| return False | |
| # Check message content quality | |
| for msg in conversation: | |
| content = msg['content'] | |
| if len(content) < 5 or len(content) > 1500: | |
| return False | |
| # Check total conversation length | |
| total_length = sum(len(msg['content']) for msg in conversation) | |
| if total_length < 20 or total_length > 3000: | |
| return False | |
| return True | |
| def process_other_datasets(self, input_path): | |
| """Process non-OpenAssistant datasets (Dolly, Alpaca, etc.)""" | |
| output_path = self.output_dir / "conversation_processed" / "conversations_standardized.jsonl" | |
| conversations = [] | |
| total_count = 0 | |
| valid_count = 0 | |
| with open(input_path, "r", encoding="utf-8") as infile: | |
| for line in tqdm(infile, desc="Processing conversations"): | |
| total_count += 1 | |
| try: | |
| raw_data = json.loads(line) | |
| # Extract conversation based on format | |
| conversation = self.extract_conversation_other_formats(raw_data) | |
| if conversation and self.validate_simple_conversation(conversation): | |
| conversations.append(conversation) | |
| valid_count += 1 | |
| except Exception as e: | |
| continue | |
| # Save processed conversations | |
| with open(output_path, "w", encoding="utf-8") as outfile: | |
| for conv in conversations: | |
| outfile.write(json.dumps(conv, ensure_ascii=False) + "\n") | |
| print(f"Processed {valid_count}/{total_count} valid conversations") | |
| print(f"Processed data saved to: {output_path}") | |
| return output_path | |
| def extract_conversation_other_formats(self, raw_data): | |
| """Extract conversation from various dataset formats""" | |
| # Dolly format | |
| if 'instruction' in raw_data and 'response' in raw_data: | |
| messages = [ | |
| {"role": "user", "content": raw_data['instruction'].strip()} | |
| ] | |
| if raw_data.get('context'): | |
| messages[0]['content'] += f"\nContext: {raw_data['context'].strip()}" | |
| messages.append({ | |
| "role": "assistant", | |
| "content": raw_data['response'].strip() | |
| }) | |
| return { | |
| "messages": messages, | |
| "category": raw_data.get('category', 'general'), | |
| "source": "dolly" | |
| } | |
| # Alpaca format | |
| elif 'instruction' in raw_data and 'output' in raw_data: | |
| messages = [ | |
| {"role": "user", "content": raw_data['instruction'].strip()} | |
| ] | |
| if raw_data.get('input'): | |
| messages[0]['content'] += f"\nInput: {raw_data['input'].strip()}" | |
| messages.append({ | |
| "role": "assistant", | |
| "content": raw_data['output'].strip() | |
| }) | |
| return { | |
| "messages": messages, | |
| "source": "alpaca" | |
| } | |
| return None | |
| def validate_simple_conversation(self, conversation): | |
| """Validate conversation quality for simple formats""" | |
| messages = conversation.get('messages', []) | |
| # Must have at least 1 message | |
| if len(messages) < 1: | |
| return False | |
| # Check message content | |
| for msg in messages: | |
| content = msg.get('content', '').strip() | |
| if not content or len(content) < 5: | |
| return False | |
| # Check total length | |
| total_length = sum(len(msg['content']) for msg in messages) | |
| if total_length < 10 or total_length > 2000: | |
| return False | |
| return True | |
| def format_for_training(self, input_path, train_format="instruction"): | |
| """Format conversations for fine-tuning""" | |
| print(f"Formatting conversations for {train_format} training...") | |
| input_path = Path(input_path) | |
| output_path = self.output_dir / "conversation_final" / "conversation_train.jsonl" | |
| test_path = self.output_dir / "conversation_final" / "conversation_test.jsonl" | |
| conversations = [] | |
| # Load processed conversations | |
| with open(input_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| conv = json.loads(line) | |
| conversations.append(conv) | |
| # Shuffle and split | |
| random.shuffle(conversations) | |
| split_point = int(len(conversations) * 0.9) | |
| train_conversations = conversations[:split_point] | |
| test_conversations = conversations[split_point:] | |
| # Format for training | |
| self.save_training_format(train_conversations, output_path, train_format) | |
| self.save_training_format(test_conversations, test_path, train_format) | |
| print(f"Training conversations: {len(train_conversations)}") | |
| print(f"Test conversations: {len(test_conversations)}") | |
| print(f"Training data saved to: {output_path}") | |
| print(f"Test data saved to: {test_path}") | |
| # Show samples | |
| if train_conversations: | |
| print("\n📝 Sample conversations:") | |
| for i, conv in enumerate(train_conversations[:3]): | |
| print(f"\nConversation {i+1}:") | |
| for j, msg in enumerate(conv['messages']): | |
| content = msg['content'][:80] + "..." if len(msg['content']) > 80 else msg['content'] | |
| print(f" {j+1}. {msg['role'].title()}: {content}") | |
| return output_path, test_path | |
| def save_training_format(self, conversations, output_path, format_type): | |
| """Save conversations in training format""" | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| for conv in conversations: | |
| messages = conv['messages'] | |
| if len(messages) >= 2: | |
| if format_type == "instruction": | |
| # Instruction format: last message is target, rest is input | |
| input_messages = [] | |
| for msg in messages[:-1]: | |
| input_messages.append(f"{msg['role'].title()}: {msg['content']}") | |
| training_example = { | |
| "instruction": "Continue this conversation naturally and helpfully.", | |
| "input": "\n".join(input_messages), | |
| "output": messages[-1]['content'] | |
| } | |
| elif format_type == "chat": | |
| # Chat format: full conversation with system prompt | |
| training_example = { | |
| "messages": [ | |
| {"role": "system", "content": "You are MAP-NEO, a helpful AI assistant."} | |
| ] + messages | |
| } | |
| f.write(json.dumps(training_example, ensure_ascii=False) + "\n") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Preprocess conversational data for fine-tuning") | |
| parser.add_argument("--dataset", type=str, default="OpenAssistant/oasst1", | |
| help="Dataset to download") | |
| parser.add_argument("--num_conversations", type=int, default=20000, | |
| help="Number of conversations to download") | |
| parser.add_argument("--format", type=str, default="instruction", | |
| choices=["instruction", "chat"], | |
| help="Training format") | |
| parser.add_argument("--output_dir", type=str, default="data", | |
| help="Output directory") | |
| args = parser.parse_args() | |
| # Initialize preprocessor | |
| preprocessor = ConversationDataPreprocessor(args.output_dir) | |
| # Run pipeline | |
| print("Starting conversational data preprocessing pipeline...") | |
| # Step 1: Download conversational data | |
| raw_path = preprocessor.download_conversational_data( | |
| args.dataset, args.num_conversations | |
| ) | |
| # Step 2: Process conversations (auto-detects OpenAssistant vs others) | |
| processed_path = preprocessor.process_conversations(raw_path, args.dataset) | |
| # Step 3: Format for training | |
| train_path, test_path = preprocessor.format_for_training( | |
| processed_path, args.format | |
| ) | |
| print("\n" + "="*60) | |
| print("🎉 Conversational data preprocessing complete!") | |
| print(f"Training data: {train_path}") | |
| print(f"Test data: {test_path}") | |
| print("\n🚀 Ready for conversational fine-tuning!") | |
| print("Next step: python finetune_conversational.py") | |
| print("="*60) | |
| if __name__ == "__main__": | |
| main() | |