Below is a compact ~200-line “mini ChatGPT” style model you can run locally. It uses a Transformer decoder (GPT-style) built with PyTorch. ⚠️ This is still very small compared to real models like GPT-4 or LLaMA, but it can learn conversation patterns and reply to prompts. 1️⃣ Install Requirements pip install torch 2️⃣ Create Training Data Create chat_data.txt Example conversation dataset: User: hello Bot: hello! how can i help you? User: what is ai Bot: ai means artificial intelligence. User: who created python Bot: python was created by guido van rossum. User: what is machine learning Bot: machine learning allows computers to learn from data. The model will learn conversation patterns. 3️⃣ Mini ChatGPT (~200 lines) Create mini_chatgpt.py import torch import torch.nn as nn import torch.nn.functional as F device = "cuda" if torch.cuda.is_available() else "cpu" # hyperparameters batch_size = 32 block_size = 128 max_iters = 3000 eval_interval = 300 learning_rate = 3e-4 n_embd = 128 n_head = 4 n_layer = 4 dropout = 0.2 # load dataset with open("chat_data.txt", "r", encoding="utf-8") as f: text = f.read() chars = sorted(list(set(text))) vocab_size = len(chars) stoi = {ch:i for i,ch in enumerate(chars)} itos = {i:ch for i,ch in enumerate(chars)} encode = lambda s: [stoi[c] for c in s] decode = lambda l: "".join([itos[i] for i in l]) data = torch.tensor(encode(text), dtype=torch.long) n = int(0.9 * len(data)) train_data = data[:n] val_data = data[n:] def get_batch(split): data = train_data if split == "train" else val_data ix = torch.randint(len(data) - block_size, (batch_size,)) x = torch.stack([data[i:i+block_size] for i in ix]) y = torch.stack([data[i+1:i+block_size+1] for i in ix]) return x.to(device), y.to(device) class Head(nn.Module): def __init__(self, head_size): super().__init__() self.key = nn.Linear(n_embd, head_size, bias=False) self.query = nn.Linear(n_embd, head_size, bias=False) self.value = nn.Linear(n_embd, head_size, bias=False) self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size))) self.dropout = nn.Dropout(dropout) def forward(self, x): B,T,C = x.shape k = self.key(x) q = self.query(x) wei = q @ k.transpose(-2,-1) * C**-0.5 wei = wei.masked_fill(self.tril[:T,:T]==0, float("-inf")) wei = F.softmax(wei, dim=-1) wei = self.dropout(wei) v = self.value(x) return wei @ v class MultiHeadAttention(nn.Module): def __init__(self, num_heads, head_size): super().__init__() self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) self.proj = nn.Linear(n_embd,n_embd) self.dropout = nn.Dropout(dropout) def forward(self,x): out = torch.cat([h(x) for h in self.heads],dim=-1) return self.dropout(self.proj(out)) class FeedForward(nn.Module): def __init__(self): super().__init__() self.net = nn.Sequential( nn.Linear(n_embd,4*n_embd), nn.ReLU(), nn.Linear(4*n_embd,n_embd), nn.Dropout(dropout) ) def forward(self,x): return self.net(x) class Block(nn.Module): def __init__(self): super().__init__() head_size = n_embd//n_head self.sa = MultiHeadAttention(n_head,head_size) self.ffwd = FeedForward() self.ln1 = nn.LayerNorm(n_embd) self.ln2 = nn.LayerNorm(n_embd) def forward(self,x): x = x + self.sa(self.ln1(x)) x = x + self.ffwd(self.ln2(x)) return x class ChatGPT(nn.Module): def __init__(self): super().__init__() self.token_embedding_table = nn.Embedding(vocab_size,n_embd) self.position_embedding_table = nn.Embedding(block_size,n_embd) self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)]) self.ln_f = nn.LayerNorm(n_embd) self.lm_head = nn.Linear(n_embd,vocab_size) def forward(self, idx, targets=None): B,T = idx.shape tok_emb = self.token_embedding_table(idx) pos_emb = self.position_embedding_table(torch.arange(T,device=device)) x = tok_emb + pos_emb x = self.blocks(x) x = self.ln_f(x) logits = self.lm_head(x) if targets is None: loss = None else: B,T,C = logits.shape logits = logits.view(B*T,C) targets = targets.view(B*T) loss = F.cross_entropy(logits,targets) return logits,loss def generate(self, idx, max_new_tokens): for _ in range(max_new_tokens): idx_cond = idx[:,-block_size:] logits,_ = self(idx_cond) logits = logits[:,-1,:] probs = F.softmax(logits,dim=-1) idx_next = torch.multinomial(probs,num_samples=1) idx = torch.cat((idx,idx_next),dim=1) return idx model = ChatGPT().to(device) optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate) for iter in range(max_iters): xb,yb = get_batch("train") logits,loss = model(xb,yb) optimizer.zero_grad(set_to_none=True) loss.backward() optimizer.step() if iter % eval_interval == 0: print(iter,loss.item()) torch.save(model.state_dict(),"chatgpt_model.pth") print("training finished") model.eval() while True: prompt = input("You: ") context = torch.tensor([encode(prompt)],dtype=torch.long).to(device) output = model.generate(context,100) print("Bot:",decode(output[0].tolist())) 4️⃣ Run Training python mini_chatgpt.py Training log example: 0 3.2 300 2.5 600 2.1 5️⃣ Chat With Your Model After training finishes: You: hello Bot: hello! how can i help you? You: what is ai Bot: ai means artificial intelligence 6️⃣ Project Structure chatgpt-mini/ │ ├── mini_chatgpt.py ├── chat_data.txt └── chatgpt_model.pth 7️⃣ What This Mini ChatGPT Teaches Architecture: Text ↓ Tokenization ↓ Embeddings ↓ Transformer Blocks ↓ Next Token Prediction ↓ Chat Response Key ideas used by real LLMs. ===================================== Below is a simple ~300-line “mini ChatGPT” that can remember conversation context while chatting. It uses a GPT-style Transformer built with PyTorch. ⚠️ This is still tiny compared to real systems like GPT-4 or ChatGPT, but it demonstrates the core idea of conversational memory. 1️⃣ Install Dependencies pip install torch 2️⃣ Create Training Dataset Create chat_data.txt Example: User: hello Bot: hello! how can i help you? User: what is ai Bot: artificial intelligence means machines that can learn. User: who created python Bot: python was created by guido van rossum. User: what is machine learning Bot: machine learning is a way for computers to learn from data. The model learns conversation patterns. 3️⃣ Mini ChatGPT With Memory Create chatgpt_memory.py import torch import torch.nn as nn import torch.nn.functional as F device = "cuda" if torch.cuda.is_available() else "cpu" # hyperparameters batch_size = 32 block_size = 128 max_iters = 3000 learning_rate = 3e-4 n_embd = 128 n_head = 4 n_layer = 4 dropout = 0.2 # load dataset with open("chat_data.txt","r",encoding="utf-8") as f: text = f.read() chars = sorted(list(set(text))) vocab_size = len(chars) stoi = {ch:i for i,ch in enumerate(chars)} itos = {i:ch for i,ch in enumerate(chars)} def encode(s): return [stoi[c] for c in s] def decode(l): return "".join([itos[i] for i in l]) data = torch.tensor(encode(text),dtype=torch.long) n = int(0.9*len(data)) train_data = data[:n] val_data = data[n:] def get_batch(split): d = train_data if split=="train" else val_data ix = torch.randint(len(d)-block_size,(batch_size,)) x = torch.stack([d[i:i+block_size] for i in ix]) y = torch.stack([d[i+1:i+block_size+1] for i in ix]) return x.to(device),y.to(device) class Head(nn.Module): def __init__(self,head_size): super().__init__() self.key = nn.Linear(n_embd,head_size,bias=False) self.query = nn.Linear(n_embd,head_size,bias=False) self.value = nn.Linear(n_embd,head_size,bias=False) self.register_buffer("tril",torch.tril(torch.ones(block_size,block_size))) self.dropout = nn.Dropout(dropout) def forward(self,x): B,T,C = x.shape k = self.key(x) q = self.query(x) wei = q @ k.transpose(-2,-1) * C**-0.5 wei = wei.masked_fill(self.tril[:T,:T]==0,float("-inf")) wei = F.softmax(wei,dim=-1) wei = self.dropout(wei) v = self.value(x) out = wei @ v return out class MultiHeadAttention(nn.Module): def __init__(self,num_heads,head_size): super().__init__() self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) self.proj = nn.Linear(n_embd,n_embd) self.dropout = nn.Dropout(dropout) def forward(self,x): out = torch.cat([h(x) for h in self.heads],dim=-1) out = self.proj(out) return self.dropout(out) class FeedForward(nn.Module): def __init__(self): super().__init__() self.net = nn.Sequential( nn.Linear(n_embd,4*n_embd), nn.ReLU(), nn.Linear(4*n_embd,n_embd), nn.Dropout(dropout) ) def forward(self,x): return self.net(x) class Block(nn.Module): def __init__(self): super().__init__() head_size = n_embd // n_head self.sa = MultiHeadAttention(n_head,head_size) self.ffwd = FeedForward() self.ln1 = nn.LayerNorm(n_embd) self.ln2 = nn.LayerNorm(n_embd) def forward(self,x): x = x + self.sa(self.ln1(x)) x = x + self.ffwd(self.ln2(x)) return x class ChatGPT(nn.Module): def __init__(self): super().__init__() self.token_embedding = nn.Embedding(vocab_size,n_embd) self.position_embedding = nn.Embedding(block_size,n_embd) self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)]) self.ln = nn.LayerNorm(n_embd) self.head = nn.Linear(n_embd,vocab_size) def forward(self,idx,targets=None): B,T = idx.shape tok_emb = self.token_embedding(idx) pos_emb = self.position_embedding(torch.arange(T,device=device)) x = tok_emb + pos_emb x = self.blocks(x) x = self.ln(x) logits = self.head(x) if targets is None: loss=None else: B,T,C = logits.shape logits = logits.view(B*T,C) targets = targets.view(B*T) loss = F.cross_entropy(logits,targets) return logits,loss def generate(self,idx,max_new_tokens): for _ in range(max_new_tokens): idx_cond = idx[:,-block_size:] logits,_ = self(idx_cond) logits = logits[:,-1,:] probs = F.softmax(logits,dim=-1) idx_next = torch.multinomial(probs,1) idx = torch.cat((idx,idx_next),dim=1) return idx model = ChatGPT().to(device) optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate) print("Training...") for iter in range(max_iters): xb,yb = get_batch("train") logits,loss = model(xb,yb) optimizer.zero_grad() loss.backward() optimizer.step() if iter % 500 == 0: print(iter,loss.item()) torch.save(model.state_dict(),"chat_model.pth") print("Training finished") conversation_history = "" model.eval() while True: user = input("You: ") conversation_history += "User: "+user+"\nBot: " context = torch.tensor([encode(conversation_history)],dtype=torch.long).to(device) output = model.generate(context,100) response = decode(output[0].tolist()) response = response[len(conversation_history):] print("Bot:",response) conversation_history += response 4️⃣ Run the Model python chatgpt_memory.py Example: You: hello Bot: hello! how can i help you? You: what is ai Bot: artificial intelligence means machines that learn The conversation history is stored in memory, so the model sees previous messages. 5️⃣ Project Structure mini-chatgpt/ │ ├── chatgpt_memory.py ├── chat_data.txt └── chat_model.pth 6️⃣ How Conversation Memory Works User message ↓ Append to conversation_history ↓ Feed entire conversation to model ↓ Generate next tokens ↓ Bot response This is the basic concept used by real chat systems. ===================================================== ===================================================== Here’s a step-by-step guide to train a ChatGPT-style model on your own documents using a mini GPT approach. I’ll break it into digestible steps, so you can follow even on a personal computer. ⚠️ This is a small-scale model suitable for learning and experimenting. Real ChatGPT training uses billions of parameters and huge datasets. 1️⃣ Prepare Your Documents Collect the text files you want the model to learn from. Examples: documents/ │ ├── doc1.txt ├── doc2.txt └── doc3.txt Combine all files into one training text: import os folder = "documents" all_text = "" for filename in os.listdir(folder): if filename.endswith(".txt"): with open(os.path.join(folder,filename), "r", encoding="utf-8") as f: all_text += f.read() + "\n" with open("training_text.txt", "w", encoding="utf-8") as f: f.write(all_text) print("Combined text saved to training_text.txt") This will create training_text.txt, which the model will train on. 2️⃣ Tokenization The model reads text as tokens (characters or subwords). For simplicity, we’ll use character-level tokenization first. with open("training_text.txt","r",encoding="utf-8") as f: text = f.read() chars = sorted(list(set(text))) vocab_size = len(chars) stoi = {ch:i for i,ch in enumerate(chars)} itos = {i:ch for i,ch in enumerate(chars)} encode = lambda s: [stoi[c] for c in s] decode = lambda l: "".join([itos[i] for i in l]) data = encode(text) 3️⃣ Mini GPT Architecture We’ll reuse a GPT-style Transformer similar to previous examples: Embeddings: token + position Transformer blocks: self-attention + feedforward Next-token prediction: output layer import torch import torch.nn as nn import torch.nn.functional as F device = "cuda" if torch.cuda.is_available() else "cpu" # hyperparameters batch_size = 32 block_size = 128 n_embd = 128 n_head = 4 n_layer = 4 dropout = 0.2 learning_rate = 3e-4 max_iters = 2000 data = torch.tensor(data,dtype=torch.long) n = int(0.9*len(data)) train_data = data[:n] val_data = data[n:] def get_batch(split): d = train_data if split=="train" else val_data ix = torch.randint(len(d)-block_size,(batch_size,)) x = torch.stack([d[i:i+block_size] for i in ix]) y = torch.stack([d[i+1:i+block_size+1] for i in ix]) return x.to(device), y.to(device) # Mini GPT model class GPT(nn.Module): def __init__(self): super().__init__() self.token_emb = nn.Embedding(vocab_size,n_embd) self.pos_emb = nn.Embedding(block_size,n_embd) self.blocks = nn.Sequential(*[nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head, dim_feedforward=4*n_embd, dropout=dropout) for _ in range(n_layer)]) self.ln = nn.LayerNorm(n_embd) self.head = nn.Linear(n_embd,vocab_size) def forward(self, idx, targets=None): B,T = idx.shape x = self.token_emb(idx) + self.pos_emb(torch.arange(T,device=device)) x = self.blocks(x) x = self.ln(x) logits = self.head(x) if targets is None: return logits, None B,T,C = logits.shape return logits, F.cross_entropy(logits.view(B*T,C), targets.view(B*T)) def generate(self, idx, max_new_tokens): for _ in range(max_new_tokens): idx_cond = idx[:,-block_size:] logits,_ = self(idx_cond) logits = logits[:,-1,:] probs = F.softmax(logits,dim=-1) next_idx = torch.multinomial(probs,num_samples=1) idx = torch.cat((idx,next_idx),dim=1) return idx model = GPT().to(device) optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate) 4️⃣ Training Loop for iter in range(max_iters): xb,yb = get_batch("train") logits,loss = model(xb,yb) optimizer.zero_grad() loss.backward() optimizer.step() if iter % 200 == 0: print("Step",iter,"Loss",loss.item()) # Save model torch.save(model.state_dict(),"chat_docs_model.pth") print("Training finished and model saved.") 5️⃣ Generating Text (Chat With Your Documents) model.eval() conversation = "" while True: prompt = input("You: ") conversation += prompt + "\n" context = torch.tensor([encode(conversation)],dtype=torch.long).to(device) output = model.generate(context,200) response = decode(output[0].tolist()) response = response[len(conversation):] # only new tokens print("Bot:", response) conversation += response + "\n" The model now uses the previous conversation as context, so it remembers what you typed. It can answer based on your own documents. 6️⃣ Tips for Better Results Clean your documents: remove extra whitespace, headers, and irrelevant data. Use more data: the more text you feed, the better it learns. Increase block size: lets the model remember longer conversations. Increase layers/embedding size: improves accuracy, but requires more memory. GPU is recommended: training on CPU is slow. 7️⃣ Project Structure Example my_chat_docs/ │ ├── documents/ │ ├── doc1.txt │ ├── doc2.txt │ └── doc3.txt ├── training_text.txt ├── train_chat_docs.py └── chat_docs_model.pth train_chat_docs.py contains the code above for training and generating. ✅ With this setup, you now have a mini ChatGPT trained on your own documents that can chat and remember context. ================================================================ ================================================================ Perfect! Here’s a ready-to-run Python script that does everything in one file: Reads all .txt files from a folder (documents/) Combines them into training text Trains a mini GPT model Allows you to chat with memory of previous conversation 📄 Full Script: train_and_chat.py import os import torch import torch.nn as nn import torch.nn.functional as F # ----------------------------- # CONFIG # ----------------------------- device = "cuda" if torch.cuda.is_available() else "cpu" documents_folder = "documents" batch_size = 32 block_size = 128 n_embd = 128 n_head = 4 n_layer = 4 dropout = 0.2 learning_rate = 3e-4 max_iters = 2000 save_path = "chat_docs_model.pth" # ----------------------------- # 1️⃣ Combine all documents # ----------------------------- all_text = "" for filename in os.listdir(documents_folder): if filename.endswith(".txt"): with open(os.path.join(documents_folder, filename), "r", encoding="utf-8") as f: all_text += f.read() + "\n" with open("training_text.txt", "w", encoding="utf-8") as f: f.write(all_text) print(f"Combined {len(os.listdir(documents_folder))} files into training_text.txt") # ----------------------------- # 2️⃣ Tokenization # ----------------------------- text = all_text chars = sorted(list(set(text))) vocab_size = len(chars) stoi = {ch:i for i,ch in enumerate(chars)} itos = {i:ch for i,ch in enumerate(chars)} encode = lambda s: [stoi[c] for c in s] decode = lambda l: "".join([itos[i] for i in l]) data = torch.tensor(encode(text), dtype=torch.long) n = int(0.9*len(data)) train_data = data[:n] val_data = data[n:] def get_batch(split): d = train_data if split=="train" else val_data ix = torch.randint(len(d)-block_size,(batch_size,)) x = torch.stack([d[i:i+block_size] for i in ix]) y = torch.stack([d[i+1:i+block_size+1] for i in ix]) return x.to(device), y.to(device) # ----------------------------- # 3️⃣ Mini GPT Model # ----------------------------- class GPT(nn.Module): def __init__(self): super().__init__() self.token_emb = nn.Embedding(vocab_size, n_embd) self.pos_emb = nn.Embedding(block_size, n_embd) self.blocks = nn.Sequential( *[nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head, dim_feedforward=4*n_embd, dropout=dropout) for _ in range(n_layer)] ) self.ln = nn.LayerNorm(n_embd) self.head = nn.Linear(n_embd, vocab_size) def forward(self, idx, targets=None): B,T = idx.shape x = self.token_emb(idx) + self.pos_emb(torch.arange(T,device=device)) x = self.blocks(x) x = self.ln(x) logits = self.head(x) if targets is None: return logits, None B,T,C = logits.shape return logits, F.cross_entropy(logits.view(B*T,C), targets.view(B*T)) def generate(self, idx, max_new_tokens): for _ in range(max_new_tokens): idx_cond = idx[:,-block_size:] logits,_ = self(idx_cond) logits = logits[:,-1,:] probs = F.softmax(logits,dim=-1) next_idx = torch.multinomial(probs,1) idx = torch.cat((idx,next_idx),dim=1) return idx model = GPT().to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) # ----------------------------- # 4️⃣ Training Loop # ----------------------------- print("Starting training...") for iter in range(max_iters): xb, yb = get_batch("train") logits, loss = model(xb, yb) optimizer.zero_grad() loss.backward() optimizer.step() if iter % 200 == 0: print(f"Step {iter}, Loss {loss.item():.4f}") torch.save(model.state_dict(), save_path) print(f"Training finished! Model saved to {save_path}") # ----------------------------- # 5️⃣ Chat with memory # ----------------------------- model.eval() conversation_history = "" print("\nYou can now chat with your model! Type 'exit' to quit.") while True: prompt = input("You: ") if prompt.lower() == "exit": break conversation_history += "User: " + prompt + "\nBot: " context = torch.tensor([encode(conversation_history)], dtype=torch.long).to(device) output = model.generate(context, 200) response = decode(output[0].tolist()) response = response[len(conversation_history):] # only new tokens print("Bot:", response.strip()) conversation_history += response + "\n" ✅ How to Use Create a folder called documents in the same directory. Put all your .txt documents there. Run: python train_and_chat.py Wait for training to finish (~a few minutes on CPU for small data; faster on GPU). Chat with your model: You: hello Bot: hello! how can I help you? You: what is ai Bot: artificial intelligence is ... Type exit to quit. 🔹 Features Reads all your documents → learns your text style. Remembers conversation history → behaves more like ChatGPT. Single Python file → easy to run. Can be retrained or continued → just keep appending documents.