Below is a compact ~200-line “mini ChatGPT” style model you can run locally.
It uses a Transformer decoder (GPT-style) built with PyTorch.

⚠️ This is still very small compared to real models like
GPT-4 or LLaMA, but it can learn conversation patterns and reply to prompts.

1️⃣ Install Requirements
pip install torch
2️⃣ Create Training Data

Create chat_data.txt

Example conversation dataset:

User: hello
Bot: hello! how can i help you?

User: what is ai
Bot: ai means artificial intelligence.

User: who created python
Bot: python was created by guido van rossum.

User: what is machine learning
Bot: machine learning allows computers to learn from data.

The model will learn conversation patterns.

3️⃣ Mini ChatGPT (~200 lines)

Create mini_chatgpt.py

import torch
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

# hyperparameters
batch_size = 32
block_size = 128
max_iters = 3000
eval_interval = 300
learning_rate = 3e-4
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2

# load dataset
with open("chat_data.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T]==0, float("-inf"))
        wei = F.softmax(wei, dim=-1)

        wei = self.dropout(wei)

        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd,n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads],dim=-1)
        return self.dropout(self.proj(out))

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd,4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd,n_embd),
            nn.Dropout(dropout)
        )

    def forward(self,x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embd//n_head
        self.sa = MultiHeadAttention(n_head,head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self,x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class ChatGPT(nn.Module):
    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
        self.position_embedding_table = nn.Embedding(block_size,n_embd)

        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])

        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd,vocab_size)

    def forward(self, idx, targets=None):

        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T,device=device))

        x = tok_emb + pos_emb
        x = self.blocks(x)

        x = self.ln_f(x)

        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)

        return logits,loss

    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):

            idx_cond = idx[:,-block_size:]

            logits,_ = self(idx_cond)

            logits = logits[:,-1,:]

            probs = F.softmax(logits,dim=-1)

            idx_next = torch.multinomial(probs,num_samples=1)

            idx = torch.cat((idx,idx_next),dim=1)

        return idx

model = ChatGPT().to(device)

optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)

for iter in range(max_iters):

    xb,yb = get_batch("train")

    logits,loss = model(xb,yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if iter % eval_interval == 0:
        print(iter,loss.item())

torch.save(model.state_dict(),"chatgpt_model.pth")

print("training finished")

model.eval()

while True:

    prompt = input("You: ")

    context = torch.tensor([encode(prompt)],dtype=torch.long).to(device)

    output = model.generate(context,100)

    print("Bot:",decode(output[0].tolist()))
4️⃣ Run Training
python mini_chatgpt.py

Training log example:

0 3.2
300 2.5
600 2.1
5️⃣ Chat With Your Model

After training finishes:

You: hello
Bot: hello! how can i help you?

You: what is ai
Bot: ai means artificial intelligence
6️⃣ Project Structure
chatgpt-mini/
│
├── mini_chatgpt.py
├── chat_data.txt
└── chatgpt_model.pth
7️⃣ What This Mini ChatGPT Teaches

Architecture:

Text
 ↓
Tokenization
 ↓
Embeddings
 ↓
Transformer Blocks
 ↓
Next Token Prediction
 ↓
Chat Response

Key ideas used by real LLMs.


















=====================================


Below is a simple ~300-line “mini ChatGPT” that can remember conversation context while chatting.
It uses a GPT-style Transformer built with PyTorch.

⚠️ This is still tiny compared to real systems like GPT-4 or ChatGPT, but it demonstrates the core idea of conversational memory.

1️⃣ Install Dependencies
pip install torch
2️⃣ Create Training Dataset

Create chat_data.txt

Example:

User: hello
Bot: hello! how can i help you?

User: what is ai
Bot: artificial intelligence means machines that can learn.

User: who created python
Bot: python was created by guido van rossum.

User: what is machine learning
Bot: machine learning is a way for computers to learn from data.

The model learns conversation patterns.

3️⃣ Mini ChatGPT With Memory

Create chatgpt_memory.py

import torch
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

# hyperparameters
batch_size = 32
block_size = 128
max_iters = 3000
learning_rate = 3e-4
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2

# load dataset
with open("chat_data.txt","r",encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return "".join([itos[i] for i in l])

data = torch.tensor(encode(text),dtype=torch.long)

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):

    d = train_data if split=="train" else val_data

    ix = torch.randint(len(d)-block_size,(batch_size,))

    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+block_size+1] for i in ix])

    return x.to(device),y.to(device)

class Head(nn.Module):

    def __init__(self,head_size):
        super().__init__()

        self.key = nn.Linear(n_embd,head_size,bias=False)
        self.query = nn.Linear(n_embd,head_size,bias=False)
        self.value = nn.Linear(n_embd,head_size,bias=False)

        self.register_buffer("tril",torch.tril(torch.ones(block_size,block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self,x):

        B,T,C = x.shape

        k = self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2,-1) * C**-0.5

        wei = wei.masked_fill(self.tril[:T,:T]==0,float("-inf"))

        wei = F.softmax(wei,dim=-1)

        wei = self.dropout(wei)

        v = self.value(x)

        out = wei @ v

        return out

class MultiHeadAttention(nn.Module):

    def __init__(self,num_heads,head_size):
        super().__init__()

        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

        self.proj = nn.Linear(n_embd,n_embd)

        self.dropout = nn.Dropout(dropout)

    def forward(self,x):

        out = torch.cat([h(x) for h in self.heads],dim=-1)

        out = self.proj(out)

        return self.dropout(out)

class FeedForward(nn.Module):

    def __init__(self):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(n_embd,4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd,n_embd),
            nn.Dropout(dropout)
        )

    def forward(self,x):
        return self.net(x)

class Block(nn.Module):

    def __init__(self):
        super().__init__()

        head_size = n_embd // n_head

        self.sa = MultiHeadAttention(n_head,head_size)
        self.ffwd = FeedForward()

        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self,x):

        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))

        return x

class ChatGPT(nn.Module):

    def __init__(self):
        super().__init__()

        self.token_embedding = nn.Embedding(vocab_size,n_embd)
        self.position_embedding = nn.Embedding(block_size,n_embd)

        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])

        self.ln = nn.LayerNorm(n_embd)

        self.head = nn.Linear(n_embd,vocab_size)

    def forward(self,idx,targets=None):

        B,T = idx.shape

        tok_emb = self.token_embedding(idx)

        pos_emb = self.position_embedding(torch.arange(T,device=device))

        x = tok_emb + pos_emb

        x = self.blocks(x)

        x = self.ln(x)

        logits = self.head(x)

        if targets is None:
            loss=None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits,targets)

        return logits,loss

    def generate(self,idx,max_new_tokens):

        for _ in range(max_new_tokens):

            idx_cond = idx[:,-block_size:]

            logits,_ = self(idx_cond)

            logits = logits[:,-1,:]

            probs = F.softmax(logits,dim=-1)

            idx_next = torch.multinomial(probs,1)

            idx = torch.cat((idx,idx_next),dim=1)

        return idx

model = ChatGPT().to(device)

optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)

print("Training...")

for iter in range(max_iters):

    xb,yb = get_batch("train")

    logits,loss = model(xb,yb)

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

    if iter % 500 == 0:
        print(iter,loss.item())

torch.save(model.state_dict(),"chat_model.pth")

print("Training finished")

conversation_history = ""

model.eval()

while True:

    user = input("You: ")

    conversation_history += "User: "+user+"\nBot: "

    context = torch.tensor([encode(conversation_history)],dtype=torch.long).to(device)

    output = model.generate(context,100)

    response = decode(output[0].tolist())

    response = response[len(conversation_history):]

    print("Bot:",response)

    conversation_history += response
4️⃣ Run the Model
python chatgpt_memory.py

Example:

You: hello
Bot: hello! how can i help you?

You: what is ai
Bot: artificial intelligence means machines that learn

The conversation history is stored in memory, so the model sees previous messages.

5️⃣ Project Structure
mini-chatgpt/
│
├── chatgpt_memory.py
├── chat_data.txt
└── chat_model.pth
6️⃣ How Conversation Memory Works
User message
     ↓
Append to conversation_history
     ↓
Feed entire conversation to model
     ↓
Generate next tokens
     ↓
Bot response

This is the basic concept used by real chat systems.






















=====================================================
=====================================================


Here’s a step-by-step guide to train a ChatGPT-style model on your own documents using a mini GPT approach. I’ll break it into digestible steps, so you can follow even on a personal computer.

⚠️ This is a small-scale model suitable for learning and experimenting. Real ChatGPT training uses billions of parameters and huge datasets.

1️⃣ Prepare Your Documents

Collect the text files you want the model to learn from. Examples:

documents/
│
├── doc1.txt
├── doc2.txt
└── doc3.txt

Combine all files into one training text:

import os

folder = "documents"
all_text = ""
for filename in os.listdir(folder):
    if filename.endswith(".txt"):
        with open(os.path.join(folder,filename), "r", encoding="utf-8") as f:
            all_text += f.read() + "\n"

with open("training_text.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

print("Combined text saved to training_text.txt")

This will create training_text.txt, which the model will train on.

2️⃣ Tokenization

The model reads text as tokens (characters or subwords).

For simplicity, we’ll use character-level tokenization first.

with open("training_text.txt","r",encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

data = encode(text)
3️⃣ Mini GPT Architecture

We’ll reuse a GPT-style Transformer similar to previous examples:

Embeddings: token + position

Transformer blocks: self-attention + feedforward

Next-token prediction: output layer

import torch
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

# hyperparameters
batch_size = 32
block_size = 128
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2
learning_rate = 3e-4
max_iters = 2000

data = torch.tensor(data,dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    d = train_data if split=="train" else val_data
    ix = torch.randint(len(d)-block_size,(batch_size,))
    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# Mini GPT model
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size,n_embd)
        self.pos_emb = nn.Embedding(block_size,n_embd)
        self.blocks = nn.Sequential(*[nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head, dim_feedforward=4*n_embd, dropout=dropout) for _ in range(n_layer)])
        self.ln = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd,vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        x = self.token_emb(idx) + self.pos_emb(torch.arange(T,device=device))
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.head(x)
        if targets is None:
            return logits, None
        B,T,C = logits.shape
        return logits, F.cross_entropy(logits.view(B*T,C), targets.view(B*T))

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:]
            logits,_ = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits,dim=-1)
            next_idx = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,next_idx),dim=1)
        return idx

model = GPT().to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)
4️⃣ Training Loop
for iter in range(max_iters):
    xb,yb = get_batch("train")
    logits,loss = model(xb,yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 200 == 0:
        print("Step",iter,"Loss",loss.item())

# Save model
torch.save(model.state_dict(),"chat_docs_model.pth")
print("Training finished and model saved.")
5️⃣ Generating Text (Chat With Your Documents)
model.eval()
conversation = ""

while True:
    prompt = input("You: ")
    conversation += prompt + "\n"
    context = torch.tensor([encode(conversation)],dtype=torch.long).to(device)
    output = model.generate(context,200)
    response = decode(output[0].tolist())
    response = response[len(conversation):]  # only new tokens
    print("Bot:", response)
    conversation += response + "\n"

The model now uses the previous conversation as context, so it remembers what you typed.

It can answer based on your own documents.

6️⃣ Tips for Better Results

Clean your documents: remove extra whitespace, headers, and irrelevant data.

Use more data: the more text you feed, the better it learns.

Increase block size: lets the model remember longer conversations.

Increase layers/embedding size: improves accuracy, but requires more memory.

GPU is recommended: training on CPU is slow.

7️⃣ Project Structure Example
my_chat_docs/
│
├── documents/
│   ├── doc1.txt
│   ├── doc2.txt
│   └── doc3.txt
├── training_text.txt
├── train_chat_docs.py
└── chat_docs_model.pth

train_chat_docs.py contains the code above for training and generating.

✅ With this setup, you now have a mini ChatGPT trained on your own documents that can chat and remember context.
















================================================================
================================================================



Perfect! Here’s a ready-to-run Python script that does everything in one file:

Reads all .txt files from a folder (documents/)

Combines them into training text

Trains a mini GPT model

Allows you to chat with memory of previous conversation

📄 Full Script: train_and_chat.py
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

# -----------------------------
# CONFIG
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
documents_folder = "documents"
batch_size = 32
block_size = 128
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2
learning_rate = 3e-4
max_iters = 2000
save_path = "chat_docs_model.pth"

# -----------------------------
# 1️⃣ Combine all documents
# -----------------------------
all_text = ""
for filename in os.listdir(documents_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(documents_folder, filename), "r", encoding="utf-8") as f:
            all_text += f.read() + "\n"

with open("training_text.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"Combined {len(os.listdir(documents_folder))} files into training_text.txt")

# -----------------------------
# 2️⃣ Tokenization
# -----------------------------
text = all_text
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])
data = torch.tensor(encode(text), dtype=torch.long)

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    d = train_data if split=="train" else val_data
    ix = torch.randint(len(d)-block_size,(batch_size,))
    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# -----------------------------
# 3️⃣ Mini GPT Model
# -----------------------------
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head, dim_feedforward=4*n_embd, dropout=dropout) for _ in range(n_layer)]
        )
        self.ln = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        x = self.token_emb(idx) + self.pos_emb(torch.arange(T,device=device))
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.head(x)
        if targets is None:
            return logits, None
        B,T,C = logits.shape
        return logits, F.cross_entropy(logits.view(B*T,C), targets.view(B*T))

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:]
            logits,_ = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits,dim=-1)
            next_idx = torch.multinomial(probs,1)
            idx = torch.cat((idx,next_idx),dim=1)
        return idx

model = GPT().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# -----------------------------
# 4️⃣ Training Loop
# -----------------------------
print("Starting training...")
for iter in range(max_iters):
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if iter % 200 == 0:
        print(f"Step {iter}, Loss {loss.item():.4f}")

torch.save(model.state_dict(), save_path)
print(f"Training finished! Model saved to {save_path}")

# -----------------------------
# 5️⃣ Chat with memory
# -----------------------------
model.eval()
conversation_history = ""

print("\nYou can now chat with your model! Type 'exit' to quit.")
while True:
    prompt = input("You: ")
    if prompt.lower() == "exit":
        break
    conversation_history += "User: " + prompt + "\nBot: "
    context = torch.tensor([encode(conversation_history)], dtype=torch.long).to(device)
    output = model.generate(context, 200)
    response = decode(output[0].tolist())
    response = response[len(conversation_history):]  # only new tokens
    print("Bot:", response.strip())
    conversation_history += response + "\n"
✅ How to Use

Create a folder called documents in the same directory.

Put all your .txt documents there.

Run:

python train_and_chat.py

Wait for training to finish (~a few minutes on CPU for small data; faster on GPU).

Chat with your model:

You: hello
Bot: hello! how can I help you?
You: what is ai
Bot: artificial intelligence is ...

Type exit to quit.

🔹 Features

Reads all your documents → learns your text style.

Remembers conversation history → behaves more like ChatGPT.

Single Python file → easy to run.

Can be retrained or continued → just keep appending documents.