Fundamentals
Transformer Architecture
# Basic Transformer components
class Transformer(nn.Module):
def __init__(self, d_model, nhead, num_layers):
super().__init__()
self.encoder = TransformerEncoder(
TransformerEncoderLayer(d_model, nhead),
num_layers
)
self.decoder = TransformerDecoder(
TransformerDecoderLayer(d_model, nhead),
num_layers
)
# Self-Attention Mechanism
def scaled_dot_product_attention(Q, K, V, mask=None):
d_k = Q.size(-1)
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention = F.softmax(scores, dim=-1)
return torch.matmul(attention, V)
class Transformer(nn.Module):
def __init__(self, d_model, nhead, num_layers):
super().__init__()
self.encoder = TransformerEncoder(
TransformerEncoderLayer(d_model, nhead),
num_layers
)
self.decoder = TransformerDecoder(
TransformerDecoderLayer(d_model, nhead),
num_layers
)
# Self-Attention Mechanism
def scaled_dot_product_attention(Q, K, V, mask=None):
d_k = Q.size(-1)
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention = F.softmax(scores, dim=-1)
return torch.matmul(attention, V)
Note: Transformers use self-attention mechanisms to process sequences in parallel, enabling more efficient training than RNNs.
Attention Mechanisms
# Multi-Head Attention
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def forward(self, Q, K, V, mask=None):
batch_size = Q.size(0)
Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
attention = scaled_dot_product_attention(Q, K, V, mask)
attention = attention.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
return self.W_o(attention)
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def forward(self, Q, K, V, mask=None):
batch_size = Q.size(0)
Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
attention = scaled_dot_product_attention(Q, K, V, mask)
attention = attention.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
return self.W_o(attention)
Note: Multi-head attention allows the model to jointly attend to information from different representation subspaces.
LLM Architectures
GPT Architecture
# GPT-style decoder-only transformer
class GPT(nn.Module):
def __init__(self, vocab_size, d_model, nhead, num_layers):
super().__init__()
self.token_embedding = nn.Embedding(vocab_size, d_model)
self.position_embedding = nn.Embedding(1000, d_model) # max sequence length
self.decoder_layers = nn.ModuleList([
TransformerDecoderLayer(d_model, nhead)
for _ in range(num_layers)
])
self.fc_out = nn.Linear(d_model, vocab_size)
def forward(self, src, src_mask=None):
batch_size, seq_len = src.shape
positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(src.device)
x = self.token_embedding(src) + self.position_embedding(positions)
for layer in self.decoder_layers:
x = layer(x, src_mask)
return self.fc_out(x)
# Causal mask for autoregressive generation
def generate_square_subsequent_mask(sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float(-1e9)).masked_fill(mask == 1, float(0.0))
return mask
class GPT(nn.Module):
def __init__(self, vocab_size, d_model, nhead, num_layers):
super().__init__()
self.token_embedding = nn.Embedding(vocab_size, d_model)
self.position_embedding = nn.Embedding(1000, d_model) # max sequence length
self.decoder_layers = nn.ModuleList([
TransformerDecoderLayer(d_model, nhead)
for _ in range(num_layers)
])
self.fc_out = nn.Linear(d_model, vocab_size)
def forward(self, src, src_mask=None):
batch_size, seq_len = src.shape
positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(src.device)
x = self.token_embedding(src) + self.position_embedding(positions)
for layer in self.decoder_layers:
x = layer(x, src_mask)
return self.fc_out(x)
# Causal mask for autoregressive generation
def generate_square_subsequent_mask(sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float(-1e9)).masked_fill(mask == 1, float(0.0))
return mask
Note: GPT uses a decoder-only transformer architecture with causal masking to ensure autoregressive generation.
BERT Architecture
# BERT-style encoder-only transformer
class BERT(nn.Module):
def __init__(self, vocab_size, d_model, nhead, num_layers):
super().__init__()
self.token_embedding = nn.Embedding(vocab_size, d_model)
self.position_embedding = nn.Embedding(512, d_model) # max sequence length
self.segment_embedding = nn.Embedding(2, d_model) # for sentence pairs
self.encoder_layers = nn.ModuleList([
TransformerEncoderLayer(d_model, nhead)
for _ in range(num_layers)
])
self.classifier = nn.Linear(d_model, 2) # for classification tasks
def forward(self, input_ids, token_type_ids=None, attention_mask=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
embeddings = self.token_embedding(input_ids) + self.position_embedding(position_ids) + self.segment_embedding(token_type_ids)
for layer in self.encoder_layers:
embeddings = layer(embeddings, attention_mask)
return self.classifier(embeddings[:, 0, :]) # use [CLS] token for classification
class BERT(nn.Module):
def __init__(self, vocab_size, d_model, nhead, num_layers):
super().__init__()
self.token_embedding = nn.Embedding(vocab_size, d_model)
self.position_embedding = nn.Embedding(512, d_model) # max sequence length
self.segment_embedding = nn.Embedding(2, d_model) # for sentence pairs
self.encoder_layers = nn.ModuleList([
TransformerEncoderLayer(d_model, nhead)
for _ in range(num_layers)
])
self.classifier = nn.Linear(d_model, 2) # for classification tasks
def forward(self, input_ids, token_type_ids=None, attention_mask=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
embeddings = self.token_embedding(input_ids) + self.position_embedding(position_ids) + self.segment_embedding(token_type_ids)
for layer in self.encoder_layers:
embeddings = layer(embeddings, attention_mask)
return self.classifier(embeddings[:, 0, :]) # use [CLS] token for classification
Note: BERT uses an encoder-only architecture and is pre-trained using masked language modeling and next sentence prediction.
Prompt Engineering
Basic Prompt Techniques
Zero-Shot Prompting:
"Classify the text: 'The movie was fantastic with great acting.' Sentiment:"
"Classify the text: 'The movie was fantastic with great acting.' Sentiment:"
Few-Shot Prompting:
"Text: 'This product is amazing!' Sentiment: Positive
Text: 'The service was terrible.' Sentiment: Negative
Text: 'It was okay, nothing special.' Sentiment: Neutral
Text: 'The acting was superb.' Sentiment:"
"Text: 'This product is amazing!' Sentiment: Positive
Text: 'The service was terrible.' Sentiment: Negative
Text: 'It was okay, nothing special.' Sentiment: Neutral
Text: 'The acting was superb.' Sentiment:"
Chain-of-Thought:
"Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.
Q: The cafeteria had 23 apples. They used 20 to make lunch. They bought 6 more. How many apples do they have now?"
"Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.
Q: The cafeteria had 23 apples. They used 20 to make lunch. They bought 6 more. How many apples do they have now?"
Note: Effective prompts provide clear instructions, examples, and context to guide the model toward the desired output.
Advanced Prompt Techniques
Role-Playing:
"You are an expert software architect with 20 years of experience. Design a microservices architecture for an e-commerce platform that needs to handle 1 million daily users."
"You are an expert software architect with 20 years of experience. Design a microservices architecture for an e-commerce platform that needs to handle 1 million daily users."
Template-Based:
"Create a marketing email for a new productivity app called 'FocusTime'.
Subject: [ catchy subject line ]
Body: [ engaging content about the app's features ]
Call-to-action: [ compelling CTA button text ]"
"Create a marketing email for a new productivity app called 'FocusTime'.
Subject: [ catchy subject line ]
Body: [ engaging content about the app's features ]
Call-to-action: [ compelling CTA button text ]"
Conditional Generation:
"If the user is a beginner, explain machine learning in simple terms with everyday examples. If the user is an expert, provide a technical overview with mathematical formulations."
"If the user is a beginner, explain machine learning in simple terms with everyday examples. If the user is an expert, provide a technical overview with mathematical formulations."
Iterative Refinement:
"First, outline the main points about climate change. Then, expand each point with supporting evidence. Finally, create a compelling conclusion that calls for action."
"First, outline the main points about climate change. Then, expand each point with supporting evidence. Finally, create a compelling conclusion that calls for action."
Note: Advanced techniques often involve structuring prompts to elicit specific behaviors, formats, or levels of detail from the model.
Fine-Tuning Techniques
Full Fine-Tuning
# Full fine-tuning with Hugging Face Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Prepare training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Start training
trainer.train()
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Prepare training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Start training
trainer.train()
Parameter-Efficient Fine-Tuning
# LoRA (Low-Rank Adaptation) with PEFT
from peft import LoraConfig, get_peft_model, TaskType
# Define LoRA configuration
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"],
)
# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# P-tuning with prompt tuning
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model
peft_config = PromptTuningConfig(
task_type=TaskType.CAUSAL_LM,
prompt_tuning_init=PromptTuningInit.TEXT,
num_virtual_tokens=20,
prompt_tuning_init_text="Classify the sentiment of this text:",
tokenizer_name_or_path="gpt2",
)
model = get_peft_model(model, peft_config)
from peft import LoraConfig, get_peft_model, TaskType
# Define LoRA configuration
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"],
)
# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# P-tuning with prompt tuning
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model
peft_config = PromptTuningConfig(
task_type=TaskType.CAUSAL_LM,
prompt_tuning_init=PromptTuningInit.TEXT,
num_virtual_tokens=20,
prompt_tuning_init_text="Classify the sentiment of this text:",
tokenizer_name_or_path="gpt2",
)
model = get_peft_model(model, peft_config)
Note: Parameter-efficient fine-tuning methods like LoRA and P-tuning allow adapting large models with minimal computational resources by training only small adapter modules.
Evaluation & Deployment
Model Evaluation
# Common evaluation metrics for LLMs
from datasets import load_metric
# Load metrics
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
perplexity_metric = load_metric("perplexity")
# Calculate BLEU score
predictions = ["I like to eat pizza"]
references = [["I enjoy eating pizza"]]
bleu_score = bleu_metric.compute(predictions=predictions, references=references)
# Calculate ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
# Human evaluation template
human_eval_template = {
"fluency": "How fluent and natural is the text? (1-5)",
"relevance": "How relevant is the response to the prompt? (1-5)",
"coherence": "How coherent and logically structured is the text? (1-5)",
"accuracy": "How factually accurate is the content? (1-5)",
}
# Toxicity detection
from transformers import pipeline
toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")
toxicity_score = toxicity_classifier("Your generated text here")
from datasets import load_metric
# Load metrics
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
perplexity_metric = load_metric("perplexity")
# Calculate BLEU score
predictions = ["I like to eat pizza"]
references = [["I enjoy eating pizza"]]
bleu_score = bleu_metric.compute(predictions=predictions, references=references)
# Calculate ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
# Human evaluation template
human_eval_template = {
"fluency": "How fluent and natural is the text? (1-5)",
"relevance": "How relevant is the response to the prompt? (1-5)",
"coherence": "How coherent and logically structured is the text? (1-5)",
"accuracy": "How factually accurate is the content? (1-5)",
}
# Toxicity detection
from transformers import pipeline
toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")
toxicity_score = toxicity_classifier("Your generated text here")
Model Deployment
# FastAPI deployment example
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
app = FastAPI()
generator = pipeline("text-generation", model="gpt2")
class GenerationRequest(BaseModel):
prompt: str
max_length: int = 100
temperature: float = 0.7
@app.post("/generate")
async def generate_text(request: GenerationRequest):
result = generator(
request.prompt,
max_length=request.max_length,
temperature=request.temperature,
do_sample=True,
)
return {"generated_text": result[0]["generated_text"]}
# Dockerfile for deployment
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
app = FastAPI()
generator = pipeline("text-generation", model="gpt2")
class GenerationRequest(BaseModel):
prompt: str
max_length: int = 100
temperature: float = 0.7
@app.post("/generate")
async def generate_text(request: GenerationRequest):
result = generator(
request.prompt,
max_length=request.max_length,
temperature=request.temperature,
do_sample=True,
)
return {"generated_text": result[0]["generated_text"]}
# Dockerfile for deployment
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
Note: When deploying LLMs, consider using model optimization techniques like quantization, distillation, or specialized inference servers like TensorRT-LLM or vLLM for better performance.