Fundamentals

Transformer Architecture

# Basic Transformer components
class Transformer(nn.Module):
  def __init__(self, d_model, nhead, num_layers):
    super().__init__()
    self.encoder = TransformerEncoder(
      TransformerEncoderLayer(d_model, nhead),
      num_layers
    )
    self.decoder = TransformerDecoder(
      TransformerDecoderLayer(d_model, nhead),
      num_layers
    )

# Self-Attention Mechanism
def scaled_dot_product_attention(Q, K, V, mask=None):
  d_k = Q.size(-1)
  scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
  if mask is not None:
    scores = scores.masked_fill(mask == 0, -1e9)
  attention = F.softmax(scores, dim=-1)
  return torch.matmul(attention, V)
Note: Transformers use self-attention mechanisms to process sequences in parallel, enabling more efficient training than RNNs.

Attention Mechanisms

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super().__init__()
    self.d_model = d_model
    self.num_heads = num_heads
    self.d_k = d_model // num_heads
    self.W_q = nn.Linear(d_model, d_model)
    self.W_k = nn.Linear(d_model, d_model)
    self.W_v = nn.Linear(d_model, d_model)
    self.W_o = nn.Linear(d_model, d_model)

  def forward(self, Q, K, V, mask=None):
    batch_size = Q.size(0)
    Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
    K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
    V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
    attention = scaled_dot_product_attention(Q, K, V, mask)
    attention = attention.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
    return self.W_o(attention)
Note: Multi-head attention allows the model to jointly attend to information from different representation subspaces.

LLM Architectures

GPT Architecture

# GPT-style decoder-only transformer
class GPT(nn.Module):
  def __init__(self, vocab_size, d_model, nhead, num_layers):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size, d_model)
    self.position_embedding = nn.Embedding(1000, d_model) # max sequence length
    self.decoder_layers = nn.ModuleList([
      TransformerDecoderLayer(d_model, nhead)
      for _ in range(num_layers)
    ])
    self.fc_out = nn.Linear(d_model, vocab_size)

  def forward(self, src, src_mask=None):
    batch_size, seq_len = src.shape
    positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(src.device)
    x = self.token_embedding(src) + self.position_embedding(positions)
    for layer in self.decoder_layers:
      x = layer(x, src_mask)
    return self.fc_out(x)

# Causal mask for autoregressive generation
def generate_square_subsequent_mask(sz):
  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask == 0, float(-1e9)).masked_fill(mask == 1, float(0.0))
  return mask
Note: GPT uses a decoder-only transformer architecture with causal masking to ensure autoregressive generation.

BERT Architecture

# BERT-style encoder-only transformer
class BERT(nn.Module):
  def __init__(self, vocab_size, d_model, nhead, num_layers):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size, d_model)
    self.position_embedding = nn.Embedding(512, d_model) # max sequence length
    self.segment_embedding = nn.Embedding(2, d_model) # for sentence pairs
    self.encoder_layers = nn.ModuleList([
      TransformerEncoderLayer(d_model, nhead)
      for _ in range(num_layers)
    ])
    self.classifier = nn.Linear(d_model, 2) # for classification tasks

  def forward(self, input_ids, token_type_ids=None, attention_mask=None):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    if token_type_ids is None:
      token_type_ids = torch.zeros_like(input_ids)
    embeddings = self.token_embedding(input_ids) + self.position_embedding(position_ids) + self.segment_embedding(token_type_ids)
    for layer in self.encoder_layers:
      embeddings = layer(embeddings, attention_mask)
    return self.classifier(embeddings[:, 0, :]) # use [CLS] token for classification
Note: BERT uses an encoder-only architecture and is pre-trained using masked language modeling and next sentence prediction.

Prompt Engineering

Basic Prompt Techniques

Zero-Shot Prompting:
"Classify the text: 'The movie was fantastic with great acting.' Sentiment:"
Few-Shot Prompting:
"Text: 'This product is amazing!' Sentiment: Positive
Text: 'The service was terrible.' Sentiment: Negative
Text: 'It was okay, nothing special.' Sentiment: Neutral
Text: 'The acting was superb.' Sentiment:"
Chain-of-Thought:
"Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.

Q: The cafeteria had 23 apples. They used 20 to make lunch. They bought 6 more. How many apples do they have now?"
Note: Effective prompts provide clear instructions, examples, and context to guide the model toward the desired output.

Advanced Prompt Techniques

Role-Playing:
"You are an expert software architect with 20 years of experience. Design a microservices architecture for an e-commerce platform that needs to handle 1 million daily users."
Template-Based:
"Create a marketing email for a new productivity app called 'FocusTime'.
Subject: [ catchy subject line ]
Body: [ engaging content about the app's features ]
Call-to-action: [ compelling CTA button text ]"
Conditional Generation:
"If the user is a beginner, explain machine learning in simple terms with everyday examples. If the user is an expert, provide a technical overview with mathematical formulations."
Iterative Refinement:
"First, outline the main points about climate change. Then, expand each point with supporting evidence. Finally, create a compelling conclusion that calls for action."
Note: Advanced techniques often involve structuring prompts to elicit specific behaviors, formats, or levels of detail from the model.

Fine-Tuning Techniques

Full Fine-Tuning

# Full fine-tuning with Hugging Face Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

# Prepare training arguments
training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=3,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  warmup_steps=500,
  weight_decay=0.01,
  logging_dir="./logs",
)

# Create Trainer instance
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=eval_dataset,
)

# Start training
trainer.train()

Parameter-Efficient Fine-Tuning

# LoRA (Low-Rank Adaptation) with PEFT
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
lora_config = LoraConfig(
  task_type=TaskType.CAUSAL_LM,
  inference_mode=False,
  r=8,
  lora_alpha=32,
  lora_dropout=0.1,
  target_modules=["q_proj", "v_proj"],
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# P-tuning with prompt tuning
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model

peft_config = PromptTuningConfig(
  task_type=TaskType.CAUSAL_LM,
  prompt_tuning_init=PromptTuningInit.TEXT,
  num_virtual_tokens=20,
  prompt_tuning_init_text="Classify the sentiment of this text:",
  tokenizer_name_or_path="gpt2",
)

model = get_peft_model(model, peft_config)
Note: Parameter-efficient fine-tuning methods like LoRA and P-tuning allow adapting large models with minimal computational resources by training only small adapter modules.

Evaluation & Deployment

Model Evaluation

# Common evaluation metrics for LLMs
from datasets import load_metric

# Load metrics
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
perplexity_metric = load_metric("perplexity")

# Calculate BLEU score
predictions = ["I like to eat pizza"]
references = [["I enjoy eating pizza"]]
bleu_score = bleu_metric.compute(predictions=predictions, references=references)

# Calculate ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=references)

# Human evaluation template
human_eval_template = {
  "fluency": "How fluent and natural is the text? (1-5)",
  "relevance": "How relevant is the response to the prompt? (1-5)",
  "coherence": "How coherent and logically structured is the text? (1-5)",
  "accuracy": "How factually accurate is the content? (1-5)",
}

# Toxicity detection
from transformers import pipeline
toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")
toxicity_score = toxicity_classifier("Your generated text here")

Model Deployment

# FastAPI deployment example
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline

app = FastAPI()
generator = pipeline("text-generation", model="gpt2")

class GenerationRequest(BaseModel):
  prompt: str
  max_length: int = 100
  temperature: float = 0.7

@app.post("/generate")
async def generate_text(request: GenerationRequest):
  result = generator(
    request.prompt,
    max_length=request.max_length,
    temperature=request.temperature,
    do_sample=True,
  )
  return {"generated_text": result[0]["generated_text"]}

# Dockerfile for deployment
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]
Note: When deploying LLMs, consider using model optimization techniques like quantization, distillation, or specialized inference servers like TensorRT-LLM or vLLM for better performance.