GenAI Cheatsheet | Nikhil Learn Hub Guide Pro Hub AI

Fundamentals

Transformer Architecture

# Basic Transformer components

class Transformer(nn.Module):

  def __init__(self, d_model, nhead, num_layers):

    super().__init__()

    self.encoder = TransformerEncoder(

      TransformerEncoderLayer(d_model, nhead),

      num_layers

    )

    self.decoder = TransformerDecoder(

      TransformerDecoderLayer(d_model, nhead),

      num_layers

    )

# Self-Attention Mechanism

def scaled_dot_product_attention(Q, K, V, mask=None):

  d_k = Q.size(-1)

  scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)

  if mask is not None:

    scores = scores.masked_fill(mask == 0, -1e9)

  attention = F.softmax(scores, dim=-1)

  return torch.matmul(attention, V)

Note: Transformers use self-attention mechanisms to process sequences in parallel, enabling more efficient training than RNNs.

Attention Mechanisms

# Multi-Head Attention

class MultiHeadAttention(nn.Module):

  def __init__(self, d_model, num_heads):

    super().__init__()

    self.d_model = d_model

    self.num_heads = num_heads

    self.d_k = d_model // num_heads

    self.W_q = nn.Linear(d_model, d_model)

    self.W_k = nn.Linear(d_model, d_model)

    self.W_v = nn.Linear(d_model, d_model)

    self.W_o = nn.Linear(d_model, d_model)

  def forward(self, Q, K, V, mask=None):

    batch_size = Q.size(0)

    Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    attention = scaled_dot_product_attention(Q, K, V, mask)

    attention = attention.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

    return self.W_o(attention)

Note: Multi-head attention allows the model to jointly attend to information from different representation subspaces.

LLM Architectures

GPT Architecture

# GPT-style decoder-only transformer

class GPT(nn.Module):

  def __init__(self, vocab_size, d_model, nhead, num_layers):

    super().__init__()

    self.token_embedding = nn.Embedding(vocab_size, d_model)

    self.position_embedding = nn.Embedding(1000, d_model) # max sequence length

    self.decoder_layers = nn.ModuleList([

      TransformerDecoderLayer(d_model, nhead)

      for _ in range(num_layers)

    ])

    self.fc_out = nn.Linear(d_model, vocab_size)

  def forward(self, src, src_mask=None):

    batch_size, seq_len = src.shape

    positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(src.device)

    x = self.token_embedding(src) + self.position_embedding(positions)

    for layer in self.decoder_layers:

      x = layer(x, src_mask)

    return self.fc_out(x)

# Causal mask for autoregressive generation

def generate_square_subsequent_mask(sz):

  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)

  mask = mask.float().masked_fill(mask == 0, float(-1e9)).masked_fill(mask == 1, float(0.0))

  return mask

Note: GPT uses a decoder-only transformer architecture with causal masking to ensure autoregressive generation.

BERT Architecture

# BERT-style encoder-only transformer

class BERT(nn.Module):

  def __init__(self, vocab_size, d_model, nhead, num_layers):

    super().__init__()

    self.token_embedding = nn.Embedding(vocab_size, d_model)

    self.position_embedding = nn.Embedding(512, d_model) # max sequence length

    self.segment_embedding = nn.Embedding(2, d_model) # for sentence pairs

    self.encoder_layers = nn.ModuleList([

      TransformerEncoderLayer(d_model, nhead)

      for _ in range(num_layers)

    ])

    self.classifier = nn.Linear(d_model, 2) # for classification tasks

  def forward(self, input_ids, token_type_ids=None, attention_mask=None):

    seq_length = input_ids.size(1)

    position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

    if token_type_ids is None:

      token_type_ids = torch.zeros_like(input_ids)

    embeddings = self.token_embedding(input_ids) + self.position_embedding(position_ids) + self.segment_embedding(token_type_ids)

    for layer in self.encoder_layers:

      embeddings = layer(embeddings, attention_mask)

    return self.classifier(embeddings[:, 0, :]) # use [CLS] token for classification

Note: BERT uses an encoder-only architecture and is pre-trained using masked language modeling and next sentence prediction.

Prompt Engineering

Basic Prompt Techniques

Zero-Shot Prompting:
"Classify the text: 'The movie was fantastic with great acting.' Sentiment:"

Few-Shot Prompting:
"Text: 'This product is amazing!' Sentiment: Positive
Text: 'The service was terrible.' Sentiment: Negative
Text: 'It was okay, nothing special.' Sentiment: Neutral
Text: 'The acting was superb.' Sentiment:"

Chain-of-Thought:
"Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.

Q: The cafeteria had 23 apples. They used 20 to make lunch. They bought 6 more. How many apples do they have now?"

Note: Effective prompts provide clear instructions, examples, and context to guide the model toward the desired output.

Advanced Prompt Techniques

Role-Playing:
"You are an expert software architect with 20 years of experience. Design a microservices architecture for an e-commerce platform that needs to handle 1 million daily users."

Template-Based:
"Create a marketing email for a new productivity app called 'FocusTime'.
Subject: [ catchy subject line ]
Body: [ engaging content about the app's features ]
Call-to-action: [ compelling CTA button text ]"

Conditional Generation:
"If the user is a beginner, explain machine learning in simple terms with everyday examples. If the user is an expert, provide a technical overview with mathematical formulations."

Iterative Refinement:
"First, outline the main points about climate change. Then, expand each point with supporting evidence. Finally, create a compelling conclusion that calls for action."

Note: Advanced techniques often involve structuring prompts to elicit specific behaviors, formats, or levels of detail from the model.

Fine-Tuning Techniques

Full Fine-Tuning

# Full fine-tuning with Hugging Face Transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

# Load pre-trained model and tokenizer

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if it doesn't exist

if tokenizer.pad_token is None:

  tokenizer.pad_token = tokenizer.eos_token

# Prepare training arguments

training_args = TrainingArguments(

  output_dir="./results",

  num_train_epochs=3,

  per_device_train_batch_size=4,

  per_device_eval_batch_size=4,

  warmup_steps=500,

  weight_decay=0.01,

  logging_dir="./logs",

)

# Create Trainer instance

trainer = Trainer(

  model=model,

  args=training_args,

  train_dataset=train_dataset,

  eval_dataset=eval_dataset,

)

# Start training

trainer.train()

Parameter-Efficient Fine-Tuning

# LoRA (Low-Rank Adaptation) with PEFT

from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration

lora_config = LoraConfig(

  task_type=TaskType.CAUSAL_LM,

  inference_mode=False,

  r=8,

  lora_alpha=32,

  lora_dropout=0.1,

  target_modules=["q_proj", "v_proj"],

)

# Apply LoRA to model

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

# P-tuning with prompt tuning

from peft import PromptTuningConfig, PromptTuningInit, get_peft_model

peft_config = PromptTuningConfig(

  task_type=TaskType.CAUSAL_LM,

  prompt_tuning_init=PromptTuningInit.TEXT,

  num_virtual_tokens=20,

  prompt_tuning_init_text="Classify the sentiment of this text:",

  tokenizer_name_or_path="gpt2",

)

model = get_peft_model(model, peft_config)

Note: Parameter-efficient fine-tuning methods like LoRA and P-tuning allow adapting large models with minimal computational resources by training only small adapter modules.

Evaluation & Deployment

Model Evaluation

# Common evaluation metrics for LLMs

from datasets import load_metric

# Load metrics

bleu_metric = load_metric("bleu")

rouge_metric = load_metric("rouge")

perplexity_metric = load_metric("perplexity")

# Calculate BLEU score

predictions = ["I like to eat pizza"]

references = [["I enjoy eating pizza"]]

bleu_score = bleu_metric.compute(predictions=predictions, references=references)

# Calculate ROUGE score

rouge_score = rouge_metric.compute(predictions=predictions, references=references)

# Human evaluation template

human_eval_template = {

  "fluency": "How fluent and natural is the text? (1-5)",

  "relevance": "How relevant is the response to the prompt? (1-5)",

  "coherence": "How coherent and logically structured is the text? (1-5)",

  "accuracy": "How factually accurate is the content? (1-5)",

}

# Toxicity detection

from transformers import pipeline

toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")

toxicity_score = toxicity_classifier("Your generated text here")

Model Deployment

# FastAPI deployment example

from fastapi import FastAPI

from pydantic import BaseModel

from transformers import pipeline

app = FastAPI()

generator = pipeline("text-generation", model="gpt2")

class GenerationRequest(BaseModel):

  prompt: str

  max_length: int = 100

  temperature: float = 0.7

@app.post("/generate")

async def generate_text(request: GenerationRequest):

  result = generator(

    request.prompt,

    max_length=request.max_length,

    temperature=request.temperature,

    do_sample=True,

  )

  return {"generated_text": result[0]["generated_text"]}

# Dockerfile for deployment

FROM python:3.9-slim

WORKDIR /app

COPY requirements.txt .

RUN pip install --no-cache-dir -r requirements.txt

COPY . .

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80"]

Note: When deploying LLMs, consider using model optimization techniques like quantization, distillation, or specialized inference servers like TensorRT-LLM or vLLM for better performance.

Related Cheatsheet Links

Generative AI Cheatsheet