Python Basics

Data Structures

# Lists
my_list = [1, 2, 3, 'apple', 'banana']
my_list.append('orange') # Add element
my_list[0] # Access first element

# Dictionaries
my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}
my_dict['age'] # Access value
my_dict['occupation'] = 'Engineer' # Add new key-value pair

# Tuples (immutable)
my_tuple = (1, 2, 3, 'apple')

# Sets (unique elements)
my_set = {1, 2, 3, 3, 4} # Results in {1, 2, 3, 4}

# NumPy Arrays
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
arr.shape # Get dimensions

Control Flow

# If-elif-else statements
x = 10
if x > 10:
    print("Greater than 10")
elif x == 10:
    print("Equal to 10")
else:
    print("Less than 10")

# For loops
for i in range(5):
    print(i)

# List comprehension
squares = [x**2 for x in range(10)]

# While loop
count = 0
while count < 5:
    print(count)
    count += 1

# Functions
def add_numbers(a, b):
    return a + b

result = add_numbers(5, 3)

Pandas Data Manipulation

DataFrames Basics

import pandas as pd

# Creating DataFrame
df = pd.DataFrame({
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 24, 35, 32],
    'City': ['New York', 'Paris', 'Berlin', 'London']
})

# Reading data
df = pd.read_csv('data.csv') # From CSV
df = pd.read_excel('data.xlsx') # From Excel

# Basic operations
df.head() # First 5 rows
df.info() # DataFrame info
df.describe() # Statistical summary
df.shape # (rows, columns)
df.columns # Column names

Data Selection

# Selecting columns
df['Name'] # Single column
df[['Name', 'Age']] # Multiple columns

# Selecting rows
df.iloc[0] # First row by index
df.loc[0] # First row by label
df[df['Age'] > 30] # Filter rows

# Boolean indexing
df[(df['Age'] > 30) & (df['City'] == 'London')]

# Setting values
df.loc[0, 'Age'] = 29 # Set specific value

# Adding new column
df['Senior'] = df['Age'] > 30

# Dropping columns
df.drop('City', axis=1, inplace=True)

Data Cleaning

# Handling missing values
df.isnull().sum() # Count missing values
df.dropna() # Drop rows with missing values
df.fillna(0) # Fill missing values with 0
df.fillna(df.mean()) # Fill with mean

# Removing duplicates
df.drop_duplicates()

# Data type conversion
df['Age'] = df['Age'].astype('int')

# String operations
df['Name'] = df['Name'].str.upper()
df['Name'] = df['Name'].str.replace(' ', '_')

# Renaming columns
df.rename(columns={'Name': 'Full_Name'}, inplace=True)

# Resetting index
df.reset_index(drop=True, inplace=True)

Grouping & Aggregation

# Group by operations
grouped = df.groupby('City')
grouped['Age'].mean() # Mean age by city

# Multiple aggregations
df.groupby('City').agg({
    'Age': ['mean', 'min', 'max', 'count'],
    'Salary': 'sum'
})

# Pivot tables
pd.pivot_table(df, values='Age', index='City',
    columns='Senior', aggfunc='mean')

# Sorting values
df.sort_values('Age', ascending=False)

# Value counts
df['City'].value_counts()

# Applying functions
df['Age'].apply(lambda x: x * 12) # Convert to months

Data Visualization

Matplotlib

import matplotlib.pyplot as plt

# Line plot
plt.plot(x, y)
plt.title('Line Plot')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.show()

# Scatter plot
plt.scatter(x, y)
plt.title('Scatter Plot')
plt.show()

# Histogram
plt.hist(data, bins=30)
plt.title('Histogram')
plt.show()

# Bar chart
plt.bar(categories, values)
plt.title('Bar Chart')
plt.show()

# Subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes[0, 0].plot(x, y)
axes[0, 1].scatter(x, y)
axes[1, 0].hist(data)
axes[1, 1].bar(categories, values)
plt.tight_layout()
plt.show()

Seaborn

import seaborn as sns

# Set style
sns.set_style('whitegrid')

# Distribution plot
sns.histplot(data=df, x='Age', kde=True)
plt.show()

# Box plot
sns.boxplot(data=df, x='City', y='Age')
plt.show()

# Violin plot
sns.violinplot(data=df, x='City', y='Age')
plt.show()

# Count plot
sns.countplot(data=df, x='City')
plt.show()

# Heatmap
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

# Pair plot
sns.pairplot(df, hue='City')
plt.show()

Machine Learning

Scikit-learn Basics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Prepare data
X = df.drop('target', axis=1)
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Classification

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f'{name} Results:')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('\\n' + '='*50 + '\\n')

# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Average score: {scores.mean()}')

Model Evaluation

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve
)

# For classification models
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

# For regression models
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score
)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Initialize model
model = RandomForestClassifier()

# Grid search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

# Randomized search (faster for large parameter spaces)
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    random_state=42
)

random_search.fit(X_train, y_train)

Statistics

Descriptive Statistics

import numpy as np
import scipy.stats as stats

# Measures of central tendency
mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)

# Measures of dispersion
variance = np.var(data)
std_dev = np.std(data)
range_val = np.ptp(data) # Peak to peak (max - min)
iqr = stats.iqr(data) # Interquartile range

# Percentiles and quantiles
q1 = np.percentile(data, 25) # First quartile
q3 = np.percentile(data, 75) # Third quartile

# Shape of distribution
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)

# Correlation
correlation = np.corrcoef(x, y)[0, 1]

# Covariance
covariance = np.cov(x, y)[0, 1]

Inferential Statistics

# Hypothesis testing

# T-test (compare means)
t_stat, p_value = stats.ttest_ind(group1, group2)

# Chi-square test (categorical data)
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)

# ANOVA (compare means across multiple groups)
f_stat, p_value = stats.f_oneway(group1, group2, group3)

# Confidence intervals
ci = stats.norm.interval(0.95, loc=mean, scale=std_err)

# Normal distribution
z_score = (x - mean) / std_dev

# Probability density function
pdf = stats.norm.pdf(x, mean, std_dev)

# Cumulative distribution function
cdf = stats.norm.cdf(x, mean, std_dev)

# Sampling
sample = np.random.choice(data, size=100, replace=False)

# Bootstrap sampling
bootstrap_means = [np.mean(np.random.choice(data, size=len(data)))
    for _ in range(1000)]

Quick Reference

Common Libraries

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Machine learning
import sklearn
import tensorflow as tf
import torch
import xgboost as xgb

# Statistics
import scipy.stats as stats
import statsmodels.api as sm

# Natural language processing
import nltk
import spacy
import gensim

# Web scraping
import requests
import beautifulsoup4 as bs4
import scrapy

# Database access
import sqlite3
import pymysql
import psycopg2
import sqlalchemy

Useful Resources

  • Online Courses: Coursera, edX, Udacity, DataCamp
  • Books: "Python for Data Analysis", "Introduction to Statistical Learning"
  • YouTube Channels: StatQuest, Corey Schafer, Krish Naik
  • Communities: Stack Overflow, Kaggle, Reddit (r/datascience)
  • Practice Platforms: Kaggle, HackerRank, LeetCode
  • Blogs: Towards Data Science, KDnuggets, Analytics Vidhya
  • Documentation: Official docs for pandas, NumPy, scikit-learn