Python Basics
Data Structures
# Lists
my_list = [1, 2, 3, 'apple', 'banana']
my_list.append('orange') # Add element
my_list[0] # Access first element
# Dictionaries
my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}
my_dict['age'] # Access value
my_dict['occupation'] = 'Engineer' # Add new key-value pair
# Tuples (immutable)
my_tuple = (1, 2, 3, 'apple')
# Sets (unique elements)
my_set = {1, 2, 3, 3, 4} # Results in {1, 2, 3, 4}
# NumPy Arrays
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
arr.shape # Get dimensions
my_list = [1, 2, 3, 'apple', 'banana']
my_list.append('orange') # Add element
my_list[0] # Access first element
# Dictionaries
my_dict = {'name': 'John', 'age': 30, 'city': 'New York'}
my_dict['age'] # Access value
my_dict['occupation'] = 'Engineer' # Add new key-value pair
# Tuples (immutable)
my_tuple = (1, 2, 3, 'apple')
# Sets (unique elements)
my_set = {1, 2, 3, 3, 4} # Results in {1, 2, 3, 4}
# NumPy Arrays
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
arr.shape # Get dimensions
Control Flow
# If-elif-else statements
x = 10
if x > 10:
print("Greater than 10")
elif x == 10:
print("Equal to 10")
else:
print("Less than 10")
# For loops
for i in range(5):
print(i)
# List comprehension
squares = [x**2 for x in range(10)]
# While loop
count = 0
while count < 5:
print(count)
count += 1
# Functions
def add_numbers(a, b):
return a + b
result = add_numbers(5, 3)
x = 10
if x > 10:
print("Greater than 10")
elif x == 10:
print("Equal to 10")
else:
print("Less than 10")
# For loops
for i in range(5):
print(i)
# List comprehension
squares = [x**2 for x in range(10)]
# While loop
count = 0
while count < 5:
print(count)
count += 1
# Functions
def add_numbers(a, b):
return a + b
result = add_numbers(5, 3)
Pandas Data Manipulation
DataFrames Basics
import pandas as pd
# Creating DataFrame
df = pd.DataFrame({
'Name': ['John', 'Anna', 'Peter', 'Linda'],
'Age': [28, 24, 35, 32],
'City': ['New York', 'Paris', 'Berlin', 'London']
})
# Reading data
df = pd.read_csv('data.csv') # From CSV
df = pd.read_excel('data.xlsx') # From Excel
# Basic operations
df.head() # First 5 rows
df.info() # DataFrame info
df.describe() # Statistical summary
df.shape # (rows, columns)
df.columns # Column names
# Creating DataFrame
df = pd.DataFrame({
'Name': ['John', 'Anna', 'Peter', 'Linda'],
'Age': [28, 24, 35, 32],
'City': ['New York', 'Paris', 'Berlin', 'London']
})
# Reading data
df = pd.read_csv('data.csv') # From CSV
df = pd.read_excel('data.xlsx') # From Excel
# Basic operations
df.head() # First 5 rows
df.info() # DataFrame info
df.describe() # Statistical summary
df.shape # (rows, columns)
df.columns # Column names
Data Selection
# Selecting columns
df['Name'] # Single column
df[['Name', 'Age']] # Multiple columns
# Selecting rows
df.iloc[0] # First row by index
df.loc[0] # First row by label
df[df['Age'] > 30] # Filter rows
# Boolean indexing
df[(df['Age'] > 30) & (df['City'] == 'London')]
# Setting values
df.loc[0, 'Age'] = 29 # Set specific value
# Adding new column
df['Senior'] = df['Age'] > 30
# Dropping columns
df.drop('City', axis=1, inplace=True)
df['Name'] # Single column
df[['Name', 'Age']] # Multiple columns
# Selecting rows
df.iloc[0] # First row by index
df.loc[0] # First row by label
df[df['Age'] > 30] # Filter rows
# Boolean indexing
df[(df['Age'] > 30) & (df['City'] == 'London')]
# Setting values
df.loc[0, 'Age'] = 29 # Set specific value
# Adding new column
df['Senior'] = df['Age'] > 30
# Dropping columns
df.drop('City', axis=1, inplace=True)
Data Cleaning
# Handling missing values
df.isnull().sum() # Count missing values
df.dropna() # Drop rows with missing values
df.fillna(0) # Fill missing values with 0
df.fillna(df.mean()) # Fill with mean
# Removing duplicates
df.drop_duplicates()
# Data type conversion
df['Age'] = df['Age'].astype('int')
# String operations
df['Name'] = df['Name'].str.upper()
df['Name'] = df['Name'].str.replace(' ', '_')
# Renaming columns
df.rename(columns={'Name': 'Full_Name'}, inplace=True)
# Resetting index
df.reset_index(drop=True, inplace=True)
df.isnull().sum() # Count missing values
df.dropna() # Drop rows with missing values
df.fillna(0) # Fill missing values with 0
df.fillna(df.mean()) # Fill with mean
# Removing duplicates
df.drop_duplicates()
# Data type conversion
df['Age'] = df['Age'].astype('int')
# String operations
df['Name'] = df['Name'].str.upper()
df['Name'] = df['Name'].str.replace(' ', '_')
# Renaming columns
df.rename(columns={'Name': 'Full_Name'}, inplace=True)
# Resetting index
df.reset_index(drop=True, inplace=True)
Grouping & Aggregation
# Group by operations
grouped = df.groupby('City')
grouped['Age'].mean() # Mean age by city
# Multiple aggregations
df.groupby('City').agg({
'Age': ['mean', 'min', 'max', 'count'],
'Salary': 'sum'
})
# Pivot tables
pd.pivot_table(df, values='Age', index='City',
columns='Senior', aggfunc='mean')
# Sorting values
df.sort_values('Age', ascending=False)
# Value counts
df['City'].value_counts()
# Applying functions
df['Age'].apply(lambda x: x * 12) # Convert to months
grouped = df.groupby('City')
grouped['Age'].mean() # Mean age by city
# Multiple aggregations
df.groupby('City').agg({
'Age': ['mean', 'min', 'max', 'count'],
'Salary': 'sum'
})
# Pivot tables
pd.pivot_table(df, values='Age', index='City',
columns='Senior', aggfunc='mean')
# Sorting values
df.sort_values('Age', ascending=False)
# Value counts
df['City'].value_counts()
# Applying functions
df['Age'].apply(lambda x: x * 12) # Convert to months
Data Visualization
Matplotlib
import matplotlib.pyplot as plt
# Line plot
plt.plot(x, y)
plt.title('Line Plot')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.show()
# Scatter plot
plt.scatter(x, y)
plt.title('Scatter Plot')
plt.show()
# Histogram
plt.hist(data, bins=30)
plt.title('Histogram')
plt.show()
# Bar chart
plt.bar(categories, values)
plt.title('Bar Chart')
plt.show()
# Subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes[0, 0].plot(x, y)
axes[0, 1].scatter(x, y)
axes[1, 0].hist(data)
axes[1, 1].bar(categories, values)
plt.tight_layout()
plt.show()
# Line plot
plt.plot(x, y)
plt.title('Line Plot')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.show()
# Scatter plot
plt.scatter(x, y)
plt.title('Scatter Plot')
plt.show()
# Histogram
plt.hist(data, bins=30)
plt.title('Histogram')
plt.show()
# Bar chart
plt.bar(categories, values)
plt.title('Bar Chart')
plt.show()
# Subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes[0, 0].plot(x, y)
axes[0, 1].scatter(x, y)
axes[1, 0].hist(data)
axes[1, 1].bar(categories, values)
plt.tight_layout()
plt.show()
Seaborn
import seaborn as sns
# Set style
sns.set_style('whitegrid')
# Distribution plot
sns.histplot(data=df, x='Age', kde=True)
plt.show()
# Box plot
sns.boxplot(data=df, x='City', y='Age')
plt.show()
# Violin plot
sns.violinplot(data=df, x='City', y='Age')
plt.show()
# Count plot
sns.countplot(data=df, x='City')
plt.show()
# Heatmap
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()
# Pair plot
sns.pairplot(df, hue='City')
plt.show()
# Set style
sns.set_style('whitegrid')
# Distribution plot
sns.histplot(data=df, x='Age', kde=True)
plt.show()
# Box plot
sns.boxplot(data=df, x='City', y='Age')
plt.show()
# Violin plot
sns.violinplot(data=df, x='City', y='Age')
plt.show()
# Count plot
sns.countplot(data=df, x='City')
plt.show()
# Heatmap
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()
# Pair plot
sns.pairplot(df, hue='City')
plt.show()
Machine Learning
Scikit-learn Basics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Prepare data
X = df.drop('target', axis=1)
y = df['target']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Prepare data
X = df.drop('target', axis=1)
y = df['target']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
# Evaluate
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
Classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# Initialize models
models = {
'Logistic Regression': LogisticRegression(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'SVM': SVC()
}
# Train and evaluate each model
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'{name} Results:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\\n' + '='*50 + '\\n')
# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Average score: {scores.mean()}')
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# Initialize models
models = {
'Logistic Regression': LogisticRegression(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'SVM': SVC()
}
# Train and evaluate each model
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'{name} Results:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\\n' + '='*50 + '\\n')
# Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Average score: {scores.mean()}')
Model Evaluation
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, roc_curve
)
# For classification models
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
# For regression models
from sklearn.metrics import (
mean_absolute_error, mean_squared_error, r2_score
)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, roc_curve
)
# For classification models
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
# For regression models
from sklearn.metrics import (
mean_absolute_error, mean_squared_error, r2_score
)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10]
}
# Initialize model
model = RandomForestClassifier()
# Grid search
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')
# Randomized search (faster for large parameter spaces)
random_search = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=10,
cv=5,
scoring='accuracy',
random_state=42
)
random_search.fit(X_train, y_train)
# Define parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10]
}
# Initialize model
model = RandomForestClassifier()
# Grid search
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')
# Randomized search (faster for large parameter spaces)
random_search = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=10,
cv=5,
scoring='accuracy',
random_state=42
)
random_search.fit(X_train, y_train)
Statistics
Descriptive Statistics
import numpy as np
import scipy.stats as stats
# Measures of central tendency
mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)
# Measures of dispersion
variance = np.var(data)
std_dev = np.std(data)
range_val = np.ptp(data) # Peak to peak (max - min)
iqr = stats.iqr(data) # Interquartile range
# Percentiles and quantiles
q1 = np.percentile(data, 25) # First quartile
q3 = np.percentile(data, 75) # Third quartile
# Shape of distribution
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)
# Correlation
correlation = np.corrcoef(x, y)[0, 1]
# Covariance
covariance = np.cov(x, y)[0, 1]
import scipy.stats as stats
# Measures of central tendency
mean = np.mean(data)
median = np.median(data)
mode = stats.mode(data)
# Measures of dispersion
variance = np.var(data)
std_dev = np.std(data)
range_val = np.ptp(data) # Peak to peak (max - min)
iqr = stats.iqr(data) # Interquartile range
# Percentiles and quantiles
q1 = np.percentile(data, 25) # First quartile
q3 = np.percentile(data, 75) # Third quartile
# Shape of distribution
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)
# Correlation
correlation = np.corrcoef(x, y)[0, 1]
# Covariance
covariance = np.cov(x, y)[0, 1]
Inferential Statistics
# Hypothesis testing
# T-test (compare means)
t_stat, p_value = stats.ttest_ind(group1, group2)
# Chi-square test (categorical data)
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)
# ANOVA (compare means across multiple groups)
f_stat, p_value = stats.f_oneway(group1, group2, group3)
# Confidence intervals
ci = stats.norm.interval(0.95, loc=mean, scale=std_err)
# Normal distribution
z_score = (x - mean) / std_dev
# Probability density function
pdf = stats.norm.pdf(x, mean, std_dev)
# Cumulative distribution function
cdf = stats.norm.cdf(x, mean, std_dev)
# Sampling
sample = np.random.choice(data, size=100, replace=False)
# Bootstrap sampling
bootstrap_means = [np.mean(np.random.choice(data, size=len(data)))
for _ in range(1000)]
# T-test (compare means)
t_stat, p_value = stats.ttest_ind(group1, group2)
# Chi-square test (categorical data)
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)
# ANOVA (compare means across multiple groups)
f_stat, p_value = stats.f_oneway(group1, group2, group3)
# Confidence intervals
ci = stats.norm.interval(0.95, loc=mean, scale=std_err)
# Normal distribution
z_score = (x - mean) / std_dev
# Probability density function
pdf = stats.norm.pdf(x, mean, std_dev)
# Cumulative distribution function
cdf = stats.norm.cdf(x, mean, std_dev)
# Sampling
sample = np.random.choice(data, size=100, replace=False)
# Bootstrap sampling
bootstrap_means = [np.mean(np.random.choice(data, size=len(data)))
for _ in range(1000)]
Quick Reference
Common Libraries
# Data manipulation
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Machine learning
import sklearn
import tensorflow as tf
import torch
import xgboost as xgb
# Statistics
import scipy.stats as stats
import statsmodels.api as sm
# Natural language processing
import nltk
import spacy
import gensim
# Web scraping
import requests
import beautifulsoup4 as bs4
import scrapy
# Database access
import sqlite3
import pymysql
import psycopg2
import sqlalchemy
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Machine learning
import sklearn
import tensorflow as tf
import torch
import xgboost as xgb
# Statistics
import scipy.stats as stats
import statsmodels.api as sm
# Natural language processing
import nltk
import spacy
import gensim
# Web scraping
import requests
import beautifulsoup4 as bs4
import scrapy
# Database access
import sqlite3
import pymysql
import psycopg2
import sqlalchemy
Useful Resources
- Online Courses: Coursera, edX, Udacity, DataCamp
- Books: "Python for Data Analysis", "Introduction to Statistical Learning"
- YouTube Channels: StatQuest, Corey Schafer, Krish Naik
- Communities: Stack Overflow, Kaggle, Reddit (r/datascience)
- Practice Platforms: Kaggle, HackerRank, LeetCode
- Blogs: Towards Data Science, KDnuggets, Analytics Vidhya
- Documentation: Official docs for pandas, NumPy, scikit-learn