Docs
data science ml
Data Science and Machine Learning Fundamentals with Python
This module introduces Python's data science ecosystem and basic machine learning concepts.
Table of Contents
- •NumPy Fundamentals
- •Pandas for Data Manipulation
- •Data Visualization with Matplotlib
- •Introduction to Machine Learning
- •Scikit-learn Basics
- •Best Practices
NumPy Fundamentals
NumPy is the foundation of Python's scientific computing stack.
Why NumPy?
# Standard Python list operations are slow
python_list = list(range(1000000))
# Looping through is O(n) for each operation
# NumPy arrays are fast (vectorized operations)
import numpy as np
numpy_array = np.arange(1000000)
# Operations happen in C, much faster!
Creating Arrays
import numpy as np
# From Python lists
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([[1, 2, 3], [4, 5, 6]]) # 2D array
# Using built-in functions
zeros = np.zeros((3, 4)) # 3x4 array of zeros
ones = np.ones((2, 3)) # 2x3 array of ones
identity = np.eye(4) # 4x4 identity matrix
range_arr = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1]
random = np.random.rand(3, 3) # Random 3x3 array
# Data types
int_arr = np.array([1, 2, 3], dtype=np.int32)
float_arr = np.array([1, 2, 3], dtype=np.float64)
Array Operations
import numpy as np
a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])
# Vectorized arithmetic (element-wise)
print(a + b) # [11, 22, 33, 44, 55]
print(a * b) # [10, 40, 90, 160, 250]
print(a ** 2) # [1, 4, 9, 16, 25]
print(np.sqrt(a)) # [1.0, 1.41, 1.73, 2.0, 2.24]
# Aggregations
print(a.sum()) # 15
print(a.mean()) # 3.0
print(a.std()) # 1.41...
print(a.min()) # 1
print(a.max()) # 5
# Boolean operations
print(a > 2) # [False, False, True, True, True]
print(a[a > 2]) # [3, 4, 5] - boolean indexing
Array Indexing and Slicing
import numpy as np
arr = np.array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12]])
# Basic indexing
print(arr[0, 0]) # 1 (row 0, col 0)
print(arr[1, 2]) # 7 (row 1, col 2)
# Slicing
print(arr[0, :]) # [1, 2, 3, 4] - first row
print(arr[:, 0]) # [1, 5, 9] - first column
print(arr[0:2, 1:3]) # [[2, 3], [6, 7]] - subarray
# Fancy indexing
print(arr[[0, 2], :]) # Rows 0 and 2
Reshaping and Broadcasting
import numpy as np
# Reshaping
arr = np.arange(12)
reshaped = arr.reshape(3, 4) # 3 rows, 4 columns
flattened = reshaped.flatten() # Back to 1D
# Broadcasting - automatic shape matching
a = np.array([[1], [2], [3]]) # Shape (3, 1)
b = np.array([10, 20, 30]) # Shape (3,)
print(a + b)
# [[11, 21, 31],
# [12, 22, 32],
# [13, 23, 33]]
Pandas for Data Manipulation
Pandas provides powerful data structures for data analysis.
Series and DataFrames
import pandas as pd
# Series - 1D labeled array
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
print(s['a']) # 1
print(s[s > 2]) # c, d, e
# DataFrame - 2D labeled data structure
data = {
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['NYC', 'LA', 'Chicago']
}
df = pd.DataFrame(data)
print(df)
# name age city
# 0 Alice 25 NYC
# 1 Bob 30 LA
# 2 Charlie 35 Chicago
Reading and Writing Data
import pandas as pd
# Reading data
df_csv = pd.read_csv('data.csv')
df_excel = pd.read_excel('data.xlsx')
df_json = pd.read_json('data.json')
df_sql = pd.read_sql('SELECT * FROM table', connection)
# Writing data
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx', index=False)
df.to_json('output.json')
# Common options
df = pd.read_csv('data.csv',
sep=',', # Delimiter
header=0, # Row to use as header
index_col='id', # Column to use as index
usecols=['a','b'], # Columns to read
dtype={'a': int}, # Data types
parse_dates=['date']) # Parse date columns
Data Selection
import pandas as pd
df = pd.DataFrame({
'A': [1, 2, 3],
'B': [4, 5, 6],
'C': [7, 8, 9]
}, index=['x', 'y', 'z'])
# Column selection
print(df['A']) # Series
print(df[['A', 'B']]) # DataFrame
# Row selection
print(df.loc['x']) # By label
print(df.iloc[0]) # By position
print(df.loc['x':'y']) # Slice by label
print(df.iloc[0:2]) # Slice by position
# Cell selection
print(df.loc['x', 'A']) # 1
print(df.iloc[0, 0]) # 1
# Boolean selection
print(df[df['A'] > 1]) # Rows where A > 1
print(df.query('A > 1')) # Same with query syntax
Data Manipulation
import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'Alice'],
'category': ['A', 'B', 'A', 'B'],
'value': [100, 200, 150, 175]
})
# Adding columns
df['doubled'] = df['value'] * 2
df['rank'] = df['value'].rank()
# Filtering
filtered = df[df['value'] > 150]
# Sorting
sorted_df = df.sort_values('value', ascending=False)
# Grouping
grouped = df.groupby('category')['value'].mean()
# category
# A 125.0
# B 187.5
# Multiple aggregations
agg = df.groupby('category').agg({
'value': ['sum', 'mean', 'count'],
'name': 'nunique'
})
# Pivot tables
pivot = df.pivot_table(
values='value',
index='name',
columns='category',
aggfunc='sum'
)
Handling Missing Data
import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [9, 10, 11, 12]
})
# Detecting missing data
print(df.isna()) # Boolean mask
print(df.isna().sum()) # Count per column
# Dropping missing data
df.dropna() # Drop rows with any NaN
df.dropna(how='all') # Drop rows where all NaN
df.dropna(subset=['A']) # Drop if A is NaN
# Filling missing data
df.fillna(0) # Fill with value
df.fillna(method='ffill') # Forward fill
df.fillna(method='bfill') # Backward fill
df['A'].fillna(df['A'].mean()) # Fill with mean
Merging and Joining
import pandas as pd
# Two DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
# Merge (like SQL JOIN)
inner = pd.merge(df1, df2, on='key', how='inner') # Only matching
left = pd.merge(df1, df2, on='key', how='left') # All from left
right = pd.merge(df1, df2, on='key', how='right') # All from right
outer = pd.merge(df1, df2, on='key', how='outer') # All from both
# Concatenation
combined = pd.concat([df1, df2], ignore_index=True) # Stack vertically
combined = pd.concat([df1, df2], axis=1) # Stack horizontally
Data Visualization with Matplotlib
Matplotlib is Python's fundamental plotting library.
Basic Plotting
import matplotlib.pyplot as plt
import numpy as np
# Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.figure(figsize=(10, 6))
plt.plot(x, y, label='sin(x)', color='blue', linewidth=2)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Sine Wave')
plt.legend()
plt.grid(True)
plt.savefig('sine_wave.png', dpi=150)
plt.show()
Multiple Plot Types
import matplotlib.pyplot as plt
import numpy as np
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Line plot
x = np.linspace(0, 10, 100)
axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('Line Plot')
# Scatter plot
x = np.random.rand(50)
y = np.random.rand(50)
colors = np.random.rand(50)
axes[0, 1].scatter(x, y, c=colors, cmap='viridis')
axes[0, 1].set_title('Scatter Plot')
# Bar plot
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 56, 78]
axes[1, 0].bar(categories, values, color='steelblue')
axes[1, 0].set_title('Bar Plot')
# Histogram
data = np.random.randn(1000)
axes[1, 1].hist(data, bins=30, color='green', alpha=0.7)
axes[1, 1].set_title('Histogram')
plt.tight_layout()
plt.show()
Pandas Plotting Integration
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({
'month': ['Jan', 'Feb', 'Mar', 'Apr', 'May'],
'sales': [100, 120, 150, 130, 170],
'expenses': [80, 90, 100, 110, 120]
})
# Pandas has built-in plotting
df.plot(x='month', y=['sales', 'expenses'], kind='bar')
plt.title('Monthly Sales vs Expenses')
plt.show()
# Other plot types
df['sales'].plot(kind='line')
df['sales'].plot(kind='hist')
df.plot(kind='scatter', x='sales', y='expenses')
Introduction to Machine Learning
Machine learning enables computers to learn patterns from data.
ML Categories
- •
Supervised Learning: Learn from labeled data
- •Classification: Predict categories (spam/not spam)
- •Regression: Predict continuous values (house prices)
- •
Unsupervised Learning: Find patterns in unlabeled data
- •Clustering: Group similar items
- •Dimensionality Reduction: Simplify data
- •
Reinforcement Learning: Learn through rewards/penalties
The ML Workflow
1. Collect Data
2. Explore & Visualize
3. Prepare Data (clean, transform)
4. Split Data (train/test)
5. Select Model
6. Train Model
7. Evaluate Model
8. Tune Hyperparameters
9. Deploy Model
Scikit-learn Basics
Scikit-learn is Python's premier machine learning library.
Train-Test Split
from sklearn.model_selection import train_test_split
# X = features, y = target
X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
y = [0, 0, 1, 1, 1]
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 20% for testing
random_state=42 # Reproducibility
)
Classification Example
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load dataset
iris = load_iris()
X, y = iris.data, iris.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Create and train model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))
Regression Example
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Load dataset (or create sample data)
# boston = load_boston() # Deprecated
X = np.random.rand(100, 3) # 100 samples, 3 features
y = 3*X[:, 0] + 2*X[:, 1] + X[:, 2] + np.random.randn(100)*0.1
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluate
print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"Coefficients: {model.coef_}")
Feature Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
# Numerical scaling
X = np.array([[1, 100], [2, 200], [3, 300]])
# Standardization (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Normalization (0-1 range)
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)
# Categorical encoding
categories = ['cat', 'dog', 'cat', 'bird']
# Label encoding (for ordinal)
le = LabelEncoder()
encoded = le.fit_transform(categories) # [1, 2, 1, 0]
# One-hot encoding (for nominal)
ohe = OneHotEncoder(sparse=False)
one_hot = ohe.fit_transform(np.array(categories).reshape(-1, 1))
Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression())
])
# Use like a single model
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
# Cross-validation
scores = cross_val_score(pipeline, X, y, cv=5)
print(f"CV Scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")
Model Selection
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'classifier__C': [0.1, 1, 10],
'classifier__max_iter': [100, 200]
}
# Grid search
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5,
scoring='accuracy'
)
grid_search.fit(X_train, y_train)
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.3f}")
best_model = grid_search.best_estimator_
Best Practices
Data Science Workflow Tips
# 1. Always explore your data first
df.head()
df.info()
df.describe()
df.isnull().sum()
# 2. Handle missing values appropriately
# - Don't just drop without understanding why
# - Consider imputation strategies
# 3. Visualize distributions
df['column'].hist()
df.boxplot()
# 4. Check for correlations
df.corr()
# 5. Split data BEFORE preprocessing
# Prevents data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler.fit(X_train) # Fit only on training
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
Common Pitfalls to Avoid
# BAD: Fitting scaler on all data (data leakage)
scaler.fit(X) # Includes test data!
# GOOD: Fit only on training data
scaler.fit(X_train)
# BAD: Not checking for class imbalance
# Model may always predict majority class
# GOOD: Check distribution
print(y.value_counts())
# BAD: Using accuracy for imbalanced data
# 95% accuracy with 95% majority class is useless
# GOOD: Use precision, recall, F1, ROC-AUC
from sklearn.metrics import f1_score, roc_auc_score
Key Libraries Summary
| Library | Purpose |
|---|---|
| NumPy | Numerical computing, arrays |
| Pandas | Data manipulation, analysis |
| Matplotlib | Basic plotting |
| Seaborn | Statistical visualization |
| Scikit-learn | Machine learning |
| Jupyter | Interactive notebooks |
Next Steps
After mastering these fundamentals:
- •Deep Learning: TensorFlow, PyTorch
- •Advanced Visualization: Plotly, Bokeh
- •Big Data: Dask, PySpark
- •Natural Language Processing: NLTK, spaCy
- •Computer Vision: OpenCV, PIL