pandas basicsPython

pandas basics

data science ml

Run notebook
pandas basics

Converted from 02_pandas_basics.ipynb for web reading.

Code cell 1

import numpy as np
import pandas as pd
print(f"Pandas version: {pd.__version__}")

Series

Code cell 3

s = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])
print(f"Series:\n{s}")
print(f"\ns['c'] = {s['c']}")
print(f"\ns > 25:\n{s[s > 25]}")

DataFrame Creation

Code cell 5

# From dictionary
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'city': ['NYC', 'LA', 'Chicago', 'NYC', 'LA'],
    'salary': [50000, 60000, 75000, 55000, 65000]
}
df = pd.DataFrame(data)
df

Code cell 6

print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nDtypes:\n{df.dtypes}")

Data Selection

Code cell 8

print(f"df['name']:\n{df['name']}")
print(f"\ndf[['name', 'salary']]:\n{df[['name', 'salary']]}")

Code cell 9

print(f"df.loc[0] (first row):\n{df.loc[0]}")
print(f"\ndf.iloc[0:2] (first 2 rows):\n{df.iloc[0:2]}")
print(f"\ndf.loc[1, 'salary'] = {df.loc[1, 'salary']}")

Filtering

Code cell 11

# Boolean conditions
high_salary = df[df['salary'] > 55000]
print(f"Salary > 55000:\n{high_salary}")

Code cell 12

nyc_young = df[(df['city'] == 'NYC') & (df['age'] < 30)]
print(f"NYC and age < 30:\n{nyc_young}")

Adding and Modifying Columns

Code cell 14

df['bonus'] = df['salary'] * 0.1
df['total_comp'] = df['salary'] + df['bonus']
df['age_group'] = df['age'].apply(lambda x: 'Young' if x < 30 else 'Senior')
df

Grouping and Aggregation

Code cell 16

# Group by single column
by_city = df.groupby('city')['salary'].agg(['mean', 'sum', 'count'])
print(f"Salary by city:\n{by_city}")

Code cell 17

# Multiple aggregations
agg_result = df.groupby('city').agg({
    'salary': ['mean', 'max'],
    'age': 'mean'
})
agg_result

Missing Data

Code cell 19

df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})
print(f"With missing data:\n{df_missing}")
print(f"\nMissing count:\n{df_missing.isna().sum()}")

Code cell 20

# Fill missing values
filled = df_missing.fillna(df_missing.mean())
print(f"Filled with mean:\n{filled}")

Merge Example

Code cell 22

df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

print(f"df1:\n{df1}")
print(f"\ndf2:\n{df2}")

Code cell 23

merged = pd.merge(df1, df2, on='key', how='outer')
print(f"Outer merge:\n{merged}")