Pandas Basics

Create a Pandas Series and DataFrame from different types such as list, numpy array and dictionary

# From list
import pandas as pd
my_list = [10,20,30,40,50,60]
series_from_list = pd.Series(my_list)
print(series_from_list)
## 0    10
## 1    20
## 2    30
## 3    40
## 4    50
## 5    60
## dtype: int64
# From Numpy array
import numpy as np
array_data = np.array([5, 15, 25, 35])
print(array_data)
## [ 5 15 25 35]
series_from_array = pd.Series(array_data)
print(series_from_array)
## 0     5
## 1    15
## 2    25
## 3    35
## dtype: int64
# From Dictionary
dic_data = {'a':1,'b':2 , 'c':3 }
series_from_dict= pd.Series(dic_data)
print(series_from_dict)
## a    1
## b    2
## c    3
## dtype: int64
# from list of list
list_of_lists =  [[1,'Alice',22],[2,'Alice',22],[3,'Alice',22]]
df_from_list = pd.DataFrame(list_of_lists,columns=['Id','Name','Age'])
print(df_from_list)
##    Id   Name  Age
## 0   1  Alice   22
## 1   2  Alice   22
## 2   3  Alice   22
# from numpy array
# Create a DataFrame from a NumPy array
array_2d = np.array([[1, 'Alice', 22], [2, 'Bob', 25], [3, 'Charlie', 30]])
df_from_array = pd.DataFrame(array_2d, columns=['ID', 'Name', 'Age'])

print("\nDataFrame from NumPy array:")
## 
## DataFrame from NumPy array:
print(df_from_array)
##   ID     Name Age
## 0  1    Alice  22
## 1  2      Bob  25
## 2  3  Charlie  30
# Create a DataFrame from a dictionary
dict_data = {'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [22, 25, 30]}
df_from_dict = pd.DataFrame(dict_data)

print("\nDataFrame from dictionary:")
## 
## DataFrame from dictionary:
print(df_from_dict)
##    ID     Name  Age
## 0   1    Alice   22
## 1   2      Bob   25
## 2   3  Charlie   30

Practice accessing rows and columns

Create a sample DataFrame

data = {
    'ID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [22, 25, 30]
}
df = pd.DataFrame(data)
print(df)
##    ID     Name  Age
## 0   1    Alice   22
## 1   2      Bob   25
## 2   3  Charlie   30
# accessing by column name
print(df['Age'])
## 0    22
## 1    25
## 2    30
## Name: Age, dtype: int64
# accessing multiple columns
print(df[['Name','Age']])
##       Name  Age
## 0    Alice   22
## 1      Bob   25
## 2  Charlie   30
# accessing first row at index 0
first_row = df.iloc[0]
print(first_row)
## ID          1
## Name    Alice
## Age        22
## Name: 0, dtype: object
# accessing multiple rows i.e first 2 rows
first_two_rows= df.iloc[0:2]
print(first_two_rows)
##    ID   Name  Age
## 0   1  Alice   22
## 1   2    Bob   25
#accessing by row labels using loc
row_label_1 = df.loc[1]
print(row_label_1)
## ID        2
## Name    Bob
## Age      25
## Name: 1, dtype: object
# accessing multiple rows using row labels
row_labels_1_2 = df.loc[1:2]
print(row_labels_1_2)
##    ID     Name  Age
## 1   2      Bob   25
## 2   3  Charlie   30
# accessing a specif value (age of the second person)
value = df.at[1,'Age']
print(value)
## 25
# Indexing and slicing
import pandas as pd

# Create a sample DataFrame
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [22, 25, 30, 35, 40],
    'Score': [88, 92, 85, 90, 95]
}
df = pd.DataFrame(data)

print("Sample DataFrame:")
## Sample DataFrame:
print(df)
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
# Accessing single column
print(df['Name'])
## 0      Alice
## 1        Bob
## 2    Charlie
## 3      David
## 4        Eve
## Name: Name, dtype: object
# Accessing multiple columns
print(df[['Name','Age']])
##       Name  Age
## 0    Alice   22
## 1      Bob   25
## 2  Charlie   30
## 3    David   35
## 4      Eve   40
# Accessing first row
first_row = df.iloc[0]
print(first_row)
## ID           1
## Name     Alice
## Age         22
## Score       88
## Name: 0, dtype: object
# Multiple rows by index range
multiple_rows = df.iloc[1:3]
print(multiple_rows)
##    ID     Name  Age  Score
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
# accessing rows with index labels 2 and 3
multiple_rows = df.loc[2:3]
print(multiple_rows)
##    ID     Name  Age  Score
## 2   3  Charlie   30     85
## 3   4    David   35     90
#accessing value in 1 row and column 'Score'
value_at = df.at[1, 'Score']
print("\nValue in row 1, column 'Score':")
## 
## Value in row 1, column 'Score':
print(value_at)
## 92
# Accessing value in row 1, column position 3 (Score)
value_iat = df.iat[1, 3]
print("\nValue in row 1, column position 3 (Score):")
## 
## Value in row 1, column position 3 (Score):
print(value_iat)
## 92
# slicing rows with conditions
#accessing rows where age is > 30
age_filtered = df[df['Age'] > 30]
print(age_filtered)
##    ID   Name  Age  Score
## 3   4  David   35     90
## 4   5    Eve   40     95
#slicing columns by position
#slicing first 2 columns
col_slice = df.iloc[:,:2]
print(col_slice)
##    ID     Name
## 0   1    Alice
## 1   2      Bob
## 2   3  Charlie
## 3   4    David
## 4   5      Eve
# Selecting specific rows and columns (rows 1 to 3 and columns 'Name' and 'Score')
specific_slice = df.loc[1:3, ['Name', 'Score']]
print("\nRows 1 to 3 and columns 'Name' and 'Score':")
## 
## Rows 1 to 3 and columns 'Name' and 'Score':
print(specific_slice)
##       Name  Score
## 1      Bob     92
## 2  Charlie     85
## 3    David     90

Save the dataframe

import pandas as pd

# Create a sample DataFrame
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [22, 25, 30, 35, 40],
    'Score': [88, 92, 85, 90, 95]
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('sample_data.csv', index=False, header=True, sep=',', encoding='utf-8')

# load the dataframe from CSV
df = pd.read_csv('sample_data.csv')  
print(df.to_string())  
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
# read the dataframe from csv
import pandas as pd

# Load the DataFrame from a CSV file
df = pd.read_csv('sample_data.csv', header=0, index_col='ID', usecols=['ID', 'Name', 'Score'], encoding='utf-8', sep=',')

# Display the DataFrame
print("Loaded DataFrame:")
## Loaded DataFrame:
print(df)
##        Name  Score
## ID                
## 1     Alice     88
## 2       Bob     92
## 3   Charlie     85
## 4     David     90
## 5       Eve     95

Use head(), tail(), info(), and describe() to inspect the first few rows, structure, and summary statistics of the data.

import pandas as pd

# Create a sample DataFrame
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Hannah', 'Ivy', 'John'],
    'Age': [22, 25, 30, 35, 40, 45, 50, 55, 60, 65],
    'Score': [88, 92, 85, 90, 95, 89, 77, 82, 91, 93]
}
df = pd.DataFrame(data)
print(df.head())
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
print(df.tail())
##    ID    Name  Age  Score
## 5   6   Frank   45     89
## 6   7   Grace   50     77
## 7   8  Hannah   55     82
## 8   9     Ivy   60     91
## 9  10    John   65     93
print(df.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 10 entries, 0 to 9
## Data columns (total 4 columns):
##  #   Column  Non-Null Count  Dtype 
## ---  ------  --------------  ----- 
##  0   ID      10 non-null     int64 
##  1   Name    10 non-null     object
##  2   Age     10 non-null     int64 
##  3   Score   10 non-null     int64 
## dtypes: int64(3), object(1)
## memory usage: 452.0+ bytes
## None
print(df.describe())
##              ID        Age      Score
## count  10.00000  10.000000  10.000000
## mean    5.50000  42.700000  88.200000
## std     3.02765  14.817782   5.473167
## min     1.00000  22.000000  77.000000
## 25%     3.25000  31.250000  85.750000
## 50%     5.50000  42.500000  89.500000
## 75%     7.75000  53.750000  91.750000
## max    10.00000  65.000000  95.000000

Check data types, columns, and index:

import pandas as pd
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Hannah', 'Ivy', 'John'],
    'Age': [22, 25, 30, 35, 40, 45, 50, 55, 60, 65],
    'Score': [88, 92, 85, 90, 95, 89, 77, 82, 91, 93]

}
df = pd.DataFrame(data)
print(df.dtypes)
## ID        int64
## Name     object
## Age       int64
## Score     int64
## dtype: object
print(df.columns)
## Index(['ID', 'Name', 'Age', 'Score'], dtype='object')
print(df.index)
## RangeIndex(start=0, stop=10, step=1)

selecting specific rows and columns using loc and iloc

df.loc[2, 'ID']  # Select specific row and column by label
## np.int64(3)
df.loc[2:5, ['ID', 'Name']]  # Select range of rows and specific columns
##    ID     Name
## 2   3  Charlie
## 3   4    David
## 4   5      Eve
## 5   6    Frank

Using iloc[]: Select by position (index).

df.iloc[2:4]  # Select specific row and column by index
##    ID     Name  Age  Score
## 2   3  Charlie   30     85
## 3   4    David   35     90
df.iloc[2:5, 1:3]  # Select range of rows and columns by position
##       Name  Age
## 2  Charlie   30
## 3    David   35
## 4      Eve   40

Filter data using conditions (e.g., df[df[‘column’] > value]):

df[df['Age'] > 0]  # Filter rows where 'column' is greater than 'value'
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
## 5   6    Frank   45     89
## 6   7    Grace   50     77
## 7   8   Hannah   55     82
## 8   9      Ivy   60     91
## 9  10     John   65     93

Practice Boolean indexing:

df[(df['ID'] > 0) & (df['Age'] != 0)]  # Multiple conditions
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
## 5   6    Frank   45     89
## 6   7    Grace   50     77
## 7   8   Hannah   55     82
## 8   9      Ivy   60     91
## 9  10     John   65     93

Detect missing values using isnull():

df.isnull()  # Returns DataFrame of booleans where True indicates missing data
##       ID   Name    Age  Score
## 0  False  False  False  False
## 1  False  False  False  False
## 2  False  False  False  False
## 3  False  False  False  False
## 4  False  False  False  False
## 5  False  False  False  False
## 6  False  False  False  False
## 7  False  False  False  False
## 8  False  False  False  False
## 9  False  False  False  False
df.isnull().sum()  # Count of missing values per column
## ID       0
## Name     0
## Age      0
## Score    0
## dtype: int64

Fill missing data with fillna():

df.fillna("value")  # Fill all NaN values with a specified value
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
## 5   6    Frank   45     89
## 6   7    Grace   50     77
## 7   8   Hannah   55     82
## 8   9      Ivy   60     91
## 9  10     John   65     93

Drop rows/columns with missing values using dropna():

df.dropna()  # Drops rows with missing values
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
## 5   6    Frank   45     89
## 6   7    Grace   50     77
## 7   8   Hannah   55     82
## 8   9      Ivy   60     91
## 9  10     John   65     93
df.dropna(axis=1)  # Drops columns with missing values
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
## 5   6    Frank   45     89
## 6   7    Grace   50     77
## 7   8   Hannah   55     82
## 8   9      Ivy   60     91
## 9  10     John   65     93

Remove duplicates using drop_duplicates():

df.drop_duplicates()  # Remove duplicate rows
##    ID     Name  Age  Score
## 0   1    Alice   22     88
## 1   2      Bob   25     92
## 2   3  Charlie   30     85
## 3   4    David   35     90
## 4   5      Eve   40     95
## 5   6    Frank   45     89
## 6   7    Grace   50     77
## 7   8   Hannah   55     82
## 8   9      Ivy   60     91
## 9  10     John   65     93

Rename columns using rename():

df.rename(columns={'ID': 'Sno'})  # Rename a column
##    Sno     Name  Age  Score
## 0    1    Alice   22     88
## 1    2      Bob   25     92
## 2    3  Charlie   30     85
## 3    4    David   35     90
## 4    5      Eve   40     95
## 5    6    Frank   45     89
## 6    7    Grace   50     77
## 7    8   Hannah   55     82
## 8    9      Ivy   60     91
## 9   10     John   65     93

Add new columns to a DataFrame:

df['new_column'] = 10  # Add a new column with specified values

Modify values in existing columns:

df['Age'] = df['Age'].apply(lambda x: x * 2)  # Modify column values
df
##    ID     Name  Age  Score  new_column
## 0   1    Alice   44     88          10
## 1   2      Bob   50     92          10
## 2   3  Charlie   60     85          10
## 3   4    David   70     90          10
## 4   5      Eve   80     95          10
## 5   6    Frank   90     89          10
## 6   7    Grace  100     77          10
## 7   8   Hannah  110     82          10
## 8   9      Ivy  120     91          10
## 9  10     John  130     93          10

Sort a DataFrame by column values using sort_values():

df.sort_values(by='Age')  # Sort by column
##    ID     Name  Age  Score  new_column
## 0   1    Alice   44     88          10
## 1   2      Bob   50     92          10
## 2   3  Charlie   60     85          10
## 3   4    David   70     90          10
## 4   5      Eve   80     95          10
## 5   6    Frank   90     89          10
## 6   7    Grace  100     77          10
## 7   8   Hannah  110     82          10
## 8   9      Ivy  120     91          10
## 9  10     John  130     93          10
df.sort_values(by=['Name', 'Age'], ascending=[True, False])  # Sort by multiple columns
##    ID     Name  Age  Score  new_column
## 0   1    Alice   44     88          10
## 1   2      Bob   50     92          10
## 2   3  Charlie   60     85          10
## 3   4    David   70     90          10
## 4   5      Eve   80     95          10
## 5   6    Frank   90     89          10
## 6   7    Grace  100     77          10
## 7   8   Hannah  110     82          10
## 8   9      Ivy  120     91          10
## 9  10     John  130     93          10

Sort by index using sort_index():

df.sort_index()  # Sort by the DataFrame index
##    ID     Name  Age  Score  new_column
## 0   1    Alice   44     88          10
## 1   2      Bob   50     92          10
## 2   3  Charlie   60     85          10
## 3   4    David   70     90          10
## 4   5      Eve   80     95          10
## 5   6    Frank   90     89          10
## 6   7    Grace  100     77          10
## 7   8   Hannah  110     82          10
## 8   9      Ivy  120     91          10
## 9  10     John  130     93          10

Group the data by a specific column using groupby():

df.groupby('Age')  # Group by column
## <pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000253B870F440>

Perform aggregation operations like mean(), sum(), and count() on the grouped data:

df.groupby('Name').agg({'Score': 'mean', 'Age': 'sum'})  # Aggregate with multiple functions
##          Score  Age
## Name               
## Alice     88.0   44
## Bob       92.0   50
## Charlie   85.0   60
## David     90.0   70
## Eve       95.0   80
## Frank     89.0   90
## Grace     77.0  100
## Hannah    82.0  110
## Ivy       91.0  120
## John      93.0  130

Practice using Student spending datastet

This dataset contains fictional data representing the spending habits of 1000 students across various demographic groups and academic backgrounds. The dataset includes information such as age, gender, year in school, major, monthly income, financial aid received, and expenses in different spending categories. Spending categories include tuition, housing, food, transportation, books & supplies, entertainment, personal care, technology, health & wellness, and miscellaneous expenses. Additionally, the dataset includes the preferred payment method for each student.

  • Calculate the total spending across all categories, grouped by gender
  • Calculate the average spending in each category, grouped by year in school
  • Calculate the total spending across all categories, grouped by major
  • Calculate the average monthly income and financial aid received, grouped by gender
  • Group by payment method and calculate total spending across all categories
  • Group by whether financial aid is received and calculate the average spending
  • Create age groups and calculate total spending across all categories for each group
  • Calculate total and average spending across all categories for the entire dataset
  • Group by Major and Year in School and calculate average spending across all categories
  • Calculate the percentage of total spending for each category

!()[https://docs.google.com/spreadsheets/d/1KVQaJ-1jW9hRjtvBQa5wGso5-8wN2zyu8DtKwPCgm38/edit?usp=sharing]