Students Performance Data Analysis

Load Packages

library(tidyverse)
library(janitor)
library(psych)
library(rsample)

Import Data

students <- read_csv("C:/Users/Bharat/Desktop/Proj/StudentsPerformance.csv") %>%
  clean_names()

dim(students)

## [1] 1000    8

names(students)

## [1] "gender"                      "race_ethnicity"             
## [3] "parental_level_of_education" "lunch"                      
## [5] "test_preparation_course"     "math_score"                 
## [7] "reading_score"               "writing_score"

head(students, 10)

Structure and Variables

str(students)

## spc_tbl_ [1,000 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ gender                     : chr [1:1000] "female" "female" "female" "male" ...
##  $ race_ethnicity             : chr [1:1000] "group B" "group C" "group B" "group A" ...
##  $ parental_level_of_education: chr [1:1000] "bachelor's degree" "some college" "master's degree" "associate's degree" ...
##  $ lunch                      : chr [1:1000] "standard" "standard" "standard" "free/reduced" ...
##  $ test_preparation_course    : chr [1:1000] "none" "completed" "none" "none" ...
##  $ math_score                 : num [1:1000] 72 69 90 47 76 71 88 40 64 38 ...
##  $ reading_score              : num [1:1000] 72 90 95 57 78 83 95 43 64 60 ...
##  $ writing_score              : num [1:1000] 74 88 93 44 75 78 92 39 67 50 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   gender = col_character(),
##   ..   `race/ethnicity` = col_character(),
##   ..   `parental level of education` = col_character(),
##   ..   lunch = col_character(),
##   ..   `test preparation course` = col_character(),
##   ..   `math score` = col_double(),
##   ..   `reading score` = col_double(),
##   ..   `writing score` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(students)

##     gender          race_ethnicity     parental_level_of_education
##  Length:1000        Length:1000        Length:1000                
##  Class :character   Class :character   Class :character           
##  Mode  :character   Mode  :character   Mode  :character           
##                                                                   
##                                                                   
##                                                                   
##     lunch           test_preparation_course   math_score     reading_score   
##  Length:1000        Length:1000             Min.   :  0.00   Min.   : 17.00  
##  Class :character   Class :character        1st Qu.: 57.00   1st Qu.: 59.00  
##  Mode  :character   Mode  :character        Median : 66.00   Median : 70.00  
##                                             Mean   : 66.09   Mean   : 69.17  
##                                             3rd Qu.: 77.00   3rd Qu.: 79.00  
##                                             Max.   :100.00   Max.   :100.00  
##  writing_score   
##  Min.   : 10.00  
##  1st Qu.: 57.75  
##  Median : 69.00  
##  Mean   : 68.05  
##  3rd Qu.: 79.00  
##  Max.   :100.00

Filter Rows (Example: students with math score > 80)

high_math <- students %>%
  filter(math_score > 80)

head(high_math, 10)

Remove Missing Values

students_no_na <- students %>% drop_na()
sum(!complete.cases(students_no_na))

## [1] 0

Remove Duplicates

students_dedup <- students_no_na %>% distinct()
nrow(students_no_na) - nrow(students_dedup)

## [1] 0

Reorder Rows (Descending by reading score)

students_ordered <- students_dedup %>% arrange(desc(reading_score))
head(students_ordered, 10)

Rename Columns (for easier use)

students_renamed <- students_dedup %>%
  rename(Gender = gender,
         Ethnicity = race_ethnicity,
         Parental_Education = parental_level_of_education,
         Lunch = lunch,
         Test_Prep = test_preparation_course,
         Math = math_score,
         Reading = reading_score,
         Writing = writing_score)

names(students_renamed)

## [1] "Gender"             "Ethnicity"          "Parental_Education"
## [4] "Lunch"              "Test_Prep"          "Math"              
## [7] "Reading"            "Writing"

Add New Variables

students_augmented <- students_renamed %>%
  mutate(Average_Score = (Math + Reading + Writing)/3,
         Passed_Math = ifelse(Math >= 50, "Yes", "No"))

head(students_augmented, 10)

Training and Testing Split

split_obj <- initial_split(students_augmented, prop = 0.7)
train_df <- training(split_obj)
test_df  <- testing(split_obj)

nrow(train_df); nrow(test_df)

## [1] 700

## [1] 300

Summary Statistics

students_augmented %>%
  group_by(Gender) %>%
  summarize(
    count = n(),
    mean_math = mean(Math),
    mean_reading = mean(Reading),
    mean_writing = mean(Writing)
  )

Mean, Median, Mode, Range (Math Scores)

x <- students_augmented$Math

mean(x)

## [1] 66.089

median(x)

## [1] 66

mode_stat <- function(v) {
  v <- v[!is.na(v)]
  u <- unique(v)
  u[which.max(tabulate(match(v, u)))]
}
mode_stat(x)

## [1] 65

range(x)

## [1]   0 100

Scatter Plot (Math vs Reading)

ggplot(students_augmented, aes(x = Math, y = Reading, color = Gender)) +
  geom_point(alpha = 0.8) +
  labs(title = "Scatter: Math vs Reading Scores") +
  theme_minimal()

Bar Plot (Average Score by Lunch Type)

students_augmented %>%
  group_by(Lunch) %>%
  summarize(mean_avg = mean(Average_Score)) %>%
  ggplot(aes(x = Lunch, y = mean_avg, fill = Lunch)) +
  geom_col() +
  labs(title = "Bar: Average Score by Lunch Type") +
  theme_minimal()

Pearson Correlation (Math vs Writing)

cor(students_augmented$Math, students_augmented$Writing, method = "pearson")

## [1] 0.802642

References

Dataset source: Kaggle — Students Performance Dataset
GitHub repository: https://github.com/bharatchopra-tech/Studentdata-analysisGroup1

Students Performance Data Analysis

Group 1

2025-11-29

Load Packages

Import Data

Structure and Variables

Filter Rows (Example: students with math score > 80)

Remove Missing Values

Remove Duplicates

Reorder Rows (Descending by reading score)

Rename Columns (for easier use)

Add New Variables

Training and Testing Split

Summary Statistics

Mean, Median, Mode, Range (Math Scores)

Scatter Plot (Math vs Reading)

Bar Plot (Average Score by Lunch Type)

Pearson Correlation (Math vs Writing)

References