Load Packages

library(tidyverse)
library(janitor)
library(psych)
library(rsample)

Import Data

students <- read_csv("C:/Users/Bharat/Desktop/Proj/StudentsPerformance.csv") %>%
  clean_names()

dim(students)
## [1] 1000    8
names(students)
## [1] "gender"                      "race_ethnicity"             
## [3] "parental_level_of_education" "lunch"                      
## [5] "test_preparation_course"     "math_score"                 
## [7] "reading_score"               "writing_score"
head(students, 10)

Structure and Variables

str(students)
## spc_tbl_ [1,000 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ gender                     : chr [1:1000] "female" "female" "female" "male" ...
##  $ race_ethnicity             : chr [1:1000] "group B" "group C" "group B" "group A" ...
##  $ parental_level_of_education: chr [1:1000] "bachelor's degree" "some college" "master's degree" "associate's degree" ...
##  $ lunch                      : chr [1:1000] "standard" "standard" "standard" "free/reduced" ...
##  $ test_preparation_course    : chr [1:1000] "none" "completed" "none" "none" ...
##  $ math_score                 : num [1:1000] 72 69 90 47 76 71 88 40 64 38 ...
##  $ reading_score              : num [1:1000] 72 90 95 57 78 83 95 43 64 60 ...
##  $ writing_score              : num [1:1000] 74 88 93 44 75 78 92 39 67 50 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   gender = col_character(),
##   ..   `race/ethnicity` = col_character(),
##   ..   `parental level of education` = col_character(),
##   ..   lunch = col_character(),
##   ..   `test preparation course` = col_character(),
##   ..   `math score` = col_double(),
##   ..   `reading score` = col_double(),
##   ..   `writing score` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(students)
##     gender          race_ethnicity     parental_level_of_education
##  Length:1000        Length:1000        Length:1000                
##  Class :character   Class :character   Class :character           
##  Mode  :character   Mode  :character   Mode  :character           
##                                                                   
##                                                                   
##                                                                   
##     lunch           test_preparation_course   math_score     reading_score   
##  Length:1000        Length:1000             Min.   :  0.00   Min.   : 17.00  
##  Class :character   Class :character        1st Qu.: 57.00   1st Qu.: 59.00  
##  Mode  :character   Mode  :character        Median : 66.00   Median : 70.00  
##                                             Mean   : 66.09   Mean   : 69.17  
##                                             3rd Qu.: 77.00   3rd Qu.: 79.00  
##                                             Max.   :100.00   Max.   :100.00  
##  writing_score   
##  Min.   : 10.00  
##  1st Qu.: 57.75  
##  Median : 69.00  
##  Mean   : 68.05  
##  3rd Qu.: 79.00  
##  Max.   :100.00

Filter Rows (Example: students with math score > 80)

high_math <- students %>%
  filter(math_score > 80)

head(high_math, 10)

Remove Missing Values

students_no_na <- students %>% drop_na()
sum(!complete.cases(students_no_na))
## [1] 0

Remove Duplicates

students_dedup <- students_no_na %>% distinct()
nrow(students_no_na) - nrow(students_dedup)
## [1] 0

Reorder Rows (Descending by reading score)

students_ordered <- students_dedup %>% arrange(desc(reading_score))
head(students_ordered, 10)

Rename Columns (for easier use)

students_renamed <- students_dedup %>%
  rename(Gender = gender,
         Ethnicity = race_ethnicity,
         Parental_Education = parental_level_of_education,
         Lunch = lunch,
         Test_Prep = test_preparation_course,
         Math = math_score,
         Reading = reading_score,
         Writing = writing_score)

names(students_renamed)
## [1] "Gender"             "Ethnicity"          "Parental_Education"
## [4] "Lunch"              "Test_Prep"          "Math"              
## [7] "Reading"            "Writing"

Add New Variables

students_augmented <- students_renamed %>%
  mutate(Average_Score = (Math + Reading + Writing)/3,
         Passed_Math = ifelse(Math >= 50, "Yes", "No"))

head(students_augmented, 10)

Training and Testing Split

split_obj <- initial_split(students_augmented, prop = 0.7)
train_df <- training(split_obj)
test_df  <- testing(split_obj)

nrow(train_df); nrow(test_df)
## [1] 700
## [1] 300

Summary Statistics

students_augmented %>%
  group_by(Gender) %>%
  summarize(
    count = n(),
    mean_math = mean(Math),
    mean_reading = mean(Reading),
    mean_writing = mean(Writing)
  )

Mean, Median, Mode, Range (Math Scores)

x <- students_augmented$Math

mean(x)
## [1] 66.089
median(x)
## [1] 66
mode_stat <- function(v) {
  v <- v[!is.na(v)]
  u <- unique(v)
  u[which.max(tabulate(match(v, u)))]
}
mode_stat(x)
## [1] 65
range(x)
## [1]   0 100

Scatter Plot (Math vs Reading)

ggplot(students_augmented, aes(x = Math, y = Reading, color = Gender)) +
  geom_point(alpha = 0.8) +
  labs(title = "Scatter: Math vs Reading Scores") +
  theme_minimal()

Bar Plot (Average Score by Lunch Type)

students_augmented %>%
  group_by(Lunch) %>%
  summarize(mean_avg = mean(Average_Score)) %>%
  ggplot(aes(x = Lunch, y = mean_avg, fill = Lunch)) +
  geom_col() +
  labs(title = "Bar: Average Score by Lunch Type") +
  theme_minimal()

Pearson Correlation (Math vs Writing)

cor(students_augmented$Math, students_augmented$Writing, method = "pearson")
## [1] 0.802642

References

Dataset source: Kaggle — Students Performance Dataset
GitHub repository: https://github.com/bharatchopra-tech/Studentdata-analysisGroup1