library(tidyverse)
library(janitor)
library(psych)
library(rsample)
students <- read_csv("C:/Users/Bharat/Desktop/Proj/StudentsPerformance.csv") %>%
clean_names()
dim(students)
## [1] 1000 8
names(students)
## [1] "gender" "race_ethnicity"
## [3] "parental_level_of_education" "lunch"
## [5] "test_preparation_course" "math_score"
## [7] "reading_score" "writing_score"
head(students, 10)
str(students)
## spc_tbl_ [1,000 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ gender : chr [1:1000] "female" "female" "female" "male" ...
## $ race_ethnicity : chr [1:1000] "group B" "group C" "group B" "group A" ...
## $ parental_level_of_education: chr [1:1000] "bachelor's degree" "some college" "master's degree" "associate's degree" ...
## $ lunch : chr [1:1000] "standard" "standard" "standard" "free/reduced" ...
## $ test_preparation_course : chr [1:1000] "none" "completed" "none" "none" ...
## $ math_score : num [1:1000] 72 69 90 47 76 71 88 40 64 38 ...
## $ reading_score : num [1:1000] 72 90 95 57 78 83 95 43 64 60 ...
## $ writing_score : num [1:1000] 74 88 93 44 75 78 92 39 67 50 ...
## - attr(*, "spec")=
## .. cols(
## .. gender = col_character(),
## .. `race/ethnicity` = col_character(),
## .. `parental level of education` = col_character(),
## .. lunch = col_character(),
## .. `test preparation course` = col_character(),
## .. `math score` = col_double(),
## .. `reading score` = col_double(),
## .. `writing score` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(students)
## gender race_ethnicity parental_level_of_education
## Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lunch test_preparation_course math_score reading_score
## Length:1000 Length:1000 Min. : 0.00 Min. : 17.00
## Class :character Class :character 1st Qu.: 57.00 1st Qu.: 59.00
## Mode :character Mode :character Median : 66.00 Median : 70.00
## Mean : 66.09 Mean : 69.17
## 3rd Qu.: 77.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00
## writing_score
## Min. : 10.00
## 1st Qu.: 57.75
## Median : 69.00
## Mean : 68.05
## 3rd Qu.: 79.00
## Max. :100.00
high_math <- students %>%
filter(math_score > 80)
head(high_math, 10)
students_no_na <- students %>% drop_na()
sum(!complete.cases(students_no_na))
## [1] 0
students_dedup <- students_no_na %>% distinct()
nrow(students_no_na) - nrow(students_dedup)
## [1] 0
students_ordered <- students_dedup %>% arrange(desc(reading_score))
head(students_ordered, 10)
students_renamed <- students_dedup %>%
rename(Gender = gender,
Ethnicity = race_ethnicity,
Parental_Education = parental_level_of_education,
Lunch = lunch,
Test_Prep = test_preparation_course,
Math = math_score,
Reading = reading_score,
Writing = writing_score)
names(students_renamed)
## [1] "Gender" "Ethnicity" "Parental_Education"
## [4] "Lunch" "Test_Prep" "Math"
## [7] "Reading" "Writing"
students_augmented <- students_renamed %>%
mutate(Average_Score = (Math + Reading + Writing)/3,
Passed_Math = ifelse(Math >= 50, "Yes", "No"))
head(students_augmented, 10)
split_obj <- initial_split(students_augmented, prop = 0.7)
train_df <- training(split_obj)
test_df <- testing(split_obj)
nrow(train_df); nrow(test_df)
## [1] 700
## [1] 300
students_augmented %>%
group_by(Gender) %>%
summarize(
count = n(),
mean_math = mean(Math),
mean_reading = mean(Reading),
mean_writing = mean(Writing)
)
x <- students_augmented$Math
mean(x)
## [1] 66.089
median(x)
## [1] 66
mode_stat <- function(v) {
v <- v[!is.na(v)]
u <- unique(v)
u[which.max(tabulate(match(v, u)))]
}
mode_stat(x)
## [1] 65
range(x)
## [1] 0 100
ggplot(students_augmented, aes(x = Math, y = Reading, color = Gender)) +
geom_point(alpha = 0.8) +
labs(title = "Scatter: Math vs Reading Scores") +
theme_minimal()
students_augmented %>%
group_by(Lunch) %>%
summarize(mean_avg = mean(Average_Score)) %>%
ggplot(aes(x = Lunch, y = mean_avg, fill = Lunch)) +
geom_col() +
labs(title = "Bar: Average Score by Lunch Type") +
theme_minimal()
cor(students_augmented$Math, students_augmented$Writing, method = "pearson")
## [1] 0.802642
Dataset source: Kaggle — Students Performance Dataset
GitHub repository: https://github.com/bharatchopra-tech/Studentdata-analysisGroup1