library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.3.3
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
data_kaggle <- read.csv("C:/Users/M S I/Downloads/Students_Grading_Dataset (1).csv")
head(data_kaggle)
## Student_ID First_Name Last_Name Email Gender Age
## 1 S1000 Omar Williams student0@university.com Female 22
## 2 S1001 Maria Brown student1@university.com Male 18
## 3 S1002 Ahmed Jones student2@university.com Male 24
## 4 S1003 Omar Williams student3@university.com Female 24
## 5 S1004 John Smith student4@university.com Female 23
## 6 S1005 Liam Brown student5@university.com Male 21
## Department Attendance.... Midterm_Score Final_Score Assignments_Avg
## 1 Engineering 52.29 55.03 57.82 84.22
## 2 Engineering 97.27 97.23 45.80 NA
## 3 Business 57.19 67.05 93.68 67.70
## 4 Mathematics 95.15 47.79 80.63 66.06
## 5 CS 54.18 46.59 78.89 96.85
## 6 Engineering NA 78.85 43.53 71.40
## Quizzes_Avg Participation_Score Projects_Score Total_Score Grade
## 1 74.06 3.99 85.90 56.09 F
## 2 94.24 8.32 55.65 50.64 A
## 3 85.70 5.05 73.79 70.30 D
## 4 93.51 6.54 92.12 61.63 A
## 5 83.70 5.97 68.42 66.13 F
## 6 52.20 6.38 67.29 62.08 B
## Study_Hours_per_Week Extracurricular_Activities Internet_Access_at_Home
## 1 6.2 No Yes
## 2 19.0 No Yes
## 3 20.7 No Yes
## 4 24.8 Yes Yes
## 5 15.4 Yes Yes
## 6 8.5 Yes Yes
## Parent_Education_Level Family_Income_Level Stress_Level..1.10.
## 1 High School Medium 5
## 2 None Medium 4
## 3 Master's Low 6
## 4 High School High 3
## 5 High School High 2
## 6 PhD High 1
## Sleep_Hours_per_Night
## 1 4.7
## 2 9.0
## 3 6.2
## 4 6.7
## 5 7.1
## 6 5.0
# rename variabel
data_kaggle <- data_kaggle %>%
rename(
Attendance = Attendance....,
Study_Hours = Study_Hours_per_Week,
Stress_Level= Stress_Level..1.10.,
Sleep_Hours = Sleep_Hours_per_Night,
Internet = Internet_Access_at_Home,
Parent_Education = Parent_Education_Level
)
colnames(data_kaggle)
## [1] "Student_ID" "First_Name"
## [3] "Last_Name" "Email"
## [5] "Gender" "Age"
## [7] "Department" "Attendance"
## [9] "Midterm_Score" "Final_Score"
## [11] "Assignments_Avg" "Quizzes_Avg"
## [13] "Participation_Score" "Projects_Score"
## [15] "Total_Score" "Grade"
## [17] "Study_Hours" "Extracurricular_Activities"
## [19] "Internet" "Parent_Education"
## [21] "Family_Income_Level" "Stress_Level"
## [23] "Sleep_Hours"
write.csv(data_kaggle, "data_kaggle_clean.csv", row.names = FALSE)
dim(data_kaggle)
## [1] 5000 23
summary(data_kaggle)
## Student_ID First_Name Last_Name Email
## Length:5000 Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Gender Age Department Attendance
## Length:5000 Min. :18.00 Length:5000 Min. : 50.01
## Class :character 1st Qu.:19.00 Class :character 1st Qu.: 63.27
## Mode :character Median :21.00 Mode :character Median : 75.72
## Mean :21.05 Mean : 75.43
## 3rd Qu.:23.00 3rd Qu.: 87.47
## Max. :24.00 Max. :100.00
## NA's :516
## Midterm_Score Final_Score Assignments_Avg Quizzes_Avg
## Min. :40.00 Min. :40.00 Min. :50.00 Min. :50.03
## 1st Qu.:55.46 1st Qu.:54.67 1st Qu.:62.09 1st Qu.:62.49
## Median :70.51 Median :69.73 Median :74.81 Median :74.69
## Mean :70.33 Mean :69.64 Mean :74.80 Mean :74.91
## 3rd Qu.:84.97 3rd Qu.:84.50 3rd Qu.:86.97 3rd Qu.:87.63
## Max. :99.98 Max. :99.98 Max. :99.98 Max. :99.96
## NA's :517
## Participation_Score Projects_Score Total_Score Grade
## Min. : 0.000 Min. : 50.01 Min. :50.02 Length:5000
## 1st Qu.: 2.440 1st Qu.: 62.32 1st Qu.:62.84 Class :character
## Median : 4.955 Median : 74.98 Median :75.39 Mode :character
## Mean : 4.980 Mean : 74.92 Mean :75.12
## 3rd Qu.: 7.500 3rd Qu.: 87.37 3rd Qu.:87.65
## Max. :10.000 Max. :100.00 Max. :99.99
##
## Study_Hours Extracurricular_Activities Internet
## Min. : 5.00 Length:5000 Length:5000
## 1st Qu.:11.40 Class :character Class :character
## Median :17.50 Mode :character Mode :character
## Mean :17.66
## 3rd Qu.:24.10
## Max. :30.00
##
## Parent_Education Family_Income_Level Stress_Level Sleep_Hours
## Length:5000 Length:5000 Min. : 1.000 Min. :4.000
## Class :character Class :character 1st Qu.: 3.000 1st Qu.:5.200
## Mode :character Mode :character Median : 5.000 Median :6.500
## Mean : 5.481 Mean :6.488
## 3rd Qu.: 8.000 3rd Qu.:7.700
## Max. :10.000 Max. :9.000
##
cat("Jumlah missing value sebelum penggantian:\n")
## Jumlah missing value sebelum penggantian:
print(sapply(data_kaggle, function(x) sum(is.na(x))))
## Student_ID First_Name
## 0 0
## Last_Name Email
## 0 0
## Gender Age
## 0 0
## Department Attendance
## 0 516
## Midterm_Score Final_Score
## 0 0
## Assignments_Avg Quizzes_Avg
## 517 0
## Participation_Score Projects_Score
## 0 0
## Total_Score Grade
## 0 0
## Study_Hours Extracurricular_Activities
## 0 0
## Internet Parent_Education
## 0 0
## Family_Income_Level Stress_Level
## 0 0
## Sleep_Hours
## 0
(colSums(is.na(data_kaggle)) / nrow(data_kaggle)) * 100
## Student_ID First_Name
## 0.00 0.00
## Last_Name Email
## 0.00 0.00
## Gender Age
## 0.00 0.00
## Department Attendance
## 0.00 10.32
## Midterm_Score Final_Score
## 0.00 0.00
## Assignments_Avg Quizzes_Avg
## 10.34 0.00
## Participation_Score Projects_Score
## 0.00 0.00
## Total_Score Grade
## 0.00 0.00
## Study_Hours Extracurricular_Activities
## 0.00 0.00
## Internet Parent_Education
## 0.00 0.00
## Family_Income_Level Stress_Level
## 0.00 0.00
## Sleep_Hours
## 0.00
describe(data_kaggle)
## vars n mean sd median trimmed mad
## Student_ID* 1 5000 2500.50 1443.52 2500.50 2500.50 1853.25
## First_Name* 2 5000 4.46 2.30 4.00 4.45 2.97
## Last_Name* 3 5000 3.49 1.69 3.00 3.48 1.48
## Email* 4 5000 2500.50 1443.52 2500.50 2500.50 1853.25
## Gender* 5 5000 1.51 0.50 2.00 1.51 0.00
## Age 6 5000 21.05 1.99 21.00 21.06 2.97
## Department* 7 5000 2.29 0.90 2.00 2.24 1.48
## Attendance 8 4484 75.43 14.37 75.72 75.50 17.87
## Midterm_Score 9 5000 70.33 17.21 70.51 70.38 21.90
## Final_Score 10 5000 69.64 17.24 69.74 69.58 22.16
## Assignments_Avg 11 4483 74.80 14.41 74.81 74.76 18.47
## Quizzes_Avg 12 5000 74.91 14.50 74.69 74.89 18.67
## Participation_Score 13 5000 4.98 2.89 4.96 4.97 3.76
## Projects_Score 14 5000 74.92 14.42 74.98 74.93 18.58
## Total_Score 15 5000 75.12 14.40 75.40 75.16 18.41
## Grade* 16 5000 2.72 1.47 3.00 2.65 1.48
## Study_Hours 17 5000 17.66 7.28 17.50 17.67 9.34
## Extracurricular_Activities* 18 5000 1.30 0.46 1.00 1.25 0.00
## Internet* 19 5000 1.90 0.30 2.00 2.00 0.00
## Parent_Education* 20 5000 3.42 1.75 3.00 3.40 2.97
## Family_Income_Level* 21 5000 2.19 0.75 2.00 2.23 1.48
## Stress_Level 22 5000 5.48 2.86 5.00 5.48 2.97
## Sleep_Hours 23 5000 6.49 1.45 6.50 6.49 1.93
## min max range skew kurtosis se
## Student_ID* 1.00 5000.00 4999.00 0.00 -1.20 20.41
## First_Name* 1.00 8.00 7.00 0.02 -1.25 0.03
## Last_Name* 1.00 6.00 5.00 0.01 -1.25 0.02
## Email* 1.00 5000.00 4999.00 0.00 -1.20 20.41
## Gender* 1.00 2.00 1.00 -0.04 -2.00 0.01
## Age 18.00 24.00 6.00 -0.04 -1.24 0.03
## Department* 1.00 4.00 3.00 0.22 -0.73 0.01
## Attendance 50.01 100.00 49.99 -0.04 -1.17 0.21
## Midterm_Score 40.00 99.98 59.98 -0.02 -1.18 0.24
## Final_Score 40.00 99.98 59.98 0.02 -1.20 0.24
## Assignments_Avg 50.00 99.98 49.98 0.02 -1.20 0.22
## Quizzes_Avg 50.03 99.96 49.93 0.02 -1.20 0.21
## Participation_Score 0.00 10.00 10.00 0.01 -1.21 0.04
## Projects_Score 50.01 100.00 49.99 0.00 -1.21 0.20
## Total_Score 50.02 99.99 49.97 -0.02 -1.19 0.20
## Grade* 1.00 5.00 4.00 0.24 -1.36 0.02
## Study_Hours 5.00 30.00 25.00 0.00 -1.22 0.10
## Extracurricular_Activities* 1.00 2.00 1.00 0.87 -1.25 0.01
## Internet* 1.00 2.00 1.00 -2.61 4.82 0.00
## Parent_Education* 1.00 6.00 5.00 0.05 -1.32 0.02
## Family_Income_Level* 1.00 3.00 2.00 -0.32 -1.19 0.01
## Stress_Level 1.00 10.00 9.00 0.01 -1.22 0.04
## Sleep_Hours 4.00 9.00 5.00 0.00 -1.20 0.02
sum(duplicated(data_kaggle))
## [1] 0
# Mengisi NA pada kolom numerik dengan nilai rata-rata
data_kaggle$Assignments_Avg[is.na(data_kaggle$Assignments_Avg)] <- mean(data_kaggle$Assignments_Avg, na.rm = TRUE)
data_kaggle$Attendance[is.na(data_kaggle$Attendance)] <- mean(data_kaggle$Attendance, na.rm = TRUE)
# Definisikan fungsi untuk mencari modus
get_mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Mengisi NA pada kolom kategorikal dengan modus
data_kaggle$Parent_Education_Level[is.na(data_kaggle$Parent_Education_Level)] <- get_mode(data_kaggle$Parent_Education_Level)
# 1. Tentukan kolom yang akan digunakan untuk menghitung korelasi
correlation_columns <- c("Assignments_Avg", "Total_Score", "Midterm_Score", "Final_Score", "Quizzes_Avg", "Projects_Score")
# 2. Hitung matriks korelasi; gunakan 'use = "complete.obs"' untuk mengabaikan NA
corr_matrix <- cor(data_kaggle[, correlation_columns], use = "complete.obs")
# 3. Ubah matriks korelasi menjadi format 'long' agar bisa dipetakan oleh ggplot2
melted_corr <- melt(corr_matrix)
# 4. Plot heatmap menggunakan ggplot2
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "black", size = 0.5) +
geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 4) +
scale_fill_gradient2(
low = "#F0E5FF",
mid = "#B19CD9",
high = "#4B0082",
midpoint = 0,
limits = c(-1, 1)
) +
ggtitle("Correlation Heatmap: Attendance, Scores, and Study Habits") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12, color = "black"),
axis.text.y = element_text(size = 12, color = "black"),
plot.title = element_text(size = 14, face = "bold", color = "purple")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df_gender <- as.data.frame(table(data_kaggle$Gender))
colnames(df_gender) <- c("Gender", "Count")
# Hitung persentase
df_gender$Percentage <- df_gender$Count / sum(df_gender$Count) * 100
color_map <- c("#9E93E8", "#D8BFD8")
ggplot(df_gender, aes(x = 2, y = Count, fill = Gender)) +
geom_bar(stat = "identity", color = "#FDF7F4", alpha = 0.8, width = 1) +
coord_polar("y", start = pi/2) +
xlim(0.5, 2.5) +
theme_void() +
theme(
legend.position = "left",
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "purple")
) +
geom_text(aes(label = paste0(round(Percentage, 2), "%")),
position = position_stack(vjust = 0.5), size = 5, color = "black") +
scale_fill_manual(values = color_map) +
ggtitle("Gender")
ggplot(data_kaggle, aes(x = Sleep_Hours, y = Total_Score, color = Grade)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
labs(
title = "Polynomial Regression: Sleep Hours vs Total_Score",
x = "Sleep Hours per Night",
y = "Total Score"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(size = 14, face = "bold", color = "purple"),
legend.position = "right"
)
Beberapa siswa mungkin belajar dalam waktu lama tetapi memperoleh nilai rendah karena teknik belajar yang tidak efektif atau stres. Mari kita analisis efisiensi belajar lintas jenis kelamin.
data_kaggle$Gender <- as.factor(data_kaggle$Gender)
data_kaggle$Department <- as.factor(data_kaggle$Department)
data_kaggle$Grade <- as.factor(data_kaggle$Grade)
data_kaggle$Grade <- factor(data_kaggle$Grade, levels = c("A", "B", "C", "D", "F"))
ggplot(data_kaggle, aes(x = Department, fill = Grade)) +
geom_bar(position = "stack") +
facet_wrap(~ Gender) +
labs(
title = "Grade Distribution by Gender and Department",
x = "Department",
y = "Number of Students"
) + scale_fill_brewer(palette = "RdPu") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "purple"),
legend.position = "top"
)
h_ts <- ggplot(data = data_kaggle, aes(x = Total_Score)) +
geom_histogram(aes(y = after_stat(density)),
bins = nclass.Sturges(data_kaggle$Total_Score),
fill = "pink", color = "black", alpha = 0.7) +
geom_density(color = "red", linewidth = 1.2) +
labs(title = "Total Score", x = "Total Score", y = "Density") +
theme_minimal()
# 2. Boxplot
bp_ts <- ggplot(data_kaggle, aes(y = Total_Score)) +
geom_boxplot(fill = "green", color = "black") +
labs(title = "Total Score", y = "Total Score") +
theme_minimal()
# 3. QQ Plot (untuk memeriksa normalitas)
qq_ts <- ggplot(data_kaggle, aes(sample = Total_Score)) +
stat_qq() +
stat_qq_line(color = "red", linewidth = 1) +
labs(title = "Normal Q-Q Plot",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_minimal()
(h_ts | bp_ts) / qq_ts
anova_result <- aov(Total_Score ~ Parent_Education, data = data_kaggle)
anova_summary <- summary(anova_result)
p_value <- anova_summary[[1]][["Pr(>F)"]][1]
cat(sprintf("ANOVA p-value: %.4f\n", p_value))
## ANOVA p-value: 0.9189
if (p_value < 0.05) {
cat("Parent education level significantly affects student performance!\n")
} else {
cat("No significant effect of parent education level on student performance.\n")
}
## No significant effect of parent education level on student performance.
# 1. Membuat variabel baru untuk Sleep Deficiency
data_kaggle$Sleep_Deficiency <- ifelse(data_kaggle$Sleep_Hours < 7, "Tidak Cukup", "Cukup")
data_kaggle$Sleep_Deficiency <- factor(data_kaggle$Sleep_Deficiency, levels = c("Cukup", "Tidak Cukup"))
# 2. Boxplot Total_Score berdasarkan Sleep_Deficiency
p1 <- ggplot(data_kaggle, aes(x = Sleep_Deficiency, y = Total_Score, fill = Sleep_Deficiency)) +
geom_boxplot() +
labs(title = "Student Performance by Sleep Deficiency",
x = "Sleep Category",
y = "Total Score") +
theme_minimal() +
scale_fill_manual(values = c("Cukup" = "green", "Tidak Cukup" = "red"))
# 3. Uji statistik (t-test) untuk membandingkan Total_Score antara kelompok tidur yang cukup dan tidak cukup
t_test_result <- t.test(Total_Score ~ Sleep_Deficiency, data = data_kaggle)
cat(sprintf("T-test p-value: %.4f\n", t_test_result$p.value))
## T-test p-value: 0.7710
if(t_test_result$p.value < 0.05) {
cat("Terdapat perbedaan signifikan pada Total Score antara kelompok Cukup dan Tidak Cukup.\n")
} else {
cat("Tidak terdapat perbedaan signifikan pada Total Score antara kedua kelompok.\n")
}
## Tidak terdapat perbedaan signifikan pada Total Score antara kedua kelompok.
# 4. (Opsional) Jika terdapat variabel Stress_Level, analisis korelasi antara Stress_Level dan Total_Score
if("Stress_Level" %in% colnames(data_kaggle)) {
# Menghitung korelasi
cor_result <- cor(data_kaggle$Stress_Level, data_kaggle$Total_Score, use = "complete.obs")
cat(sprintf("Correlation between Stress Level and Total Score: %.4f\n", cor_result))
# Membuat scatter plot dengan garis regresi
p2 <- ggplot(data_kaggle, aes(x = Stress_Level, y = Total_Score)) +
geom_point(color = "blue", alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Relationship between Stress Level and Total Score",
x = "Stress Level",
y = "Total Score") +
theme_minimal()
print(p2)
}
## Correlation between Stress Level and Total Score: 0.0042
## `geom_smooth()` using formula = 'y ~ x'
# Menampilkan boxplot
print(p1)
ggplot(data_kaggle, aes(x = factor(Stress_Level), y = Total_Score, fill = factor(Stress_Level))) +
geom_violin(trim = FALSE) +
labs(
title = "Total Score by Stress Level",
x = "Stress Level",
y = "Total Score"
) +
theme_minimal()
library(dplyr)
# Rata-rata Total_Score per Stress_Level
df_mean <- data_kaggle %>%
group_by(Stress_Level) %>%
summarize(mean_score = mean(Total_Score, na.rm = TRUE))
ggplot(df_mean, aes(x = Stress_Level, y = mean_score)) +
geom_line(color = "blue") +
geom_point(color = "red", size = 2) +
labs(
title = "Average Total Score by Stress Level",
x = "Stress Level",
y = "Mean of Total Score"
) +
theme_minimal()
ks_result <- ks.test(
data_kaggle$Total_Score,
"pnorm",
mean = mean(data_kaggle$Total_Score, na.rm = TRUE),
sd = sd(data_kaggle$Total_Score, na.rm = TRUE)
)
## Warning in ks.test.default(data_kaggle$Total_Score, "pnorm", mean =
## mean(data_kaggle$Total_Score, : ties should not be present for the
## Kolmogorov-Smirnov test
# Menampilkan hasil KS test
print(ks_result)
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: data_kaggle$Total_Score
## D = 0.058641, p-value = 2.331e-15
## alternative hypothesis: two-sided
qqnorm(data_kaggle$Total_Score, main = "QQ Plot: Total_Score")
qqline(data_kaggle$Total_Score, col = "red")
Berdasarkan hasil visualisasi menggunakan QQ Plot pada variabel
Total_Score, dapat disimpulkan bahwa distribusi data tidak sepenuhnya
mengikuti distribusi normal. Hal ini ditunjukkan oleh adanya
penyimpangan titik-titik data dari garis referensi merah pada bagian
ujung plot, baik di sisi kiri maupun kanan, yang mengindikasikan adanya
outlier atau distribusi dengan ekor tebal (heavy tails).