{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)
# IMPORT LIBRARY
{r} library(ggplot2) library(dplyr) library(corrplot) library(cluster) library(reshape2)
{r} data_kaggle <- read.csv("C:/Users/M S I/Downloads/Students_Grading_Dataset (1).csv") data_kaggle
{r} data_kaggle <- data_kaggle %>% rename( Attendance = Attendance...., # Ubah "Attendance...." menjadi "Attendance" Study_Hours = Study_Hours_per_Week, Stress_Level= Stress_Level..1.10., Sleep_Hours = Sleep_Hours_per_Night, Internet = Internet_Access_at_Home, Parent_Education = Parent_Education_Level ) colnames(data_kaggle) write.csv(data_kaggle, "data_kaggle_clean.csv", row.names = FALSE)
# MEMBACA TOTAL BARIS DAN KOLOM {r} dim(data_kaggle)
{r} summary(data_kaggle) # Menampilkan jumlah missing value
pada tiap kolom sebelum penggantian
{r} cat("Jumlah missing value sebelum penggantian:\n") print(sapply(data_kaggle, function(x) sum(is.na(x))))
{r} (colSums(is.na(data_kaggle)) / nrow(data_kaggle)) * 100
{r} library(psych) describe(data_kaggle) ```{r}
sum(duplicated(data_kaggle))
```{r}
# Mengisi NA pada kolom numerik dengan nilai rata-rata
data_kaggle$Assignments_Avg[is.na(data_kaggle$Assignments_Avg)] <- mean(data_kaggle$Assignments_Avg, na.rm = TRUE)
data_kaggle$Attendance[is.na(data_kaggle$Attendance)] <- mean(data_kaggle$Attendance, na.rm = TRUE)
# Definisikan fungsi untuk mencari modus
get_mode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
```{r} # Mengisi NA pada kolom kategorikal dengan modus data\(Parent_Education_Level[is.na(data\)Parent_Education_Level)] <- get_mode(data$Parent_Education_Level)
```{r}
# 1. Tentukan kolom yang akan digunakan untuk menghitung korelasi
correlation_columns <- c("Assignments_Avg", "Total_Score", "Midterm_Score", "Final_Score", "Quizzes_Avg", "Projects_Score")
# 2. Hitung matriks korelasi; gunakan 'use = "complete.obs"' untuk mengabaikan NA
corr_matrix <- cor(data_kaggle[, correlation_columns], use = "complete.obs")
# 3. Ubah matriks korelasi menjadi format 'long' agar bisa dipetakan oleh ggplot2
melted_corr <- melt(corr_matrix)
# 4. Plot heatmap menggunakan ggplot2
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
# Membuat kotak untuk setiap nilai korelasi dengan garis pemisah hitam
geom_tile(color = "black", size = 0.5) +
# Menambahkan teks (nilai korelasi) di setiap kotak dengan format dua desimal
geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 4) +
# Menetapkan palet warna diverging dari nilai -1 hingga 1.
scale_fill_gradient2(
low = "#F0E5FF", # Warna untuk nilai terendah
mid = "#B19CD9", # Warna tengah (biasanya mendekati 0)
high = "#4B0082", # Warna untuk nilai tertinggi
midpoint = 0, # Titik tengah di 0
limits = c(-1, 1)
) +
# Menambahkan judul dan mengatur tampilan minimal
ggtitle("Correlation Heatmap: Attendance, Scores, and Study Habits") +
theme_minimal() +
# Mengatur label sumbu agar menyerupai pengaturan di Python
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12, color = "black"),
axis.text.y = element_text(size = 12, color = "black"),
plot.title = element_text(size = 14, face = "bold", color = "purple")
)
{r} df_gender <- as.data.frame(table(data_kaggle$Gender)) colnames(df_gender) <- c("Gender", "Count")
```{r} # Hitung persentase df_gender\(Percentage <- df_gender\)Count / sum(df_gender$Count) * 100
color_map <- c(“#9E93E8”, “#D8BFD8”)
ggplot(df_gender, aes(x = 2, y = Count, fill = Gender)) + # geom_bar() dengan stat=“identity” untuk menampilkan jumlah geom_bar(stat = “identity”, color = “#FDF7F4”, alpha = 0.8, width = 1) + # Mengubah koordinat menjadi polar, dengan sudut awal = pi/2 (setara 90 derajat) coord_polar(“y”, start = pi/2) + # xlim() memperluas/menyempitkan sumbu X sehingga terbentuk “donut hole” di tengah xlim(0.5, 2.5) + # Menghilangkan latar belakang dan grid theme_void() + # Mengatur letak legend, judul, warna teks, dll. theme( legend.position = “left”, plot.title = element_text(hjust = 0.5, size = 14, face = “bold”, color = “purple”) ) + # Menambahkan label persentase di tengah-tengah setiap sektor geom_text(aes(label = paste0(round(Percentage, 2), “%”)), position = position_stack(vjust = 0.5), size = 5, color = “black”) + # Skala warna (opsional) jika Anda ingin mengatur warna khusus scale_fill_manual(values = color_map) + # Judul ggtitle(“Gender”)
```{r}
ggplot(data_kaggle, aes(x = Sleep_Hours, y = Total_Score, color = Grade)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
labs(
title = "Polynomial Regression: Sleep Hours vs Total_Score",
x = "Sleep Hours per Night",
y = "Total Score"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(size = 14, face = "bold", color = "purple"),
legend.position = "right"
)
Beberapa siswa mungkin belajar dalam waktu lama tetapi memperoleh nilai rendah karena teknik belajar yang tidak efektif atau stres. Mari kita analisis efisiensi belajar lintas jenis kelamin.
```{r} data_kaggle\(Gender <- as.factor(data_kaggle\)Gender) data_kaggle\(Department <- as.factor(data_kaggle\)Department) data_kaggle\(Grade <- as.factor(data_kaggle\)Grade)
```{r}
data_kaggle$Grade <- factor(data_kaggle$Grade, levels = c("A", "B", "C", "D", "F"))
```{r} ggplot(data_kaggle, aes(x = Department, fill = Grade)) + geom_bar(position = “stack”) + facet_wrap(~ Gender) + labs( title = “Grade Distribution by Gender and Department”, x = “Department”, y = “Number of Students” ) + scale_fill_brewer(palette = “RdPu”) + theme_minimal(base_size = 12) + theme( plot.title = element_text(hjust = 0.5, size = 14, face = “bold”, color = “purple”), legend.position = “top” )
```{r}
library(patchwork)
h_ts <- ggplot(data = data, aes(x = Total_Score)) +
geom_histogram(aes(y = after_stat(density)),
bins = nclass.Sturges(data$Total_Score),
fill = "pink", color = "black", alpha = 0.7) +
geom_density(color = "red", linewidth = 1.2) +
labs(title = "Total Score", x = "Total Score", y = "Density") +
theme_minimal()
# 2. Boxplot
bp_ts <- ggplot(data, aes(y = Total_Score)) +
geom_boxplot(fill = "green", color = "black") +
labs(title = "Total Score", y = "Total Score") +
theme_minimal()
# 3. QQ Plot (untuk memeriksa normalitas)
qq_ts <- ggplot(data, aes(sample = Total_Score)) +
stat_qq() +
stat_qq_line(color = "red", linewidth = 1) +
labs(title = "Normal Q-Q Plot",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_minimal()
(h_ts | bp_ts) / qq_ts
```{r} # 1. Melakukan uji ANOVA satu arah anova_result <- aov(Total_Score ~ Parent_Education, data = data_kaggle) anova_summary <- summary(anova_result)
p_value <- anova_summary[[1]][[“Pr(>F)”]][1]
cat(sprintf(“ANOVA p-value: %.4f”, p_value)) if (p_value < 0.05) { cat(“Parent education level significantly affects student performance!”) } else { cat(“No significant effect of parent education level on student performance.”) }
# Address Stress and Sleep Deficiency:
```{r}
# 1. Membuat variabel baru untuk Sleep Deficiency
data_kaggle$Sleep_Deficiency <- ifelse(data_kaggle$Sleep_Hours < 7, "Tidak Cukup", "Cukup")
data_kaggle$Sleep_Deficiency <- factor(data_kaggle$Sleep_Deficiency, levels = c("Cukup", "Tidak Cukup"))
```{r} # 2. Boxplot Total_Score berdasarkan Sleep_Deficiency library(ggplot2) p1 <- ggplot(data_kaggle, aes(x = Sleep_Deficiency, y = Total_Score, fill = Sleep_Deficiency)) + geom_boxplot() + labs(title = “Student Performance by Sleep Deficiency”, x = “Sleep Category”, y = “Total Score”) + theme_minimal() + scale_fill_manual(values = c(“Cukup” = “green”, “Tidak Cukup” = “red”))
t_test_result <- t.test(Total_Score ~ Sleep_Deficiency, data = data_kaggle) cat(sprintf(“T-test p-value: %.4f”, t_test_result\(p.value)) if(t_test_result\)p.value < 0.05) { cat(“Terdapat perbedaan signifikan pada Total Score antara kelompok Cukup dan Tidak Cukup.”) } else { cat(“Tidak terdapat perbedaan signifikan pada Total Score antara kedua kelompok.”) }
if(“Stress_Level” %in% colnames(data_kaggle)) { # Menghitung korelasi cor_result <- cor(data\(Stress_Level, data\)Total_Score, use = “complete.obs”) cat(sprintf(“Correlation between Stress Level and Total Score: %.4f”, cor_result))
# Membuat scatter plot dengan garis regresi p2 <- ggplot(data_kaggle, aes(x = Stress_Level, y = Total_Score)) + geom_point(color = “blue”, alpha = 0.7) + geom_smooth(method = “lm”, se = FALSE, color = “red”) + labs(title = “Relationship between Stress Level and Total Score”, x = “Stress Level”, y = “Total Score”) + theme_minimal()
print(p2) }
print(p1)
```{r}
ggplot(data_kaggle, aes(x = factor(Stress_Level), y = Total_Score, fill = factor(Stress_Level))) +
geom_violin(trim = FALSE) +
labs(
title = "Total Score by Stress Level",
x = "Stress Level",
y = "Total Score"
) +
theme_minimal()
```{r} library(dplyr)
df_mean <- data_kaggle %>% group_by(Stress_Level) %>% summarize(mean_score = mean(Total_Score, na.rm = TRUE))
ggplot(df_mean, aes(x = Stress_Level, y = mean_score)) + geom_line(color = “blue”) + geom_point(color = “red”, size = 2) + labs( title = “Average Total Score by Stress Level”, x = “Stress Level”, y = “Mean of Total Score” ) + theme_minimal()
```{r}
ks_result <- ks.test(
data_kaggle$Total_Score,
"pnorm",
mean = mean(data_kaggle$Total_Score, na.rm = TRUE),
sd = sd(data_kaggle$Total_Score, na.rm = TRUE)
)
# Menampilkan hasil KS test
print(ks_result)
{r} # Visualisasi dengan QQ Plot qqnorm(data_kaggle$Total_Score, main = "QQ Plot: Total_Score") qqline(data_kaggle$Total_Score, col = "red")