title: "AED" author: "Shafi Faris Arif Rabbani" date: "2025-03-15" output: html_document ---
{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)
{r} library(ggplot2) library(dplyr) library(corrplot) library(cluster) library(reshape2)
{r} data_kaggle <- read.csv("C:/Users/M S I/Downloads/Students_Grading_Dataset (1).csv") data_kaggle
{r} data_kaggle <- data_kaggle %>% rename( Attendance = Attendance...., # Ubah "Attendance...." menjadi "Attendance" Study_Hours = Study_Hours_per_Week, Stress_Level= Stress_Level..1.10., Sleep_Hours = Sleep_Hours_per_Night, Internet = Internet_Access_at_Home, Parent_Education = Parent_Education_Level ) colnames(data_kaggle) write.csv(data_kaggle, "data_kaggle_clean.csv", row.names = FALSE)
{r} dim(data_kaggle)
{r} summary(data_kaggle)
{r} cat("Jumlah missing value sebelum penggantian:\n") print(sapply(data_kaggle, function(x) sum(is.na(x))))
{r} (colSums(is.na(data_kaggle)) / nrow(data_kaggle)) * 100
{r} library(psych) describe(data_kaggle) ```{r}
sum(duplicated(data_kaggle))
{r}
datakaggle$AssignmentsAvg[is.na(datakaggle$AssignmentsAvg)] <- mean(datakaggle$AssignmentsAvg, na.rm = TRUE) datakaggle$Attendance[is.na(data_kaggle$Attendance)] <- mean(datakaggle$Attendance, na.rm = TRUE)
get_mode <- function(v) { uniqv <- unique(v) uniqv[which.max(tabulate(match(v, uniqv)))] } ```
```{r}
data$ParentEducationLevel[is.na(data$ParentEducationLevel)] <- getmode(data$ParentEducation_Level)
```
```{r}
correlationcolumns <- c("AssignmentsAvg", "TotalScore", "MidtermScore", "FinalScore", "QuizzesAvg", "Projects_Score")
corrmatrix <- cor(datakaggle[, correlation_columns], use = "complete.obs")
meltedcorr <- melt(corrmatrix)
ggplot(data = meltedcorr, aes(x = Var1, y = Var2, fill = value)) + # Membuat kotak untuk setiap nilai korelasi dengan garis pemisah hitam geomtile(color = "black", size = 0.5) + # Menambahkan teks (nilai korelasi) di setiap kotak dengan format dua desimal geomtext(aes(label = sprintf("%.2f", value)), color = "black", size = 4) + # Menetapkan palet warna diverging dari nilai -1 hingga 1. scalefillgradient2( low = "#F0E5FF", # Warna untuk nilai terendah mid = "#B19CD9", # Warna tengah (biasanya mendekati 0) high = "#4B0082", # Warna untuk nilai tertinggi midpoint = 0, # Titik tengah di 0 limits = c(-1, 1) ) + # Menambahkan judul dan mengatur tampilan minimal ggtitle("Correlation Heatmap: Attendance, Scores, and Study Habits") + thememinimal() + # Mengatur label sumbu agar menyerupai pengaturan di Python theme( axis.text.x = elementtext(angle = 45, hjust = 1, size = 12, color = "black"), axis.text.y = elementtext(size = 12, color = "black"), plot.title = element_text(size = 14, face = "bold", color = "purple") )
{r} dfgender <-
as.data.frame(table(datakaggle$Gender)) colnames(df_gender) <-
c("Gender", "Count") ```
```{r}
dfgender$Percentage <- dfgender$Count / sum(df_gender$Count) * 100
color_map <- c("#9E93E8", "#D8BFD8")
ggplot(dfgender, aes(x = 2, y = Count, fill = Gender)) + #
geombar() dengan stat="identity" untuk menampilkan jumlah
geombar(stat = "identity", color = "#FDF7F4", alpha = 0.8, width =
1) + # Mengubah koordinat menjadi polar, dengan sudut awal = pi/2
(setara 90 derajat) coordpolar("y", start = pi/2) + # xlim()
memperluas/menyempitkan sumbu X sehingga terbentuk "donut hole" di
tengah xlim(0.5, 2.5) + # Menghilangkan latar belakang dan grid
themevoid() + # Mengatur letak legend, judul, warna teks, dll.
theme( legend.position = "left", plot.title = elementtext(hjust =
0.5, size = 14, face = "bold", color = "purple") ) + # Menambahkan label
persentase di tengah-tengah setiap sektor geomtext(aes(label =
paste0(round(Percentage, 2), "%")), position = positionstack(vjust
= 0.5), size = 5, color = "black") + # Skala warna (opsional) jika Anda
ingin mengatur warna khusus scalefillmanual(values =
colormap) + # Judul ggtitle("Gender") {r}
ggplot(datakaggle, aes(x = SleepHours, y = TotalScore,
color = Grade)) + geompoint() + geomsmooth(method = "lm",
formula = y ~ x + I(x^2), se = FALSE) + labs( title = "Polynomial
Regression: Sleep Hours vs TotalScore", x = "Sleep Hours per Night",
y = "Total Score" ) + thememinimal(basesize = 12) + theme(
plot.title = elementtext(size = 14, face = "bold", color =
"purple"), legend.position = "right" ) ```
Beberapa siswa mungkin belajar dalam waktu lama tetapi memperoleh nilai rendah karena teknik belajar yang tidak efektif atau stres. Mari kita analisis efisiensi belajar lintas jenis kelamin.
```{r} datakaggle$Gender <- as.factor(datakaggle$Gender) datakaggle$Department <- as.factor(datakaggle$Department) datakaggle$Grade <- as.factor(datakaggle$Grade)
```
{r} data_kaggle$Grade <- factor(data_kaggle$Grade, levels = c("A", "B", "C", "D", "F"))
```{r} ggplot(datakaggle, aes(x = Department, fill = Grade)) + geombar(position = "stack") + facetwrap(~ Gender) + labs( title = "Grade Distribution by Gender and Department", x = "Department", y = "Number of Students" ) + scalefillbrewer(palette = "RdPu") + thememinimal(basesize = 12) + theme( plot.title = elementtext(hjust = 0.5, size = 14, face = "bold", color = "purple"), legend.position = "top" )
```
```{r} library(patchwork) hts <- ggplot(data = data, aes(x = TotalScore)) + geomhistogram(aes(y = afterstat(density)), bins = nclass.Sturges(data$TotalScore), fill = "pink", color = "black", alpha = 0.7) + geomdensity(color = "red", linewidth = 1.2) + labs(title = "Total Score", x = "Total Score", y = "Density") + theme_minimal()
bpts <- ggplot(data, aes(y = TotalScore)) + geomboxplot(fill = "green", color = "black") + labs(title = "Total Score", y = "Total Score") + thememinimal()
qqts <- ggplot(data, aes(sample = TotalScore)) +
statqq() + statqqline(color = "red", linewidth = 1) +
labs(title = "Normal Q-Q Plot", x = "Theoretical Quantiles", y = "Sample
Quantiles") + thememinimal() (hts | bpts) / qq_ts
{r}
anovaresult <- aov(TotalScore ~ ParentEducation, data = datakaggle) anovasummary <- summary(anovaresult)
pvalue <- anovasummary[[1]][["Pr(>F)"]][1]
cat(sprintf("ANOVA p-value: %.4f\n", pvalue)) if (pvalue < 0.05) { cat("Parent education level significantly affects student performance!\n") } else { cat("No significant effect of parent education level on student performance.\n") }
# Address Stress and Sleep Deficiency:{r}
datakaggle$SleepDeficiency <- ifelse(datakaggle$SleepHours < 7, "Tidak Cukup", "Cukup") datakaggle$SleepDeficiency <- factor(datakaggle$SleepDeficiency, levels = c("Cukup", "Tidak Cukup")) ```
```{r}
library(ggplot2) p1 <- ggplot(datakaggle, aes(x = SleepDeficiency, y = TotalScore, fill = SleepDeficiency)) + geomboxplot() + labs(title = "Student Performance by Sleep Deficiency", x = "Sleep Category", y = "Total Score") + thememinimal() + scalefillmanual(values = c("Cukup" = "green", "Tidak Cukup" = "red"))
ttestresult <- t.test(TotalScore ~ SleepDeficiency, data = datakaggle) cat(sprintf("T-test p-value: %.4f\n", ttestresult$p.value)) if(ttest_result$p.value < 0.05) { cat("Terdapat perbedaan signifikan pada Total Score antara kelompok Cukup dan Tidak Cukup.\n") } else { cat("Tidak terdapat perbedaan signifikan pada Total Score antara kedua kelompok.\n") }
if("StressLevel" %in% colnames(datakaggle)) { # Menghitung korelasi corresult <- cor(data$StressLevel, data$TotalScore, use = "complete.obs") cat(sprintf("Correlation between Stress Level and Total Score: %.4f\n", corresult))
# Membuat scatter plot dengan garis regresi p2 <- ggplot(datakaggle, aes(x = StressLevel, y = TotalScore)) + geompoint(color = "blue", alpha = 0.7) + geomsmooth(method = "lm", se = FALSE, color = "red") + labs(title = "Relationship between Stress Level and Total Score", x = "Stress Level", y = "Total Score") + thememinimal()
print(p2) }
print(p1)
```
```{r} ggplot(datakaggle, aes(x = factor(StressLevel), y = TotalScore, fill = factor(StressLevel))) + geomviolin(trim = FALSE) + labs( title = "Total Score by Stress Level", x = "Stress Level", y = "Total Score" ) + thememinimal()
```
```{r} library(dplyr)
dfmean <- datakaggle %>% groupby(StressLevel) %>% summarize(meanscore = mean(TotalScore, na.rm = TRUE))
ggplot(dfmean, aes(x = StressLevel, y = meanscore)) + geomline(color = "blue") + geompoint(color = "red", size = 2) + labs( title = "Average Total Score by Stress Level", x = "Stress Level", y = "Mean of Total Score" ) + thememinimal()
```
```{r} ksresult <- ks.test( datakaggle$TotalScore, "pnorm", mean = mean(datakaggle$TotalScore, na.rm = TRUE), sd = sd(datakaggle$Total_Score, na.rm = TRUE) )
print(ks_result) ```
{r} # Visualisasi dengan QQ Plot qqnorm(data_kaggle$Total_Score, main = "QQ Plot: Total_Score") qqline(data_kaggle$Total_Score, col = "red")