IMPORT DATA

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.95 loaded

library(reshape2)

## Warning: package 'reshape2' was built under R version 4.3.3

library(patchwork)

## Warning: package 'patchwork' was built under R version 4.3.3

library(psych)

## Warning: package 'psych' was built under R version 4.3.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Membaca Data

data_kaggle <- read.csv("C:/Users/M S I/Downloads/Students_Grading_Dataset (1).csv")
head(data_kaggle)

##   Student_ID First_Name Last_Name                   Email Gender Age
## 1      S1000       Omar  Williams student0@university.com Female  22
## 2      S1001      Maria     Brown student1@university.com   Male  18
## 3      S1002      Ahmed     Jones student2@university.com   Male  24
## 4      S1003       Omar  Williams student3@university.com Female  24
## 5      S1004       John     Smith student4@university.com Female  23
## 6      S1005       Liam     Brown student5@university.com   Male  21
##    Department Attendance.... Midterm_Score Final_Score Assignments_Avg
## 1 Engineering          52.29         55.03       57.82           84.22
## 2 Engineering          97.27         97.23       45.80              NA
## 3    Business          57.19         67.05       93.68           67.70
## 4 Mathematics          95.15         47.79       80.63           66.06
## 5          CS          54.18         46.59       78.89           96.85
## 6 Engineering             NA         78.85       43.53           71.40
##   Quizzes_Avg Participation_Score Projects_Score Total_Score Grade
## 1       74.06                3.99          85.90       56.09     F
## 2       94.24                8.32          55.65       50.64     A
## 3       85.70                5.05          73.79       70.30     D
## 4       93.51                6.54          92.12       61.63     A
## 5       83.70                5.97          68.42       66.13     F
## 6       52.20                6.38          67.29       62.08     B
##   Study_Hours_per_Week Extracurricular_Activities Internet_Access_at_Home
## 1                  6.2                         No                     Yes
## 2                 19.0                         No                     Yes
## 3                 20.7                         No                     Yes
## 4                 24.8                        Yes                     Yes
## 5                 15.4                        Yes                     Yes
## 6                  8.5                        Yes                     Yes
##   Parent_Education_Level Family_Income_Level Stress_Level..1.10.
## 1            High School              Medium                   5
## 2                   None              Medium                   4
## 3               Master's                 Low                   6
## 4            High School                High                   3
## 5            High School                High                   2
## 6                    PhD                High                   1
##   Sleep_Hours_per_Night
## 1                   4.7
## 2                   9.0
## 3                   6.2
## 4                   6.7
## 5                   7.1
## 6                   5.0

# rename variabel
data_kaggle <- data_kaggle %>%
  rename(
    Attendance = Attendance....,  
    Study_Hours = Study_Hours_per_Week,
    Stress_Level= Stress_Level..1.10.,
    Sleep_Hours = Sleep_Hours_per_Night,
    Internet = Internet_Access_at_Home,
    Parent_Education = Parent_Education_Level
  )
colnames(data_kaggle)

##  [1] "Student_ID"                 "First_Name"                
##  [3] "Last_Name"                  "Email"                     
##  [5] "Gender"                     "Age"                       
##  [7] "Department"                 "Attendance"                
##  [9] "Midterm_Score"              "Final_Score"               
## [11] "Assignments_Avg"            "Quizzes_Avg"               
## [13] "Participation_Score"        "Projects_Score"            
## [15] "Total_Score"                "Grade"                     
## [17] "Study_Hours"                "Extracurricular_Activities"
## [19] "Internet"                   "Parent_Education"          
## [21] "Family_Income_Level"        "Stress_Level"              
## [23] "Sleep_Hours"

write.csv(data_kaggle, "data_kaggle_clean.csv", row.names = FALSE)

MEMBACA TOTAL BARIS DAN KOLOM

dim(data_kaggle)

## [1] 5000   23

summary(data_kaggle)

##   Student_ID         First_Name         Last_Name            Email          
##  Length:5000        Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Gender               Age         Department          Attendance    
##  Length:5000        Min.   :18.00   Length:5000        Min.   : 50.01  
##  Class :character   1st Qu.:19.00   Class :character   1st Qu.: 63.27  
##  Mode  :character   Median :21.00   Mode  :character   Median : 75.72  
##                     Mean   :21.05                      Mean   : 75.43  
##                     3rd Qu.:23.00                      3rd Qu.: 87.47  
##                     Max.   :24.00                      Max.   :100.00  
##                                                        NA's   :516     
##  Midterm_Score    Final_Score    Assignments_Avg  Quizzes_Avg   
##  Min.   :40.00   Min.   :40.00   Min.   :50.00   Min.   :50.03  
##  1st Qu.:55.46   1st Qu.:54.67   1st Qu.:62.09   1st Qu.:62.49  
##  Median :70.51   Median :69.73   Median :74.81   Median :74.69  
##  Mean   :70.33   Mean   :69.64   Mean   :74.80   Mean   :74.91  
##  3rd Qu.:84.97   3rd Qu.:84.50   3rd Qu.:86.97   3rd Qu.:87.63  
##  Max.   :99.98   Max.   :99.98   Max.   :99.98   Max.   :99.96  
##                                  NA's   :517                    
##  Participation_Score Projects_Score    Total_Score       Grade          
##  Min.   : 0.000      Min.   : 50.01   Min.   :50.02   Length:5000       
##  1st Qu.: 2.440      1st Qu.: 62.32   1st Qu.:62.84   Class :character  
##  Median : 4.955      Median : 74.98   Median :75.39   Mode  :character  
##  Mean   : 4.980      Mean   : 74.92   Mean   :75.12                     
##  3rd Qu.: 7.500      3rd Qu.: 87.37   3rd Qu.:87.65                     
##  Max.   :10.000      Max.   :100.00   Max.   :99.99                     
##                                                                         
##   Study_Hours    Extracurricular_Activities   Internet        
##  Min.   : 5.00   Length:5000                Length:5000       
##  1st Qu.:11.40   Class :character           Class :character  
##  Median :17.50   Mode  :character           Mode  :character  
##  Mean   :17.66                                                
##  3rd Qu.:24.10                                                
##  Max.   :30.00                                                
##                                                               
##  Parent_Education   Family_Income_Level  Stress_Level     Sleep_Hours   
##  Length:5000        Length:5000         Min.   : 1.000   Min.   :4.000  
##  Class :character   Class :character    1st Qu.: 3.000   1st Qu.:5.200  
##  Mode  :character   Mode  :character    Median : 5.000   Median :6.500  
##                                         Mean   : 5.481   Mean   :6.488  
##                                         3rd Qu.: 8.000   3rd Qu.:7.700  
##                                         Max.   :10.000   Max.   :9.000  
##

Menampilkan jumlah missing value pada tiap kolom sebelum penggantian

cat("Jumlah missing value sebelum penggantian:\n")

## Jumlah missing value sebelum penggantian:

print(sapply(data_kaggle, function(x) sum(is.na(x))))

##                 Student_ID                 First_Name 
##                          0                          0 
##                  Last_Name                      Email 
##                          0                          0 
##                     Gender                        Age 
##                          0                          0 
##                 Department                 Attendance 
##                          0                        516 
##              Midterm_Score                Final_Score 
##                          0                          0 
##            Assignments_Avg                Quizzes_Avg 
##                        517                          0 
##        Participation_Score             Projects_Score 
##                          0                          0 
##                Total_Score                      Grade 
##                          0                          0 
##                Study_Hours Extracurricular_Activities 
##                          0                          0 
##                   Internet           Parent_Education 
##                          0                          0 
##        Family_Income_Level               Stress_Level 
##                          0                          0 
##                Sleep_Hours 
##                          0

(colSums(is.na(data_kaggle)) / nrow(data_kaggle)) * 100

##                 Student_ID                 First_Name 
##                       0.00                       0.00 
##                  Last_Name                      Email 
##                       0.00                       0.00 
##                     Gender                        Age 
##                       0.00                       0.00 
##                 Department                 Attendance 
##                       0.00                      10.32 
##              Midterm_Score                Final_Score 
##                       0.00                       0.00 
##            Assignments_Avg                Quizzes_Avg 
##                      10.34                       0.00 
##        Participation_Score             Projects_Score 
##                       0.00                       0.00 
##                Total_Score                      Grade 
##                       0.00                       0.00 
##                Study_Hours Extracurricular_Activities 
##                       0.00                       0.00 
##                   Internet           Parent_Education 
##                       0.00                       0.00 
##        Family_Income_Level               Stress_Level 
##                       0.00                       0.00 
##                Sleep_Hours 
##                       0.00

describe(data_kaggle)

##                             vars    n    mean      sd  median trimmed     mad
## Student_ID*                    1 5000 2500.50 1443.52 2500.50 2500.50 1853.25
## First_Name*                    2 5000    4.46    2.30    4.00    4.45    2.97
## Last_Name*                     3 5000    3.49    1.69    3.00    3.48    1.48
## Email*                         4 5000 2500.50 1443.52 2500.50 2500.50 1853.25
## Gender*                        5 5000    1.51    0.50    2.00    1.51    0.00
## Age                            6 5000   21.05    1.99   21.00   21.06    2.97
## Department*                    7 5000    2.29    0.90    2.00    2.24    1.48
## Attendance                     8 4484   75.43   14.37   75.72   75.50   17.87
## Midterm_Score                  9 5000   70.33   17.21   70.51   70.38   21.90
## Final_Score                   10 5000   69.64   17.24   69.74   69.58   22.16
## Assignments_Avg               11 4483   74.80   14.41   74.81   74.76   18.47
## Quizzes_Avg                   12 5000   74.91   14.50   74.69   74.89   18.67
## Participation_Score           13 5000    4.98    2.89    4.96    4.97    3.76
## Projects_Score                14 5000   74.92   14.42   74.98   74.93   18.58
## Total_Score                   15 5000   75.12   14.40   75.40   75.16   18.41
## Grade*                        16 5000    2.72    1.47    3.00    2.65    1.48
## Study_Hours                   17 5000   17.66    7.28   17.50   17.67    9.34
## Extracurricular_Activities*   18 5000    1.30    0.46    1.00    1.25    0.00
## Internet*                     19 5000    1.90    0.30    2.00    2.00    0.00
## Parent_Education*             20 5000    3.42    1.75    3.00    3.40    2.97
## Family_Income_Level*          21 5000    2.19    0.75    2.00    2.23    1.48
## Stress_Level                  22 5000    5.48    2.86    5.00    5.48    2.97
## Sleep_Hours                   23 5000    6.49    1.45    6.50    6.49    1.93
##                               min     max   range  skew kurtosis    se
## Student_ID*                  1.00 5000.00 4999.00  0.00    -1.20 20.41
## First_Name*                  1.00    8.00    7.00  0.02    -1.25  0.03
## Last_Name*                   1.00    6.00    5.00  0.01    -1.25  0.02
## Email*                       1.00 5000.00 4999.00  0.00    -1.20 20.41
## Gender*                      1.00    2.00    1.00 -0.04    -2.00  0.01
## Age                         18.00   24.00    6.00 -0.04    -1.24  0.03
## Department*                  1.00    4.00    3.00  0.22    -0.73  0.01
## Attendance                  50.01  100.00   49.99 -0.04    -1.17  0.21
## Midterm_Score               40.00   99.98   59.98 -0.02    -1.18  0.24
## Final_Score                 40.00   99.98   59.98  0.02    -1.20  0.24
## Assignments_Avg             50.00   99.98   49.98  0.02    -1.20  0.22
## Quizzes_Avg                 50.03   99.96   49.93  0.02    -1.20  0.21
## Participation_Score          0.00   10.00   10.00  0.01    -1.21  0.04
## Projects_Score              50.01  100.00   49.99  0.00    -1.21  0.20
## Total_Score                 50.02   99.99   49.97 -0.02    -1.19  0.20
## Grade*                       1.00    5.00    4.00  0.24    -1.36  0.02
## Study_Hours                  5.00   30.00   25.00  0.00    -1.22  0.10
## Extracurricular_Activities*  1.00    2.00    1.00  0.87    -1.25  0.01
## Internet*                    1.00    2.00    1.00 -2.61     4.82  0.00
## Parent_Education*            1.00    6.00    5.00  0.05    -1.32  0.02
## Family_Income_Level*         1.00    3.00    2.00 -0.32    -1.19  0.01
## Stress_Level                 1.00   10.00    9.00  0.01    -1.22  0.04
## Sleep_Hours                  4.00    9.00    5.00  0.00    -1.20  0.02

sum(duplicated(data_kaggle))

## [1] 0

# Mengisi NA pada kolom numerik dengan nilai rata-rata
data_kaggle$Assignments_Avg[is.na(data_kaggle$Assignments_Avg)] <- mean(data_kaggle$Assignments_Avg, na.rm = TRUE)
data_kaggle$Attendance[is.na(data_kaggle$Attendance)] <- mean(data_kaggle$Attendance, na.rm = TRUE)

# Definisikan fungsi untuk mencari modus
get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

# Mengisi NA pada kolom kategorikal dengan modus
data_kaggle$Parent_Education_Level[is.na(data_kaggle$Parent_Education_Level)] <- get_mode(data_kaggle$Parent_Education_Level)

# 1. Tentukan kolom yang akan digunakan untuk menghitung korelasi
correlation_columns <- c("Assignments_Avg", "Total_Score", "Midterm_Score", "Final_Score", "Quizzes_Avg", "Projects_Score")

# 2. Hitung matriks korelasi; gunakan 'use = "complete.obs"' untuk mengabaikan NA
corr_matrix <- cor(data_kaggle[, correlation_columns], use = "complete.obs")

# 3. Ubah matriks korelasi menjadi format 'long' agar bisa dipetakan oleh ggplot2
melted_corr <- melt(corr_matrix)

# 4. Plot heatmap menggunakan ggplot2
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "black", size = 0.5) +
  geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 4) +
  scale_fill_gradient2(
    low = "#F0E5FF",     
    mid = "#B19CD9",     
    high = "#4B0082",    
    midpoint = 0,        
    limits = c(-1, 1)
  ) +
  ggtitle("Correlation Heatmap: Attendance, Scores, and Study Habits") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12, color = "black"),
    axis.text.y = element_text(size = 12, color = "black"),
    plot.title = element_text(size = 14, face = "bold", color = "purple")
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

df_gender <- as.data.frame(table(data_kaggle$Gender))
colnames(df_gender) <- c("Gender", "Count")

# Hitung persentase
df_gender$Percentage <- df_gender$Count / sum(df_gender$Count) * 100
color_map <- c("#9E93E8", "#D8BFD8")

ggplot(df_gender, aes(x = 2, y = Count, fill = Gender)) +
  geom_bar(stat = "identity", color = "#FDF7F4", alpha = 0.8, width = 1) +
  coord_polar("y", start = pi/2) +
  xlim(0.5, 2.5) +
  theme_void() +
  theme(
    legend.position = "left",
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "purple")
  ) +
  geom_text(aes(label = paste0(round(Percentage, 2), "%")),
            position = position_stack(vjust = 0.5), size = 5, color = "black") +
  scale_fill_manual(values = color_map) +
  ggtitle("Gender")

ggplot(data_kaggle, aes(x = Sleep_Hours, y = Total_Score, color = Grade)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
  labs(
    title = "Polynomial Regression: Sleep Hours vs Total_Score",
    x = "Sleep Hours per Night",
    y = "Total Score"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "purple"),
    legend.position = "right"
  )

Mengidentifikasi departemen tempat siswa lebih sering memperoleh nilai lebih tinggi.

Beberapa siswa mungkin belajar dalam waktu lama tetapi memperoleh nilai rendah karena teknik belajar yang tidak efektif atau stres. Mari kita analisis efisiensi belajar lintas jenis kelamin.

data_kaggle$Gender <- as.factor(data_kaggle$Gender)
data_kaggle$Department <- as.factor(data_kaggle$Department)
data_kaggle$Grade <- as.factor(data_kaggle$Grade)

data_kaggle$Grade <- factor(data_kaggle$Grade, levels = c("A", "B", "C", "D", "F"))

ggplot(data_kaggle, aes(x = Department, fill = Grade)) +
  geom_bar(position = "stack") +
  facet_wrap(~ Gender) +
  labs(
    title = "Grade Distribution by Gender and Department",
    x = "Department",
    y = "Number of Students"
  ) + scale_fill_brewer(palette = "RdPu") +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold", color = "purple"),
    legend.position = "top" 
  )

h_ts <- ggplot(data = data_kaggle, aes(x = Total_Score)) +
  geom_histogram(aes(y = after_stat(density)), 
                 bins = nclass.Sturges(data_kaggle$Total_Score), 
                 fill = "pink", color = "black", alpha = 0.7) +
  geom_density(color = "red", linewidth = 1.2) +
  labs(title = "Total Score", x = "Total Score", y = "Density") +
  theme_minimal()

# 2. Boxplot
bp_ts <- ggplot(data_kaggle, aes(y = Total_Score)) +
  geom_boxplot(fill = "green", color = "black") +
  labs(title = "Total Score", y = "Total Score") +
  theme_minimal()

# 3. QQ Plot (untuk memeriksa normalitas)
qq_ts <- ggplot(data_kaggle, aes(sample = Total_Score)) +
  stat_qq() +
  stat_qq_line(color = "red", linewidth = 1) +
  labs(title = "Normal Q-Q Plot", 
       x = "Theoretical Quantiles", 
       y = "Sample Quantiles") +
  theme_minimal()
(h_ts | bp_ts) / qq_ts

Melakukan uji ANOVA satu arah

anova_result <- aov(Total_Score ~ Parent_Education, data = data_kaggle)
anova_summary <- summary(anova_result)

p_value <- anova_summary[[1]][["Pr(>F)"]][1]

cat(sprintf("ANOVA p-value: %.4f\n", p_value))

## ANOVA p-value: 0.9189

if (p_value < 0.05) {
  cat("Parent education level significantly affects student performance!\n")
} else {
  cat("No significant effect of parent education level on student performance.\n")
}

## No significant effect of parent education level on student performance.

Address Stress and Sleep Deficiency

# 1. Membuat variabel baru untuk Sleep Deficiency
data_kaggle$Sleep_Deficiency <- ifelse(data_kaggle$Sleep_Hours < 7, "Tidak Cukup", "Cukup")
data_kaggle$Sleep_Deficiency <- factor(data_kaggle$Sleep_Deficiency, levels = c("Cukup", "Tidak Cukup"))


# 2. Boxplot Total_Score berdasarkan Sleep_Deficiency

p1 <- ggplot(data_kaggle, aes(x = Sleep_Deficiency, y = Total_Score, fill = Sleep_Deficiency)) +
  geom_boxplot() +
  labs(title = "Student Performance by Sleep Deficiency",
       x = "Sleep Category",
       y = "Total Score") +
  theme_minimal() +
  scale_fill_manual(values = c("Cukup" = "green", "Tidak Cukup" = "red"))

# 3. Uji statistik (t-test) untuk membandingkan Total_Score antara kelompok tidur yang cukup dan tidak cukup
t_test_result <- t.test(Total_Score ~ Sleep_Deficiency, data = data_kaggle)
cat(sprintf("T-test p-value: %.4f\n", t_test_result$p.value))

## T-test p-value: 0.7710

if(t_test_result$p.value < 0.05) {
  cat("Terdapat perbedaan signifikan pada Total Score antara kelompok Cukup dan Tidak Cukup.\n")
} else {
  cat("Tidak terdapat perbedaan signifikan pada Total Score antara kedua kelompok.\n")
}

## Tidak terdapat perbedaan signifikan pada Total Score antara kedua kelompok.

# 4. (Opsional) Jika terdapat variabel Stress_Level, analisis korelasi antara Stress_Level dan Total_Score
if("Stress_Level" %in% colnames(data_kaggle)) {
  # Menghitung korelasi
  cor_result <- cor(data_kaggle$Stress_Level, data_kaggle$Total_Score, use = "complete.obs")
  cat(sprintf("Correlation between Stress Level and Total Score: %.4f\n", cor_result))
  
  # Membuat scatter plot dengan garis regresi
  p2 <- ggplot(data_kaggle, aes(x = Stress_Level, y = Total_Score)) +
    geom_point(color = "blue", alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "red") +
    labs(title = "Relationship between Stress Level and Total Score",
         x = "Stress Level",
         y = "Total Score") +
    theme_minimal()
  
  print(p2)
}

## Correlation between Stress Level and Total Score: 0.0042

## `geom_smooth()` using formula = 'y ~ x'

# Menampilkan boxplot
print(p1)

Visualisasi Distribusi Skor Total pada Berbagai Tingkat Stres dengan Violin Plot

ggplot(data_kaggle, aes(x = factor(Stress_Level), y = Total_Score, fill = factor(Stress_Level))) +
  geom_violin(trim = FALSE) +
  labs(
    title = "Total Score by Stress Level",
    x = "Stress Level",
    y = "Total Score"
  ) +
  theme_minimal()

KS-TEST

library(dplyr)

# Rata-rata Total_Score per Stress_Level
df_mean <- data_kaggle %>%
  group_by(Stress_Level) %>%
  summarize(mean_score = mean(Total_Score, na.rm = TRUE))

ggplot(df_mean, aes(x = Stress_Level, y = mean_score)) +
  geom_line(color = "blue") +
  geom_point(color = "red", size = 2) +
  labs(
    title = "Average Total Score by Stress Level",
    x = "Stress Level",
    y = "Mean of Total Score"
  ) +
  theme_minimal()

ks_result <- ks.test(
  data_kaggle$Total_Score,
  "pnorm",
  mean = mean(data_kaggle$Total_Score, na.rm = TRUE),
  sd   = sd(data_kaggle$Total_Score, na.rm = TRUE)
)

## Warning in ks.test.default(data_kaggle$Total_Score, "pnorm", mean =
## mean(data_kaggle$Total_Score, : ties should not be present for the
## Kolmogorov-Smirnov test

# Menampilkan hasil KS test
print(ks_result)

## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  data_kaggle$Total_Score
## D = 0.058641, p-value = 2.331e-15
## alternative hypothesis: two-sided

Visualisasi dengan QQ Plot

qqnorm(data_kaggle$Total_Score, main = "QQ Plot: Total_Score")
qqline(data_kaggle$Total_Score, col = "red")

Berdasarkan hasil visualisasi menggunakan QQ Plot pada variabel Total_Score, dapat disimpulkan bahwa distribusi data tidak sepenuhnya mengikuti distribusi normal. Hal ini ditunjukkan oleh adanya penyimpangan titik-titik data dari garis referensi merah pada bagian ujung plot, baik di sisi kiri maupun kanan, yang mengindikasikan adanya outlier atau distribusi dengan ekor tebal (heavy tails).

TUGAS INDIVIDU AED

G1401231008_Shafi Faris Arif Rabbani

2025-03-15