library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(ggplot2)
setwd("C:/UNNES/Semester 1/Pengantar Sains Data/Latihan R")
student <- read.csv("student-mat.csv", sep = ";", header = TRUE)
student$G3 <- as.numeric(student$G3)
str(student)
## 'data.frame':    395 obs. of  33 variables:
##  $ school    : chr  "GP" "GP" "GP" "GP" ...
##  $ sex       : chr  "F" "F" "F" "F" ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : chr  "U" "U" "U" "U" ...
##  $ famsize   : chr  "GT3" "GT3" "LE3" "GT3" ...
##  $ Pstatus   : chr  "A" "T" "T" "T" ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : chr  "at_home" "at_home" "at_home" "health" ...
##  $ Fjob      : chr  "teacher" "other" "other" "services" ...
##  $ reason    : chr  "course" "course" "other" "home" ...
##  $ guardian  : chr  "mother" "father" "mother" "mother" ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : chr  "yes" "no" "yes" "no" ...
##  $ famsup    : chr  "no" "yes" "no" "yes" ...
##  $ paid      : chr  "no" "no" "yes" "yes" ...
##  $ activities: chr  "no" "no" "no" "yes" ...
##  $ nursery   : chr  "yes" "no" "yes" "yes" ...
##  $ higher    : chr  "yes" "yes" "yes" "yes" ...
##  $ internet  : chr  "no" "yes" "yes" "yes" ...
##  $ romantic  : chr  "no" "no" "no" "yes" ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ G1        : int  5 5 7 15 6 15 12 6 16 14 ...
##  $ G2        : int  6 5 8 14 10 15 12 5 18 15 ...
##  $ G3        : num  6 6 10 15 10 15 11 6 19 15 ...
head(student)
##   school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob     reason
## 1     GP   F  18       U     GT3       A    4    4  at_home  teacher     course
## 2     GP   F  17       U     GT3       T    1    1  at_home    other     course
## 3     GP   F  15       U     LE3       T    1    1  at_home    other      other
## 4     GP   F  15       U     GT3       T    4    2   health services       home
## 5     GP   F  16       U     GT3       T    3    3    other    other       home
## 6     GP   M  16       U     LE3       T    4    3 services    other reputation
##   guardian traveltime studytime failures schoolsup famsup paid activities
## 1   mother          2         2        0       yes     no   no         no
## 2   father          1         2        0        no    yes   no         no
## 3   mother          1         2        3       yes     no  yes         no
## 4   mother          1         3        0        no    yes  yes        yes
## 5   father          1         2        0        no    yes  yes         no
## 6   mother          1         2        0        no    yes  yes        yes
##   nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1     yes    yes       no       no      4        3     4    1    1      3
## 2      no    yes      yes       no      5        3     3    1    1      3
## 3     yes    yes      yes       no      4        3     2    2    3      3
## 4     yes    yes      yes      yes      3        2     2    1    1      5
## 5     yes    yes       no       no      4        3     2    1    2      5
## 6     yes    yes      yes       no      5        4     2    1    2      5
##   absences G1 G2 G3
## 1        6  5  6  6
## 2        4  5  5  6
## 3       10  7  8 10
## 4        2 15 14 15
## 5        4  6 10 10
## 6       10 15 15 15
summary(student)
##     school              sex                 age         address         
##  Length:395         Length:395         Min.   :15.0   Length:395        
##  Class :character   Class :character   1st Qu.:16.0   Class :character  
##  Mode  :character   Mode  :character   Median :17.0   Mode  :character  
##                                        Mean   :16.7                     
##                                        3rd Qu.:18.0                     
##                                        Max.   :22.0                     
##    famsize            Pstatus               Medu            Fedu      
##  Length:395         Length:395         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Median :3.000   Median :2.000  
##                                        Mean   :2.749   Mean   :2.522  
##                                        3rd Qu.:4.000   3rd Qu.:3.000  
##                                        Max.   :4.000   Max.   :4.000  
##      Mjob               Fjob              reason            guardian        
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    traveltime      studytime        failures       schoolsup        
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Length:395        
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :1.000   Median :2.000   Median :0.0000   Mode  :character  
##  Mean   :1.448   Mean   :2.035   Mean   :0.3342                     
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000                     
##  Max.   :4.000   Max.   :4.000   Max.   :3.0000                     
##     famsup              paid            activities          nursery         
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     higher            internet           romantic             famrel     
##  Length:395         Length:395         Length:395         Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:4.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.944  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :5.000  
##     freetime         goout            Dalc            Walc      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :3.000   Median :3.000   Median :1.000   Median :2.000  
##  Mean   :3.235   Mean   :3.109   Mean   :1.481   Mean   :2.291  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##      health         absences            G1              G2       
##  Min.   :1.000   Min.   : 0.000   Min.   : 3.00   Min.   : 0.00  
##  1st Qu.:3.000   1st Qu.: 0.000   1st Qu.: 8.00   1st Qu.: 9.00  
##  Median :4.000   Median : 4.000   Median :11.00   Median :11.00  
##  Mean   :3.554   Mean   : 5.709   Mean   :10.91   Mean   :10.71  
##  3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:13.00   3rd Qu.:13.00  
##  Max.   :5.000   Max.   :75.000   Max.   :19.00   Max.   :19.00  
##        G3       
##  Min.   : 0.00  
##  1st Qu.: 8.00  
##  Median :11.00  
##  Mean   :10.42  
##  3rd Qu.:14.00  
##  Max.   :20.00
colSums(is.na(student))
##     school        sex        age    address    famsize    Pstatus       Medu 
##          0          0          0          0          0          0          0 
##       Fedu       Mjob       Fjob     reason   guardian traveltime  studytime 
##          0          0          0          0          0          0          0 
##   failures  schoolsup     famsup       paid activities    nursery     higher 
##          0          0          0          0          0          0          0 
##   internet   romantic     famrel   freetime      goout       Dalc       Walc 
##          0          0          0          0          0          0          0 
##     health   absences         G1         G2         G3 
##          0          0          0          0          0
sum(duplicated(student))
## [1] 0
  1. Analisis Karakteristik Siswa, Untuk menganalisis komposisi siswa, kita menggunakan bar plot frekuensi.
# Komposisi Jenis Kelamin (sex)
ggplot(student, aes(x = sex, fill = sex)) +
  geom_bar() +
  labs(title = "Komposisi Siswa Berdasarkan Jenis Kelamin", x = "Jenis Kelamin (F=Wanita, M=Pria)", y = "Jumlah Siswa") +
  theme_minimal()

# Komposisi Tempat Tinggal (address)
ggplot(student, aes(x = address, fill = address)) +
  geom_bar() +
  labs(title = "Komposisi Siswa Berdasarkan Tempat Tinggal", x = "Tempat Tinggal (U=Urban, R=Rural)", y = "Jumlah Siswa") +
  theme_minimal()

# Komposisi Waktu Belajar (studytime)
ggplot(student, aes(x = factor(studytime), fill = factor(studytime))) +
  geom_bar() +
  labs(title = "Komposisi Siswa Berdasarkan Waktu Belajar", 
       x = "Waktu Belajar (1: <2j, 2: 2-5j, 3: 5-10j, 4: >10j)", y = "Jumlah Siswa") +
  scale_fill_discrete(name = "Waktu Belajar") +
  theme_minimal()

Persebaran Nilai Akhir (G3) dan Outlier

# Distribusi Nilai Akhir (G3) 
hist(student$G3,
     main = "Distribusi Nilai Akhir (G3)",
     xlab = "Nilai Akhir",
     col = "lightblue",
     border = "black")

# Pengecekan Outlier pada Absences 
boxplot(student$absences,
        main = "Pengecekan Outlier pada Absences",
        ylab = "Jumlah Ketidakhadiran",
        col = "lightcoral")

summary(student$G3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.00   11.00   10.42   14.00   20.00
  1. Analisis Nilai Akademik Siswa, Tujuan utama adalah menguji pengaruh G1 dan G2 terhadap G3. Korelasi dan visualisasi scatter plot digunakan untuk eksplorasi, diikuti dengan model regresi untuk mengukur pengaruh.

Korelasi dan Hubungan G1, G2 dengan G3

# Korelasi G1 dan G3 
cor_G1_G3 <- cor(student$G1, student$G3)
print(paste("Korelasi G1 dan G3:", round(cor_G1_G3, 3)))
## [1] "Korelasi G1 dan G3: 0.801"
# Korelasi G2 dan G3 
cor_G2_G3 <- cor(student$G2, student$G3)
print(paste("Korelasi G2 dan G3:", round(cor_G2_G3, 3)))
## [1] "Korelasi G2 dan G3: 0.905"
# Hubungan G1 dengan G3 
plot(student$G1, student$G3,
     main="Hubungan G1 dengan G3",
     xlab="Nilai G1", ylab="Nilai G3",
     pch=19, col="darkorange")

# Hubungan G2 dengan G3 
plot(student$G2, student$G3,
     main="Hubungan G2 dengan G3",
     xlab="Nilai G2", ylab="Nilai G3",
     pch=19, col="blue")

Pengaruh G1 dan G2 terhadap G3

# Menguji pengaruh G1 dan G2 menggunakan Regresi Linear
model_akademik <- lm(G3 ~ G1 + G2, data = student)
summary(model_akademik)
## 
## Call:
## lm(formula = G3 ~ G1 + G2, data = student)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5713 -0.3888  0.2885  0.9725  3.7089 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.83001    0.33531  -5.458 8.57e-08 ***
## G1           0.15327    0.05618   2.728  0.00665 ** 
## G2           0.98687    0.04957  19.909  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.937 on 392 degrees of freedom
## Multiple R-squared:  0.8222, Adjusted R-squared:  0.8213 
## F-statistic: 906.1 on 2 and 392 DF,  p-value: < 2.2e-16

Korelasi dan Hubungan G1, G2 dengan G3

# Korelasi G1 dan G3 
cor_G1_G3 <- cor(student$G1, student$G3)
print(paste("Korelasi G1 dan G3:", round(cor_G1_G3, 3)))
## [1] "Korelasi G1 dan G3: 0.801"
# Korelasi G2 dan G3 
cor_G2_G3 <- cor(student$G2, student$G3)
print(paste("Korelasi G2 dan G3:", round(cor_G2_G3, 3)))
## [1] "Korelasi G2 dan G3: 0.905"
# Hubungan G1 dengan G3 
plot(student$G1, student$G3,
     main="Hubungan G1 dengan G3",
     xlab="Nilai G1", ylab="Nilai G3",
     pch=19, col="darkorange")

# Hubungan G2 dengan G3 
plot(student$G2, student$G3,
     main="Hubungan G2 dengan G3",
     xlab="Nilai G2", ylab="Nilai G3",
     pch=19, col="blue")

Pengaruh G1 dan G2 terhadap G3

# Menguji pengaruh G1 dan G2 menggunakan Regresi Linear
model_akademik <- lm(G3 ~ G1 + G2, data = student)
summary(model_akademik)
## 
## Call:
## lm(formula = G3 ~ G1 + G2, data = student)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5713 -0.3888  0.2885  0.9725  3.7089 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.83001    0.33531  -5.458 8.57e-08 ***
## G1           0.15327    0.05618   2.728  0.00665 ** 
## G2           0.98687    0.04957  19.909  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.937 on 392 degrees of freedom
## Multiple R-squared:  0.8222, Adjusted R-squared:  0.8213 
## F-statistic: 906.1 on 2 and 392 DF,  p-value: < 2.2e-16
  1. Analisis Pengaruh Faktor Non-Akademik, Pengaruh Absences dan Jenis Kelamin
# Hubungan Absences dengan G3
cor_absences_G3 <- cor(student$absences, student$G3)
print(paste("Korelasi Absences dan G3:", round(cor_absences_G3, 3)))
## [1] "Korelasi Absences dan G3: 0.034"
plot(student$absences, student$G3,
     main="Hubungan Absences dengan Nilai Akhir (G3)",
     xlab="Jumlah Ketidakhadiran", ylab="Nilai G3",
     pch=19, col="darkgreen")

# Perbedaan G3 berdasarkan Jenis Kelamin
boxplot(G3 ~ sex, data = student,
        main="G3 Berdasarkan Jenis Kelamin",
        xlab="Jenis Kelamin", ylab="Nilai Akhir",
        col=c("pink","lightblue"))

Pengaruh Waktu Belajar (Studytime)

# Rata-rata G3 per Waktu Belajar
avg_study <- student %>%
  group_by(studytime) %>%
  summarise(mean_G3 = mean(G3), .groups = 'drop')

barplot(avg_study$mean_G3, names.arg = avg_study$studytime,
        main="Rata-rata G3 per Waktu Belajar",
        xlab="Waktu Belajar (1-4)", ylab="Rata-rata G3",
        col="skyblue")

# Analisis Lanjutan: Rata-rata G3 berdasarkan Waktu Belajar dan Jenis Kelamin
avg_study_sex <- student %>%
  group_by(studytime, sex) %>%
  summarise(mean_G3 = mean(G3), .groups = 'drop')

ggplot(avg_study_sex, aes(x = factor(studytime), y = mean_G3, fill = sex)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Rata-rata G3 Berdasarkan Waktu Belajar dan Jenis Kelamin",
       x = "Waktu Belajar (1-4)", y = "Rata-rata Nilai G3") +
  scale_fill_manual(values = c("F" = "pink", "M" = "lightblue")) +
  theme_minimal()

  1. Analisis Korelasi Seluruh Variabel Numerik Tujuan untuk mengidentifikasi variabel numerik mana yang paling kuat hubungannya dengan G3.
# Pilih variabel numerik secara eksplisit untuk keamanan
num_cols <- c("age", "Medu", "Fedu", "traveltime", "studytime", 
              "failures", "famrel", "freetime", "goout", "Dalc", 
              "Walc", "health", "absences", "G1", "G2", "G3")
# Ambil data numerik
num_data_clean <- student %>%
  select(all_of(num_cols)) %>%
  mutate(across(everything(), as.numeric))
# Hitung Matriks Korelasi
cor_matrix <- cor(num_data_clean)
# 4. Visualisasi Korelasi
corrplot(cor_matrix, 
         method="color", 
         type="upper", 
         tl.col="black", 
         tl.srt=45,
         addCoef.col = "black",
         diag = FALSE)