library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(ggplot2)
setwd("C:/UNNES/Semester 1/Pengantar Sains Data/Latihan R")
student <- read.csv("student-mat.csv", sep = ";", header = TRUE)
student$G3 <- as.numeric(student$G3)
str(student)
## 'data.frame': 395 obs. of 33 variables:
## $ school : chr "GP" "GP" "GP" "GP" ...
## $ sex : chr "F" "F" "F" "F" ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : chr "U" "U" "U" "U" ...
## $ famsize : chr "GT3" "GT3" "LE3" "GT3" ...
## $ Pstatus : chr "A" "T" "T" "T" ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : chr "at_home" "at_home" "at_home" "health" ...
## $ Fjob : chr "teacher" "other" "other" "services" ...
## $ reason : chr "course" "course" "other" "home" ...
## $ guardian : chr "mother" "father" "mother" "mother" ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : chr "yes" "no" "yes" "no" ...
## $ famsup : chr "no" "yes" "no" "yes" ...
## $ paid : chr "no" "no" "yes" "yes" ...
## $ activities: chr "no" "no" "no" "yes" ...
## $ nursery : chr "yes" "no" "yes" "yes" ...
## $ higher : chr "yes" "yes" "yes" "yes" ...
## $ internet : chr "no" "yes" "yes" "yes" ...
## $ romantic : chr "no" "no" "no" "yes" ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : int 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : int 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : num 6 6 10 15 10 15 11 6 19 15 ...
head(student)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## 1 GP F 18 U GT3 A 4 4 at_home teacher course
## 2 GP F 17 U GT3 T 1 1 at_home other course
## 3 GP F 15 U LE3 T 1 1 at_home other other
## 4 GP F 15 U GT3 T 4 2 health services home
## 5 GP F 16 U GT3 T 3 3 other other home
## 6 GP M 16 U LE3 T 4 3 services other reputation
## guardian traveltime studytime failures schoolsup famsup paid activities
## 1 mother 2 2 0 yes no no no
## 2 father 1 2 0 no yes no no
## 3 mother 1 2 3 yes no yes no
## 4 mother 1 3 0 no yes yes yes
## 5 father 1 2 0 no yes yes no
## 6 mother 1 2 0 no yes yes yes
## nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1 yes yes no no 4 3 4 1 1 3
## 2 no yes yes no 5 3 3 1 1 3
## 3 yes yes yes no 4 3 2 2 3 3
## 4 yes yes yes yes 3 2 2 1 1 5
## 5 yes yes no no 4 3 2 1 2 5
## 6 yes yes yes no 5 4 2 1 2 5
## absences G1 G2 G3
## 1 6 5 6 6
## 2 4 5 5 6
## 3 10 7 8 10
## 4 2 15 14 15
## 5 4 6 10 10
## 6 10 15 15 15
summary(student)
## school sex age address
## Length:395 Length:395 Min. :15.0 Length:395
## Class :character Class :character 1st Qu.:16.0 Class :character
## Mode :character Mode :character Median :17.0 Mode :character
## Mean :16.7
## 3rd Qu.:18.0
## Max. :22.0
## famsize Pstatus Medu Fedu
## Length:395 Length:395 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Mode :character Median :3.000 Median :2.000
## Mean :2.749 Mean :2.522
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :4.000 Max. :4.000
## Mjob Fjob reason guardian
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## traveltime studytime failures schoolsup
## Min. :1.000 Min. :1.000 Min. :0.0000 Length:395
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 Class :character
## Median :1.000 Median :2.000 Median :0.0000 Mode :character
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## higher internet romantic famrel
## Length:395 Length:395 Length:395 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.944
## 3rd Qu.:5.000
## Max. :5.000
## freetime goout Dalc Walc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :1.000 Median :2.000
## Mean :3.235 Mean :3.109 Mean :1.481 Mean :2.291
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## health absences G1 G2
## Min. :1.000 Min. : 0.000 Min. : 3.00 Min. : 0.00
## 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00 1st Qu.: 9.00
## Median :4.000 Median : 4.000 Median :11.00 Median :11.00
## Mean :3.554 Mean : 5.709 Mean :10.91 Mean :10.71
## 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00 3rd Qu.:13.00
## Max. :5.000 Max. :75.000 Max. :19.00 Max. :19.00
## G3
## Min. : 0.00
## 1st Qu.: 8.00
## Median :11.00
## Mean :10.42
## 3rd Qu.:14.00
## Max. :20.00
colSums(is.na(student))
## school sex age address famsize Pstatus Medu
## 0 0 0 0 0 0 0
## Fedu Mjob Fjob reason guardian traveltime studytime
## 0 0 0 0 0 0 0
## failures schoolsup famsup paid activities nursery higher
## 0 0 0 0 0 0 0
## internet romantic famrel freetime goout Dalc Walc
## 0 0 0 0 0 0 0
## health absences G1 G2 G3
## 0 0 0 0 0
sum(duplicated(student))
## [1] 0
# Komposisi Jenis Kelamin (sex)
ggplot(student, aes(x = sex, fill = sex)) +
geom_bar() +
labs(title = "Komposisi Siswa Berdasarkan Jenis Kelamin", x = "Jenis Kelamin (F=Wanita, M=Pria)", y = "Jumlah Siswa") +
theme_minimal()
# Komposisi Tempat Tinggal (address)
ggplot(student, aes(x = address, fill = address)) +
geom_bar() +
labs(title = "Komposisi Siswa Berdasarkan Tempat Tinggal", x = "Tempat Tinggal (U=Urban, R=Rural)", y = "Jumlah Siswa") +
theme_minimal()
# Komposisi Waktu Belajar (studytime)
ggplot(student, aes(x = factor(studytime), fill = factor(studytime))) +
geom_bar() +
labs(title = "Komposisi Siswa Berdasarkan Waktu Belajar",
x = "Waktu Belajar (1: <2j, 2: 2-5j, 3: 5-10j, 4: >10j)", y = "Jumlah Siswa") +
scale_fill_discrete(name = "Waktu Belajar") +
theme_minimal()
Persebaran Nilai Akhir (G3) dan Outlier
# Distribusi Nilai Akhir (G3)
hist(student$G3,
main = "Distribusi Nilai Akhir (G3)",
xlab = "Nilai Akhir",
col = "lightblue",
border = "black")
# Pengecekan Outlier pada Absences
boxplot(student$absences,
main = "Pengecekan Outlier pada Absences",
ylab = "Jumlah Ketidakhadiran",
col = "lightcoral")
summary(student$G3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 8.00 11.00 10.42 14.00 20.00
Korelasi dan Hubungan G1, G2 dengan G3
# Korelasi G1 dan G3
cor_G1_G3 <- cor(student$G1, student$G3)
print(paste("Korelasi G1 dan G3:", round(cor_G1_G3, 3)))
## [1] "Korelasi G1 dan G3: 0.801"
# Korelasi G2 dan G3
cor_G2_G3 <- cor(student$G2, student$G3)
print(paste("Korelasi G2 dan G3:", round(cor_G2_G3, 3)))
## [1] "Korelasi G2 dan G3: 0.905"
# Hubungan G1 dengan G3
plot(student$G1, student$G3,
main="Hubungan G1 dengan G3",
xlab="Nilai G1", ylab="Nilai G3",
pch=19, col="darkorange")
# Hubungan G2 dengan G3
plot(student$G2, student$G3,
main="Hubungan G2 dengan G3",
xlab="Nilai G2", ylab="Nilai G3",
pch=19, col="blue")
Pengaruh G1 dan G2 terhadap G3
# Menguji pengaruh G1 dan G2 menggunakan Regresi Linear
model_akademik <- lm(G3 ~ G1 + G2, data = student)
summary(model_akademik)
##
## Call:
## lm(formula = G3 ~ G1 + G2, data = student)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5713 -0.3888 0.2885 0.9725 3.7089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.83001 0.33531 -5.458 8.57e-08 ***
## G1 0.15327 0.05618 2.728 0.00665 **
## G2 0.98687 0.04957 19.909 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.937 on 392 degrees of freedom
## Multiple R-squared: 0.8222, Adjusted R-squared: 0.8213
## F-statistic: 906.1 on 2 and 392 DF, p-value: < 2.2e-16
Korelasi dan Hubungan G1, G2 dengan G3
# Korelasi G1 dan G3
cor_G1_G3 <- cor(student$G1, student$G3)
print(paste("Korelasi G1 dan G3:", round(cor_G1_G3, 3)))
## [1] "Korelasi G1 dan G3: 0.801"
# Korelasi G2 dan G3
cor_G2_G3 <- cor(student$G2, student$G3)
print(paste("Korelasi G2 dan G3:", round(cor_G2_G3, 3)))
## [1] "Korelasi G2 dan G3: 0.905"
# Hubungan G1 dengan G3
plot(student$G1, student$G3,
main="Hubungan G1 dengan G3",
xlab="Nilai G1", ylab="Nilai G3",
pch=19, col="darkorange")
# Hubungan G2 dengan G3
plot(student$G2, student$G3,
main="Hubungan G2 dengan G3",
xlab="Nilai G2", ylab="Nilai G3",
pch=19, col="blue")
Pengaruh G1 dan G2 terhadap G3
# Menguji pengaruh G1 dan G2 menggunakan Regresi Linear
model_akademik <- lm(G3 ~ G1 + G2, data = student)
summary(model_akademik)
##
## Call:
## lm(formula = G3 ~ G1 + G2, data = student)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5713 -0.3888 0.2885 0.9725 3.7089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.83001 0.33531 -5.458 8.57e-08 ***
## G1 0.15327 0.05618 2.728 0.00665 **
## G2 0.98687 0.04957 19.909 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.937 on 392 degrees of freedom
## Multiple R-squared: 0.8222, Adjusted R-squared: 0.8213
## F-statistic: 906.1 on 2 and 392 DF, p-value: < 2.2e-16
# Hubungan Absences dengan G3
cor_absences_G3 <- cor(student$absences, student$G3)
print(paste("Korelasi Absences dan G3:", round(cor_absences_G3, 3)))
## [1] "Korelasi Absences dan G3: 0.034"
plot(student$absences, student$G3,
main="Hubungan Absences dengan Nilai Akhir (G3)",
xlab="Jumlah Ketidakhadiran", ylab="Nilai G3",
pch=19, col="darkgreen")
# Perbedaan G3 berdasarkan Jenis Kelamin
boxplot(G3 ~ sex, data = student,
main="G3 Berdasarkan Jenis Kelamin",
xlab="Jenis Kelamin", ylab="Nilai Akhir",
col=c("pink","lightblue"))
Pengaruh Waktu Belajar (Studytime)
# Rata-rata G3 per Waktu Belajar
avg_study <- student %>%
group_by(studytime) %>%
summarise(mean_G3 = mean(G3), .groups = 'drop')
barplot(avg_study$mean_G3, names.arg = avg_study$studytime,
main="Rata-rata G3 per Waktu Belajar",
xlab="Waktu Belajar (1-4)", ylab="Rata-rata G3",
col="skyblue")
# Analisis Lanjutan: Rata-rata G3 berdasarkan Waktu Belajar dan Jenis Kelamin
avg_study_sex <- student %>%
group_by(studytime, sex) %>%
summarise(mean_G3 = mean(G3), .groups = 'drop')
ggplot(avg_study_sex, aes(x = factor(studytime), y = mean_G3, fill = sex)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Rata-rata G3 Berdasarkan Waktu Belajar dan Jenis Kelamin",
x = "Waktu Belajar (1-4)", y = "Rata-rata Nilai G3") +
scale_fill_manual(values = c("F" = "pink", "M" = "lightblue")) +
theme_minimal()
# Pilih variabel numerik secara eksplisit untuk keamanan
num_cols <- c("age", "Medu", "Fedu", "traveltime", "studytime",
"failures", "famrel", "freetime", "goout", "Dalc",
"Walc", "health", "absences", "G1", "G2", "G3")
# Ambil data numerik
num_data_clean <- student %>%
select(all_of(num_cols)) %>%
mutate(across(everything(), as.numeric))
# Hitung Matriks Korelasi
cor_matrix <- cor(num_data_clean)
# 4. Visualisasi Korelasi
corrplot(cor_matrix,
method="color",
type="upper",
tl.col="black",
tl.srt=45,
addCoef.col = "black",
diag = FALSE)