Load Library
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(readr)
-MENG-INPUT DATA DAN GOALS Goals:
Input Data
nilaimat <- read.csv("student-mat.csv",sep = ";")
nilaipor <- read.csv("student-por.csv",sep = ";")
head(nilaimat)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## 1 GP F 18 U GT3 A 4 4 at_home teacher course
## 2 GP F 17 U GT3 T 1 1 at_home other course
## 3 GP F 15 U LE3 T 1 1 at_home other other
## 4 GP F 15 U GT3 T 4 2 health services home
## 5 GP F 16 U GT3 T 3 3 other other home
## 6 GP M 16 U LE3 T 4 3 services other reputation
## guardian traveltime studytime failures schoolsup famsup paid activities
## 1 mother 2 2 0 yes no no no
## 2 father 1 2 0 no yes no no
## 3 mother 1 2 3 yes no yes no
## 4 mother 1 3 0 no yes yes yes
## 5 father 1 2 0 no yes yes no
## 6 mother 1 2 0 no yes yes yes
## nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1 yes yes no no 4 3 4 1 1 3
## 2 no yes yes no 5 3 3 1 1 3
## 3 yes yes yes no 4 3 2 2 3 3
## 4 yes yes yes yes 3 2 2 1 1 5
## 5 yes yes no no 4 3 2 1 2 5
## 6 yes yes yes no 5 4 2 1 2 5
## absences G1 G2 G3
## 1 6 5 6 6
## 2 4 5 5 6
## 3 10 7 8 10
## 4 2 15 14 15
## 5 4 6 10 10
## 6 10 15 15 15
head(nilaipor)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## 1 GP F 18 U GT3 A 4 4 at_home teacher course
## 2 GP F 17 U GT3 T 1 1 at_home other course
## 3 GP F 15 U LE3 T 1 1 at_home other other
## 4 GP F 15 U GT3 T 4 2 health services home
## 5 GP F 16 U GT3 T 3 3 other other home
## 6 GP M 16 U LE3 T 4 3 services other reputation
## guardian traveltime studytime failures schoolsup famsup paid activities
## 1 mother 2 2 0 yes no no no
## 2 father 1 2 0 no yes no no
## 3 mother 1 2 0 yes no no no
## 4 mother 1 3 0 no yes no yes
## 5 father 1 2 0 no yes no no
## 6 mother 1 2 0 no yes no yes
## nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1 yes yes no no 4 3 4 1 1 3
## 2 no yes yes no 5 3 3 1 1 3
## 3 yes yes yes no 4 3 2 2 3 3
## 4 yes yes yes yes 3 2 2 1 1 5
## 5 yes yes no no 4 3 2 1 2 5
## 6 yes yes yes no 5 4 2 1 2 5
## absences G1 G2 G3
## 1 4 0 11 11
## 2 2 9 11 11
## 3 6 12 13 12
## 4 0 14 14 14
## 5 0 11 13 13
## 6 6 12 12 13
DATA CLEANING
cek jumlah {baris, kolom}
dim(nilaimat)
## [1] 395 33
dim(nilaipor)
## [1] 649 33
Cek mising value
colSums(is.na(nilaimat))
## school sex age address famsize Pstatus Medu
## 0 0 0 0 0 0 0
## Fedu Mjob Fjob reason guardian traveltime studytime
## 0 0 0 0 0 0 0
## failures schoolsup famsup paid activities nursery higher
## 0 0 0 0 0 0 0
## internet romantic famrel freetime goout Dalc Walc
## 0 0 0 0 0 0 0
## health absences G1 G2 G3
## 0 0 0 0 0
colSums(is.na(nilaipor))
## school sex age address famsize Pstatus Medu
## 0 0 0 0 0 0 0
## Fedu Mjob Fjob reason guardian traveltime studytime
## 0 0 0 0 0 0 0
## failures schoolsup famsup paid activities nursery higher
## 0 0 0 0 0 0 0
## internet romantic famrel freetime goout Dalc Walc
## 0 0 0 0 0 0 0
## health absences G1 G2 G3
## 0 0 0 0 0
Cek Duplikat
sum(duplicated(nilaimat))
## [1] 0
sum(duplicated(nilaipor))
## [1] 0
Menghapus duplikat dan mengecek ukuran setelah menghapusnya
mat_clean <- na.omit(nilaimat)
por_clean <- na.omit(nilaipor)
por_clean <- por_clean %>% distinct()
mat_clean <- mat_clean %>% distinct()
dim(mat_clean)
## [1] 395 33
dim(por_clean)
## [1] 649 33
EDA
Top 10 siswa dengan nilai tertinggi pada tiap pelajaran
top10por <- por_clean %>%
arrange(desc(G3)) %>%
slice(1:10)
top10por
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 17 R LE3 T 3 1 services other
## 2 MS M 18 U GT3 T 4 4 teacher teacher
## 3 GP M 15 U LE3 T 4 2 teacher other
## 4 GP F 16 U GT3 T 4 2 health services
## 5 GP M 16 U GT3 T 1 0 other other
## 6 GP M 17 R GT3 T 1 2 at_home at_home
## 7 GP F 18 R LE3 T 1 1 at_home other
## 8 GP F 18 U GT3 T 2 2 at_home at_home
## 9 GP F 17 U GT3 T 4 3 health services
## 10 GP F 17 U GT3 T 3 2 health health
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 reputation mother 2 4 0 no yes no
## 2 home father 1 2 0 no no no
## 3 course mother 1 1 0 no no no
## 4 home father 1 2 0 no no no
## 5 reputation mother 2 2 0 no yes no
## 6 home mother 1 2 0 no yes no
## 7 reputation mother 2 4 0 no yes no
## 8 other mother 1 3 0 no yes no
## 9 reputation mother 1 3 0 no yes no
## 10 reputation father 1 4 0 no yes no
## activities nursery higher internet romantic famrel freetime goout Dalc Walc
## 1 no yes yes no no 3 1 2 1 1
## 2 yes no yes yes no 3 2 4 1 4
## 3 no yes yes yes no 3 5 2 1 1
## 4 no yes yes yes yes 4 2 3 1 1
## 5 yes yes yes yes yes 4 3 2 1 1
## 6 yes no yes no yes 3 5 2 2 2
## 7 yes yes yes no no 5 2 2 1 1
## 8 no yes yes yes no 4 3 3 1 2
## 9 no yes yes yes no 4 2 2 1 2
## 10 yes no yes yes no 5 2 2 1 2
## health absences G1 G2 G3
## 1 3 0 18 19 19
## 2 2 4 17 18 19
## 3 3 10 18 17 18
## 4 3 0 17 17 18
## 5 3 0 16 17 18
## 6 1 2 16 17 18
## 7 3 2 17 17 18
## 8 2 0 18 18 18
## 9 3 0 17 18 18
## 10 5 0 18 18 18
top10mat <- mat_clean %>%
arrange(desc(G3)) %>%
slice(1:10)
top10mat
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP M 16 U GT3 T 4 3 health services
## 2 GP M 15 U LE3 A 3 2 services other
## 3 GP M 15 U LE3 A 4 4 teacher teacher
## 4 GP M 15 U LE3 T 4 2 teacher other
## 5 GP F 18 U GT3 T 2 2 at_home at_home
## 6 MS F 18 R LE3 T 4 4 other other
## 7 GP M 15 U LE3 T 4 3 teacher services
## 8 GP M 15 U GT3 T 4 4 services teacher
## 9 GP F 15 U GT3 T 4 3 services other
## 10 GP M 15 U GT3 A 3 4 services other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 reputation mother 1 4 0 no no no
## 2 home mother 1 2 0 no yes yes
## 3 course mother 1 1 0 no no no
## 4 course mother 1 1 0 no no no
## 5 other mother 1 3 0 no yes yes
## 6 reputation mother 2 3 0 no no no
## 7 home mother 1 3 0 no yes no
## 8 course father 1 2 0 no yes no
## 9 reputation mother 1 1 0 no no yes
## 10 course mother 1 2 0 no yes yes
## activities nursery higher internet romantic famrel freetime goout Dalc Walc
## 1 yes yes yes yes no 4 2 2 1 1
## 2 no yes yes yes no 4 2 2 1 1
## 3 yes yes yes yes no 5 5 3 1 1
## 4 no yes yes yes no 3 5 2 1 1
## 5 no yes yes yes no 4 3 3 1 2
## 6 no yes yes yes no 5 4 4 1 1
## 7 yes yes yes yes no 5 4 3 1 1
## 8 yes yes yes yes no 4 3 3 1 1
## 9 yes yes yes yes no 4 5 5 1 3
## 10 yes yes yes yes no 5 4 4 1 1
## health absences G1 G2 G3
## 1 2 4 19 19 20
## 2 1 0 16 18 19
## 3 4 6 18 19 19
## 4 3 10 18 19 19
## 5 2 5 18 18 19
## 6 1 0 19 18 19
## 7 4 2 15 16 18
## 8 5 2 19 18 18
## 9 1 4 16 17 18
## 10 1 0 16 18 18
ggplot(top10mat, aes(x = factor(1:nrow(top10mat)), y = G3)) +
geom_col(fill = "red") +
labs(
title = "10 Nilai Tertinggi Mat (G3)",
x = "Ranking",
y = "Nilai"
) +
theme_minimal()
ggplot(top10por, aes(x = factor(1:nrow(top10por)), y = G3)) +
geom_col(fill = "blue") +
labs(
title = "10 Nilai Tertinggi Bahasa Portugis (G3)",
x = "Ranking",
y = "Nilai"
) +
theme_minimal()
Distribusi nilai akhir G3
ggplot(por_clean, aes(x = G3)) +
geom_histogram(binwidth = 1) +
labs(title = "Distribusi Nilai Akhir (G3) - Portugis",
x = "Nilai", y = "Frekuensi")
ggplot(mat_clean, aes(x = G3)) +
geom_histogram(binwidth = 1) +
labs(title = "Distribusi Nilai Akhir (G3) - Matematika",
x = "Nilai", y = "Frekuensi")
Hubungan jam belajar dengan nilai akhir
ggplot(por_clean, aes(x = studytime, y = G3)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Jam Belajar dengan Nilai Akhir (Portugis)")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(mat_clean, aes(x = studytime, y = G3)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Jam Belajar dengan Nilai Akhir (Matematika)")
## `geom_smooth()` using formula = 'y ~ x'
Perbandingan nilai berdasarkan gender
ggplot(por_clean, aes(x = sex, y = G3)) +
geom_boxplot() +
labs(title = "Nilai Akhir berdasarkan Gender (Portugis)")
ggplot(mat_clean, aes(x = sex, y = G3)) +
geom_boxplot() +
labs(title = "Nilai Akhir berdasarkan Gender (Matematika)")