## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'ggcorrplot' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
library(ggplot2)
# Memastikan keberadaan data
file.exists("D:/Ayub Nur Haqiqi/Semester 4/P Sains Data/Expanded_data_with_more_features.csv/Expanded_data.csv")## [1] TRUE
# Membaca data
data <- read.csv("D:/Ayub Nur Haqiqi/Semester 4/P Sains Data/Expanded_data_with_more_features.csv/Expanded_data.csv")
# Menimpilkan data
head(data)## Gender EthnicGroup ParentEduc LunchType TestPrep
## 1 female bachelor's degree standard none
## 2 female group C some college standard
## 3 female group B master's degree standard none
## 4 male group A associate's degree free/reduced none
## 5 male group C some college standard none
## 6 female group B associate's degree standard none
## ParentMaritalStatus PracticeSport IsFirstChild NrSiblings TransportMeans
## 1 married regularly yes 3 school_bus
## 2 married sometimes yes 0
## 3 single sometimes yes 4 school_bus
## 4 married never no 1
## 5 married sometimes yes 0 school_bus
## 6 married regularly yes 1 school_bus
## WklyStudyHours MathScore ReadingScore WritingScore
## 1 < 5 71 71 74
## 2 10-May 69 90 88
## 3 < 5 87 93 91
## 4 10-May 45 56 42
## 5 10-May 76 78 75
## 6 10-May 73 84 79
# Pastikan kolom kategorikal diubah ke faktor
data <- data %>% mutate(
Gender = as.factor(Gender),
EthnicGroup = as.factor(EthnicGroup),
ParentEduc = as.factor(ParentEduc),
LunchType = as.factor(LunchType),
TestPrep = as.factor(TestPrep),
ParentMaritalStatus = as.factor(ParentMaritalStatus),
PracticeSport = as.factor(PracticeSport),
IsFirstChild = as.factor(IsFirstChild),
TransportMeans = as.factor(TransportMeans)
)
# Konversi kolom numerik yang seharusnya integer
data$NrSiblings <- as.integer(data$NrSiblings)
# Memeriksa struktur data
str(data)## 'data.frame': 30641 obs. of 14 variables:
## $ Gender : Factor w/ 2 levels "female","male": 1 1 1 2 2 1 1 2 2 1 ...
## $ EthnicGroup : Factor w/ 6 levels "","group A","group B",..: 1 4 3 2 4 3 3 3 5 3 ...
## $ ParentEduc : Factor w/ 7 levels "","associate's degree",..: 3 6 5 2 6 2 6 6 4 4 ...
## $ LunchType : Factor w/ 2 levels "free/reduced",..: 2 2 2 1 2 2 2 1 1 1 ...
## $ TestPrep : Factor w/ 3 levels "","completed",..: 3 1 3 3 3 3 2 3 2 3 ...
## $ ParentMaritalStatus: Factor w/ 5 levels "","divorced",..: 3 3 4 3 3 3 5 3 4 3 ...
## $ PracticeSport : Factor w/ 4 levels "","never","regularly",..: 3 4 4 2 4 3 2 4 4 3 ...
## $ IsFirstChild : Factor w/ 3 levels "","no","yes": 3 3 3 2 3 3 2 3 2 3 ...
## $ NrSiblings : int 3 0 4 1 0 1 1 1 3 NA ...
## $ TransportMeans : Factor w/ 3 levels "","private","school_bus": 3 1 3 1 3 3 2 2 2 2 ...
## $ WklyStudyHours : chr "< 5" "10-May" "< 5" "10-May" ...
## $ MathScore : int 71 69 87 45 76 73 85 41 65 37 ...
## $ ReadingScore : int 71 90 93 56 78 84 93 43 64 59 ...
## $ WritingScore : int 74 88 91 42 75 79 89 39 68 50 ...
# Plot distribusi nilai
par(mfrow = c(1, 3))
hist(data$MathScore, main = "Distribusi Math Score", col = "skyblue", xlab = "Math Score")
hist(data$ReadingScore, main = "Distribusi Reading Score", col = "lightgreen", xlab = "Reading Score")
hist(data$WritingScore, main = "Distribusi Writing Score", col = "lightcoral", xlab = "Writing Score")# Boxplot untuk outlier
par(mfrow = c(1, 3))
boxplot(data$MathScore, main = "Outlier Math Score", col = "skyblue")
boxplot(data$ReadingScore, main = "Outlier Reading Score", col = "lightgreen")
boxplot(data$WritingScore, main = "Outlier Writing Score", col = "lightcoral")Outlier adalah nilai yang jauh berbeda dari mayoritas data dalam distribusi. Cara mendeteksi outlier dapat dilakukan dengan boxplot, Z-score, atau Interquartile Range (IQR).
# Cek outlier dengan IQR
Q1 <- quantile(data$MathScore, 0.25)
Q3 <- quantile(data$MathScore, 0.75)
IQR <- Q3 - Q1
outliers_math <- data$MathScore[data$MathScore < (Q1 - 1.5 * IQR) | data$MathScore > (Q3 + 1.5 * IQR)]
outliers_math## [1] 18 0 21 18 8 21 22 17 20 16 21 21 22 19 22 22 22 17 20 15 15 18 18 11 19
## [26] 12 12 13 18 22 17 19 21 21 9 18 21 22 10 7 22 21 17 22 10 22 14 14 19 19
## [51] 11 9 10 13 18 20 21 16 13 22 13 22 10 16 21 21 16 18 20 19 9 19 11 9 21
## [76] 19
# Normalisasi nilai dalam WklyStudyHours
data$WklyStudyHours <- as.character(data$WklyStudyHours)
data$WklyStudyHours[data$WklyStudyHours == "< 5"] <- "0-5"
data$WklyStudyHours[data$WklyStudyHours == "10-May"] <- "5-10"
data$WklyStudyHours <- as.factor(data$WklyStudyHours)
# Visualisasi
study_mode <- data %>%
group_by(WklyStudyHours) %>%
summarise(count = n()) %>%
arrange(desc(count))
ggplot(study_mode, aes(x = WklyStudyHours, y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Jam Belajar Paling Banyak Dilakukan", x = "Jam Belajar", y = "Jumlah Student") +
theme_minimal()ggplot(data, aes(x = EthnicGroup, y = MathScore, fill = EthnicGroup)) +
geom_boxplot() +
labs(title = "Distribusi Math Score per Ethnic", x = "Ethnic Group", y = "Math Score") +
theme_minimal()