Import Data

# Memanggil package yang akan digunakan
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
library(ggplot2)

# Memastikan keberadaan data
file.exists("D:/Ayub Nur Haqiqi/Semester 4/P Sains Data/Expanded_data_with_more_features.csv/Expanded_data.csv")
## [1] TRUE
# Membaca data
data <- read.csv("D:/Ayub Nur Haqiqi/Semester 4/P Sains Data/Expanded_data_with_more_features.csv/Expanded_data.csv")

# Menimpilkan data
head(data)
##   Gender EthnicGroup         ParentEduc    LunchType TestPrep
## 1 female              bachelor's degree     standard     none
## 2 female     group C       some college     standard         
## 3 female     group B    master's degree     standard     none
## 4   male     group A associate's degree free/reduced     none
## 5   male     group C       some college     standard     none
## 6 female     group B associate's degree     standard     none
##   ParentMaritalStatus PracticeSport IsFirstChild NrSiblings TransportMeans
## 1             married     regularly          yes          3     school_bus
## 2             married     sometimes          yes          0               
## 3              single     sometimes          yes          4     school_bus
## 4             married         never           no          1               
## 5             married     sometimes          yes          0     school_bus
## 6             married     regularly          yes          1     school_bus
##   WklyStudyHours MathScore ReadingScore WritingScore
## 1            < 5        71           71           74
## 2         10-May        69           90           88
## 3            < 5        87           93           91
## 4         10-May        45           56           42
## 5         10-May        76           78           75
## 6         10-May        73           84           79
# Pastikan kolom kategorikal diubah ke faktor
data <- data %>% mutate(
  Gender = as.factor(Gender),
  EthnicGroup = as.factor(EthnicGroup),
  ParentEduc = as.factor(ParentEduc),
  LunchType = as.factor(LunchType),
  TestPrep = as.factor(TestPrep),
  ParentMaritalStatus = as.factor(ParentMaritalStatus),
  PracticeSport = as.factor(PracticeSport),
  IsFirstChild = as.factor(IsFirstChild),
  TransportMeans = as.factor(TransportMeans)
)

# Konversi kolom numerik yang seharusnya integer
data$NrSiblings <- as.integer(data$NrSiblings)

# Memeriksa struktur data
str(data)
## 'data.frame':    30641 obs. of  14 variables:
##  $ Gender             : Factor w/ 2 levels "female","male": 1 1 1 2 2 1 1 2 2 1 ...
##  $ EthnicGroup        : Factor w/ 6 levels "","group A","group B",..: 1 4 3 2 4 3 3 3 5 3 ...
##  $ ParentEduc         : Factor w/ 7 levels "","associate's degree",..: 3 6 5 2 6 2 6 6 4 4 ...
##  $ LunchType          : Factor w/ 2 levels "free/reduced",..: 2 2 2 1 2 2 2 1 1 1 ...
##  $ TestPrep           : Factor w/ 3 levels "","completed",..: 3 1 3 3 3 3 2 3 2 3 ...
##  $ ParentMaritalStatus: Factor w/ 5 levels "","divorced",..: 3 3 4 3 3 3 5 3 4 3 ...
##  $ PracticeSport      : Factor w/ 4 levels "","never","regularly",..: 3 4 4 2 4 3 2 4 4 3 ...
##  $ IsFirstChild       : Factor w/ 3 levels "","no","yes": 3 3 3 2 3 3 2 3 2 3 ...
##  $ NrSiblings         : int  3 0 4 1 0 1 1 1 3 NA ...
##  $ TransportMeans     : Factor w/ 3 levels "","private","school_bus": 3 1 3 1 3 3 2 2 2 2 ...
##  $ WklyStudyHours     : chr  "< 5" "10-May" "< 5" "10-May" ...
##  $ MathScore          : int  71 69 87 45 76 73 85 41 65 37 ...
##  $ ReadingScore       : int  71 90 93 56 78 84 93 43 64 59 ...
##  $ WritingScore       : int  74 88 91 42 75 79 89 39 68 50 ...

Instruksi Mengerjakan Tugas

Tugas PSD M6

Jawaban

1. Periksa Distribusi Math, Reading, dan Writing Score & Cek Outlier

# Plot distribusi nilai
par(mfrow = c(1, 3))
hist(data$MathScore, main = "Distribusi Math Score", col = "skyblue", xlab = "Math Score")
hist(data$ReadingScore, main = "Distribusi Reading Score", col = "lightgreen", xlab = "Reading Score")
hist(data$WritingScore, main = "Distribusi Writing Score", col = "lightcoral", xlab = "Writing Score")

# Boxplot untuk outlier
par(mfrow = c(1, 3))
boxplot(data$MathScore, main = "Outlier Math Score", col = "skyblue")
boxplot(data$ReadingScore, main = "Outlier Reading Score", col = "lightgreen")
boxplot(data$WritingScore, main = "Outlier Writing Score", col = "lightcoral")

2. Apa itu Outlier? Bagaimana Cara Memeriksa dan Menanggulanginya?

Outlier adalah nilai yang jauh berbeda dari mayoritas data dalam distribusi. Cara mendeteksi outlier dapat dilakukan dengan boxplot, Z-score, atau Interquartile Range (IQR).

# Cek outlier dengan IQR
Q1 <- quantile(data$MathScore, 0.25)
Q3 <- quantile(data$MathScore, 0.75)
IQR <- Q3 - Q1
outliers_math <- data$MathScore[data$MathScore < (Q1 - 1.5 * IQR) | data$MathScore > (Q3 + 1.5 * IQR)]
outliers_math
##  [1] 18  0 21 18  8 21 22 17 20 16 21 21 22 19 22 22 22 17 20 15 15 18 18 11 19
## [26] 12 12 13 18 22 17 19 21 21  9 18 21 22 10  7 22 21 17 22 10 22 14 14 19 19
## [51] 11  9 10 13 18 20 21 16 13 22 13 22 10 16 21 21 16 18 20 19  9 19 11  9 21
## [76] 19

3. Jam Belajar per Minggu yang Paling Banyak Dilakukan oleh Student

# Normalisasi nilai dalam WklyStudyHours
data$WklyStudyHours <- as.character(data$WklyStudyHours)
data$WklyStudyHours[data$WklyStudyHours == "< 5"] <- "0-5"
data$WklyStudyHours[data$WklyStudyHours == "10-May"] <- "5-10"
data$WklyStudyHours <- as.factor(data$WklyStudyHours)

# Visualisasi
study_mode <- data %>% 
  group_by(WklyStudyHours) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

ggplot(study_mode, aes(x = WklyStudyHours, y = count)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Jam Belajar Paling Banyak Dilakukan", x = "Jam Belajar", y = "Jumlah Student") +
  theme_minimal()

4. Perbedaan Nilai Math, Reading, Writing per Ethnic

ggplot(data, aes(x = EthnicGroup, y = MathScore, fill = EthnicGroup)) +
  geom_boxplot() +
  labs(title = "Distribusi Math Score per Ethnic", x = "Ethnic Group", y = "Math Score") +
  theme_minimal()

5. Korelasi Antar Variabel (Correlation Heatmap)

cor_matrix <- cor(data[, c("MathScore", "ReadingScore", "WritingScore")], use = "complete.obs")
ggcorrplot(cor_matrix, lab = TRUE, colors = c("#6D9EC1", "white", "#E46726"))

6. Perbedaan Nilai Student Berdasarkan PracticeSport

ggplot(data, aes(x = PracticeSport, y = MathScore, fill = PracticeSport)) +
  geom_boxplot() +
  labs(title = "Perbedaan Math Score Berdasarkan PracticeSport", x = "PracticeSport", y = "Math Score") +
  theme_minimal()