SOAL NOMOR 1. Pendahuluan

Membaca Data

data <- read_csv("Expanded_data_with_more_features.csv")
## New names:
## Rows: 30641 Columns: 15
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (10): Gender, EthnicGroup, ParentEduc, LunchType, TestPrep, ParentMarita... dbl
## (5): ...1, NrSiblings, MathScore, ReadingScore, WritingScore
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(data)
## # A tibble: 6 × 15
##    ...1 Gender EthnicGroup ParentEduc     LunchType TestPrep ParentMaritalStatus
##   <dbl> <chr>  <chr>       <chr>          <chr>     <chr>    <chr>              
## 1     0 female <NA>        bachelor's de… standard  none     married            
## 2     1 female group C     some college   standard  <NA>     married            
## 3     2 female group B     master's degr… standard  none     single             
## 4     3 male   group A     associate's d… free/red… none     married            
## 5     4 male   group C     some college   standard  none     married            
## 6     5 female group B     associate's d… standard  none     married            
## # ℹ 8 more variables: PracticeSport <chr>, IsFirstChild <chr>,
## #   NrSiblings <dbl>, TransportMeans <chr>, WklyStudyHours <chr>,
## #   MathScore <dbl>, ReadingScore <dbl>, WritingScore <dbl>

1. Distribusi Gender

ggplot(data, aes(x = Gender)) +
  geom_bar(fill = "skyblue") +
  labs(title = "Distribusi Gender", x = "Gender", y = "Jumlah")

2. Distribusi Kelompok Etnis

ggplot(data, aes(x = EthnicGroup)) +
  geom_bar(fill = "orange") +
  labs(title = "Distribusi Kelompok Etnis", x = "Kelompok Etnis", y = "Jumlah")

3. Latar Belakang Pendidikan Orang Tua

ggplot(data, aes(x = ParentEduc)) +
  geom_bar(fill = "purple") +
  labs(title = "Latar Belakang Pendidikan Orang Tua", x = "Pendidikan", y = "Jumlah") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

4. Jenis Makan Siang

ggplot(data, aes(x = LunchType)) +
  geom_bar(fill = "green") +
  labs(title = "Jenis Makan Siang", x = "Tipe", y = "Jumlah")

5. Kursus Persiapan Ujian

ggplot(data, aes(x = TestPrep)) +
  geom_bar(fill = "coral") +
  labs(title = "Persiapan Ujian", x = "Status", y = "Jumlah")

6. Status Perkawinan Orang Tua

ggplot(data, aes(x = ParentMaritalStatus)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Status Perkawinan Orang Tua", x = "Status", y = "Jumlah")

7. Kebiasaan Berolahraga

ggplot(data, aes(x = PracticeSport)) +
  geom_bar(fill = "brown") +
  labs(title = "Frekuensi Berolahraga", x = "Frekuensi", y = "Jumlah")

8. Anak Pertama atau Bukan

ggplot(data, aes(x = IsFirstChild)) +
  geom_bar(fill = "darkcyan") +
  labs(title = "Apakah Anak Pertama", x = "Ya/Tidak", y = "Jumlah")

9. Jumlah Saudara Kandung

ggplot(data, aes(x = NrSiblings)) +
  geom_bar(fill = "darkorange") +
  labs(title = "Jumlah Saudara Kandung", x = "Jumlah Saudara", y = "Jumlah Siswa")
## Warning: Removed 1572 rows containing non-finite outside the scale range
## (`stat_count()`).

10. Transportasi ke Sekolah

ggplot(data, aes(x = TransportMeans)) +
  geom_bar(fill = "violet") +
  labs(title = "Sarana Transportasi ke Sekolah", x = "Sarana", y = "Jumlah")

11. Jam Belajar Mandiri

ggplot(data, aes(x = WklyStudyHours)) +
  geom_bar(fill = "darkgreen") +
  labs(title = "Jam Belajar Mandiri per Minggu", x = "Kategori Jam", y = "Jumlah")

12. Nilai Matematika

ggplot(data, aes(x = MathScore)) +
  geom_histogram(binwidth = 5, fill = "dodgerblue") +
  labs(title = "Distribusi Nilai Matematika", x = "Nilai", y = "Frekuensi")

13. Nilai Membaca

ggplot(data, aes(x = ReadingScore)) +
  geom_histogram(binwidth = 5, fill = "salmon") +
  labs(title = "Distribusi Nilai Membaca", x = "Nilai", y = "Frekuensi")

14. Nilai Menulis

ggplot(data, aes(x = WritingScore)) +
  geom_histogram(binwidth = 5, fill = "seagreen") +
  labs(title = "Distribusi Nilai Menulis", x = "Nilai", y = "Frekuensi")


SOAL NOMOR 2. Import Data

# Menampilkan boxplot untuk masing-masing nilai
boxplot(data$MathScore, main = "Boxplot Math Score", col = "skyblue")

boxplot(data$ReadingScore, main = "Boxplot Reading Score", col = "salmon")

boxplot(data$WritingScore, main = "Boxplot Writing Score", col = "lightgreen")

detect_outlier <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  return(which(x < lower | x > upper))
}

out_math <- detect_outlier(data$MathScore)
out_reading <- detect_outlier(data$ReadingScore)
out_writing <- detect_outlier(data$WritingScore)

data[out_math, c("Gender", "MathScore")]
## # A tibble: 76 × 2
##    Gender MathScore
##    <chr>      <dbl>
##  1 female        18
##  2 female         0
##  3 female        21
##  4 female        18
##  5 female         8
##  6 male          21
##  7 female        22
##  8 female        17
##  9 female        20
## 10 male          16
## # ℹ 66 more rows
data[out_reading, c("Gender", "ReadingScore")]
## # A tibble: 90 × 2
##    Gender ReadingScore
##    <chr>         <dbl>
##  1 female           17
##  2 male             25
##  3 male             23
##  4 male             24
##  5 female           24
##  6 male             26
##  7 male             23
##  8 male             21
##  9 male             18
## 10 male             25
## # ℹ 80 more rows
data[out_writing, c("Gender", "WritingScore")]
## # A tibble: 109 × 2
##    Gender WritingScore
##    <chr>         <dbl>
##  1 female           10
##  2 male             21
##  3 male             19
##  4 female           26
##  5 male             15
##  6 female           23
##  7 male             17
##  8 male             25
##  9 male             26
## 10 male             24
## # ℹ 99 more rows

SOAL NOMOR 3. Pertanyaan

1. Distribusi dan Outlier

par(mfrow = c(1, 3)) hist(data\(MathScore, main = "Distribusi Nilai Math", col = "skyblue", xlab = "Math Score") hist(data\)ReadingScore, main = “Distribusi Nilai Reading”, col = “salmon”, xlab = “Reading Score”) hist(data$WritingScore, main = “Distribusi Nilai Writing”, col = “seagreen”, xlab = “Writing Score”)

par(mfrow = c(1, 3)) boxplot(data\(MathScore, main = "Boxplot Math", col = "skyblue") boxplot(data\)ReadingScore, main = “Boxplot Reading”, col = “salmon”) boxplot(data$WritingScore, main = “Boxplot Writing”, col = “seagreen”)

2. Penjelasan Outlier

cat(“Outlier adalah nilai yang jauh berbeda dari mayoritas data lainnya.”) cat(“Cara memeriksa: gunakan boxplot atau rumus IQR (Q3 - Q1).”) cat(“Menanggulangi outlier: hapus, transformasi data, atau gunakan metode robust.”)

3. Jam belajar paling umum

ggplot(data, aes(x = WklyStudyHours)) + geom_bar(fill = “darkorange”) + labs(title = “Distribusi Jam Belajar Mandiri”, x = “Kategori Jam Belajar”, y = “Jumlah Siswa”)

4. Perbedaan nilai per ethnic

data_long <- data %>% pivot_longer(cols = c(MathScore, ReadingScore, WritingScore), names_to = “Subject”, values_to = “Score”)

ggplot(data_long, aes(x = EthnicGroup, y = Score, fill = Subject)) + geom_boxplot() + labs(title = “Perbandingan Nilai per Etnis”, x = “Kelompok Etnis”, y = “Nilai”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))

5. Korelasi Heatmap

nilai <- data[, c(“MathScore”, “ReadingScore”, “WritingScore”)] corr_matrix <- cor(na.omit(nilai)) corrplot(corr_matrix, method = “color”, addCoef.col = “black”, tl.cex = 0.8)

6. Perbedaan nilai Math berdasarkan PracticeSport

ggboxplot(data, x = “PracticeSport”, y = “MathScore”, color = “PracticeSport”, palette = “jco”, ylab = “Math Score”, xlab = “Olahraga”)

t.test(MathScore ~ PracticeSport, data = data)