1. preparation data

1.1 Import Library

library(dplyr) 
library(plotly) 
library(glue) 
library(scales) 
library(ggpubr) 
library(readr) 
library(tidyr) 
library(forcats)
library(patchwork)

1.2 Reading the file

student <- read_csv('StudentsPerformance.csv')
head(student)
#> # A tibble: 6 × 8
#>   gender `race/ethnicity` parental level…¹ lunch test …² math …³ readi…⁴ writi…⁵
#>   <chr>  <chr>            <chr>            <chr> <chr>     <dbl>   <dbl>   <dbl>
#> 1 female group B          bachelor's degr… stan… none         72      72      74
#> 2 female group C          some college     stan… comple…      69      90      88
#> 3 female group B          master's degree  stan… none         90      95      93
#> 4 male   group A          associate's deg… free… none         47      57      44
#> 5 male   group C          some college     stan… none         76      78      75
#> 6 female group B          associate's deg… stan… none         71      83      78
#> # … with abbreviated variable names ¹​`parental level of education`,
#> #   ²​`test preparation course`, ³​`math score`, ⁴​`reading score`,
#> #   ⁵​`writing score`

1.3 Check Type Data

str(student)
#> spc_tbl_ [1,000 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#>  $ gender                     : chr [1:1000] "female" "female" "female" "male" ...
#>  $ race/ethnicity             : chr [1:1000] "group B" "group C" "group B" "group A" ...
#>  $ parental level of education: chr [1:1000] "bachelor's degree" "some college" "master's degree" "associate's degree" ...
#>  $ lunch                      : chr [1:1000] "standard" "standard" "standard" "free/reduced" ...
#>  $ test preparation course    : chr [1:1000] "none" "completed" "none" "none" ...
#>  $ math score                 : num [1:1000] 72 69 90 47 76 71 88 40 64 38 ...
#>  $ reading score              : num [1:1000] 72 90 95 57 78 83 95 43 64 60 ...
#>  $ writing score              : num [1:1000] 74 88 93 44 75 78 92 39 67 50 ...
#>  - attr(*, "spec")=
#>   .. cols(
#>   ..   gender = col_character(),
#>   ..   `race/ethnicity` = col_character(),
#>   ..   `parental level of education` = col_character(),
#>   ..   lunch = col_character(),
#>   ..   `test preparation course` = col_character(),
#>   ..   `math score` = col_double(),
#>   ..   `reading score` = col_double(),
#>   ..   `writing score` = col_double()
#>   .. )
#>  - attr(*, "problems")=<externalptr>
# Mengubah tipe data kolom 'race/ethnicity' dan 'parental level of education' menjadi faktor
student <- student %>% 
  mutate(`race/ethnicity` = factor(`race/ethnicity`),
         `parental level of education` = factor(`parental level of education`))

# Memeriksa struktur data setelah perubahan
str(student)
#> tibble [1,000 × 8] (S3: tbl_df/tbl/data.frame)
#>  $ gender                     : chr [1:1000] "female" "female" "female" "male" ...
#>  $ race/ethnicity             : Factor w/ 5 levels "group A","group B",..: 2 3 2 1 3 2 2 2 4 2 ...
#>  $ parental level of education: Factor w/ 6 levels "associate's degree",..: 2 5 4 1 5 1 5 5 3 3 ...
#>  $ lunch                      : chr [1:1000] "standard" "standard" "standard" "free/reduced" ...
#>  $ test preparation course    : chr [1:1000] "none" "completed" "none" "none" ...
#>  $ math score                 : num [1:1000] 72 69 90 47 76 71 88 40 64 38 ...
#>  $ reading score              : num [1:1000] 72 90 95 57 78 83 95 43 64 60 ...
#>  $ writing score              : num [1:1000] 74 88 93 44 75 78 92 39 67 50 ...

1.4 Missing Value

# Mengecek apakah terdapat missing value pada dataset
sum(is.na(student))
#> [1] 0

2. EDA

2.1 Gender

student <- student %>%
  rename(race.ethnicity = `race/ethnicity`,
         parental.level.of.education = `parental level of education`,
         test.preparation.course = `test preparation course`)

# Menghitung jumlah data unik pada tiap kolom yang dipilih
nunique_df <- student %>%
  select(race.ethnicity, parental.level.of.education, lunch, test.preparation.course, gender) %>%
  lapply(function(x) as.data.frame(table(x))) %>%
  lapply(function(x) setNames(x, c("var", "freq"))) %>%
  lapply(function(x) x[order(-x$freq),]) %>%
  setNames(c("race.ethnicity", "parental.level.of.education", "lunch", "test.preparation.course", "gender"))

plot_gender <- nunique_df$gender %>%
  plot_ly(labels = ~var, values = ~freq, type = 'pie',
          marker = list(colors = c("#0072B2", "#E69F00"))) %>%
  layout(title = list(text = "Gender"))
plot_gender 

Insight Grafik diatas mennjukan perbandingan gender antara laki-laki dan perempuan, dimana laki-laki lebih banyak dari pada perempuan.

2.2 Race/Ethnicity

plot_race_ethnicity <- nunique_df$race.ethnicity %>%
  plot_ly(labels = ~var, values = ~freq, type = 'pie',
          marker = list(colors = c("#0072B2", "#E69F00", "#009E73", "#F0E442", "#D55E00"))) %>%
  layout(title = list(text = "Race/Ethnicity"))

plot_race_ethnicity

Insight Grafik di atas menunjukkan perbandingan gender antara ras dan etnis, di mana ras C dan D menjadi mayoritas dan ras A menjadi minoritas.

2.3 Parental level of education

plot_parental_education <- nunique_df$parental.level.of.education %>%
  plot_ly(labels = ~var, values = ~freq, type = 'pie',
          marker = list(colors = c("#0072B2", "#E69F00", "#009E73", "#F0E442", "#D55E00", "#CC79A7"))) %>%
  layout(title = list(text = "Parental Level of Education"))
plot_parental_education

Insight Grafik di atas menunjukkan perbandingan Tingkat Edukasi orangtua, dimana mayoritas tingkat edukasi orangtua adalah some collage & associate’s degerre

2.4 Lunch

# Lunch
plot_lunch <- nunique_df$lunch %>%
  plot_ly(labels = ~var, values = ~freq, type = 'pie',
          marker = list(colors = c("#0072B2", "#E69F00"))) %>%
  layout(title = list(text = "Lunch"))
plot_lunch 

Insight Grafik di atas menunjukkan perbandingan makan siang antara standard dan free/reduced, dimana sebagian besar siswa mendapatkan makan siang standard.

2.5 Test preparation course

# Test Preparation Course
plot_test_prep <- nunique_df$test.preparation.course %>%
  plot_ly(labels = ~var, values = ~freq, type = 'pie',
          marker = list(colors = c("#0072B2", "#E69F00"))) %>%
  layout(title = list(text = "Test Preparation Course"))
plot_test_prep

Insight Grafik di atas menunjukkan perbandingan persiapan tes, dimana sebagian besar siswa tidak menyiapkan test.

2.6 Numeric Columns

math_plot <- plot_ly(student, y = ~`math score`, type = "box", 
                boxmean = TRUE, template = "plotly_dark",
                marker = list(color = 'rgb(7,40,89)'),
                line = list(color = 'rgb(7,40,89)'),
                name = "Math score") %>%
  layout(title = "Math score", font = list(size=17,family="Franklin Gothic"))
read_plot <- plot_ly(student, y = ~`reading score`, type = "box", 
                boxmean = TRUE, template = "plotly_dark",
                marker = list(color = 'rgb(7,40,89)'),
                line = list(color = 'rgb(7,40,89)'),
                name = "Reading Score") %>%
  layout(title = "Reading Score", font = list(size=17,family="Franklin Gothic"))
writing_plot <- plot_ly(student, y = ~`writing score`, type = "box", 
                boxmean = TRUE, template = "plotly_dark",
                marker = list(color = 'rgb(7,40,89)'),
                line = list(color = 'rgb(7,40,89)'),
                name = "Writing Score") %>%
  layout(title = "Writing Score", font = list(size=17,family="Franklin Gothic"))


subplot(math_plot, read_plot, writing_plot, nrows = 1, shareX = TRUE, shareY = TRUE) %>%
  layout(title = "Box Plot Score"
       )

Insight Grafik di atas menunjukkan persebaran data skor yang bervariasi di setiap tes

3 Analysis: Relationship between Attributes

3.1 Gender Analysis

# Membuat data frame yang memuat skor matematika, membaca, menulis, dan gender
df <- student %>% 
  select(c("gender", "math score", "reading score", "writing score")) %>% 
  group_by(gender) %>% 
  summarize(across(c("math score", "reading score", "writing score"), median))

# Membentuk tiga baris data dengan memisahkan kolom skor matematika, membaca, dan menulis
df <- df %>% pivot_longer(cols = c("math score", "reading score", "writing score"), names_to = "score_type", values_to = "score")

# Membuat grafik batang untuk membandingkan skor matematika, membaca, dan menulis dengan gender
fig <- plot_ly(data = df, x = ~score_type, y = ~score, color = ~gender, type = "bar") %>% layout(title = "Comparison of Math, Reading, and Writing Scores by Gender")
fig

Insight Grafik di atas menunjukkan perbandingan nilai antara laki-laki dan perempuan pada mata pelajaran matematika, membaca, dan menulis. Terlihat bahwa laki-laki memiliki nilai yang lebih tinggi hanya pada mata pelajaran matematika, sementara perempuan lebih unggul pada mata pelajaran membaca dan menulis.

3.2 Score and lunch

# Create histogram plots
plot1 <- ggplot(student, aes(x = `math score`, fill = lunch)) +
  geom_histogram(alpha = 0.5, bins = 30) +
  labs(title = "Math Score", x = "Math Score", y = "Count") +
  scale_fill_discrete(name = "Lunch")

plot2 <- ggplot(student, aes(x = `reading score`, fill = lunch)) +
  geom_histogram(alpha = 0.5, bins = 30) +
  labs(title = "Reading Score", x = "Reading Score", y = "Count") +
  scale_fill_discrete(name = "Lunch")

plot3 <- ggplot(student, aes(x = `writing score`, fill = lunch)) +
  geom_histogram(alpha = 0.5, bins = 30) +
  labs(title = "Writing Score", x = "Writing Score", y = "Count") +
  scale_fill_discrete(name = "Lunch")

# Convert to plotly objects
math_lunch <- ggplotly(plot1)
read_lunch <- ggplotly(plot2)
writing_lunch<- ggplotly(plot3)


subplot(math_lunch, read_lunch, writing_lunch, nrows = 1, shareX = TRUE, shareY = TRUE) %>%
  layout(title = "Score distributions of students based on whether they had lunch")

Insight Grafik di atas menunjukkan distribusi nilai matematika, membaca, dan menulis berdasarkan klasifikasi makanan siang standar dan gratis. Ketiga grafik menunjukkan bahwa lunch gratis memiliki skor yang lebih tinggi dibandingkan dengan lunch standar.

3.3 Parental level of eduction and scores

# Create density plots
plot1 <- ggplot(student, aes(x = `math score`, fill = `parental.level.of.education`)) +
  geom_density(alpha = 0.5) +
  labs(title = "Math Score", x = "Score", y = "Density") +
  scale_fill_discrete(name = "Parental\nEducation", guide = FALSE)

plot2 <- ggplot(student, aes(x = `reading score`, fill = `parental.level.of.education`)) +
  geom_density(alpha = 0.5) +
  labs(title = "Reading Score", x = "Score", y = "Density") +
  scale_fill_discrete(name = "Parental\nEducation", guide = FALSE)

plot3 <- ggplot(student, aes(x = `writing score`, fill = `parental.level.of.education`)) +
  geom_density(alpha = 0.5) +
  labs(title = "Writing Score", x = "Score", y = "Density") +
  scale_fill_discrete(name = "Parental\nEducation", guide = FALSE)


# Convert plot1 to plotly
math_parenting <- ggplotly(plot1) %>% layout(title = "Math Score",
                                       xaxis = list(title = "Math Score"),
                                       yaxis = list(title = "Density"),
                                       showlegend = TRUE)

# Convert plot2 to plotly
reading_parenting <- ggplotly(plot2) %>% layout(title = "Reading Score",
                                       xaxis = list(title = "Reading Score"),
                                       yaxis = list(title = "Density"),
                                       showlegend = TRUE)

# Convert plot3 to plotly
writing_pareting <- ggplotly(plot3) %>% layout(title = "Writing Score",
                                       xaxis = list(title = "Writing Score"),
                                       yaxis = list(title = "Density"),
                                       showlegend = TRUE)
subplot(math_parenting, reading_parenting, writing_pareting, nrows = 1, shareX = TRUE, shareY = TRUE) %>%
  layout(title = "Score distributions of students based on whether Parental level of eduction")

Insight Grafik di atas menunjukkan distribusi nilai matematika, membaca, dan menulis berdasarkan klasifikasi tingkat pendidikan orangtua. Dimana tingkat pendidikan orang tua tidak mempengeruhi signifikan nilai siswa.

3.4 Race/ethnicity and score

# Create density plots
plotmath <- ggplot(student, aes(x = `math score`, color = `race.ethnicity`)) +
  geom_density(alpha = 0.5) +
  labs(title = "Math Score", x = "Score", y = "Density") +
  scale_color_discrete(name = "Race/Ethnicity")

plotwriting <- ggplot(student, aes(x = `reading score`, color = `race.ethnicity`)) +
  geom_density(alpha = 0.5) +
  labs(title = "Reading Score", x = "Score", y = "Density") +
  scale_color_discrete(name = "Race/Ethnicity")

plotread <- ggplot(student, aes(x = `writing score`, color = `race.ethnicity`)) +
  geom_density(alpha = 0.5) +
  labs(title = "Writing Score", x = "Score", y = "Density") +
  scale_color_discrete(name = "Race/Ethnicity")


# Convert to plotly objects
p1 <- ggplotly(plotmath)
p2 <- ggplotly(plotwriting)
p3 <- ggplotly(plotread)

# Combine the plots
subplot(p1, p2, p3, nrows = 1, shareX = TRUE, shareY = TRUE) %>%
  layout(title = "Score distributions of students based on Race/ethnicity",
         xaxis = list(title = "Score"),
         yaxis = list(title = "Count"))

Insight Grafik di atas menunjukkan distribusi nilai matematika, membaca, dan menulis berdasarkan klasifikasi res/etnis. Dimana res/etnis tidak mempengeruhi signifikan nilai siswa.

3.5 Test preparation course and score

library(plotly)

# Create histograms
plot1 <- ggplot(student, aes(x = `math score`, fill = `test.preparation.course`)) +
  geom_histogram(alpha = 0.5, bins = 30) +
  labs(title = "Math Score", x = "Math Score", y = "Count") +
  scale_fill_manual(values = c("#1f77b4", "#ff7f0e"), name = "Test\nPreparation\nCourse")

plot2 <- ggplot(student, aes(x = `reading score`, fill = `test.preparation.course`)) +
  geom_histogram(alpha = 0.5, bins = 30) +
  labs(title = "Reading Score", x = "Reading Score", y = "Count") +
  scale_fill_manual(values = c("#1f77b4", "#ff7f0e"), name = "Test\nPreparation\nCourse")

plot3 <- ggplot(student, aes(x = `writing score`, fill = `test.preparation.course`)) +
  geom_histogram(alpha = 0.5, bins = 30) +
  labs(title = "Writing Score", x = "Writing Score", y = "Count") +
  scale_fill_manual(values = c("#1f77b4", "#ff7f0e"), name = "Test\nPreparation\nCourse")

# Convert ggplot objects to plotly objects
math_pre <- ggplotly(plot1)
read_pre <- ggplotly(plot2)
writing_pre <- ggplotly(plot3)



# Combine the plots
subplot(math_pre, read_pre, writing_pre, nrows = 1, shareX = TRUE, shareY = TRUE) %>%
  layout(title = "Score distributions of Students Based on Test Preparation Course and Score")

Insight Grafik di atas menunjukkan distribusi nilai matematika, membaca, dan menulis berdasarkan klasifikasi perseiapan siswa sebelum ujian. Dimana siswa yang melakukan persiapan sebelum ujian memiliki skor nilai lebih tinggi dibandingkan dengan yang tidak disetiap tes.

4.Summary

Kesimpulan yang bisa diambil dari anilisa diatas adalah:
1. Laki-laki cenderung lebih baik dalam matematika dibandingkan perempuan, Perempuan cenderung lebih baik dalam membaca dan menulis dibandingkan laki-laki.

  1. Ras/etnis dan tingkat pendidikan orang tua tidak ada pengaruh signifikan terhadap skor ujian. Dalam hal makan siang (standar, gratis/diskon)

  2. Opsi makan siang gratis/diskon memiliki skor lebih tinggi daripada opsi makan siang standard.

  3. Siswa yang memiliki persiapan sbelum tes memiliki nilai yang lebih tinggi dari siswa yang tidak melakukan persiapan