# Membuka library yang diperlukan
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(moments)
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(stats)
library(datasets)

Preprocess

# Akses dataset starwars
data("starwars")
starwars
## # A tibble: 87 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 C-3PO       167    75 <NA>       gold       yellow         112   none  mascu…
##  3 R2-D2        96    32 <NA>       white, bl… red             33   none  mascu…
##  4 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  5 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  6 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  7 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  8 R5-D4        97    32 <NA>       white, red red             NA   none  mascu…
##  9 Biggs D…    183    84 black      light      brown           24   male  mascu…
## 10 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
## # ℹ 77 more rows
## # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
# Hasil amatan tanpa missing value
data_clean <- starwars %>% na.omit()
data_clean
## # A tibble: 29 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  3 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  4 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  5 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  6 Biggs D…    183    84 black      light      brown           24   male  mascu…
##  7 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
##  8 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  9 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
## 10 Han Solo    180    80 brown      fair       brown           29   male  mascu…
## # ℹ 19 more rows
## # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
# Akses peubah "height" dan "mass"
data_clean <- data_clean %>% select(height, mass)
data_clean
## # A tibble: 29 × 2
##    height  mass
##     <int> <dbl>
##  1    172    77
##  2    202   136
##  3    150    49
##  4    178   120
##  5    165    75
##  6    183    84
##  7    182    77
##  8    188    84
##  9    228   112
## 10    180    80
## # ℹ 19 more rows
# memanggil library skimr dan psych
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

1. EKSPLORASI DATA

# Hitung statistik deskriptif untuk "height"
summary_height <- data_clean %>%
  summarise(
    Mean_Height = mean(height),
    Min_Height = min(height),
    Q1_Height = quantile(height, 0.25),
    Median_Height = median(height),
    Q3_Height = quantile(height, 0.75),
    Max_Height = max(height),
    Variance_Height = var(height),
    SD_Height = sd(height),
  )
summary_height
## # A tibble: 1 × 8
##   Mean_Height Min_Height Q1_Height Median_Height Q3_Height Max_Height
##         <dbl>      <int>     <dbl>         <int>     <dbl>      <int>
## 1        179.         88       172           180       188        228
## # ℹ 2 more variables: Variance_Height <dbl>, SD_Height <dbl>
# Hitung statistik deskriptif untuk "mass"
summary_mass <- data_clean %>%
  summarise(
Mean_Mass = mean(mass),
    Min_Mass = min(mass),
    Q1_Mass = quantile(mass, 0.25),
    Median_Mass = median(mass),
    Q3_Mass = quantile(mass, 0.75),
    Max_Mass = max(mass),
    Variance_Mass = var(mass),
    SD_Mass = sd(mass)
  )
summary_mass
## # A tibble: 1 × 8
##   Mean_Mass Min_Mass Q1_Mass Median_Mass Q3_Mass Max_Mass Variance_Mass SD_Mass
##       <dbl>    <dbl>   <dbl>       <dbl>   <dbl>    <dbl>         <dbl>   <dbl>
## 1      77.8       20      75          79      83      136          533.    23.1
# Membuat Histogram untuk distribusi Height
ggplot(data_clean, aes(x = height)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  ggtitle("Distribusi Height Karakter Star Wars") +
  xlab("Height (cm)") +
  ylab("Frekuensi") +
  theme_minimal()

# Membuat Histogram untuk distribusi Mass
ggplot(data_clean, aes(x = mass)) +
  geom_histogram(binwidth = 5, fill = "red", color = "black", alpha = 0.7) +
  ggtitle("Distribusi Mass Karakter Star Wars") +
  xlab("Mass (kg)") +
  ylab("Frekuensi") +
  theme_minimal()

# Menghitung Skewness & Kurtosis untuk height
skewness_height <- skewness(data_clean$height, na.rm = TRUE)
kurtosis_height <- kurtosis(data_clean$height, na.rm = TRUE)

# Menghitung Skewness & Kurtosis untuk mass
skewness_mass <- skewness(data_clean$mass, na.rm = TRUE)
kurtosis_mass <- kurtosis(data_clean$mass, na.rm = TRUE)

# Menampilkan hasil
cat("Skewness & Kurtosis Height:\n")
## Skewness & Kurtosis Height:
cat("Skewness:", skewness_height, "\n")
## Skewness: -2.014021
cat("Kurtosis:", kurtosis_height, "\n\n")
## Kurtosis: 10.9963
cat("Skewness & Kurtosis Mass:\n")
## Skewness & Kurtosis Mass:
cat("Skewness:", skewness_mass, "\n")
## Skewness: 0.1794252
cat("Kurtosis:", kurtosis_mass, "\n")
## Kurtosis: 4.124474
# Membuat Boxplot untuk distribusi Height
ggplot(data_clean, aes(y = height)) +
  geom_boxplot(fill = "blue", color = "black", alpha = 0.7) +
  ggtitle("Boxplot Height Karakter Star Wars") +
  ylab("Height (cm)") +
  theme_minimal()

# Membuat Histogram untuk distribusi Mass
ggplot(data_clean, aes(y = mass)) +
  geom_boxplot(fill = "red", color = "black", alpha = 0.7) +
  ggtitle("Boxplot Mass Karakter Star Wars") +
  ylab("Mass (kg)") +
  theme_minimal()

# Scatterplot untuk hubungan Height dan Mass
ggplot(data_clean, aes(x = mass, y = height)) +
  geom_point(color = "blue", alpha = 0.7) +  # Titik warna biru transparan
  geom_smooth(method = "lm", color = "red", se = FALSE) +  # Garis regresi linear
  labs(title = "Scatterplot Height vs Mass",
       x = "Mass (kg)",
       y = "Height (cm)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

2. Mencari pencilan dengan teknik visual & uji formal

# memanggil library outliers
library(outliers)
## 
## Attaching package: 'outliers'
## The following object is masked from 'package:psych':
## 
##     outlier
# Boxplot untuk mendeteksi pencilan secara visual
ggplot(data_clean, aes(y = height)) +
  geom_boxplot(fill = "blue", alpha = 0.7) +
  ggtitle("Boxplot Height") +
  theme_minimal()

ggplot(data_clean, aes(y = mass)) +
  geom_boxplot(fill = "red", alpha = 0.7) +
  ggtitle("Boxplot Mass") +
  theme_minimal()

# Uji formal menggunakan metode IQR
Q1_height <- quantile(data_clean$height, 0.25)
Q3_height <- quantile(data_clean$height, 0.75)
IQR_height <- Q3_height - Q1_height

Q1_mass <- quantile(data_clean$mass, 0.25)
Q3_mass <- quantile(data_clean$mass, 0.75)
IQR_mass <- Q3_mass - Q1_mass

# Menentukan batas pencilan
lower_bound_height <- Q1_height - 1.5 * IQR_height
upper_bound_height <- Q3_height + 1.5 * IQR_height

lower_bound_mass <- Q1_mass - 1.5 * IQR_mass
upper_bound_mass <- Q3_mass + 1.5 * IQR_mass

# Identifikasi outlier
outliers_height <- data_clean$height[data_clean$height < lower_bound_height | data_clean$height > upper_bound_height]
outliers_mass <- data_clean$mass[data_clean$mass < lower_bound_mass | data_clean$mass > upper_bound_mass]

# Print hasil
print("Outlier Height:")
## [1] "Outlier Height:"
print(outliers_height)
## [1] 228  88
print("Outlier Mass:")
## [1] "Outlier Mass:"
print(outliers_mass)
##  [1] 136.0  49.0 120.0 112.0 113.0  20.0  45.0  55.0  56.2  50.0

Kesimpulan : outlier pada Height adalah 228 dan 88, outlier pada Mass adalah 20, 45, 49, 50, 55, 56.2, 112, 113, 120, 136

3. Winsorized / Trimmed Mean untuk rata-rata yang kekar?

# memanggil library DescTools
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.3
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD
## The following object is masked from 'package:car':
## 
##     Recode
# Trimmed Mean (α = 15%)
trimmed_mean_height <- mean(data_clean$height, trim = 0.15)
trimmed_mean_mass <- mean(data_clean$mass, trim = 0.15)

print(paste("Trimmed Mean Height:", trimmed_mean_height))
## [1] "Trimmed Mean Height: 180.380952380952"
print(paste("Trimmed Mean Mass:", trimmed_mean_mass))
## [1] "Trimmed Mean Mass: 76.6857142857143"
# Winsorized Mean dengan threshold 15%

# Menentukan batas Winsorizing (15% bawah & atas)
lower_bound_height <- quantile(data_clean$height, 0.15, na.rm = TRUE)
upper_bound_height <- quantile(data_clean$height, 0.85, na.rm = TRUE)

lower_bound_mass <- quantile(data_clean$mass, 0.15, na.rm = TRUE)
upper_bound_mass <- quantile(data_clean$mass, 0.85, na.rm = TRUE)

# Winsorizing secara manual
winsorized_height <- pmax(pmin(data_clean$height, upper_bound_height), lower_bound_height)
winsorized_mass <- pmax(pmin(data_clean$mass, upper_bound_mass), lower_bound_mass)

# Menghitung Winsorized Mean
winsorized_mean_height <- mean(winsorized_height, na.rm = TRUE)
winsorized_mean_mass <- mean(winsorized_mass, na.rm = TRUE)

# Print hasil
print(winsorized_mean_height)
## [1] 180.5862
print(winsorized_mean_mass)
## [1] 74.74483

Dari kedua metode yang telah dilakukan, menurut saya kedua metode tersebut memiliki kelebihan & kekuarangan masing-masing.

Winsorized Mean : Kelebihannya adalah tetap menggunakan semua data & dapat mengurangi efek outlier tanpa kehilangan informasi. Kekurangannya adalah tetap dipengaruhi oleh outlier & kurang efektif jika outlier sangat ekstrem

Trimmed Mean : Kelebihannya adalah lebih efektif menghilangkan pengaruh outlier & memberi hasil yang lebih stabil. Kekurangannya adalah mengurangi informasi dengan menghapus data.

Kesimpulan : Menurut saya, untuk mendapatkan rata-rata yang kekar lebih baik menggunakan metode Trimmed Mean karena sudah tidak dipengaruhi oleh outlier lagi (sudah dipotong).

4. Hasil penambahan amatan baru (height 210 dan mass 100)

# Menambahkan amatan baru ke dalam data_clean
data_clean <- rbind(data_clean, data.frame(height = 210, mass = 100))

# Menentukan batas outlier menggunakan IQR
Q1_height <- quantile(data_clean$height, 0.25, na.rm = TRUE)
Q3_height <- quantile(data_clean$height, 0.75, na.rm = TRUE)
IQR_height <- Q3_height - Q1_height

Q1_mass <- quantile(data_clean$mass, 0.25, na.rm = TRUE)
Q3_mass <- quantile(data_clean$mass, 0.75, na.rm = TRUE)
IQR_mass <- Q3_mass - Q1_mass

# Menentukan batas pencilan
lower_bound_height <- Q1_height - 1.5 * IQR_height
upper_bound_height <- Q3_height + 1.5 * IQR_height

lower_bound_mass <- Q1_mass - 1.5 * IQR_mass
upper_bound_mass <- Q3_mass + 1.5 * IQR_mass

# Mengecek apakah height = 210 dan mass = 100 adalah pencilan
is_outlier_height <- (210 < lower_bound_height) | (210 > upper_bound_height)
is_outlier_mass <- (100 < lower_bound_mass) | (100 > upper_bound_mass)

# Menampilkan hasil
print(paste("Apakah height = 210 outlier?", is_outlier_height))
## [1] "Apakah height = 210 outlier? FALSE"
print(paste("Apakah mass = 100 outlier?", is_outlier_mass))
## [1] "Apakah mass = 100 outlier? TRUE"

Kesimpulan : Height=210 tidak termasuk ke dalam outlier dan Mass=100 termasuk ke dalam outlier.