# Membuka library yang diperlukan
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(moments)
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(stats)
library(datasets)
Preprocess
# Akses dataset starwars
data("starwars")
starwars
## # A tibble: 87 × 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male mascu…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu…
## 3 R2-D2 96 32 <NA> white, bl… red 33 none mascu…
## 4 Darth V… 202 136 none white yellow 41.9 male mascu…
## 5 Leia Or… 150 49 brown light brown 19 fema… femin…
## 6 Owen La… 178 120 brown, gr… light blue 52 male mascu…
## 7 Beru Wh… 165 75 brown light blue 47 fema… femin…
## 8 R5-D4 97 32 <NA> white, red red NA none mascu…
## 9 Biggs D… 183 84 black light brown 24 male mascu…
## 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male mascu…
## # ℹ 77 more rows
## # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
# Hasil amatan tanpa missing value
data_clean <- starwars %>% na.omit()
data_clean
## # A tibble: 29 × 14
## name height mass hair_color skin_color eye_color birth_year sex gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Luke Sk… 172 77 blond fair blue 19 male mascu…
## 2 Darth V… 202 136 none white yellow 41.9 male mascu…
## 3 Leia Or… 150 49 brown light brown 19 fema… femin…
## 4 Owen La… 178 120 brown, gr… light blue 52 male mascu…
## 5 Beru Wh… 165 75 brown light blue 47 fema… femin…
## 6 Biggs D… 183 84 black light brown 24 male mascu…
## 7 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male mascu…
## 8 Anakin … 188 84 blond fair blue 41.9 male mascu…
## 9 Chewbac… 228 112 brown unknown blue 200 male mascu…
## 10 Han Solo 180 80 brown fair brown 29 male mascu…
## # ℹ 19 more rows
## # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
# Akses peubah "height" dan "mass"
data_clean <- data_clean %>% select(height, mass)
data_clean
## # A tibble: 29 × 2
## height mass
## <int> <dbl>
## 1 172 77
## 2 202 136
## 3 150 49
## 4 178 120
## 5 165 75
## 6 183 84
## 7 182 77
## 8 188 84
## 9 228 112
## 10 180 80
## # ℹ 19 more rows
# memanggil library skimr dan psych
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
1. EKSPLORASI DATA
# Hitung statistik deskriptif untuk "height"
summary_height <- data_clean %>%
summarise(
Mean_Height = mean(height),
Min_Height = min(height),
Q1_Height = quantile(height, 0.25),
Median_Height = median(height),
Q3_Height = quantile(height, 0.75),
Max_Height = max(height),
Variance_Height = var(height),
SD_Height = sd(height),
)
summary_height
## # A tibble: 1 × 8
## Mean_Height Min_Height Q1_Height Median_Height Q3_Height Max_Height
## <dbl> <int> <dbl> <int> <dbl> <int>
## 1 179. 88 172 180 188 228
## # ℹ 2 more variables: Variance_Height <dbl>, SD_Height <dbl>
# Hitung statistik deskriptif untuk "mass"
summary_mass <- data_clean %>%
summarise(
Mean_Mass = mean(mass),
Min_Mass = min(mass),
Q1_Mass = quantile(mass, 0.25),
Median_Mass = median(mass),
Q3_Mass = quantile(mass, 0.75),
Max_Mass = max(mass),
Variance_Mass = var(mass),
SD_Mass = sd(mass)
)
summary_mass
## # A tibble: 1 × 8
## Mean_Mass Min_Mass Q1_Mass Median_Mass Q3_Mass Max_Mass Variance_Mass SD_Mass
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 77.8 20 75 79 83 136 533. 23.1
# Membuat Histogram untuk distribusi Height
ggplot(data_clean, aes(x = height)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
ggtitle("Distribusi Height Karakter Star Wars") +
xlab("Height (cm)") +
ylab("Frekuensi") +
theme_minimal()

# Membuat Histogram untuk distribusi Mass
ggplot(data_clean, aes(x = mass)) +
geom_histogram(binwidth = 5, fill = "red", color = "black", alpha = 0.7) +
ggtitle("Distribusi Mass Karakter Star Wars") +
xlab("Mass (kg)") +
ylab("Frekuensi") +
theme_minimal()

# Menghitung Skewness & Kurtosis untuk height
skewness_height <- skewness(data_clean$height, na.rm = TRUE)
kurtosis_height <- kurtosis(data_clean$height, na.rm = TRUE)
# Menghitung Skewness & Kurtosis untuk mass
skewness_mass <- skewness(data_clean$mass, na.rm = TRUE)
kurtosis_mass <- kurtosis(data_clean$mass, na.rm = TRUE)
# Menampilkan hasil
cat("Skewness & Kurtosis Height:\n")
## Skewness & Kurtosis Height:
cat("Skewness:", skewness_height, "\n")
## Skewness: -2.014021
cat("Kurtosis:", kurtosis_height, "\n\n")
## Kurtosis: 10.9963
cat("Skewness & Kurtosis Mass:\n")
## Skewness & Kurtosis Mass:
cat("Skewness:", skewness_mass, "\n")
## Skewness: 0.1794252
cat("Kurtosis:", kurtosis_mass, "\n")
## Kurtosis: 4.124474
# Membuat Boxplot untuk distribusi Height
ggplot(data_clean, aes(y = height)) +
geom_boxplot(fill = "blue", color = "black", alpha = 0.7) +
ggtitle("Boxplot Height Karakter Star Wars") +
ylab("Height (cm)") +
theme_minimal()

# Membuat Histogram untuk distribusi Mass
ggplot(data_clean, aes(y = mass)) +
geom_boxplot(fill = "red", color = "black", alpha = 0.7) +
ggtitle("Boxplot Mass Karakter Star Wars") +
ylab("Mass (kg)") +
theme_minimal()

# Scatterplot untuk hubungan Height dan Mass
ggplot(data_clean, aes(x = mass, y = height)) +
geom_point(color = "blue", alpha = 0.7) + # Titik warna biru transparan
geom_smooth(method = "lm", color = "red", se = FALSE) + # Garis regresi linear
labs(title = "Scatterplot Height vs Mass",
x = "Mass (kg)",
y = "Height (cm)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

3. Winsorized / Trimmed Mean untuk rata-rata yang kekar?
# memanggil library DescTools
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.3
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:psych':
##
## AUC, ICC, SD
## The following object is masked from 'package:car':
##
## Recode
# Trimmed Mean (α = 15%)
trimmed_mean_height <- mean(data_clean$height, trim = 0.15)
trimmed_mean_mass <- mean(data_clean$mass, trim = 0.15)
print(paste("Trimmed Mean Height:", trimmed_mean_height))
## [1] "Trimmed Mean Height: 180.380952380952"
print(paste("Trimmed Mean Mass:", trimmed_mean_mass))
## [1] "Trimmed Mean Mass: 76.6857142857143"
# Winsorized Mean dengan threshold 15%
# Menentukan batas Winsorizing (15% bawah & atas)
lower_bound_height <- quantile(data_clean$height, 0.15, na.rm = TRUE)
upper_bound_height <- quantile(data_clean$height, 0.85, na.rm = TRUE)
lower_bound_mass <- quantile(data_clean$mass, 0.15, na.rm = TRUE)
upper_bound_mass <- quantile(data_clean$mass, 0.85, na.rm = TRUE)
# Winsorizing secara manual
winsorized_height <- pmax(pmin(data_clean$height, upper_bound_height), lower_bound_height)
winsorized_mass <- pmax(pmin(data_clean$mass, upper_bound_mass), lower_bound_mass)
# Menghitung Winsorized Mean
winsorized_mean_height <- mean(winsorized_height, na.rm = TRUE)
winsorized_mean_mass <- mean(winsorized_mass, na.rm = TRUE)
# Print hasil
print(winsorized_mean_height)
## [1] 180.5862
print(winsorized_mean_mass)
## [1] 74.74483
Winsorized Mean : Kelebihannya adalah tetap menggunakan semua data
& dapat mengurangi efek outlier tanpa kehilangan informasi.
Kekurangannya adalah tetap dipengaruhi oleh outlier & kurang efektif
jika outlier sangat ekstrem
Trimmed Mean : Kelebihannya adalah lebih efektif menghilangkan
pengaruh outlier & memberi hasil yang lebih stabil. Kekurangannya
adalah mengurangi informasi dengan menghapus data.
4. Hasil penambahan amatan baru (height 210 dan mass 100)
# Menambahkan amatan baru ke dalam data_clean
data_clean <- rbind(data_clean, data.frame(height = 210, mass = 100))
# Menentukan batas outlier menggunakan IQR
Q1_height <- quantile(data_clean$height, 0.25, na.rm = TRUE)
Q3_height <- quantile(data_clean$height, 0.75, na.rm = TRUE)
IQR_height <- Q3_height - Q1_height
Q1_mass <- quantile(data_clean$mass, 0.25, na.rm = TRUE)
Q3_mass <- quantile(data_clean$mass, 0.75, na.rm = TRUE)
IQR_mass <- Q3_mass - Q1_mass
# Menentukan batas pencilan
lower_bound_height <- Q1_height - 1.5 * IQR_height
upper_bound_height <- Q3_height + 1.5 * IQR_height
lower_bound_mass <- Q1_mass - 1.5 * IQR_mass
upper_bound_mass <- Q3_mass + 1.5 * IQR_mass
# Mengecek apakah height = 210 dan mass = 100 adalah pencilan
is_outlier_height <- (210 < lower_bound_height) | (210 > upper_bound_height)
is_outlier_mass <- (100 < lower_bound_mass) | (100 > upper_bound_mass)
# Menampilkan hasil
print(paste("Apakah height = 210 outlier?", is_outlier_height))
## [1] "Apakah height = 210 outlier? FALSE"
print(paste("Apakah mass = 100 outlier?", is_outlier_mass))
## [1] "Apakah mass = 100 outlier? TRUE"
Kesimpulan : Height=210 tidak termasuk ke dalam outlier dan Mass=100
termasuk ke dalam outlier.