Tugas UAS Manajemen Data Relasional-STA1261
2024-12-03
Berikut ini hasil praktik penggunaan fungsi summarise(), arrange(), filter(), mutate(), dan select() yang saya terapkan pada tahapan wrangling di dataset diamonds. Terdapat juga penjelasan berupa komentar pada tiap potongan kode.
# Memanggil packages library
library(tidyverse)## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)# Memuat dataset diamonds
data(diamonds)# Mengetahui tipe data dari dataset
class(diamonds)## [1] "tbl_df" "tbl" "data.frame"
# Menampilkan beberapa baris awal data
head(diamonds)## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
# Mengganti nama kolom depth menjadi depth_p
diamonds <- diamonds %>%
rename( depth_p = depth)# Menampilkan tabel data secara terpisah di jendela baru R Studio
view(diamonds)# Menampilkan ringkasan data di konsol
glimpse(diamonds)## Rows: 53,940
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth_p <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
Penggunaan Fungsi Summarise()
# Rata-rata harga berlian
diamonds$price %>% mean()## [1] 3932.8
# Melihat rigkasan data
summary(diamonds)## carat cut color clarity depth_p
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
Data Filtering
# SELECT kolom x y dan z untuk buat dataset baru yaitu dimension dan ubah nama kolomnya
dimension <- diamonds %>%
select(x, y, z) %>%
rename(
length = x,
wide = y,
depth = z
)
print(dimension)## # A tibble: 53,940 × 3
## length wide depth
## <dbl> <dbl> <dbl>
## 1 3.95 3.98 2.43
## 2 3.89 3.84 2.31
## 3 4.05 4.07 2.31
## 4 4.2 4.23 2.63
## 5 4.34 4.35 2.75
## 6 3.94 3.96 2.48
## 7 3.95 3.98 2.47
## 8 4.07 4.11 2.53
## 9 3.87 3.78 2.49
## 10 4 4.05 2.39
## # ℹ 53,930 more rows
# Mengambil berlian dengan carat dalam rentang 0.50 sampai 1.99
mid_carat_diamonds <- diamonds %>% filter(carat >= 0.50 & carat <= 1.99)
print(mid_carat_diamonds)## # A tibble: 34,112 × 10
## carat cut color clarity depth_p table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.7 Ideal E SI1 62.5 57 2757 5.7 5.72 3.57
## 2 0.86 Fair E SI2 55.1 69 2757 6.45 6.33 3.52
## 3 0.7 Ideal G VS2 61.6 56 2757 5.7 5.67 3.5
## 4 0.71 Very Good E VS2 62.4 57 2759 5.68 5.73 3.56
## 5 0.78 Very Good G SI2 63.8 56 2759 5.81 5.85 3.72
## 6 0.7 Good E VS2 57.5 58 2759 5.85 5.9 3.38
## 7 0.7 Good F VS1 59.4 62 2759 5.71 5.76 3.4
## 8 0.96 Fair F SI2 66.3 62 2759 6.27 5.95 4.07
## 9 0.73 Very Good E SI1 61.6 59 2760 5.77 5.78 3.56
## 10 0.8 Premium H SI1 61.5 58 2760 5.97 5.93 3.66
## # ℹ 34,102 more rows
Data Arranging
# Mengurutkan data mid_carat_diamonds dari carat terkecil
mid_carat_diamonds %>% arrange(carat)## # A tibble: 34,112 × 10
## carat cut color clarity depth_p table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.5 Ideal E VVS2 62.2 54 2889 5.08 5.12 3.17
## 2 0.5 Ideal E VVS2 62.2 54 2889 5.09 5.11 3.17
## 3 0.5 Good D VVS2 62.4 64 3017 5.03 5.06 3.14
## 4 0.5 Good D IF 63.2 59 3378 4.99 5.04 3.17
## 5 0.5 Very Good D IF 62.9 59 3378 4.99 5.09 3.17
## 6 0.5 Fair F I1 69.8 55 584 4.89 4.8 3.38
## 7 0.5 Fair F I1 71 57 613 4.87 4.79 3.43
## 8 0.5 Fair F I1 68.4 54 613 4.94 4.82 3.35
## 9 0.5 Fair F I1 67.1 57 627 4.92 4.87 3.28
## 10 0.5 Fair F I1 68.3 58 627 4.91 4.78 3.32
## # ℹ 34,102 more rows
view(mid_carat_diamonds)Data Reshaping
# Menyiapkan subset data diamonds
subdiamonds <- diamonds %>%
select(carat, cut, color, price) %>%
slice(1:100)
print(subdiamonds)## # A tibble: 100 × 4
## carat cut color price
## <dbl> <ord> <ord> <int>
## 1 0.23 Ideal E 326
## 2 0.21 Premium E 326
## 3 0.23 Good E 327
## 4 0.29 Premium I 334
## 5 0.31 Good J 335
## 6 0.24 Very Good J 336
## 7 0.24 Very Good I 336
## 8 0.26 Very Good H 337
## 9 0.22 Fair E 337
## 10 0.23 Very Good H 338
## # ℹ 90 more rows
# Reshaping subset_diamonds format wide to long
subdiamonds_long <- subdiamonds %>%
pivot_longer(cols = c(carat, price),
names_to ="attribute",
values_to ="value")
print(subdiamonds_long)## # A tibble: 200 × 4
## cut color attribute value
## <ord> <ord> <chr> <dbl>
## 1 Ideal E carat 0.23
## 2 Ideal E price 326
## 3 Premium E carat 0.21
## 4 Premium E price 326
## 5 Good E carat 0.23
## 6 Good E price 327
## 7 Premium I carat 0.29
## 8 Premium I price 334
## 9 Good J carat 0.31
## 10 Good J price 335
## # ℹ 190 more rows
Data Merging
# Membuat dataset diamond_colors
diamond_colors <- data.frame(
color = c("D", "E", "F", "G", "H", "I", "J"),
description = c("Pure", "Pristine", "Clear", "Light Hint", "Faint", "Tinted", "Yellowish")
)# Merging diamond_colors dengan mid_carat_diamonds
mid_carat_diamonds <- mid_carat_diamonds %>%
left_join(diamond_colors, by="color")
print(mid_carat_diamonds)## # A tibble: 34,112 × 11
## carat cut color clarity depth_p table price x y z description
## <dbl> <ord> <chr> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>
## 1 0.7 Ideal E SI1 62.5 57 2757 5.7 5.72 3.57 Pristine
## 2 0.86 Fair E SI2 55.1 69 2757 6.45 6.33 3.52 Pristine
## 3 0.7 Ideal G VS2 61.6 56 2757 5.7 5.67 3.5 Light Hint
## 4 0.71 Very G… E VS2 62.4 57 2759 5.68 5.73 3.56 Pristine
## 5 0.78 Very G… G SI2 63.8 56 2759 5.81 5.85 3.72 Light Hint
## 6 0.7 Good E VS2 57.5 58 2759 5.85 5.9 3.38 Pristine
## 7 0.7 Good F VS1 59.4 62 2759 5.71 5.76 3.4 Clear
## 8 0.96 Fair F SI2 66.3 62 2759 6.27 5.95 4.07 Clear
## 9 0.73 Very G… E SI1 61.6 59 2760 5.77 5.78 3.56 Pristine
## 10 0.8 Premium H SI1 61.5 58 2760 5.97 5.93 3.66 Faint
## # ℹ 34,102 more rows
Data Aggregating
# Menghitung rata-rata harga tiap berlian berdasarkan hasil cuttingnya
diamonds %>% group_by(cut) %>% summarize (mean=mean(price))## # A tibble: 5 × 2
## cut mean
## <ord> <dbl>
## 1 Fair 4359.
## 2 Good 3929.
## 3 Very Good 3982.
## 4 Premium 4584.
## 5 Ideal 3458.
Feature Engineering
# Membuat variabel baru yaitu dimension yang merupakan hasil perkalian kolom x y dan z
diamonds <- diamonds %>%
mutate(dimension = x*y*z)
print(diamonds)## # A tibble: 53,940 × 11
## carat cut color clarity depth_p table price x y z dimension
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 38.2
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 34.5
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 38.1
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 46.7
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 51.9
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 38.7
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 38.8
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 42.3
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 36.4
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 38.7
## # ℹ 53,930 more rows
# Menambahkan kolom is_expensive untuk melihat kategori berlian mid_carat yang mahal dan tidak
mid_carat_diamonds <- mid_carat_diamonds %>%
mutate(is_expensive = if_else(price > 5000, "YES", "NO"))
head(mid_carat_diamonds)## # A tibble: 6 × 12
## carat cut color clarity depth_p table price x y z description
## <dbl> <ord> <chr> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>
## 1 0.7 Ideal E SI1 62.5 57 2757 5.7 5.72 3.57 Pristine
## 2 0.86 Fair E SI2 55.1 69 2757 6.45 6.33 3.52 Pristine
## 3 0.7 Ideal G VS2 61.6 56 2757 5.7 5.67 3.5 Light Hint
## 4 0.71 Very Go… E VS2 62.4 57 2759 5.68 5.73 3.56 Pristine
## 5 0.78 Very Go… G SI2 63.8 56 2759 5.81 5.85 3.72 Light Hint
## 6 0.7 Good E VS2 57.5 58 2759 5.85 5.9 3.38 Pristine
## # ℹ 1 more variable: is_expensive <chr>
Penggunaan 2 Fungsi Bersamaan (tambahan)
# Membuat dataset high_quality_diamonds dengan menambahkan kolom harga per karat dan memfilter berdasarkan cutting yang premium dan ideal pada data diamonds
high_quality_diamonds <- diamonds %>%
mutate(price_per_carat = price / carat) %>%
filter(cut %in% c("Premium", "Ideal") & price_per_carat > 4000)
head(high_quality_diamonds)## # A tibble: 6 × 12
## carat cut color clarity depth_p table price x y z dimension
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.59 Ideal E VVS2 62 55 2761 5.38 5.43 3.35 97.9
## 2 0.63 Premium E VVS1 60.9 60 2765 5.52 5.55 3.37 103.
## 3 0.64 Ideal G VVS1 61.9 56 2766 5.53 5.56 3.43 105.
## 4 0.58 Ideal G VVS1 61.5 55 2772 5.39 5.44 3.33 97.6
## 5 0.58 Ideal F VVS1 61.7 56 2772 5.33 5.37 3.3 94.5
## 6 0.6 Ideal E VS1 61.7 55 2774 5.41 5.44 3.35 98.6
## # ℹ 1 more variable: price_per_carat <dbl>