Tugas UAS Manajemen Data Relasional-STA1261

2024-12-03

Fatima Azzahra / G1401231036

Berikut ini hasil praktik penggunaan fungsi summarise(), arrange(), filter(), mutate(), dan select() yang saya terapkan pada tahapan wrangling di dataset diamonds. Terdapat juga penjelasan berupa komentar pada tiap potongan kode.

# Memanggil packages library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

library(ggplot2)
# Memuat dataset diamonds
data(diamonds)
# Mengetahui tipe data dari dataset
class(diamonds)
## [1] "tbl_df"     "tbl"        "data.frame"
# Menampilkan beberapa baris awal data
head(diamonds)
## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
# Mengganti nama kolom depth menjadi depth_p
diamonds <- diamonds %>%
  rename( depth_p = depth)
# Menampilkan tabel data secara terpisah di jendela baru R Studio
view(diamonds)
# Menampilkan ringkasan data di konsol
glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth_p <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

Penggunaan Fungsi Summarise()

# Rata-rata harga berlian
diamonds$price %>% mean()
## [1] 3932.8
# Melihat rigkasan data
summary(diamonds)
##      carat               cut        color        clarity         depth_p     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
## 

Data Filtering

# SELECT kolom x y dan z untuk buat dataset baru yaitu dimension dan ubah nama kolomnya
dimension <- diamonds %>% 
  select(x, y, z) %>%
  rename(
    length = x,
    wide = y,
    depth = z
  )

print(dimension)
## # A tibble: 53,940 × 3
##    length  wide depth
##     <dbl> <dbl> <dbl>
##  1   3.95  3.98  2.43
##  2   3.89  3.84  2.31
##  3   4.05  4.07  2.31
##  4   4.2   4.23  2.63
##  5   4.34  4.35  2.75
##  6   3.94  3.96  2.48
##  7   3.95  3.98  2.47
##  8   4.07  4.11  2.53
##  9   3.87  3.78  2.49
## 10   4     4.05  2.39
## # ℹ 53,930 more rows
# Mengambil berlian dengan carat dalam rentang 0.50 sampai 1.99
mid_carat_diamonds <- diamonds %>% filter(carat >= 0.50 & carat <= 1.99)
print(mid_carat_diamonds)
## # A tibble: 34,112 × 10
##    carat cut       color clarity depth_p table price     x     y     z
##    <dbl> <ord>     <ord> <ord>     <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.7  Ideal     E     SI1        62.5    57  2757  5.7   5.72  3.57
##  2  0.86 Fair      E     SI2        55.1    69  2757  6.45  6.33  3.52
##  3  0.7  Ideal     G     VS2        61.6    56  2757  5.7   5.67  3.5 
##  4  0.71 Very Good E     VS2        62.4    57  2759  5.68  5.73  3.56
##  5  0.78 Very Good G     SI2        63.8    56  2759  5.81  5.85  3.72
##  6  0.7  Good      E     VS2        57.5    58  2759  5.85  5.9   3.38
##  7  0.7  Good      F     VS1        59.4    62  2759  5.71  5.76  3.4 
##  8  0.96 Fair      F     SI2        66.3    62  2759  6.27  5.95  4.07
##  9  0.73 Very Good E     SI1        61.6    59  2760  5.77  5.78  3.56
## 10  0.8  Premium   H     SI1        61.5    58  2760  5.97  5.93  3.66
## # ℹ 34,102 more rows

Data Arranging

# Mengurutkan data mid_carat_diamonds dari carat terkecil
mid_carat_diamonds %>% arrange(carat)
## # A tibble: 34,112 × 10
##    carat cut       color clarity depth_p table price     x     y     z
##    <dbl> <ord>     <ord> <ord>     <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1   0.5 Ideal     E     VVS2       62.2    54  2889  5.08  5.12  3.17
##  2   0.5 Ideal     E     VVS2       62.2    54  2889  5.09  5.11  3.17
##  3   0.5 Good      D     VVS2       62.4    64  3017  5.03  5.06  3.14
##  4   0.5 Good      D     IF         63.2    59  3378  4.99  5.04  3.17
##  5   0.5 Very Good D     IF         62.9    59  3378  4.99  5.09  3.17
##  6   0.5 Fair      F     I1         69.8    55   584  4.89  4.8   3.38
##  7   0.5 Fair      F     I1         71      57   613  4.87  4.79  3.43
##  8   0.5 Fair      F     I1         68.4    54   613  4.94  4.82  3.35
##  9   0.5 Fair      F     I1         67.1    57   627  4.92  4.87  3.28
## 10   0.5 Fair      F     I1         68.3    58   627  4.91  4.78  3.32
## # ℹ 34,102 more rows
view(mid_carat_diamonds)

Data Reshaping

# Menyiapkan subset data diamonds
subdiamonds <- diamonds %>%
  select(carat, cut, color, price) %>%
  slice(1:100)
print(subdiamonds)
## # A tibble: 100 × 4
##    carat cut       color price
##    <dbl> <ord>     <ord> <int>
##  1  0.23 Ideal     E       326
##  2  0.21 Premium   E       326
##  3  0.23 Good      E       327
##  4  0.29 Premium   I       334
##  5  0.31 Good      J       335
##  6  0.24 Very Good J       336
##  7  0.24 Very Good I       336
##  8  0.26 Very Good H       337
##  9  0.22 Fair      E       337
## 10  0.23 Very Good H       338
## # ℹ 90 more rows
# Reshaping subset_diamonds format wide to long
subdiamonds_long <- subdiamonds %>%
  pivot_longer(cols = c(carat, price),
               names_to ="attribute",
               values_to ="value")
print(subdiamonds_long)
## # A tibble: 200 × 4
##    cut     color attribute  value
##    <ord>   <ord> <chr>      <dbl>
##  1 Ideal   E     carat       0.23
##  2 Ideal   E     price     326   
##  3 Premium E     carat       0.21
##  4 Premium E     price     326   
##  5 Good    E     carat       0.23
##  6 Good    E     price     327   
##  7 Premium I     carat       0.29
##  8 Premium I     price     334   
##  9 Good    J     carat       0.31
## 10 Good    J     price     335   
## # ℹ 190 more rows

Data Merging

# Membuat dataset diamond_colors
diamond_colors <- data.frame(
  color = c("D", "E", "F", "G", "H", "I", "J"),
  description = c("Pure", "Pristine", "Clear", "Light Hint", "Faint", "Tinted", "Yellowish")
)
# Merging diamond_colors dengan mid_carat_diamonds
mid_carat_diamonds <- mid_carat_diamonds %>%
  left_join(diamond_colors, by="color")
print(mid_carat_diamonds)
## # A tibble: 34,112 × 11
##    carat cut     color clarity depth_p table price     x     y     z description
##    <dbl> <ord>   <chr> <ord>     <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>      
##  1  0.7  Ideal   E     SI1        62.5    57  2757  5.7   5.72  3.57 Pristine   
##  2  0.86 Fair    E     SI2        55.1    69  2757  6.45  6.33  3.52 Pristine   
##  3  0.7  Ideal   G     VS2        61.6    56  2757  5.7   5.67  3.5  Light Hint 
##  4  0.71 Very G… E     VS2        62.4    57  2759  5.68  5.73  3.56 Pristine   
##  5  0.78 Very G… G     SI2        63.8    56  2759  5.81  5.85  3.72 Light Hint 
##  6  0.7  Good    E     VS2        57.5    58  2759  5.85  5.9   3.38 Pristine   
##  7  0.7  Good    F     VS1        59.4    62  2759  5.71  5.76  3.4  Clear      
##  8  0.96 Fair    F     SI2        66.3    62  2759  6.27  5.95  4.07 Clear      
##  9  0.73 Very G… E     SI1        61.6    59  2760  5.77  5.78  3.56 Pristine   
## 10  0.8  Premium H     SI1        61.5    58  2760  5.97  5.93  3.66 Faint      
## # ℹ 34,102 more rows

Data Aggregating

# Menghitung rata-rata harga tiap berlian berdasarkan hasil cuttingnya
diamonds %>% group_by(cut) %>% summarize (mean=mean(price))
## # A tibble: 5 × 2
##   cut        mean
##   <ord>     <dbl>
## 1 Fair      4359.
## 2 Good      3929.
## 3 Very Good 3982.
## 4 Premium   4584.
## 5 Ideal     3458.

Feature Engineering

# Membuat variabel baru yaitu dimension yang merupakan hasil perkalian kolom x y dan z
diamonds <- diamonds %>%
  mutate(dimension = x*y*z)
print(diamonds)
## # A tibble: 53,940 × 11
##    carat cut       color clarity depth_p table price     x     y     z dimension
##    <dbl> <ord>     <ord> <ord>     <dbl> <dbl> <int> <dbl> <dbl> <dbl>     <dbl>
##  1  0.23 Ideal     E     SI2        61.5    55   326  3.95  3.98  2.43      38.2
##  2  0.21 Premium   E     SI1        59.8    61   326  3.89  3.84  2.31      34.5
##  3  0.23 Good      E     VS1        56.9    65   327  4.05  4.07  2.31      38.1
##  4  0.29 Premium   I     VS2        62.4    58   334  4.2   4.23  2.63      46.7
##  5  0.31 Good      J     SI2        63.3    58   335  4.34  4.35  2.75      51.9
##  6  0.24 Very Good J     VVS2       62.8    57   336  3.94  3.96  2.48      38.7
##  7  0.24 Very Good I     VVS1       62.3    57   336  3.95  3.98  2.47      38.8
##  8  0.26 Very Good H     SI1        61.9    55   337  4.07  4.11  2.53      42.3
##  9  0.22 Fair      E     VS2        65.1    61   337  3.87  3.78  2.49      36.4
## 10  0.23 Very Good H     VS1        59.4    61   338  4     4.05  2.39      38.7
## # ℹ 53,930 more rows
# Menambahkan kolom is_expensive untuk melihat kategori berlian mid_carat yang mahal dan tidak
mid_carat_diamonds <- mid_carat_diamonds %>%
  mutate(is_expensive = if_else(price > 5000, "YES", "NO"))
head(mid_carat_diamonds)
## # A tibble: 6 × 12
##   carat cut      color clarity depth_p table price     x     y     z description
##   <dbl> <ord>    <chr> <ord>     <dbl> <dbl> <int> <dbl> <dbl> <dbl> <chr>      
## 1  0.7  Ideal    E     SI1        62.5    57  2757  5.7   5.72  3.57 Pristine   
## 2  0.86 Fair     E     SI2        55.1    69  2757  6.45  6.33  3.52 Pristine   
## 3  0.7  Ideal    G     VS2        61.6    56  2757  5.7   5.67  3.5  Light Hint 
## 4  0.71 Very Go… E     VS2        62.4    57  2759  5.68  5.73  3.56 Pristine   
## 5  0.78 Very Go… G     SI2        63.8    56  2759  5.81  5.85  3.72 Light Hint 
## 6  0.7  Good     E     VS2        57.5    58  2759  5.85  5.9   3.38 Pristine   
## # ℹ 1 more variable: is_expensive <chr>

Penggunaan 2 Fungsi Bersamaan (tambahan)

# Membuat dataset high_quality_diamonds dengan menambahkan kolom harga per karat dan memfilter berdasarkan cutting yang premium dan ideal pada data diamonds
high_quality_diamonds <- diamonds %>%
  mutate(price_per_carat = price / carat) %>%  
  filter(cut %in% c("Premium", "Ideal") & price_per_carat > 4000) 
head(high_quality_diamonds)
## # A tibble: 6 × 12
##   carat cut     color clarity depth_p table price     x     y     z dimension
##   <dbl> <ord>   <ord> <ord>     <dbl> <dbl> <int> <dbl> <dbl> <dbl>     <dbl>
## 1  0.59 Ideal   E     VVS2       62      55  2761  5.38  5.43  3.35      97.9
## 2  0.63 Premium E     VVS1       60.9    60  2765  5.52  5.55  3.37     103. 
## 3  0.64 Ideal   G     VVS1       61.9    56  2766  5.53  5.56  3.43     105. 
## 4  0.58 Ideal   G     VVS1       61.5    55  2772  5.39  5.44  3.33      97.6
## 5  0.58 Ideal   F     VVS1       61.7    56  2772  5.33  5.37  3.3       94.5
## 6  0.6  Ideal   E     VS1        61.7    55  2774  5.41  5.44  3.35      98.6
## # ℹ 1 more variable: price_per_carat <dbl>