diamond = read.csv("C:/Users/Lenovo/Documents/conda/data/Diamonds Prices2022.csv")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
diamond = diamond %>% select(-X) # drop kolom indeks
glimpse(diamond)
## Rows: 53,943
## Columns: 10
## $ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut <chr> "Ideal", "Premium", "Good", "Premium", "Good", "Very Good", "V…
## $ color <chr> "E", "E", "E", "I", "J", "J", "I", "H", "E", "H", "J", "J", "F…
## $ clarity <chr> "SI2", "SI1", "VS1", "VS2", "SI2", "VVS2", "VVS1", "SI1", "VS2…
## $ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
str(diamond)
## 'data.frame': 53943 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : chr "Ideal" "Premium" "Good" "Premium" ...
## $ color : chr "E" "E" "E" "I" ...
## $ clarity: chr "SI2" "SI1" "VS1" "VS2" ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamond)
## carat cut color clarity
## Min. :0.2000 Length:53943 Length:53943 Length:53943
## 1st Qu.:0.4000 Class :character Class :character Class :character
## Median :0.7000 Mode :character Mode :character Mode :character
## Mean :0.7979
## 3rd Qu.:1.0400
## Max. :5.0100
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
diamond$clarity <- factor(diamond$clarity, levels = c('IF', 'VVS1', 'VVS2',
'VS1', 'VS2',
'SI1', 'SI2', 'I1'))
diamond$cut <- factor(diamond$cut, levels = c('Ideal', 'Premium', 'Very Good', 'Good', 'Fair'))
# cek data ukuran berlian
diamond_size = pivot_longer(diamond, cols = c(x,y,z),
names_to = "variabel", values_to = "nilai")
ggplot(diamond_size, aes(x = variabel, y = nilai)) +
geom_boxplot(fill = 'skyblue')
# clean invalid dan outlier
diamond = diamond %>% filter(x != 0, y != 0, z != 0, y < 20, z < 20)
# recheck boxplot
diamond_size = pivot_longer(diamond, cols = c(x,y,z),
names_to = "variabel", values_to = "nilai")
ggplot(diamond_size, aes(x = variabel, y = nilai)) +
geom_boxplot(fill = 'skyblue')
# tambah kolom volume (untuk menentukan ukuran)
diamond = diamond %>% mutate(x.y.z = x * y * z)
# cari mean dari volume
mean_size = mean(diamond$x.y.z)
# Color mempengaruhi Price?
big_diamond = diamond %>% filter(x.y.z > mean_size)
small_diamond = diamond %>% filter(x.y.z < mean_size)
ggplot(big_diamond, aes(x = color, y = price, fill = color)) +
geom_jitter(alpha = 0.05)
ggplot(small_diamond, aes(x = color, y = price, fill = color)) +
geom_jitter(alpha = 0.05) + geom_hline(yintercept = 1500)
big_diamond = diamond %>% filter(x.y.z > mean_size)
small_diamond = diamond %>% filter(x.y.z < mean_size)
ggplot(big_diamond, aes(x = color, y = price, fill = color)) +
geom_jitter(alpha = 0.05)
ggplot(small_diamond, aes(x = color, y = price, fill = color)) +
geom_jitter(alpha = 0.05) + geom_hline(yintercept = 1500)
ggplot(big_diamond, aes(x = color, fill = color)) +
geom_bar()
# Volume mempengaruhi Price?
ggplot(diamond, aes(x = x.y.z, y = price, color = x.y.z)) +
geom_point(alpha = 0.05) + geom_vline(xintercept = mean_size, linewidth = 1, color = 'red') +
coord_fixed(ratio = 0.03)
ggplot(diamond, aes(x = x.y.z)) +
geom_histogram(bins = 50, fill = "skyblue", color = "black") +
theme_minimal()
diamond %>% arrange(desc(price)) %>% head(10)
## carat cut color clarity depth table price x y z x.y.z
## 1 2.29 Premium I VS2 60.8 60 18823 8.50 8.47 5.16 371.4942
## 2 2.00 Very Good G SI1 63.5 56 18818 7.90 7.97 5.04 317.3335
## 3 1.51 Ideal G IF 61.7 55 18806 7.37 7.41 4.56 249.0294
## 4 2.07 Ideal G SI2 62.5 55 18804 8.20 8.13 5.11 340.6633
## 5 2.00 Very Good H SI1 62.8 57 18803 7.95 8.00 5.01 318.6360
## 6 2.29 Premium I SI1 61.8 59 18797 8.52 8.45 5.24 377.2486
## 7 2.04 Premium H SI1 58.1 60 18795 8.37 8.28 4.84 335.4294
## 8 2.00 Premium I VS1 60.8 59 18795 8.13 8.02 4.91 320.1448
## 9 1.71 Premium F VS2 62.3 59 18791 7.57 7.53 4.70 267.9099
## 10 2.15 Ideal G SI2 62.6 54 18791 8.29 8.35 5.21 360.6440
ggplot(diamond, aes(x = cut, y = price)) +
geom_boxplot()
# bandingin carat ke price
ggplot(diamond, aes(x = carat, y = price, color = 'red')) +
geom_point(alpha = 0.05)
# carat yang sering muncul
ggplot(diamond, aes(x = carat, y = price, color = 'red')) +
geom_point(alpha = 0.05) +
geom_vline(xintercept = c(1,1.2,1.5,1.7,2), linewidth = 1,
color = 'blue', alpha = 0.5)
big_diamond %>%
group_by(cut) %>%
summarise(mean_ppc = mean(price))
## # A tibble: 5 × 2
## cut mean_ppc
## <fct> <dbl>
## 1 Ideal 7931.
## 2 Premium 7683.
## 3 Very Good 7248.
## 4 Good 6454.
## 5 Fair 5769.
small_diamond %>%
group_by(cut) %>%
summarise(mean_ppc = mean(price))
## # A tibble: 5 × 2
## cut mean_ppc
## <fct> <dbl>
## 1 Ideal 1402.
## 2 Premium 1326.
## 3 Very Good 1383.
## 4 Good 1351.
## 5 Fair 1662.
ggplot(small_diamond, aes(x = cut, y = price, color = cut)) +
geom_jitter(alpha = 0.1)
ggplot(small_diamond, aes(x = color, y = price, color = color)) +
geom_jitter(alpha = 0.05)
ggplot(big_diamond, aes(x = cut, y = price, color = cut)) +
geom_jitter(alpha = 0.1)
ggplot(big_diamond, aes(x = clarity, y = price, color = clarity)) +
geom_jitter(alpha = 0.1)
ggplot(diamond, aes(x = depth)) +
geom_histogram(bins = 50, fill = "skyblue", color = "black")
ggplot(diamond, aes(x = table)) +
geom_histogram(bins = 50, fill = "skyblue", color = "black")
```