diamond = read.csv("C:/Users/Lenovo/Documents/conda/data/Diamonds Prices2022.csv")

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
diamond = diamond %>% select(-X) # drop kolom indeks
glimpse(diamond)
## Rows: 53,943
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <chr> "Ideal", "Premium", "Good", "Premium", "Good", "Very Good", "V…
## $ color   <chr> "E", "E", "E", "I", "J", "J", "I", "H", "E", "H", "J", "J", "F…
## $ clarity <chr> "SI2", "SI1", "VS1", "VS2", "SI2", "VVS2", "VVS1", "SI1", "VS2…
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
str(diamond)
## 'data.frame':    53943 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : chr  "Ideal" "Premium" "Good" "Premium" ...
##  $ color  : chr  "E" "E" "E" "I" ...
##  $ clarity: chr  "SI2" "SI1" "VS1" "VS2" ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamond)
##      carat            cut               color             clarity         
##  Min.   :0.2000   Length:53943       Length:53943       Length:53943      
##  1st Qu.:0.4000   Class :character   Class :character   Class :character  
##  Median :0.7000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.7979                                                           
##  3rd Qu.:1.0400                                                           
##  Max.   :5.0100                                                           
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800
diamond$clarity <- factor(diamond$clarity, levels = c('IF', 'VVS1', 'VVS2',
                                                      'VS1', 'VS2',
                                                      'SI1', 'SI2', 'I1'))
diamond$cut <- factor(diamond$cut, levels = c('Ideal', 'Premium', 'Very Good', 'Good', 'Fair'))
# cek data ukuran berlian
diamond_size  = pivot_longer(diamond, cols = c(x,y,z),
                             names_to = "variabel", values_to = "nilai")
ggplot(diamond_size, aes(x = variabel, y = nilai)) +
  geom_boxplot(fill = 'skyblue')

# clean invalid dan outlier
diamond = diamond %>% filter(x != 0, y != 0, z != 0, y < 20, z < 20)

# recheck boxplot
diamond_size  = pivot_longer(diamond, cols = c(x,y,z),
                             names_to = "variabel", values_to = "nilai")
ggplot(diamond_size, aes(x = variabel, y = nilai)) +
  geom_boxplot(fill = 'skyblue')

# tambah kolom volume (untuk menentukan ukuran)
diamond = diamond %>% mutate(x.y.z = x * y * z)

# cari mean dari volume
mean_size = mean(diamond$x.y.z)
# Color mempengaruhi Price?

big_diamond = diamond %>% filter(x.y.z > mean_size)
small_diamond = diamond %>% filter(x.y.z < mean_size)

ggplot(big_diamond, aes(x = color, y = price, fill = color)) +
  geom_jitter(alpha = 0.05)

ggplot(small_diamond, aes(x = color, y = price, fill = color)) +
  geom_jitter(alpha = 0.05) + geom_hline(yintercept = 1500)

big_diamond = diamond %>% filter(x.y.z > mean_size)
small_diamond = diamond %>% filter(x.y.z < mean_size)

ggplot(big_diamond, aes(x = color, y = price, fill = color)) +
  geom_jitter(alpha = 0.05)

ggplot(small_diamond, aes(x = color, y = price, fill = color)) +
  geom_jitter(alpha = 0.05) + geom_hline(yintercept = 1500)

ggplot(big_diamond, aes(x = color, fill = color)) +
  geom_bar()

# Volume mempengaruhi Price?

ggplot(diamond, aes(x = x.y.z, y = price, color = x.y.z)) +
  geom_point(alpha = 0.05) + geom_vline(xintercept = mean_size, linewidth = 1, color = 'red') +
  coord_fixed(ratio = 0.03)

ggplot(diamond, aes(x = x.y.z)) +
  geom_histogram(bins = 50, fill = "skyblue", color = "black") +
  theme_minimal()

diamond %>% arrange(desc(price)) %>% head(10)
##    carat       cut color clarity depth table price    x    y    z    x.y.z
## 1   2.29   Premium     I     VS2  60.8    60 18823 8.50 8.47 5.16 371.4942
## 2   2.00 Very Good     G     SI1  63.5    56 18818 7.90 7.97 5.04 317.3335
## 3   1.51     Ideal     G      IF  61.7    55 18806 7.37 7.41 4.56 249.0294
## 4   2.07     Ideal     G     SI2  62.5    55 18804 8.20 8.13 5.11 340.6633
## 5   2.00 Very Good     H     SI1  62.8    57 18803 7.95 8.00 5.01 318.6360
## 6   2.29   Premium     I     SI1  61.8    59 18797 8.52 8.45 5.24 377.2486
## 7   2.04   Premium     H     SI1  58.1    60 18795 8.37 8.28 4.84 335.4294
## 8   2.00   Premium     I     VS1  60.8    59 18795 8.13 8.02 4.91 320.1448
## 9   1.71   Premium     F     VS2  62.3    59 18791 7.57 7.53 4.70 267.9099
## 10  2.15     Ideal     G     SI2  62.6    54 18791 8.29 8.35 5.21 360.6440
ggplot(diamond, aes(x = cut, y = price)) +
  geom_boxplot()

# bandingin carat ke price
ggplot(diamond, aes(x = carat, y = price, color = 'red')) +
  geom_point(alpha = 0.05)

# carat yang sering muncul
ggplot(diamond, aes(x = carat, y = price, color = 'red')) +
  geom_point(alpha = 0.05) +
  geom_vline(xintercept = c(1,1.2,1.5,1.7,2), linewidth = 1,
             color = 'blue', alpha = 0.5)

big_diamond %>%
  group_by(cut) %>%
  summarise(mean_ppc = mean(price))
## # A tibble: 5 × 2
##   cut       mean_ppc
##   <fct>        <dbl>
## 1 Ideal        7931.
## 2 Premium      7683.
## 3 Very Good    7248.
## 4 Good         6454.
## 5 Fair         5769.
small_diamond %>%
  group_by(cut) %>%
  summarise(mean_ppc = mean(price))
## # A tibble: 5 × 2
##   cut       mean_ppc
##   <fct>        <dbl>
## 1 Ideal        1402.
## 2 Premium      1326.
## 3 Very Good    1383.
## 4 Good         1351.
## 5 Fair         1662.
ggplot(small_diamond, aes(x = cut, y = price, color = cut)) +
  geom_jitter(alpha = 0.1)

ggplot(small_diamond, aes(x = color, y = price, color = color)) +
  geom_jitter(alpha = 0.05)

ggplot(big_diamond, aes(x = cut, y = price, color = cut)) +
  geom_jitter(alpha = 0.1)

ggplot(big_diamond, aes(x = clarity, y = price, color = clarity)) +
  geom_jitter(alpha = 0.1)

ggplot(diamond, aes(x = depth)) +
  geom_histogram(bins = 50, fill = "skyblue", color = "black")

ggplot(diamond, aes(x = table)) +
  geom_histogram(bins = 50, fill = "skyblue", color = "black")

```