The Visualizations of Diamonds dataframe

Exploring diamonds dataframe from ggplot2 package by using R Programming language.

library(tidyverse)
library(ggplot2)
library(patchwork)

glimpse(diamonds)
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

diamonds dataframe contains :


1. Diamonds Price Distribution

Exploring distribution of price range

p1 <- ggplot(diamonds, aes(x=price)) + 
  geom_histogram(bins = 100) + 
  theme_minimal() + 
  labs(title = "Diamonds Dataframe : Price Histogram")

p2 <- ggplot(diamonds, aes(x=price)) + 
  geom_density() + 
  theme_minimal() 

p3 <- ggplot(diamonds, aes(x=price))+
  geom_freqpoly() + 
  theme_minimal() + 
  labs(caption = "Source : Diamonds from ggplot2 package")

p1/(p2+p3)

ggplot(diamonds, aes(x=price)) + 
  geom_boxplot() + 
  theme_minimal() + 
  labs(title = "Diamonds Dataframe : Price Boxplot", 
    caption = "Source : Diamonds from ggplot2 package")

fivenum(diamonds$price)
## [1]   326.0   950.0  2401.0  5324.5 18823.0
sd(diamonds$price)
## [1] 3989.44
mean(diamonds$price)
## [1] 3932.8
Q3 <- quantile(diamonds$price, probs = .75)
Q1 <- quantile(diamonds$price, probs = .25)
IQR_price <- Q3-Q1

outliers1 <- Q3+1.5*IQR_price
outliers2 <- Q1-1.5*IQR_price

outliers1
##      75% 
## 11885.62
outliers2
##       25% 
## -5611.375

Comment :

The distribution of price is Positively skewed curve.

Min = $326.0,

Max = $18823.0,

Median = $2401.0,

SD = $3989.44,

Mean = $3932.8

Outliers = flagged the prices that are over $11885.62 and under $326.


2. Diamonds Cutting Type

  ggplot(diamonds, aes(x=cut, fill=cut)) + 
  geom_bar() + 
  theme_minimal() + 
  labs(title = "Quantity of each Cutting Type in each Color", 
       caption = "Source : Diamonds from ggplot2 package")

summary(diamonds$cut)
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551
ggplot(diamonds, aes(x=cut, y=price, col = cut)) + 
  geom_boxplot() + 
  theme_minimal() + 
  labs(title = "Diamonds Dataframe : Cutting Type Boxplot", 
       caption = "Source : Diamonds from ggplot2 package")

df <- diamonds %>% 
  group_by(cut) %>% 
  summarise(median_price = median(price)) 

ggplot(df, aes(x=cut, y=median_price, fill=cut)) + 
  geom_col() + 
  geom_text(size = 3, aes(label = median_price), vjust = -0.2, colour = "black") +
  theme_minimal() + 
  labs(title = "Median Price of each Cutting Type", 
       caption = "Source : Diamonds from ggplot2 package")

Comment :

Due to, the distribution of price is Positively skewed curve and there are many outliers over $11885.62, so we focus on median price instead of mean price.


3. Diamonds Color Type

  ggplot(diamonds, aes(x=color, fill=cut)) + 
  geom_bar() + 
  theme_minimal() + 
  labs(title = "Quantity of each Color Type in each Cutting", 
       caption = "Source : Diamonds from ggplot2 package")

summary(diamonds$color)
##     D     E     F     G     H     I     J 
##  6775  9797  9542 11292  8304  5422  2808
ggplot(diamonds, aes(x=color, y=price, col = color)) + 
  geom_boxplot() + 
  theme_minimal() + 
  labs(title = "Diamonds Dataframe : Color Type Boxplot", 
       caption = "Source : Diamonds from ggplot2 package")

df <- diamonds %>% 
  group_by(color) %>% 
  summarise(median_price = median(price)) 

ggplot(df, aes(x=color, y=median_price, fill=color)) + 
  geom_col() + 
  geom_text(size = 3, aes(label = median_price), vjust = -0.2, colour = "black") +
  theme_minimal() + 
  labs(title = "Median Price of each Color Type", 
       caption = "Source : Diamonds from ggplot2 package")

Comment :

Due to, the distribution of price is Positively skewed curve and there are many outliers over $11885.62, so we focus on median price instead of mean price.


4. Diamonds Clarity Type

  ggplot(diamonds, aes(x=clarity, fill=cut)) + 
  geom_bar() + 
  theme_minimal() + 
  labs(title = "Quantity of each Clarity Type in each Cutting", 
       caption = "Source : Diamonds from ggplot2 package")

summary(diamonds$clarity)
##    I1   SI2   SI1   VS2   VS1  VVS2  VVS1    IF 
##   741  9194 13065 12258  8171  5066  3655  1790
ggplot(diamonds, aes(x=clarity, y=price, col = clarity)) + 
  geom_boxplot() + 
  theme_minimal() + 
  labs(title = "Diamonds Dataframe : Clarity Type Boxplot", 
       caption = "Source : Diamonds from ggplot2 package")

df <- diamonds %>% 
  group_by(clarity) %>% 
  summarise(median_price = median(price)) 

ggplot(df, aes(x=clarity, y=median_price, fill=clarity)) + 
  geom_col() + 
  geom_text(size = 3, aes(label = median_price), vjust = -0.2, colour = "black") +
  theme_minimal() + 
  labs(title = "Median Price of each Clarity Type", 
       caption = "Source : Diamonds from ggplot2 package")

Comment :

Due to, the distribution of price is Positively skewed curve and there are many outliers over $11885.62, so we focus on median price instead of mean price.


5. Relationship between Carat and Price

ggplot(diamonds %>% sample_n(2000), 
       aes(x=carat, y=price)) + 
  geom_point(size=2, col="lightblue", alpha=0.5) + 
  geom_smooth(method = "lm") +
  geom_rug() +
  theme_minimal()+ 
  labs(title = "Reationship between Carat and Price", 
       caption = "Source : Diamonds from ggplot2 package")

ggplot(diamonds %>% sample_n(2000), 
       aes(x=carat, y=price, col = cut)) + 
  geom_point(size=3, alpha=0.5) + 
  theme_minimal() + 
  facet_wrap(~clarity, ncol = 2) + 
  labs(title = "Reationship between Carat and Price in each Cutting Type, seprated by Clarity",
       caption = "Source : Diamonds from ggplot2 package")

cor.test(diamonds$carat, diamonds$price)
## 
##  Pearson's product-moment correlation
## 
## data:  diamonds$carat and diamonds$price
## t = 551.41, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9203098 0.9228530
## sample estimates:
##       cor 
## 0.9215913

Comment :

The correlation between Carat and Price is highly positive, which correlation is 0.9215

The more carat is the more price.

p-value < 0.05 : Statistically significant.

At 95 percent confidence interval: 0.9203098 0.9228530