data(diamonds)
## Warning in data(diamonds): data set 'diamonds' not found

price vs. x

library(ggplot2)

ggplot(aes(x = price, y = x), data = diamonds) + 
  geom_point()


Findings - price vs. x

Notes: There are some values with x = 0 that should be accounted for. There is more of a spread of x values the higher the price. Plot resembles an exponential graph.


Correlations

with(diamonds, cor.test(price, x))
## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352
with(diamonds, cor.test(price, y))
## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209
with(diamonds, cor.test(price, z))
## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

price vs. depth

ggplot(aes(x = price, y = depth), data = diamonds) + 
  geom_point()


Adjustments - price vs. depth

range(diamonds$price)
## [1]   326 18823
ggplot(aes(x = price, y = depth), data = diamonds) + 
  geom_point(alpha = 1/100) + 
  scale_x_continuous(breaks = seq(326, 18823, 2),
                     limits = c(326, 18823))


Typical Depth Range

Notes: Based on the scatterplot of depth vs. price, most diamonds are between 60 to 64 depth.


Correlation - price and depth

with (diamonds, cor.test(price, depth))
## 
##  Pearson's product-moment correlation
## 
## data:  price and depth
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

Notes: Depth is not a good indicator of price because there are many different prices for the same value of depth.


price vs. carat

ggplot(aes(x = price, y = carat), data = diamonds) + 
  geom_point() + 
  xlim(0, quantile(diamonds$price, 0.99)) + 
  ylim(0, quantile(diamonds$carat, 0.99))
## Warning: Removed 926 rows containing missing values (geom_point).


price vs. volume

diamonds$volume <- (diamonds$x * diamonds$y * diamonds$z)

ggplot(aes(x = price, y = volume), data = diamonds) + 
  geom_point()


Findings - price vs. volume

Notes: There is one extreme value with a volume of around 4000 and multiple values with a volume of 0, both are highly unlikely.


Correlations on Subsets

vol_sub <- subset(diamonds, volume > 0 & volume < 800)

with(vol_sub, cor.test(volume, price))
## 
##  Pearson's product-moment correlation
## 
## data:  volume and price
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9222944 0.9247772
## sample estimates:
##       cor 
## 0.9235455

Adjustments - price vs. carat

ggplot(aes(x = price, y = volume), data = vol_sub) + 
  geom_point(alpha = 1/100) + 
  geom_smooth(method = 'lm', color= 'red')

Notes: This could be used skeptically as an indicator of price. Price generally always goes up as volume increases, but as you can see by the horizontal lines, there is much variation of price for each tier of volume.


Mean Price by Clarity

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
diamonds_by_clarity <- group_by(diamonds, clarity)

diamondsByClarity <- summarise(diamonds_by_clarity, 
                               mean_price = mean(price),
                               median_price = median(price), 
                               min_price = min(price),
                               max_price = max(price),
                               n= n())

Bar Charts of Mean Price

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
diamonds_by_color <- group_by(diamonds, color)

diamondsByColor <- summarise(diamonds_by_color, 
                               mean_price = mean(price),
                               median_price = median(price), 
                               min_price = min(price),
                               max_price = max(price),
                               n= n())

p1 <- ggplot(aes(x = clarity, y = mean_price), 
             data = diamondsByClarity) + 
  geom_bar(stat = 'identity')

p2 <- ggplot(aes(x = color, y = mean_price), 
             data = diamondsByColor) + 
  geom_bar(stat = 'identity')

grid.arrange(p1, p2, ncol = 1)