data(diamonds)
## Warning in data(diamonds): data set 'diamonds' not found
library(ggplot2)
ggplot(aes(x = price, y = x), data = diamonds) +
geom_point()
Notes: There are some values with x = 0 that should be accounted for. There is more of a spread of x values the higher the price. Plot resembles an exponential graph.
with(diamonds, cor.test(price, x))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8825835 0.8862594
## sample estimates:
## cor
## 0.8844352
with(diamonds, cor.test(price, y))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8632867 0.8675241
## sample estimates:
## cor
## 0.8654209
with(diamonds, cor.test(price, z))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8590541 0.8634131
## sample estimates:
## cor
## 0.8612494
ggplot(aes(x = price, y = depth), data = diamonds) +
geom_point()
range(diamonds$price)
## [1] 326 18823
ggplot(aes(x = price, y = depth), data = diamonds) +
geom_point(alpha = 1/100) +
scale_x_continuous(breaks = seq(326, 18823, 2),
limits = c(326, 18823))
Notes: Based on the scatterplot of depth vs. price, most diamonds are between 60 to 64 depth.
with (diamonds, cor.test(price, depth))
##
## Pearson's product-moment correlation
##
## data: price and depth
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019084756 -0.002208537
## sample estimates:
## cor
## -0.0106474
Notes: Depth is not a good indicator of price because there are many different prices for the same value of depth.
ggplot(aes(x = price, y = carat), data = diamonds) +
geom_point() +
xlim(0, quantile(diamonds$price, 0.99)) +
ylim(0, quantile(diamonds$carat, 0.99))
## Warning: Removed 926 rows containing missing values (geom_point).
diamonds$volume <- (diamonds$x * diamonds$y * diamonds$z)
ggplot(aes(x = price, y = volume), data = diamonds) +
geom_point()
Notes: There is one extreme value with a volume of around 4000 and multiple values with a volume of 0, both are highly unlikely.
vol_sub <- subset(diamonds, volume > 0 & volume < 800)
with(vol_sub, cor.test(volume, price))
##
## Pearson's product-moment correlation
##
## data: volume and price
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9222944 0.9247772
## sample estimates:
## cor
## 0.9235455
ggplot(aes(x = price, y = volume), data = vol_sub) +
geom_point(alpha = 1/100) +
geom_smooth(method = 'lm', color= 'red')
Notes: This could be used skeptically as an indicator of price. Price generally always goes up as volume increases, but as you can see by the horizontal lines, there is much variation of price for each tier of volume.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
diamonds_by_clarity <- group_by(diamonds, clarity)
diamondsByClarity <- summarise(diamonds_by_clarity,
mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n= n())
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
diamonds_by_color <- group_by(diamonds, color)
diamondsByColor <- summarise(diamonds_by_color,
mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n= n())
p1 <- ggplot(aes(x = clarity, y = mean_price),
data = diamondsByClarity) +
geom_bar(stat = 'identity')
p2 <- ggplot(aes(x = color, y = mean_price),
data = diamondsByColor) +
geom_bar(stat = 'identity')
grid.arrange(p1, p2, ncol = 1)
Notes: Mean price tends to decrease as clarity improves. The same can be same for color.