Lesson 9: Diamonds & Price Predictions

Scatterplot Review

library(ggplot2)
data("diamonds")
ggplot(data = diamonds, aes(x = carat, y = price)) + 
  coord_cartesian(xlim = c(0, quantile(diamonds$carat, prob=0.99)), 
                  ylim = c(0, quantile(diamonds$price, prob=0.99))) +
  geom_point(fill = I('#F79420'), color = I('black'), shape = 21)

Price and Carat Relationship: Add on Linear Model

ggplot(data = diamonds, aes(x = carat, y = price)) + 
  coord_cartesian(xlim = c(0, quantile(diamonds$carat, prob=0.99)), 
                  ylim = c(0, quantile(diamonds$price, prob=0.99))) +
  geom_point(color = I('#F79420'), alpha = 1/4) + 
  stat_smooth(method = 'lm')

ggpairs Function

# load the ggplot graphics package and the others
library(ggplot2)
library(GGally)
library(scales)
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
## 
## Attaching package: 'memisc'
## The following object is masked from 'package:scales':
## 
##     percent
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
## 
##     as.array
# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp, 
  lower = list(continuous = wrap("points", shape = I('.'))), 
  upper = list(combo = wrap("box", outlier.shape = I('.'))))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The Demand of Diamonds

plot1 <- ggplot(data = diamonds, aes(x = price)) + 
  ggtitle("Price") +
  geom_histogram(binwidth = 100, fill = I("#099DD9"))

plot2 <- ggplot(data = diamonds, aes(x = price)) +
  geom_histogram( binwidth = 0.01, fill = I("#F79420")) +
  ggtitle("Price (log10)") + 
  scale_x_log10()

library(gridExtra)
library(grid)
grid.arrange(plot1, plot2, ncol=2)

Scatterplot Transformation

qplot(carat, price, data = diamonds) +
  scale_y_continuous(trans = log10_trans()) +
  ggtitle("Price (log10) by Carat")

Create a new function to transform the carat variable

cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3),
                                      inverse = function(x) x^3)

Use the cuberoot_trans function

ggplot(aes(carat, price), data = diamonds) + 
  geom_point() + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')
## Warning: Removed 1683 rows containing missing values (geom_point).

Overplotting Revisited

head(sort(table(diamonds$carat), decreasing = T))
## 
##  0.3 0.31 1.01  0.7 0.32    1 
## 2604 2249 2242 1981 1840 1558
head(sort(table(diamonds$price), decreasing = T))
## 
## 605 802 625 828 776 698 
## 132 127 126 125 124 121
ggplot(aes(carat, price), data = diamonds) + 
  geom_point(alpha = 0.5, size = 0.75, position = 'jitter') + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')
## Warning: Removed 1691 rows containing missing values (geom_point).

Price vs. Carat and Clarity

library(RColorBrewer)

ggplot(aes(x = carat, y = price, color = clarity), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
    guide = guide_legend(title = 'Clarity', reverse = T,
    override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
    breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
    breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
## Warning: Removed 1693 rows containing missing values (geom_point).

Price vs. Carat and Cut

ggplot(aes(x = carat, y = price, color = cut), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Cut', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Cut')
## Warning: Removed 1696 rows containing missing values (geom_point).

Price vs. Carat and Color

ggplot(aes(x = carat, y = price, color = color), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Color',
                                          reverse = FALSE,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Color')
## Warning: Removed 1688 rows containing missing values (geom_point).

Building the Linear Model

m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- lm(I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
m3 <- lm(I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
m4 <- lm(I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, data = diamonds)
m5 <- lm(I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + clarity, data = diamonds)
mtable(m1, m2, m3, m4, m5, sdigits = 3)
## 
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, 
##     data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = diamonds)
## 
## =============================================================================
##                       m1          m2          m3         m4          m5      
## -----------------------------------------------------------------------------
##   (Intercept)      2.821***    1.039***    0.874***    0.932***   0.415***   
##                   (0.006)     (0.019)     (0.019)     (0.017)    (0.010)     
##   I(carat^(1/3))   5.558***    8.568***    8.703***    8.438***   9.144***   
##                   (0.007)     (0.032)     (0.031)     (0.028)    (0.016)     
##   carat                       -1.137***   -1.163***   -0.992***  -1.093***   
##                               (0.012)     (0.011)     (0.010)    (0.006)     
##   cut: .L                                  0.224***    0.224***   0.120***   
##                                           (0.004)     (0.004)    (0.002)     
##   cut: .Q                                 -0.062***   -0.062***  -0.031***   
##                                           (0.004)     (0.003)    (0.002)     
##   cut: .C                                  0.051***    0.052***   0.014***   
##                                           (0.003)     (0.003)    (0.002)     
##   cut: ^4                                  0.018***    0.018***  -0.002      
##                                           (0.003)     (0.002)    (0.001)     
##   color: .L                                           -0.373***  -0.441***   
##                                                       (0.003)    (0.002)     
##   color: .Q                                           -0.129***  -0.093***   
##                                                       (0.003)    (0.002)     
##   color: .C                                            0.001     -0.013***   
##                                                       (0.003)    (0.002)     
##   color: ^4                                            0.029***   0.012***   
##                                                       (0.003)    (0.002)     
##   color: ^5                                           -0.016***  -0.003*     
##                                                       (0.003)    (0.001)     
##   color: ^6                                           -0.023***   0.001      
##                                                       (0.002)    (0.001)     
##   clarity: .L                                                     0.907***   
##                                                                  (0.003)     
##   clarity: .Q                                                    -0.240***   
##                                                                  (0.003)     
##   clarity: .C                                                     0.131***   
##                                                                  (0.003)     
##   clarity: ^4                                                    -0.063***   
##                                                                  (0.002)     
##   clarity: ^5                                                     0.026***   
##                                                                  (0.002)     
##   clarity: ^6                                                    -0.002      
##                                                                  (0.002)     
##   clarity: ^7                                                     0.032***   
##                                                                  (0.001)     
## -----------------------------------------------------------------------------
##   R-squared            0.924       0.935       0.939      0.951       0.984  
##   adj. R-squared       0.924       0.935       0.939      0.951       0.984  
##   sigma                0.280       0.259       0.250      0.224       0.129  
##   F               652012.063  387489.366  138654.523  87959.467  173791.084  
##   p                    0.000       0.000       0.000      0.000       0.000  
##   Log-likelihood   -7962.499   -3631.319   -1837.416   4235.240   34091.272  
##   Deviance          4242.831    3613.360    3380.837   2699.212     892.214  
##   AIC              15930.999    7270.637    3690.832  -8442.481  -68140.544  
##   BIC              15957.685    7306.220    3761.997  -8317.942  -67953.736  
##   N                53940       53940       53940      53940       53940      
## =============================================================================

A Bigger, Better Data Set

Notes:

library('bitops')
library('RCurl')

#diamondsurl = getBinaryURL("https://raw.github.com/solomonm/diamonds-data/master/BigDiamonds.Rda")
#load(rawConnection(diamondsurl))
setwd('C:/Users/User/Desktop/LessonR/lesson6')
diamondsBig <- read.csv('diamondsbig.csv')

The code used to obtain the data is available here: https://github.com/solomonm/diamonds-data

Building a Model Using the Big Diamonds Data Set

Notes:

diamondsBig$logprice = log(diamondsBig$price)
m1 <- lm(formula = logprice ~ I(carat^(1/3)), data = diamondsBig[diamondsBig$price < 10000 & diamondsBig$cert == "GIA",])
m2 <- lm(formula = logprice ~ I(carat^(1/3)) + carat, data = diamondsBig[diamondsBig$price < 10000 & diamondsBig$cert == "GIA",])
m3 <- lm(formula = logprice ~ I(carat^(1/3)) + carat + cut, data = diamondsBig[diamondsBig$price < 10000 & diamondsBig$cert == "GIA",])
m4 <- lm(formula = logprice ~ I(carat^(1/3)) + carat + cut + color, data = diamondsBig[diamondsBig$price < 10000 & diamondsBig$cert == "GIA",])
m5 <- lm(formula = logprice ~ I(carat^(1/3)) + carat + cut + color + clarity, data = diamondsBig[diamondsBig$price < 10000 & diamondsBig$cert == "GIA",])
mtable(m1, m2, m3, m4, m5, sdigits = 3)
## 
## Calls:
## m1: lm(formula = logprice ~ I(carat^(1/3)), data = diamondsBig[diamondsBig$price < 
##     10000 & diamondsBig$cert == "GIA", ])
## m2: lm(formula = logprice ~ I(carat^(1/3)) + carat, data = diamondsBig[diamondsBig$price < 
##     10000 & diamondsBig$cert == "GIA", ])
## m3: lm(formula = logprice ~ I(carat^(1/3)) + carat + cut, data = diamondsBig[diamondsBig$price < 
##     10000 & diamondsBig$cert == "GIA", ])
## m4: lm(formula = logprice ~ I(carat^(1/3)) + carat + cut + color, 
##     data = diamondsBig[diamondsBig$price < 10000 & diamondsBig$cert == 
##         "GIA", ])
## m5: lm(formula = logprice ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = diamondsBig[diamondsBig$price < 10000 & diamondsBig$cert == 
##     "GIA", ])
## 
## =================================================================================
##                       m1           m2           m3          m4          m5       
## ---------------------------------------------------------------------------------
##   (Intercept)       2.671***     1.333***    0.949***    1.341***     0.665***   
##                    (0.003)      (0.012)     (0.012)     (0.010)      (0.007)     
##   I(carat^(1/3))    5.839***     8.243***    8.633***    8.110***     8.320***   
##                    (0.004)      (0.022)     (0.021)     (0.017)      (0.012)     
##   carat                         -1.061***   -1.223***   -0.782***    -0.763***   
##                                 (0.009)     (0.009)     (0.007)      (0.005)     
##   cut: Ideal                                 0.211***    0.181***     0.131***   
##                                             (0.002)     (0.001)      (0.001)     
##   cut: V.Good                                0.120***    0.090***     0.071***   
##                                             (0.002)     (0.001)      (0.001)     
##   color: E/D                                            -0.083***    -0.071***   
##                                                         (0.001)      (0.001)     
##   color: F/D                                            -0.125***    -0.105***   
##                                                         (0.001)      (0.001)     
##   color: G/D                                            -0.178***    -0.162***   
##                                                         (0.001)      (0.001)     
##   color: H/D                                            -0.243***    -0.225***   
##                                                         (0.002)      (0.001)     
##   color: I/D                                            -0.361***    -0.358***   
##                                                         (0.002)      (0.001)     
##   color: J/D                                            -0.500***    -0.509***   
##                                                         (0.002)      (0.001)     
##   color: K/D                                            -0.689***    -0.710***   
##                                                         (0.002)      (0.002)     
##   color: L/D                                            -0.812***    -0.827***   
##                                                         (0.003)      (0.002)     
##   clarity: I2                                                        -0.301***   
##                                                                      (0.006)     
##   clarity: IF                                                         0.751***   
##                                                                      (0.002)     
##   clarity: SI1                                                        0.426***   
##                                                                      (0.002)     
##   clarity: SI2                                                        0.306***   
##                                                                      (0.002)     
##   clarity: VS1                                                        0.590***   
##                                                                      (0.002)     
##   clarity: VS2                                                        0.534***   
##                                                                      (0.002)     
##   clarity: VVS1                                                       0.693***   
##                                                                      (0.002)     
##   clarity: VVS2                                                       0.633***   
##                                                                      (0.002)     
## ---------------------------------------------------------------------------------
##   R-squared             0.888        0.892       0.899       0.937        0.969  
##   adj. R-squared        0.888        0.892       0.899       0.937        0.969  
##   sigma                 0.289        0.284       0.275       0.216        0.154  
##   F               2700903.714  1406538.330  754405.425  423311.488   521161.443  
##   p                     0.000        0.000       0.000       0.000        0.000  
##   Log-likelihood   -60137.791   -53996.269  -43339.818   37830.414   154124.270  
##   Deviance          28298.689    27291.534   25628.285   15874.910     7992.720  
##   AIC              120281.582   108000.539   86691.636  -75632.827  -308204.540  
##   BIC              120313.783   108043.473   86756.037  -75482.557  -307968.400  
##   N                338946       338946      338946      338946       338946      
## =================================================================================

Predictions

Example Diamond from BlueNile: Round 1.00 Very Good I VS1 $5,601

#Be sure you have loaded the library memisc and have m5 saved as an object in your workspace.
#library(memisc)

#thisDiamond = data.frame(carat = 1.00, cut = "Very Good",
#                         color = "I", clarity="VS1")
#modelEstimate = predict(m5, newdata = thisDiamond,
#                        interval="prediction", level = .95)
#exp(modelEstimate)