========================================================

Welcome

Notes:


Scatterplot Review

# Import Libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load the dataset
data("diamonds")

#Spot Check
head(diamonds)
## Source: local data frame [6 x 10]
## 
##   carat       cut  color clarity depth table price     x     y     z
##   (dbl)    (fctr) (fctr)  (fctr) (dbl) (dbl) (int) (dbl) (dbl) (dbl)
## 1  0.23     Ideal      E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium      E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good      E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium      I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good      J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good      J    VVS2  62.8    57   336  3.94  3.96  2.48
# Check the Names
names(diamonds)
##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"
# Let's start by examining two variables in the data set.
# The scatterplot is a powerful tool to help you understand
# the relationship between two continuous variables.

# We can quickly see if the relationship is linear or not.
# In this case, we can use a variety of diamond
# characteristics to help us figure out whether
# the price advertised for any given diamond is 
# reasonable or a rip-off.

# Let's consider the price of a diamond and it's carat weight.
# Create a scatterplot of price (y) vs carat weight (x).

# Limit the x-axis and y-axis to omit the top 1% of values.

# ENTER YOUR CODE BELOW THIS LINE
# ================================================================


# QPLOT Syntax
#
qplot(data = diamonds, x = carat, y = price,
  xlim = c(0, quantile(diamonds$carat, 0.99)), 
  ylim = c(0, quantile(diamonds$price, 0.99))) +
  geom_point(fill = I('#F79420'), color = I('black'), shape = 21)
## Warning: Removed 926 rows containing missing values (geom_point).

## Warning: Removed 926 rows containing missing values (geom_point).

# GGPlot Syntax
#
ggplot(data = diamonds, aes(x = carat, y = price)) +
  scale_x_continuous(lim = c(0, quantile(diamonds$carat, 0.99))) +
  scale_y_continuous(lim = c(0, quantile(diamonds$price, 0.99))) +
  geom_point(fill = I('#F79420'), color = I('black'), shape = 21) + 
  stat_smooth(method = 'lm')
## Warning: Removed 926 rows containing non-finite values (stat_smooth).

## Warning: Removed 926 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_smooth).


Price and Carat Relationship

Response:


Frances Gerety

Notes:

A diamonds is

Forever.


The Rise of Diamonds

Notes:

De Beere Cartel …


ggpairs Function

Notes:

# install these if necessary
# install.packages('GGally')
# install.packages('scales')
# install.packages('memisc')
# install.packages('lattice')
# install.packages('MASS')
# install.packages('car')
# install.packages('reshape')
# install.packages('plyr')

# load the ggplot graphics package and the others
library(ggplot2)
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(scales)
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'memisc'
## The following object is masked from 'package:scales':
## 
##     percent
## The following objects are masked from 'package:dplyr':
## 
##     collect, query, rename
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
## 
##     as.array
# sample 10,000 diamonds from the data set
set.seed(20022012)


diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
# ggpairs(diamond_samp, params = c(shape = I('.'), outlier.shape = I('.')))  <- DEPRECATED !!!

ggpairs(diamond_samp, 
  lower = list(continuous = wrap("points", shape = I('.'))), 
  upper = list(combo = wrap("box", outlier.shape = I('.'))))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What are some things you notice in the ggpairs output? Response:

Diamond Carat Weight

Diamond Carat Weight


The Demand of Diamonds

Notes:

# Create two histograms of the price variable
# and place them side by side on one output image.

# We've put some code below to get you started.

# The first plot should be a histogram of price
# and the second plot should transform
# the price variable using log10.

# Set appropriate bin widths for each plot.
# ggtitle() will add a title to each histogram.

# You can self-assess your work with the plots
# in the solution video.

# ALTER THE CODE BELOW THIS LINE
# ==============================================

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# QPLOT VERSION
plot1 <- qplot(data = diamonds, x=price, binwidth = 100, fill = I('#099DD9')) + 
  ggtitle('Price')

plot2 <- qplot(data = diamonds, x=price, binwidth = 0.01, fill = I('#F79420')) + 
  scale_x_log10() +
  ggtitle('Price (log10)')

grid.arrange(plot1, plot2, ncol = 2)

# GGPLOT VERSION
p1 <- ggplot(data=diamonds, aes(x = price)) +
  geom_histogram(binwidth = 100, col="red", fill="green", alpha = .2) +
  ggtitle('Price')


p2 <- ggplot(data=diamonds, aes(x = price)) +
  geom_histogram(binwidth = 0.01, col="red", fill="green", alpha = .2) +
  scale_x_log10() +
  ggtitle('Price (log 10)')


grid.arrange(p1, p2, ncol = 2)


Connecting Demand and Price Distributions

Notes:

Sneak Preview Log 10

Sneak Preview Log 10


Scatterplot Transformation - put PRICE on a (log10) scale

# Replot on a LOG 10 scale X: Carat vs Y: Price
ggplot(data=diamonds, aes(x = carat, y = price)) +
  geom_point(position='jitter',size=0.6,alpha=1/2) +
  scale_y_continuous(trans = log10_trans()) +
  ggtitle('Price (log10 by Carat)')

# Create a CUBE'd function
# cuberoot_trans = function() trans_new('cuberoot',
#                                      transform = function(x) x^(1/3),
#                                      inverse = function(x) x^3)



# ggplot(data=diamonds, aes(x = carat, y = price)) +
#  geom_point(position='jitter',size=0.6,alpha=1/2) +
#  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
#                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
#  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
#                     breaks = c(350, 1000, 5000, 10000, 15000)) +
#  ggtitle('Price (log10 by Carat)')

Create a new function to transform the carat variable

cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3),
                                      inverse = function(x) x^3)

Use the cuberoot_trans function

ggplot(aes(carat, price), data = diamonds) + 
  geom_point() + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')
## Warning: Removed 1683 rows containing missing values (geom_point).


Overplotting Revisited

Notes:

head(sort(table(diamonds$carat), decreasing=T))
## 
##  0.3 0.31 1.01  0.7 0.32    1 
## 2604 2249 2242 1981 1840 1558
head(sort(table(diamonds$price), decreasing=T))
## 
## 605 802 625 828 776 698 
## 132 127 126 125 124 121

# Add a layer to adjust the features of the
# scatterplot. Set the transparency to one half,
# the size to three-fourths, and jitter the points.

# If you need hints, see the Instructor Notes.
# There are three hints so scroll down slowly if
# you don't want all the hints at once.

# ALTER THE CODE BELOW THIS LINE
# =======================================================================

ggplot(aes(carat, price), data = diamonds) + 
  # geom_point() + 
  geom_point(position='jitter',size=0.75, alpha=1/2) +
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')
## Warning: Removed 1693 rows containing missing values (geom_point).

Other Qualitative Factors

Notes:


Price vs. Carat and Clarity

Alter the code below.

# install and load the RColorBrewer package
# install.packages('RColorBrewer', dependencies = TRUE) 
library(RColorBrewer)


# Start by visualizing CLARITY - Just add one parameter to our AES "color=clarity"
ggplot(aes(x = carat, y = price, color=clarity), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
    guide = guide_legend(title = 'Clarity', reverse = T,
    override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
    breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
    breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
## Warning: Removed 1696 rows containing missing values (geom_point).


Clarity and Price

Response:


Price vs. Carat and Cut

Alter the code below.

ggplot(aes(x = carat, y = price, color = cut), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Cut', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Cut')
## Warning: Removed 1688 rows containing missing values (geom_point).


Cut and Price

Response:


Price vs. Carat and Color

Alter the code below.

ggplot(aes(x = carat, y = price, color = color), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Color', reverse = F,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Color')
## Warning: Removed 1691 rows containing missing values (geom_point).


Color and Price

Response:

** Yes - because it creates a clear separation of price depending on the color. The better colors nearly always command a higher price, regardless of what the marketing people say.


Linear Models in R

Notes:

Response:


Building the Linear Model

Notes:

m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
## 
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color, 
##     data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + 
##     clarity, data = diamonds)
## 
## =========================================================================
##                      m1         m2         m3         m4         m5      
## -------------------------------------------------------------------------
##   (Intercept)      2.821***   1.039***   0.874***   0.932***   0.415***  
##                   (0.006)    (0.019)    (0.019)    (0.017)    (0.010)    
##   I(carat^(1/3))   5.558***   8.568***   8.703***   8.438***   9.144***  
##                   (0.007)    (0.032)    (0.031)    (0.028)    (0.016)    
##   carat                      -1.137***  -1.163***  -0.992***  -1.093***  
##                              (0.012)    (0.011)    (0.010)    (0.006)    
##   cut: .L                                0.224***   0.224***   0.120***  
##                                         (0.004)    (0.004)    (0.002)    
##   cut: .Q                               -0.062***  -0.062***  -0.031***  
##                                         (0.004)    (0.003)    (0.002)    
##   cut: .C                                0.051***   0.052***   0.014***  
##                                         (0.003)    (0.003)    (0.002)    
##   cut: ^4                                0.018***   0.018***  -0.002     
##                                         (0.003)    (0.002)    (0.001)    
##   color: .L                                        -0.373***  -0.441***  
##                                                    (0.003)    (0.002)    
##   color: .Q                                        -0.129***  -0.093***  
##                                                    (0.003)    (0.002)    
##   color: .C                                         0.001     -0.013***  
##                                                    (0.003)    (0.002)    
##   color: ^4                                         0.029***   0.012***  
##                                                    (0.003)    (0.002)    
##   color: ^5                                        -0.016***  -0.003*    
##                                                    (0.003)    (0.001)    
##   color: ^6                                        -0.023***   0.001     
##                                                    (0.002)    (0.001)    
##   clarity: .L                                                  0.907***  
##                                                               (0.003)    
##   clarity: .Q                                                 -0.240***  
##                                                               (0.003)    
##   clarity: .C                                                  0.131***  
##                                                               (0.003)    
##   clarity: ^4                                                 -0.063***  
##                                                               (0.002)    
##   clarity: ^5                                                  0.026***  
##                                                               (0.002)    
##   clarity: ^6                                                 -0.002     
##                                                               (0.002)    
##   clarity: ^7                                                  0.032***  
##                                                               (0.001)    
## -------------------------------------------------------------------------
##   R-squared            0.9        0.9        0.9        1.0        1.0   
##   adj. R-squared       0.9        0.9        0.9        1.0        1.0   
##   sigma                0.3        0.3        0.3        0.2        0.1   
##   F               652012.1   387489.4   138654.5    87959.5   173791.1   
##   p                    0.0        0.0        0.0        0.0        0.0   
##   Log-likelihood   -7962.5    -3631.3    -1837.4     4235.2    34091.3   
##   Deviance          4242.8     3613.4     3380.8     2699.2      892.2   
##   AIC              15931.0     7270.6     3690.8    -8442.5   -68140.5   
##   BIC              15957.7     7306.2     3762.0    -8317.9   -67953.7   
##   N                53940      53940      53940      53940      53940     
## =========================================================================

Notice how adding cut to our model does not help explain much of the variance in the price of diamonds. This fits with out exploration earlier.


Model Problems

Video Notes:

Research: (Take some time to come up with 2-4 problems for the model) (You should spend 10-20 min on this)

Response:

Here’s the problem for the model.


A Bigger, Better Data Set

Notes:

# install.packages('bitops')
# install.packages('RCurl')

library('bitops')
library('RCurl')

diamondsurl = getBinaryURL("https://raw.github.com/solomonm/diamonds-data/master/BigDiamonds.Rda")
# load(rawConnection(diamondsurl))

load("BigDiamonds.rda")

The code used to obtain the data is available here: https://github.com/solomonm/diamonds-data

Building a Model Using the Big Diamonds Data Set

Notes:

# Your task is to build five linear models like Solomon
# did for the diamonds data set only this
# time you'll use a sample of diamonds from the
# diamondsbig data set.

# Be sure to make use of the same variables
# (logprice, carat, etc.) and model
# names (m1, m2, m3, m4, m5).

# To get the diamondsbig data into RStudio
# on your machine, copy, paste, and run the
# code in the Instructor Notes. There's
# 598,024 diamonds in this data set!

# Since the data set is so large,
# you are going to use a sample of the
# data set to compute the models. You can use
# the entire data set on your machine which
# will produce slightly different coefficients
# and statistics for the models.

# This exercise WILL BE automatically graded.

# You can leave off the code to load in the data.
# We've sampled the data for you.
# You also don't need code to create the table output of the models.
# We'll do that for you and check your model summaries (R^2 values, AIC, etc.)

# Your task is to write the code to create the models.

# DO NOT ALTER THE CODE BELOW THIS LINE (Reads in a sample of the diamondsbig data set)
#===========================================================================================
#
#diamondsBigSample <- read.csv('diamondsBigSample.csv')

# diamondsBigSample <- load("BigDiamonds.rda")

# diamondsBigSample <- data.frame(load("BigDiamonds.rda"))

# head(diamondsBigSample)

# ENTER YOUR CODE BELOW THIS LINE. (Create the five models)
#===========================================================================================


# diamondsBigSample$logprice = log(diamondsBigSample$price)

# m1 <- lm(logprice ~ I(carat^(1/3)), 
#          data = diamondsBigSample[diamondsBigSample$price < 10000 & 
#                              diamondsBigSample$cert == "GIA",])
# m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = subset(diamondsBigSample, price < 10000 & cert == 'GIA' ))
# m2 <- update(m1, ~ . + carat)
# m3 <- update(m2, ~ . + cut)
# m4 <- update(m3, ~ . + color)
# m5 <- update(m4, ~ . + clarity)



# DO NOT ALTER THE CODE BELOW THIS LINE (Tables your models and pulls out the statistics)
#===========================================================================================
# suppressMessages(library(lattice))
# suppressMessages(library(MASS))
# suppressMessages(library(memisc))
# models <- mtable(m1, m2, m3, m4, m5)

Predictions

Example Diamond from BlueNile:

# Be sure you've loaded the library memisc and have m5 saved as an object in your workspace.


str(m5)
## List of 13
##  $ coefficients : Named num [1:20] 0.415 9.144 -1.093 0.12 -0.031 ...
##   ..- attr(*, "names")= chr [1:20] "(Intercept)" "I(carat^(1/3))" "carat" "cut.L" ...
##  $ residuals    :Class 'AsIs'  Named num [1:53940] 0.0828 0.0875 -0.2141 -0.2465 0.1509 ...
##   .. ..- attr(*, "names")= chr [1:53940] "1" "2" "3" "4" ...
##  $ effects      :Class 'AsIs'  Named num [1:53940] -1808.48 226.47 25.09 14.1 2.44 ...
##   .. ..- attr(*, "names")= chr [1:53940] "(Intercept)" "I(carat^(1/3))" "carat" "cut.L" ...
##  $ rank         : int 20
##  $ fitted.values:Class 'AsIs'  Named num [1:53940] 5.7 5.7 6 6.06 5.66 ...
##   .. ..- attr(*, "names")= chr [1:53940] "1" "2" "3" "4" ...
##  $ assign       : int [1:20] 0 1 2 3 3 3 3 4 4 4 ...
##  $ qr           :List of 5
##   ..$ qr   : num [1:53940, 1:20] -2.32e+02 4.31e-03 4.31e-03 4.31e-03 4.31e-03 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:53940] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:20] "(Intercept)" "I(carat^(1/3))" "carat" "cut.L" ...
##   .. ..- attr(*, "assign")= int [1:20] 0 1 2 3 3 3 3 4 4 4 ...
##   .. ..- attr(*, "contrasts")=List of 3
##   .. .. ..$ cut    : chr "contr.poly"
##   .. .. ..$ color  : chr "contr.poly"
##   .. .. ..$ clarity: chr "contr.poly"
##   ..$ qraux: num [1:20] 1 1.01 1.01 1 1 ...
##   ..$ pivot: int [1:20] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ tol  : num 1e-07
##   ..$ rank : int 20
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 53920
##  $ contrasts    :List of 3
##   ..$ cut    : chr "contr.poly"
##   ..$ color  : chr "contr.poly"
##   ..$ clarity: chr "contr.poly"
##  $ xlevels      :List of 3
##   ..$ cut    : chr [1:5] "Fair" "Good" "Very Good" "Premium" ...
##   ..$ color  : chr [1:7] "D" "E" "F" "G" ...
##   ..$ clarity: chr [1:8] "I1" "SI2" "SI1" "VS2" ...
##  $ call         : language lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color +      clarity, data = diamonds)
##  $ terms        :Classes 'terms', 'formula' length 3 I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + clarity
##   .. ..- attr(*, "variables")= language list(I(log(price)), I(carat^(1/3)), carat, cut, color, clarity)
##   .. ..- attr(*, "factors")= int [1:6, 1:5] 0 1 0 0 0 0 0 0 1 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:6] "I(log(price))" "I(carat^(1/3))" "carat" "cut" ...
##   .. .. .. ..$ : chr [1:5] "I(carat^(1/3))" "carat" "cut" "color" ...
##   .. ..- attr(*, "term.labels")= chr [1:5] "I(carat^(1/3))" "carat" "cut" "color" ...
##   .. ..- attr(*, "order")= int [1:5] 1 1 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(I(log(price)), I(carat^(1/3)), carat, cut, color, clarity)
##   .. ..- attr(*, "dataClasses")= Named chr [1:6] "numeric" "numeric" "numeric" "ordered" ...
##   .. .. ..- attr(*, "names")= chr [1:6] "I(log(price))" "I(carat^(1/3))" "carat" "cut" ...
##  $ model        :'data.frame':   53940 obs. of  6 variables:
##   ..$ I(log(price)) :Class 'AsIs'  num [1:53940] 5.79 5.79 5.79 5.81 5.81 ...
##   ..$ I(carat^(1/3)):Class 'AsIs'  num [1:53940] 0.613 0.594 0.613 0.662 0.677 ...
##   ..$ carat         : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##   ..$ cut           : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##   ..$ color         : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##   ..$ clarity       : Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula' length 3 I(log(price)) ~ I(carat^(1/3)) + carat + cut + color + clarity
##   .. .. ..- attr(*, "variables")= language list(I(log(price)), I(carat^(1/3)), carat, cut, color, clarity)
##   .. .. ..- attr(*, "factors")= int [1:6, 1:5] 0 1 0 0 0 0 0 0 1 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:6] "I(log(price))" "I(carat^(1/3))" "carat" "cut" ...
##   .. .. .. .. ..$ : chr [1:5] "I(carat^(1/3))" "carat" "cut" "color" ...
##   .. .. ..- attr(*, "term.labels")= chr [1:5] "I(carat^(1/3))" "carat" "cut" "color" ...
##   .. .. ..- attr(*, "order")= int [1:5] 1 1 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(I(log(price)), I(carat^(1/3)), carat, cut, color, clarity)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:6] "numeric" "numeric" "numeric" "ordered" ...
##   .. .. .. ..- attr(*, "names")= chr [1:6] "I(log(price))" "I(carat^(1/3))" "carat" "cut" ...
##  - attr(*, "class")= chr "lm"
# Create a Diamond:
# thisDiamond = data.frame(carat = 1.00, cut = "Good", color = "I", clarity="VS1")

# modelEstimate = predict(m5, newdata = thisDiamond, interval="prediction", level = .95)

# Estimate / predict it's value:
# modelEstimate = predict(m5, newdata = thisDiamond, interval="prediction", level = .95)

# Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : 
#  factor cut has new level V.Good

# NOTE: The culpret is "cut = "V.Good".


# View(modelEstimate)

# modelEstimate

Evaluate how well the model predicts the BlueNile diamond’s price. Think about the fitted point estimate as well as the 95% CI.


Final Thoughts

Notes:


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!