library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

Create our cell and separate it randomly into a subset for building the model and for testing the model

cell = diamonds %>% 
  filter(cut=="Ideal",color=="G",clarity=="VS1")
cell$type = sample(c("Model","Test"),size=nrow(cell),prob=c(.5,.5),replace=TRUE)
glimpse(cell)
## Observations: 953
## Variables: 11
## $ carat   <dbl> 0.23, 0.74, 0.70, 0.30, 0.71, 0.70, 0.71, 0.71, 0.71, ...
## $ cut     <ord> Ideal, Ideal, Ideal, Ideal, Ideal, Ideal, Ideal, Ideal...
## $ color   <ord> G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, ...
## $ clarity <ord> VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1,...
## $ depth   <dbl> 61.9, 61.5, 61.4, 62.3, 62.9, 61.3, 61.5, 62.7, 62.6, ...
## $ table   <dbl> 54, 55, 59, 56, 58, 57, 56, 57, 57, 55, 57, 55, 56, 55...
## $ price   <int> 404, 2780, 2780, 555, 2820, 2839, 2859, 2930, 2930, 29...
## $ x       <dbl> 3.93, 5.81, 5.64, 4.29, 5.66, 5.71, 5.74, 5.69, 5.67, ...
## $ y       <dbl> 3.95, 5.86, 5.73, 4.31, 5.69, 5.74, 5.78, 5.73, 5.70, ...
## $ z       <dbl> 2.44, 3.59, 3.49, 2.68, 3.57, 3.51, 3.54, 3.58, 3.56, ...
## $ type    <chr> "Model", "Model", "Model", "Test", "Model", "Model", "...
table(cell$type)
## 
## Model  Test 
##   480   473
Model = filter(cell,type=="Model")
Test = filter(cell,type=="Test")
glimpse(Model)
## Observations: 480
## Variables: 11
## $ carat   <dbl> 0.23, 0.74, 0.70, 0.71, 0.70, 0.71, 0.73, 0.71, 0.75, ...
## $ cut     <ord> Ideal, Ideal, Ideal, Ideal, Ideal, Ideal, Ideal, Ideal...
## $ color   <ord> G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, ...
## $ clarity <ord> VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1,...
## $ depth   <dbl> 61.9, 61.5, 61.4, 62.9, 61.3, 62.6, 61.7, 62.2, 62.3, ...
## $ table   <dbl> 54, 55, 59, 58, 57, 57, 55, 56, 57, 59, 56, 56, 56, 56...
## $ price   <int> 404, 2780, 2780, 2820, 2839, 2930, 2948, 2962, 2973, 2...
## $ x       <dbl> 3.93, 5.81, 5.64, 5.66, 5.71, 5.67, 5.80, 5.69, 5.83, ...
## $ y       <dbl> 3.95, 5.86, 5.73, 5.69, 5.74, 5.70, 5.84, 5.72, 5.86, ...
## $ z       <dbl> 2.44, 3.59, 3.49, 3.57, 3.51, 3.56, 3.59, 3.55, 3.64, ...
## $ type    <chr> "Model", "Model", "Model", "Model", "Model", "Model", ...
glimpse(Test)
## Observations: 473
## Variables: 11
## $ carat   <dbl> 0.30, 0.71, 0.71, 0.71, 0.74, 0.32, 0.32, 0.76, 0.70, ...
## $ cut     <ord> Ideal, Ideal, Ideal, Ideal, Ideal, Ideal, Ideal, Ideal...
## $ color   <ord> G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, G, ...
## $ clarity <ord> VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1, VS1,...
## $ depth   <dbl> 62.3, 61.5, 62.7, 62.4, 61.8, 62.3, 61.8, 59.4, 61.7, ...
## $ table   <dbl> 56.0, 56.0, 57.0, 57.0, 55.0, 55.0, 55.0, 57.0, 56.0, ...
## $ price   <int> 555, 2859, 2930, 2950, 2960, 559, 559, 2972, 2972, 303...
## $ x       <dbl> 4.29, 5.74, 5.69, 5.68, 5.85, 4.39, 4.42, 5.99, 5.64, ...
## $ y       <dbl> 4.31, 5.78, 5.73, 5.73, 5.80, 4.41, 4.45, 6.03, 5.71, ...
## $ z       <dbl> 2.68, 3.54, 3.58, 3.56, 3.60, 2.74, 2.74, 3.57, 3.50, ...
## $ type    <chr> "Test", "Test", "Test", "Test", "Test", "Test", "Test"...

Build a model and test it

lm1 = lm(price~carat,data = Model)
summary(lm1)  
## 
## Call:
## lm(formula = price ~ carat, data = Model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3218.8  -520.7  -161.4   576.3  3916.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3234.22      81.19  -39.84   <2e-16 ***
## carat       10182.55     101.60  100.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 851.4 on 478 degrees of freedom
## Multiple R-squared:  0.9546, Adjusted R-squared:  0.9545 
## F-statistic: 1.004e+04 on 1 and 478 DF,  p-value: < 2.2e-16
pred1 = predict(lm1,newdata=Test)
resid1 = Test$price - pred1
sd(resid1)
## [1] 910.441
lm2 = lm(price~poly(carat,2),data = Model)
summary(lm2)  
## 
## Call:
## lm(formula = price ~ poly(carat, 2), data = Model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4991.4  -196.0    20.1   202.3  2056.1 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3910.11      24.83  157.48   <2e-16 ***
## poly(carat, 2)1 85327.16     544.00  156.85   <2e-16 ***
## poly(carat, 2)2 14328.93     544.00   26.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 544 on 477 degrees of freedom
## Multiple R-squared:  0.9815, Adjusted R-squared:  0.9814 
## F-statistic: 1.265e+04 on 2 and 477 DF,  p-value: < 2.2e-16
pred2 = predict(lm2,newdata=Test)
resid2 = Test$price - pred2
sd(resid2)
## [1] 543.4518
lm4 = lm(price~poly(carat,4),data = Model)
summary(lm4) 
## 
## Call:
## lm(formula = price ~ poly(carat, 4), data = Model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4737.9  -127.2   -20.2   137.1  2318.6 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3910.1       23.8 164.308  < 2e-16 ***
## poly(carat, 4)1  85327.2      521.4 163.658  < 2e-16 ***
## poly(carat, 4)2  14328.9      521.4  27.483  < 2e-16 ***
## poly(carat, 4)3  -3145.1      521.4  -6.032 3.26e-09 ***
## poly(carat, 4)4   1465.6      521.4   2.811  0.00514 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 521.4 on 475 degrees of freedom
## Multiple R-squared:  0.9831, Adjusted R-squared:  0.9829 
## F-statistic:  6896 on 4 and 475 DF,  p-value: < 2.2e-16
pred4 = predict(lm4,newdata=Test)
resid4 = Test$price - pred4
sd(resid4)
## [1] 535.1244