Initial Visualization

ggplot(diamonds, aes(cut,price)) + geom_boxplot()

ggplot(diamonds, aes(color,price)) + geom_boxplot()

ggplot(diamonds, aes(clarity,price)) + geom_boxplot()

ggplot(diamonds, aes(carat, price)) +
  geom_hex(bins=50)

Subset Data and replot

diamonds2 <- diamonds %>%
  filter(carat <= 2.5)  %>%
  mutate(lprice = log2(price), lcarat = log2(carat))

ggplot(diamonds2, aes(lcarat, lprice)) +
  geom_hex(bins=50)

Simple model and visualization

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)

grid <- diamonds2 %>%
  data_grid(carat = seq_range(carat, 20)) %>%
  mutate(lcarat = log2(carat)) %>%
  add_predictions(mod_diamond, "lprice") %>%
  mutate(price = 2 ^ lprice)

ggplot(diamonds2, aes(carat, price)) +
  geom_hex(bins = 50) +
  geom_line(data = grid, color = "green", size = 1)

Add residuals and plot

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) +
  geom_hex(bins = 50)

ggplot(diamonds2, aes(cut,lresid)) + geom_boxplot()

ggplot(diamonds2, aes(color,lresid)) + geom_boxplot()

ggplot(diamonds2, aes(clarity,lresid)) + geom_boxplot()

Four parameter model and visualization

mod_diamond2 <- lm(
  lprice ~ lcarat + color + cut + clarity, diamonds2
)

grid <- diamonds2 %>%
  data_grid(cut, .model = mod_diamond2) %>%
  add_predictions(mod_diamond2)
grid
## # A tibble: 5 x 5
##   cut       lcarat color clarity  pred
##   <ord>      <dbl> <chr> <chr>   <dbl>
## 1 Fair      -0.515 G     VS2      11.2
## 2 Good      -0.515 G     VS2      11.3
## 3 Very Good -0.515 G     VS2      11.4
## 4 Premium   -0.515 G     VS2      11.4
## 5 Ideal     -0.515 G     VS2      11.4
ggplot(grid, aes(cut, pred)) +
  geom_point()

Plot residuals of four parameter model

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond2, "lresid2")

ggplot(diamonds2, aes(lcarat, lresid2)) +
  geom_hex(bins = 50)

diamonds2 %>%
  filter(abs(lresid2) > 1) %>%
  add_predictions(mod_diamond2) %>%
  mutate(pred = round(2^pred)) %>%
  select(price, pred, carat:table, x:z) %>%
  arrange(price)
## # A tibble: 16 x 11
##    price  pred carat cut       color clarity depth table     x     y     z
##    <int> <dbl> <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1013   264 0.25  Fair      F     SI2      54.4    64  4.3   4.23  2.32
##  2  1186   284 0.25  Premium   G     SI2      59      60  5.33  5.28  3.12
##  3  1186   284 0.25  Premium   G     SI2      58.8    60  5.33  5.28  3.12
##  4  1262  2644 1.03  Fair      E     I1       78.2    54  5.72  5.59  4.42
##  5  1415   639 0.35  Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  6  1415   639 0.35  Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  7  1715   576 0.32  Fair      F     VS2      59.6    60  4.42  4.34  2.61
##  8  1776   412 0.290 Fair      F     SI1      55.8    60  4.48  4.41  2.48
##  9  2160   314 0.34  Fair      F     I1       55.8    62  4.72  4.6   2.6 
## 10  2366   774 0.3   Very Good D     VVS2     60.6    58  4.33  4.35  2.63
## 11  3360  1373 0.51  Premium   F     SI1      62.7    62  5.09  4.96  3.15
## 12  3807  1540 0.61  Good      F     SI2      62.5    65  5.36  5.29  3.33
## 13  3920  1705 0.51  Fair      F     VVS2     65.4    60  4.98  4.9   3.23
## 14  4368  1705 0.51  Fair      F     VVS2     60.7    66  5.21  5.11  3.13
## 15 10011  4048 1.01  Fair      D     SI2      64.6    58  6.25  6.2   4.02
## 16 10470 23622 2.46  Premium   E     SI2      59.7    59  8.82  8.76  5.25

Question #1

In the plot of lcarat vs. lprice, there are some bright vertical strips. What do they represent?

Ans : Bright means higher count. Hence, those bright strips means a high density of population is aggregated at some specific log of carat in respect to a specific price range. That strips are deemed to be the popular choices when people buying diamonds.

Question #2

If log(price) = a_0 + a_1 * log(carat), what does that say about the relationship between price and carat?

Ans : The equation describes a linear relationship between log of price and log of carat. log transformation means price and carat doesn’t have a linear relationship directly, but through a transformation. It means that 1% change in carat leads to a_1% change in price.

Question #3

Extract the diamonds that have very high and very low residuals. Is there anything unusual about these diamonds? Are they particularly bad or good, or do you think these are pricing errors?

# Use this chunk to place your code for extracting the high and low residuals
diamonds2 <- diamonds %>% 
  filter(carat <= 2.5) %>% 
  mutate(lprice = log2(price), lcarat = log2(carat))

mod_diamond <- lm(lprice ~ lcarat + color + clarity + cut, data = diamonds2)

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond,'lresid')

summary(diamonds2$lresid)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.17388 -0.12437 -0.00094  0.00000  0.11920  2.78322
diamonds3 <- diamonds2 %>% filter(lresid > quantile(lresid)[[3]] | lresid < quantile(lresid)[[1]] )

table(diamonds3$cut)
## 
##      Fair      Good Very Good   Premium     Ideal 
##       780      2562      6020      7048     10497
table(diamonds3$clarity)
## 
##   I1  SI2  SI1  VS2  VS1 VVS2 VVS1   IF 
##  391 5032 6898 5879 3810 2395 1686  816
diamonds3 %>% 
  ggplot(aes(clarity,price))+
  geom_boxplot()+
  facet_grid(~cut)

diamonds2 <-
  diamonds %>% 
  mutate(lprice = log2(price),
         lcarat = log2(carat))
mod1 <- lm(lprice ~ lcarat + color + clarity + cut, data = diamonds2)
bottom <-
  diamonds2 %>% 
  add_residuals(mod1) %>% 
  arrange(resid) %>% 
  slice(1:10)
top <-
  diamonds2 %>% 
  add_residuals(mod1) %>% 
  arrange(-resid) %>% 
  slice(1:10)
bind_rows(bottom, top) %>% 
  select(price, carat, resid)
## # A tibble: 20 x 3
##    price carat  resid
##    <int> <dbl>  <dbl>
##  1  6512 3     -1.46 
##  2 10470 2.46  -1.17 
##  3 10453 3.05  -1.14 
##  4 14220 3.01  -1.12 
##  5  9925 3.01  -1.12 
##  6 18701 3.51  -1.09 
##  7  1262 1.03  -1.04 
##  8  8040 3.01  -1.02 
##  9 12587 3.5   -0.990
## 10  8044 3     -0.985
## 11  2160 0.34   2.81 
## 12  1776 0.290  2.10 
## 13  1186 0.25   2.06 
## 14  1186 0.25   2.06 
## 15  1013 0.25   1.94 
## 16  2366 0.3    1.61 
## 17  1715 0.32   1.57 
## 18  4368 0.51   1.36 
## 19 10011 1.01   1.31 
## 20  3807 0.61   1.31

Ans : There seems to be no pricing errors. Generally speaking, it is observed that high residuals occur mostly in low carat dimonds, while low residuals occur in high carat dimonds.

Question #4

Does the final model, mod_diamonds2, do a good job of predicting diamond prices? Would you trust it to tell you how much to spend if you were buying a diamond and why?

# Use this chunk to place your code for assessing how well the model predicts diamond prices
mod_diamond2 <- lm(
  lprice ~ lcarat + color + cut + clarity, diamonds2
)

diamonds4 <- diamonds2 %>%
  add_predictions(mod_diamond2)


ggplot(diamonds4, aes(lprice, pred)) +
  geom_point() +
  geom_abline(slope=1, color="red")

summary(mod_diamond2)
## 
## Call:
## lm(formula = lprice ~ lcarat + color + cut + clarity, data = diamonds2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.45867 -0.12459 -0.00033  0.12033  2.81005 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 12.200915   0.001685 7242.225  < 2e-16 ***
## lcarat       1.883718   0.001129 1668.750  < 2e-16 ***
## color.L     -0.634174   0.002925 -216.828  < 2e-16 ***
## color.Q     -0.137955   0.002687  -51.335  < 2e-16 ***
## color.C     -0.021328   0.002515   -8.481  < 2e-16 ***
## color^4      0.017098   0.002310    7.403 1.35e-13 ***
## color^5     -0.003176   0.002182   -1.455    0.146    
## color^6      0.003450   0.001984    1.739    0.082 .  
## cut.L        0.174154   0.003396   51.284  < 2e-16 ***
## cut.Q       -0.050660   0.002989  -16.950  < 2e-16 ***
## cut.C        0.019446   0.002595    7.494 6.77e-14 ***
## cut^4       -0.002253   0.002079   -1.084    0.278    
## clarity.L    1.322709   0.005161  256.274  < 2e-16 ***
## clarity.Q   -0.350630   0.004804  -72.982  < 2e-16 ***
## clarity.C    0.191013   0.004118   46.387  < 2e-16 ***
## clarity^4   -0.095368   0.003294  -28.955  < 2e-16 ***
## clarity^5    0.039556   0.002689   14.711  < 2e-16 ***
## clarity^6   -0.002624   0.002342   -1.120    0.263    
## clarity^7    0.048375   0.002066   23.412  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.193 on 53921 degrees of freedom
## Multiple R-squared:  0.9826, Adjusted R-squared:  0.9826 
## F-statistic: 1.693e+05 on 18 and 53921 DF,  p-value: < 2.2e-16

Ans : Based on the figure, the predicition of price is relative closed to the true value. And based on the summary table, most of the coefficients’ p-values are statistically significant at an alpha level of 0.05. The adjusted R-squared is 0.98, which is better than 0.93 from the simple model. However, the r-squared may indicate an overfittng model, thus furthur analysis needs to be conducted. For now, I would not depende on the model entirely for diamond purchasing.