Initial Visualization

ggplot(diamonds, aes(cut,price)) + geom_boxplot()

ggplot(diamonds, aes(color,price)) + geom_boxplot()

ggplot(diamonds, aes(clarity,price)) + geom_boxplot()

ggplot(diamonds, aes(carat, price)) +
  geom_hex(bins=50)

Subset Data and replot

diamonds2 <- diamonds %>%
  filter(carat <= 2.5)  %>%
  mutate(lprice = log2(price), lcarat = log2(carat))

ggplot(diamonds2, aes(lcarat, lprice)) +
  geom_hex(bins=50)

Simple model and visualization

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)

grid <- diamonds2 %>%
  data_grid(carat = seq_range(carat, 20)) %>%
  mutate(lcarat = log2(carat)) %>%
  add_predictions(mod_diamond, "lprice") %>%
  mutate(price = 2 ^ lprice)

ggplot(diamonds2, aes(carat, price)) +
  geom_hex(bins = 50) +
  geom_line(data = grid, color = "green", size = 1)

Add residuals and plot

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) +
  geom_hex(bins = 50)

ggplot(diamonds2, aes(cut,lresid)) + geom_boxplot()

ggplot(diamonds2, aes(color,lresid)) + geom_boxplot()

ggplot(diamonds2, aes(clarity,lresid)) + geom_boxplot()

Four parameter model and visualization

mod_diamond2 <- lm(
  lprice ~ lcarat + color + cut + clarity, diamonds2
)

grid <- diamonds2 %>%
  data_grid(cut, .model = mod_diamond2) %>%
  add_predictions(mod_diamond2)
grid

## # A tibble: 5 x 5
##   cut       lcarat color clarity  pred
##   <ord>      <dbl> <chr> <chr>   <dbl>
## 1 Fair      -0.515 G     VS2      11.2
## 2 Good      -0.515 G     VS2      11.3
## 3 Very Good -0.515 G     VS2      11.4
## 4 Premium   -0.515 G     VS2      11.4
## 5 Ideal     -0.515 G     VS2      11.4

ggplot(grid, aes(cut, pred)) +
  geom_point()

Plot residuals of four parameter model

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond2, "lresid2")

ggplot(diamonds2, aes(lcarat, lresid2)) +
  geom_hex(bins = 50)

diamonds2 %>%
  filter(abs(lresid2) > 1) %>%
  add_predictions(mod_diamond2) %>%
  mutate(pred = round(2^pred)) %>%
  select(price, pred, carat:table, x:z) %>%
  arrange(price)

## # A tibble: 16 x 11
##    price  pred carat cut       color clarity depth table     x     y     z
##    <int> <dbl> <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1013   264 0.25  Fair      F     SI2      54.4    64  4.3   4.23  2.32
##  2  1186   284 0.25  Premium   G     SI2      59      60  5.33  5.28  3.12
##  3  1186   284 0.25  Premium   G     SI2      58.8    60  5.33  5.28  3.12
##  4  1262  2644 1.03  Fair      E     I1       78.2    54  5.72  5.59  4.42
##  5  1415   639 0.35  Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  6  1415   639 0.35  Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  7  1715   576 0.32  Fair      F     VS2      59.6    60  4.42  4.34  2.61
##  8  1776   412 0.290 Fair      F     SI1      55.8    60  4.48  4.41  2.48
##  9  2160   314 0.34  Fair      F     I1       55.8    62  4.72  4.6   2.6 
## 10  2366   774 0.3   Very Good D     VVS2     60.6    58  4.33  4.35  2.63
## 11  3360  1373 0.51  Premium   F     SI1      62.7    62  5.09  4.96  3.15
## 12  3807  1540 0.61  Good      F     SI2      62.5    65  5.36  5.29  3.33
## 13  3920  1705 0.51  Fair      F     VVS2     65.4    60  4.98  4.9   3.23
## 14  4368  1705 0.51  Fair      F     VVS2     60.7    66  5.21  5.11  3.13
## 15 10011  4048 1.01  Fair      D     SI2      64.6    58  6.25  6.2   4.02
## 16 10470 23622 2.46  Premium   E     SI2      59.7    59  8.82  8.76  5.25

Question #1

In the plot of lcarat vs. lprice, there are some bright vertical strips. What do they represents? Bright strips represent higher frequency for those points in the graphic versus darkest spots; from our data we can infer that The price of a diamond varies from 326 dollars to 18,823 dollards (integers) depends on different factors, from our graphic we can confirm that the caract is not the only factor because diamonds with similar carat differ in prices. For example if you see observations 1 and 10 of diamonds, both have carat of 0.23m, the last one price is 338 and the first is 236 dollars, but they differ in clarity and cut. Also we can see the brightest points are between lcarat of less than -2 and 0. Also, it looks like each 0.25 carat we have more concentration of points, -1.75, -1.25,–1, -0.50, and 0.

Question #2

If log(price) = a_0 + a_1 * log(carat), what does that say about the relationship between price and carat? After applying log function to the data we could see better the relation between the variables, the linear relation between log of price and carat means that for 1% increase in carat results in 1% of increase in price.

Question #3

Extract the diamonds that have very high and very low residuals. Is there anything unusual about these diamonds? Are they particularly bad or good, or do you think these are pricing errors?

diamond2 <- diamonds %>% 
  filter(carat <= 2.5) %>% 
  mutate(lprice = log2(price), lcarat = log2(carat))

mod_diamond2<- lm(lprice ~ lcarat + color + clarity + cut, data = diamond2)

diamond2 <- diamond2 %>%
  add_residuals(mod_diamond2,'lresid')

summary(diamond2$lresid)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.17388 -0.12437 -0.00094  0.00000  0.11920  2.78322

diamond3<- diamond2 %>% filter(lresid > quantile(lresid)[[3]] | lresid < quantile(lresid)[[1]] )
table(diamond3$cut)

## 
##      Fair      Good Very Good   Premium     Ideal 
##       780      2562      6020      7048     10497

diamond3

## # A tibble: 26,907 x 13
##    carat cut   color clarity depth table price     x     y     z lprice
##    <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>  <dbl>
##  1  0.23 Ideal E     SI2      61.5    55   326  3.95  3.98  2.43   8.35
##  2  0.21 Prem… E     SI1      59.8    61   326  3.89  3.84  2.31   8.35
##  3  0.31 Good  J     SI2      63.3    58   335  4.34  4.35  2.75   8.39
##  4  0.24 Very… J     VVS2     62.8    57   336  3.94  3.96  2.48   8.39
##  5  0.26 Very… H     SI1      61.9    55   337  4.07  4.11  2.53   8.40
##  6  0.22 Fair  E     VS2      65.1    61   337  3.87  3.78  2.49   8.40
##  7  0.23 Very… H     VS1      59.4    61   338  4     4.05  2.39   8.40
##  8  0.3  Good  J     SI1      64      55   339  4.25  4.28  2.73   8.41
##  9  0.23 Ideal J     VS1      62.8    56   340  3.93  3.9   2.46   8.41
## 10  0.22 Prem… F     SI1      60.4    61   342  3.88  3.84  2.33   8.42
## # … with 26,897 more rows, and 2 more variables: lcarat <dbl>,
## #   lresid <dbl>

diamond3 %>% 
  ggplot(aes(clarity,price))+
  geom_boxplot()+
  facet_grid(~cut)

By the graphic looks like that diamond with better clarity are priced lower in some cases, which could be strange, but I wouldn’t say is an error in the data, because when I started to analyze the table you can see that diamonds with same carat have different prices depends on depth, color, and cut that affect pricing.

For example, I analyze in page 2 of the table two observations with 0.31 of carat, the price of the first is 344 and the other 353, the first one is ideal and the second is very good.

Question #4

Does the final model, mod_diamonds2, do a good job of predicting diamond prices? Would you trust it to tell you how much to spend if you were buying a diamond and why?

par(mfrow=c(2,2))
plot(mod_diamond2)

summary(mod_diamond2)

## 
## Call:
## lm(formula = lprice ~ lcarat + color + clarity + cut, data = diamond2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17388 -0.12437 -0.00094  0.11920  2.78322 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 12.206978   0.001693 7211.806  < 2e-16 ***
## lcarat       1.886239   0.001124 1677.809  < 2e-16 ***
## color.L     -0.633998   0.002910 -217.872  < 2e-16 ***
## color.Q     -0.137580   0.002676  -51.409  < 2e-16 ***
## color.C     -0.022072   0.002503   -8.819  < 2e-16 ***
## color^4      0.016570   0.002297    7.213 5.54e-13 ***
## color^5     -0.002828   0.002169   -1.304    0.192    
## color^6      0.003533   0.001971    1.793    0.073 .  
## clarity.L    1.308155   0.005179  252.598  < 2e-16 ***
## clarity.Q   -0.334090   0.004839  -69.047  < 2e-16 ***
## clarity.C    0.178423   0.004140   43.093  < 2e-16 ***
## clarity^4   -0.088059   0.003298  -26.697  < 2e-16 ***
## clarity^5    0.035885   0.002680   13.389  < 2e-16 ***
## clarity^6   -0.001371   0.002327   -0.589    0.556    
## clarity^7    0.048221   0.002051   23.512  < 2e-16 ***
## cut.L        0.173866   0.003386   51.349  < 2e-16 ***
## cut.Q       -0.050346   0.002980  -16.897  < 2e-16 ***
## cut.C        0.019129   0.002583    7.407 1.31e-13 ***
## cut^4       -0.002410   0.002066   -1.166    0.243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1916 on 53795 degrees of freedom
## Multiple R-squared:  0.9828, Adjusted R-squared:  0.9828 
## F-statistic: 1.706e+05 on 18 and 53795 DF,  p-value: < 2.2e-16

I would trust this model, by the values of p value and R squared, the model does a good job predicting price. I think this is a good model to have a better understanding if you are not familiar with diamonds and you want to have an idea of a price range if you are buying one, but if you want to do this as a business you might need additional input to create a better model.

DiamondsDataModel

Marivalentina Lizardo

2019-09-29