Initial Visualization

ggplot(diamonds, aes(cut,price)) + geom_boxplot()

ggplot(diamonds, aes(color,price)) + geom_boxplot()

ggplot(diamonds, aes(clarity,price)) + geom_boxplot()

ggplot(diamonds, aes(carat, price)) +
  geom_hex(bins=50)

Subset Data and replot

diamonds2 <- diamonds %>%
  filter(carat <= 2.5)  %>%
  mutate(lprice = log2(price), lcarat = log2(carat))

ggplot(diamonds2, aes(lcarat, lprice)) +
  geom_hex(bins=50)

Simple model and visualization

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2, na.action = na.warn)

grid <- diamonds2 %>%
  data_grid(carat = seq_range(carat, 20)) %>%
  mutate(lcarat = log2(carat)) %>%
  add_predictions(mod_diamond, "lprice") %>%
  mutate(price = 2 ^ lprice)

ggplot(diamonds2, aes(carat, price)) +
  geom_hex(bins = 50) +
  geom_line(data = grid, color = "green", size = 1)

Add residuals and plot

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) +
  geom_hex(bins = 50)

ggplot(diamonds2, aes(cut,lresid)) + geom_boxplot()

ggplot(diamonds2, aes(color,lresid)) + geom_boxplot()

ggplot(diamonds2, aes(clarity,lresid)) + geom_boxplot()

Four parameter model and visualization

mod_diamond2 <- lm(
  lprice ~ lcarat + color + cut + clarity, diamonds2, na.action = na.warn
)

grid <- diamonds2 %>%
  data_grid(cut, .model = mod_diamond2) %>%
  add_predictions(mod_diamond2)
grid

## # A tibble: 5 x 5
##   cut       lcarat color clarity  pred
##   <ord>      <dbl> <chr> <chr>   <dbl>
## 1 Fair      -0.515 G     VS2      11.2
## 2 Good      -0.515 G     VS2      11.3
## 3 Very Good -0.515 G     VS2      11.4
## 4 Premium   -0.515 G     VS2      11.4
## 5 Ideal     -0.515 G     VS2      11.4

ggplot(grid, aes(cut, pred)) +
  geom_point()

Plot residuals of four parameter model

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond2, "lresid2")

ggplot(diamonds2, aes(lcarat, lresid2)) +
  geom_hex(bins = 50)

diamonds2 %>%
  filter(abs(lresid2) > 1) %>%
  add_predictions(mod_diamond2) %>%
  mutate(pred = round(2^pred)) %>%
  select(price, pred, carat:table, x:z) %>%
  arrange(price)

## # A tibble: 16 x 11
##    price  pred carat cut       color clarity depth table     x     y     z
##    <int> <dbl> <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1013   264 0.25  Fair      F     SI2      54.4    64  4.3   4.23  2.32
##  2  1186   284 0.25  Premium   G     SI2      59      60  5.33  5.28  3.12
##  3  1186   284 0.25  Premium   G     SI2      58.8    60  5.33  5.28  3.12
##  4  1262  2644 1.03  Fair      E     I1       78.2    54  5.72  5.59  4.42
##  5  1415   639 0.35  Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  6  1415   639 0.35  Fair      G     VS2      65.9    54  5.57  5.53  3.66
##  7  1715   576 0.32  Fair      F     VS2      59.6    60  4.42  4.34  2.61
##  8  1776   412 0.290 Fair      F     SI1      55.8    60  4.48  4.41  2.48
##  9  2160   314 0.34  Fair      F     I1       55.8    62  4.72  4.6   2.6 
## 10  2366   774 0.3   Very Good D     VVS2     60.6    58  4.33  4.35  2.63
## 11  3360  1373 0.51  Premium   F     SI1      62.7    62  5.09  4.96  3.15
## 12  3807  1540 0.61  Good      F     SI2      62.5    65  5.36  5.29  3.33
## 13  3920  1705 0.51  Fair      F     VVS2     65.4    60  4.98  4.9   3.23
## 14  4368  1705 0.51  Fair      F     VVS2     60.7    66  5.21  5.11  3.13
## 15 10011  4048 1.01  Fair      D     SI2      64.6    58  6.25  6.2   4.02
## 16 10470 23622 2.46  Premium   E     SI2      59.7    59  8.82  8.76  5.25

Question #1

In the plot of lcarat vs. lprice, there are some bright vertical strips. What do they represent?

# Use this chunk to answer question 1
# These vertical strips represent some popular carat weight of the Diamond. Brighter color means higher number of occurance in the graph.

Question #2

If log(price) = a_0 + a_1 * log(carat), what does that say about the relationship between price and carat?

# Use this chunk to answer question 2
# The relationship means price= exp(a_0)*carat^(a_1). Which means the relationship between carat and price is not linear. If x increases by n, they y increases by n^a1.

Question #3

Extract the diamonds that have very high and very low residuals. Is there anything unusual about these diamonds? Are they particularly bad or good, or do you think these are pricing errors?

# Use this chunk to place your code for extracting the high and low residuals and answer question 3
diamonds2 <- diamonds %>% 
  filter(carat <= 2.5) %>% 
  mutate(lprice = log2(price), lcarat = log2(carat))

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond,'lresid')
summary(diamonds2$lresid)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.964068 -0.245488 -0.008442  0.000000  0.239301  1.934855

diamonds3 <- diamonds2 %>% filter(lresid > quantile(lresid)[[3]] | lresid < quantile(lresid)[[1]] )
table(diamonds3$cut)

## 
##      Fair      Good Very Good   Premium     Ideal 
##       317      1651      5515      6477     12946

table(diamonds3$clarity)

## 
##   I1  SI2  SI1  VS2  VS1 VVS2 VVS1   IF 
##    1  836 4321 7340 5304 4208 3225 1671

diamonds3 %>% 
  ggplot(aes(clarity,price))+
  geom_boxplot()+
  facet_grid(~cut)

#There must be some error happen since the higher the quality and clarity, the lower the price fo the diamond. Or it could be possible that most of the high clarity diamonds are the small ones since we only filtered extra large diamond. If we filter >1.5 Carat diamond, the graph will be different and make more sense.

diamonds2 <- diamonds %>% 
  filter(carat >= 1.5) %>% 
  mutate(lprice = log2(price), lcarat = log2(carat))

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond,'lresid')
diamonds3 <- diamonds2 %>% filter(lresid > quantile(lresid)[[3]] | lresid < quantile(lresid)[[1]] )

diamonds3 %>% 
  ggplot(aes(clarity,price))+
  geom_boxplot()+
  facet_grid(~cut)

Question #4

Does the final model, mod_diamonds2, do a good job of predicting diamond prices? Would you trust it to tell you how much to spend if you were buying a diamond and why?

# Use this chunk to place your code for assessing how well the model predicts diamond prices and answer question 4

diamonds2 <- diamonds2 %>% 
    add_predictions(mod_diamond) %>% 
    mutate(pred = round(2 ^ pred),
           err = pred - price)


diamonds2 %>% 
  add_residuals(mod_diamond) %>% 
  mutate(resid = 2 ^ abs(resid)) %>% 
  ggplot(aes(resid)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Based on the plots of the residuals, this is an average model. It does well in predicting the models of base prices not so much for greater prices. I would not base a buying decision on the model though

ANLY 505 - DiamondsDataModel

Week 3

Luoxi Hao

2020-02-24