Initial Visualization

#install.packages('ggplot')
#ggplot(diamonds, aes(cut,price)) + geom_boxplot()
#ggplot(diamonds, aes(color,price)) + geom_boxplot()
#ggplot(diamonds, aes(clarity,price)) + geom_boxplot()

#ggplot(diamonds, aes(carat, price)) +
#  geom_hex(bins=50)

Subset Data and replot

#diamonds2 <- diamonds %>%
  #filter(carat <= 2.5)  %>%
  #mutate(lprice = log2(price), lcarat = log2(carat))

#ggplot(diamonds2, aes(lcarat, lprice)) +
  #geom_hex(bins=50)

Simple model and visualization

#mod_diamond <- lm(lprice ~ lcarat, data = diamonds2, na.action = na.warn)

#grid <- diamonds2 %>%
#  data_grid(carat = seq_range(carat, 20)) %>%
#  mutate(lcarat = log2(carat)) %>%
#  add_predictions(mod_diamond, "lprice") %>%
#  mutate(price = 2 ^ lprice)

#ggplot(diamonds2, aes(carat, price)) +
#  geom_hex(bins = 50) +
#  geom_line(data = grid, color = "green", size = 1)
#

Add residuals and plot

#diamonds2 <- diamonds2 %>%
#  add_residuals(mod_diamond, "lresid")

#ggplot(diamonds2, aes(lcarat, lresid)) +
#  geom_hex(bins = 50)

#ggplot(diamonds2, aes(cut,lresid)) + geom_boxplot()
#ggplot(diamonds2, aes(color,lresid)) + geom_boxplot()
#ggplot(diamonds2, aes(clarity,lresid)) + geom_boxplot()
#

Four parameter model and visualization

#mod_diamond2 <- lm(
#  lprice ~ lcarat + color + cut + clarity, diamonds2, na.action = na.warn
#)

#grid <- diamonds2 %>%
#  data_grid(cut, .model = mod_diamond2) %>%
#  add_predictions(mod_diamond2)
#grid

#ggplot(grid, aes(cut, pred)) +
#  geom_point()
#

Plot residuals of four parameter model

#diamonds2 <- diamonds2 %>%
#  add_residuals(mod_diamond2, "lresid2")

#ggplot(diamonds2, aes(lcarat, lresid2)) +
#  geom_hex(bins = 50)

#diamonds2 %>%
#  filter(abs(lresid2) > 1) %>%
#  add_predictions(mod_diamond2) %>%
#  mutate(pred = round(2^pred)) %>%
#  select(price, pred, carat:table, x:z) %>%
#  arrange(price)
#

Question #1

In the plot of lcarat vs. lprice, there are some bright vertical strips. What do they represent?

# Use this chunk to answer question 1
#`carat` represent an integer variable. The 'price' is dependent on the carat when the relationship is linear.

Question #2

If log(price) = a_0 + a_1 * log(carat), what does that say about the relationship between price and carat?

# Use this chunk to answer question 2
#That the price of a diamond depends on the carat size when the relationship is in a linear fashion. A 1% increase in carat is associated with a 1% increase in price.

Question #3

Extract the diamonds that have very high and very low residuals. Is there anything unusual about these diamonds? Are they particularly bad or good, or do you think these are pricing errors?

# Use this chunk to place your code for extracting the high and low residuals and answer question 3
#diamonds2 <-
#  diamonds %>% 
#  mutate(lprice = log2(price),
#         lcarat = log2(carat))
#mod1 <- lm(lprice ~ lcarat + color + clarity + cut, data = diamonds2)
#bottom <-
#  diamonds2 %>% 
#  add_residuals(mod1) %>% 
#  arrange(resid) %>% 
#  slice(1:10)
#top <-
#  diamonds2 %>% 
#  add_residuals(mod1) %>% 
#  arrange(-resid) %>% 
#  slice(1:10)
#bind_rows(bottom, top) %>% 
#  select(price, carat, resid)

Question #4

Does the final model, mod_diamonds2, do a good job of predicting diamond prices? Would you trust it to tell you how much to spend if you were buying a diamond and why?

# Use this chunk to place your code for assessing how well the model predicts diamond prices and answer question 4

#diamonds2 %>% 
#  add_predictions(mod1) %>% 
#  mutate(pred = 2 ^ pred) %>% 
#  select(price, pred) %>% 
#  mutate(se = predict(mod1, se.fit = TRUE)$se.fit,
#         low_ci = pred - se * 2,
#         upper_ci = pred + se * 2,
#         correct = if_else(price >= low_ci & price <= upper_ci, TRUE, FALSE)) %>% 
#  summarize(prop_correct = mean(correct))

ANLY 505 - DiamondsDataModel

Week 3

Zijun Jiang

2020-01-26