##install.packages("hexbin")
library(hexbin)
ggplot(diamonds, aes(cut,price)) + geom_boxplot()
ggplot(diamonds, aes(color,price)) + geom_boxplot()
ggplot(diamonds, aes(clarity,price)) + geom_boxplot()
ggplot(diamonds, aes(carat, price)) +
geom_hex(bins=50)
diamonds2 <- diamonds %>%
filter(carat <= 2.5) %>%
mutate(lprice = log2(price), lcarat = log2(carat))
ggplot(diamonds2, aes(lcarat, lprice)) +
geom_hex(bins=50)
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
grid <- diamonds2 %>%
data_grid(carat = seq_range(carat, 20)) %>%
mutate(lcarat = log2(carat)) %>%
add_predictions(mod_diamond, "lprice") %>%
mutate(price = 2 ^ lprice)
ggplot(diamonds2, aes(carat, price)) +
geom_hex(bins = 50) +
geom_line(data = grid, color = "green", size = 1)
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond, "lresid")
ggplot(diamonds2, aes(lcarat, lresid)) +
geom_hex(bins = 50)
ggplot(diamonds2, aes(cut,lresid)) + geom_boxplot()
ggplot(diamonds2, aes(color,lresid)) + geom_boxplot()
ggplot(diamonds2, aes(clarity,lresid)) + geom_boxplot()
mod_diamond2 <- lm(
lprice ~ lcarat + color + cut + clarity, diamonds2
)
grid <- diamonds2 %>%
data_grid(cut, .model = mod_diamond2) %>%
add_predictions(mod_diamond2)
grid
## # A tibble: 5 x 5
## cut lcarat color clarity pred
## <ord> <dbl> <chr> <chr> <dbl>
## 1 Fair -0.515 G VS2 11.2
## 2 Good -0.515 G VS2 11.3
## 3 Very Good -0.515 G VS2 11.4
## 4 Premium -0.515 G VS2 11.4
## 5 Ideal -0.515 G VS2 11.4
ggplot(grid, aes(cut, pred)) +
geom_point()
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
geom_hex(bins = 50)
diamonds2 %>%
filter(abs(lresid2) > 1) %>%
add_predictions(mod_diamond2) %>%
mutate(pred = round(2^pred)) %>%
select(price, pred, carat:table, x:z) %>%
arrange(price)
## # A tibble: 16 x 11
## price pred carat cut color clarity depth table x y z
## <int> <dbl> <dbl> <ord> <ord> <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1013 264 0.25 Fair F SI2 54.4 64 4.3 4.23 2.32
## 2 1186 284 0.25 Premium G SI2 59 60 5.33 5.28 3.12
## 3 1186 284 0.25 Premium G SI2 58.8 60 5.33 5.28 3.12
## 4 1262 2644 1.03 Fair E I1 78.2 54 5.72 5.59 4.42
## 5 1415 639 0.35 Fair G VS2 65.9 54 5.57 5.53 3.66
## 6 1415 639 0.35 Fair G VS2 65.9 54 5.57 5.53 3.66
## 7 1715 576 0.32 Fair F VS2 59.6 60 4.42 4.34 2.61
## 8 1776 412 0.290 Fair F SI1 55.8 60 4.48 4.41 2.48
## 9 2160 314 0.34 Fair F I1 55.8 62 4.72 4.6 2.6
## 10 2366 774 0.3 Very Good D VVS2 60.6 58 4.33 4.35 2.63
## 11 3360 1373 0.51 Premium F SI1 62.7 62 5.09 4.96 3.15
## 12 3807 1540 0.61 Good F SI2 62.5 65 5.36 5.29 3.33
## 13 3920 1705 0.51 Fair F VVS2 65.4 60 4.98 4.9 3.23
## 14 4368 1705 0.51 Fair F VVS2 60.7 66 5.21 5.11 3.13
## 15 10011 4048 1.01 Fair D SI2 64.6 58 6.25 6.2 4.02
## 16 10470 23622 2.46 Premium E SI2 59.7 59 8.82 8.76 5.25
In the plot of lcarat vs. lprice, there are some bright vertical strips. What do they represent?
##Answer: Brighter strps represent higher counts. Therefore, based on the plot of lcarat and lprice, besides the positive relationship between carat and price, there are some specific combinations of carat and price that are popular among customers.
If log(price) = a_0 + a_1 * log(carat), what does that say about the relationship between price and carat?
##Answer: It says that carat and price does not have a linear relationship, but there is a linear relationship between log price and log carat.
Extract the diamonds that have very high and very low residuals. Is there anything unusual about these diamonds? Are they particularly bad or good, or do you think these are pricing errors?
##Answer: There are indeed something usual about these diamonds that have very high and very low residuals. The price for these diamonds are either estremely low (i.e.2571 for 1.30 carat) or extremely high (i.e.18542 for 1.0 carat). I do not think these are pricing errors, because the price is also impacted factors that are not included in the model, i.e. brand.
# Use this chunk to place your code for extracting the high and low residuals
summary(diamonds2$lresid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.964068 -0.245488 -0.008442 0.000000 0.239301 1.934855
extreme_high <- diamonds2 %>% filter(lresid > 1.5 )
extreme_low <- diamonds2 %>% filter(lresid < -1.5 )
extreme_high
## # A tibble: 38 x 14
## carat cut color clarity depth table price x y z lprice
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.51 Fair F VVS2 60.7 66 4368 5.21 5.11 3.13 12.1
## 2 0.63 Ideal D IF 62.5 55 6549 5.47 5.5 3.43 12.7
## 3 0.63 Ideal D IF 62.5 55 6607 5.5 5.47 3.43 12.7
## 4 1.04 Ideal D IF 61.8 57 14494 6.49 6.52 4.02 13.8
## 5 1 Ideal D VVS1 60.7 56 14498 6.47 6.54 3.95 13.8
## 6 1.04 Ideal D IF 61.8 57 14626 6.52 6.49 4.02 13.8
## 7 1.01 Good D IF 63.2 59 15081 6.34 6.38 4.02 13.9
## 8 1.01 Good D IF 63.4 59 15081 6.26 6.39 4.01 13.9
## 9 1.01 Very… D IF 63.4 59 15219 6.39 6.26 4.01 13.9
## 10 1.02 Prem… D IF 61.5 60 15231 6.45 6.52 3.99 13.9
## # … with 28 more rows, and 3 more variables: lcarat <dbl>, lresid <dbl>,
## # lresid2 <dbl>
extreme_low
## # A tibble: 21 x 14
## carat cut color clarity depth table price x y z lprice
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1.5 Fair H I1 65.6 54 2964 7.26 7.09 4.7 11.5
## 2 1.52 Good E I1 57.3 58 3105 7.53 7.42 4.28 11.6
## 3 1.52 Good E I1 57.3 58 3105 7.53 7.42 4.28 11.6
## 4 1.5 Fair H I1 69.3 61 3175 6.99 6.81 4.78 11.6
## 5 1.5 Good G I1 57.4 62 3179 7.56 7.39 4.29 11.6
## 6 1.95 Prem… H I1 60.3 59 5045 8.1 8.05 4.87 12.3
## 7 2 Prem… J I1 61.5 59 5051 8.11 8.06 4.97 12.3
## 8 2.06 Prem… J I1 61.2 58 5203 8.1 8.07 4.95 12.3
## 9 2.14 Fair J I1 69.4 57 5405 7.74 7.7 5.36 12.4
## 10 2.15 Fair J I1 65.5 57 5430 8.01 7.95 5.23 12.4
## # … with 11 more rows, and 3 more variables: lcarat <dbl>, lresid <dbl>,
## # lresid2 <dbl>
Does the final model, mod_diamonds2, do a good job of predicting diamond prices? Would you trust it to tell you how much to spend if you were buying a diamond and why?
##Answer: Based on the adjusted r square (i.e.0.98), this model does a good job of predicting diamond prices. Given the Pr(>|t|) for most of the variables are below 0.05, i would use this model as a general guide when purchasing dismonds. However, i would not solely depend on the model, because there are also other factors impactig diamond price that might not be captured in the model.
# Use this chunk to place your code for assessing how well the model predicts diamond prices
summary(mod_diamond2)
##
## Call:
## lm(formula = lprice ~ lcarat + color + cut + clarity, data = diamonds2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17388 -0.12437 -0.00094 0.11920 2.78322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.206978 0.001693 7211.806 < 2e-16 ***
## lcarat 1.886239 0.001124 1677.809 < 2e-16 ***
## color.L -0.633998 0.002910 -217.872 < 2e-16 ***
## color.Q -0.137580 0.002676 -51.409 < 2e-16 ***
## color.C -0.022072 0.002503 -8.819 < 2e-16 ***
## color^4 0.016570 0.002297 7.213 5.54e-13 ***
## color^5 -0.002828 0.002169 -1.304 0.192
## color^6 0.003533 0.001971 1.793 0.073 .
## cut.L 0.173866 0.003386 51.349 < 2e-16 ***
## cut.Q -0.050346 0.002980 -16.897 < 2e-16 ***
## cut.C 0.019129 0.002583 7.407 1.31e-13 ***
## cut^4 -0.002410 0.002066 -1.166 0.243
## clarity.L 1.308155 0.005179 252.598 < 2e-16 ***
## clarity.Q -0.334090 0.004839 -69.047 < 2e-16 ***
## clarity.C 0.178423 0.004140 43.093 < 2e-16 ***
## clarity^4 -0.088059 0.003298 -26.697 < 2e-16 ***
## clarity^5 0.035885 0.002680 13.389 < 2e-16 ***
## clarity^6 -0.001371 0.002327 -0.589 0.556
## clarity^7 0.048221 0.002051 23.512 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1916 on 53795 degrees of freedom
## Multiple R-squared: 0.9828, Adjusted R-squared: 0.9828
## F-statistic: 1.706e+05 on 18 and 53795 DF, p-value: < 2.2e-16