============================================================================================================
Reference: Wickham, H., & Grolemund, G. (2016). R for data science:
import, tidy, transform, visualize, and model data. “O’Reilly Media,
Inc.”. 376-384.
Data dictionary: https://ggplot2.tidyverse.org/reference/diamonds.html
p1 <- ggplot(diamonds, aes(cut,price)) + geom_boxplot() + theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(x="Cut", y="Price (USD)") + ggtitle("Boxplot 1")
p2 <- ggplot(diamonds, aes(color,price)) + geom_boxplot() + labs(x="Color", y="Price (USD)") + ggtitle("Boxplot 2") + scale_x_discrete(limits=rev(levels(diamonds$color)))
p3 <- ggplot(diamonds, aes(clarity,price)) + geom_boxplot() + theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(x="Clarity", y="Price (USD)") + ggtitle("Boxplot 3")
p4 <- ggplot(diamonds, aes(carat, price)) + geom_hex(bins=50) + labs(x="Carat", y="Price (USD)") + ggtitle("Scatter plot 1")
gridExtra::grid.arrange(p1,p2,p3,p4, nrow=2, bottom="Initial Visualization")
diamonds2 <- diamonds %>%
filter(carat <= 2.5) %>%
mutate(lprice = log2(price), lcarat = log2(carat),
color = factor(color, ordered = F),
cut = factor(cut, ordered = F),
clarity = factor(clarity, ordered = F))
ggplot(diamonds2, aes(lcarat, lprice)) + geom_hex(bins=50) + labs(x="log2(Carat)", y="log2(Price)") + ggtitle("Scatter plot 2")
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2) #linear model of log2(price) and log2(carat)
grid <- diamonds2 %>%
data_grid(carat = seq_range(carat, 20)) %>%
mutate(lcarat = log2(carat)) %>%
add_predictions(mod_diamond, "lprice") %>%
mutate(price = 2 ^ lprice) #back transformation of logarithm
ggplot(diamonds2, aes(carat, price)) +
geom_hex(bins = 50) +
geom_line(data = grid, color = "green", size = 1) +
labs(x="Carat", y="Price (USD)") + ggtitle("Scatter plot and regression")
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond, "lresid")
f1 <- ggplot(diamonds2, aes(lcarat, lresid)) + geom_hex(bins = 50) + labs(x="log2(Carat)", y="Residuals") + ggtitle("Scatter plot 3")
f2 <- ggplot(diamonds2, aes(cut, lresid)) + geom_boxplot() + theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(x="Cut", y="Residuals") + ggtitle("Boxplot 4")
f3 <- ggplot(diamonds2, aes(color, lresid)) + geom_boxplot() + labs(x="Color", y="Residuals") + ggtitle("Boxplot 5") + scale_x_discrete(limits=rev(levels(diamonds$color)))
f4 <- ggplot(diamonds2, aes(clarity, lresid)) + geom_boxplot() + theme(axis.text.x=element_text(angle=45, hjust=1)) + labs(x="Clarity", y="Residuals") + ggtitle("Boxplot 6")
gridExtra::grid.arrange(f1,f2,f3,f4, nrow=2, bottom="Visualization")
mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity, diamonds2) #linear model of log2(price) and log2(carat), color, cut, and clarity
#take "cut" as an example for prediction in log2(price)
grid2 <- diamonds2 %>%
data_grid(cut, .model = mod_diamond2) %>%
add_predictions(mod_diamond2)
grid2 %>% kable() %>% kable_styling()
| cut | lcarat | color | clarity | pred |
|---|---|---|---|---|
| Fair | -0.5145732 | G | SI1 | 10.98985 |
| Good | -0.5145732 | G | SI1 | 11.10479 |
| Very Good | -0.5145732 | G | SI1 | 11.15824 |
| Premium | -0.5145732 | G | SI1 | 11.19055 |
| Ideal | -0.5145732 | G | SI1 | 11.22187 |
ggplot(grid2, aes(cut, pred)) + geom_point() + labs(x="Cut", y="Predicted log2(Price)") + ggtitle("Scatter plot 4")
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) + geom_hex(bins = 50) + labs(x="log2(Carat)", y="Residuals") + ggtitle("Scatter plot 5")
diamonds2b <- diamonds2 %>%
filter(abs(lresid2) > 1) %>%
add_predictions(mod_diamond2, "pred2") %>%
mutate(pred2=round(2^pred2)) %>%
select(price, pred2, lresid2, carat:table, x:z) %>%
arrange(price)
diamonds2b %>% kable() %>% kable_styling()
| price | pred2 | lresid2 | carat | cut | color | clarity | depth | table | x | y | z |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1013 | 264 | 1.938203 | 0.25 | Fair | F | SI2 | 54.4 | 64 | 4.30 | 4.23 | 2.32 |
| 1186 | 284 | 2.059699 | 0.25 | Premium | G | SI2 | 59.0 | 60 | 5.33 | 5.28 | 3.12 |
| 1186 | 284 | 2.059699 | 0.25 | Premium | G | SI2 | 58.8 | 60 | 5.33 | 5.28 | 3.12 |
| 1262 | 2644 | -1.067148 | 1.03 | Fair | E | I1 | 78.2 | 54 | 5.72 | 5.59 | 4.42 |
| 1415 | 639 | 1.147258 | 0.35 | Fair | G | VS2 | 65.9 | 54 | 5.57 | 5.53 | 3.66 |
| 1415 | 639 | 1.147258 | 0.35 | Fair | G | VS2 | 65.9 | 54 | 5.57 | 5.53 | 3.66 |
| 1715 | 576 | 1.573796 | 0.32 | Fair | F | VS2 | 59.6 | 60 | 4.42 | 4.34 | 2.61 |
| 1776 | 412 | 2.107816 | 0.29 | Fair | F | SI1 | 55.8 | 60 | 4.48 | 4.41 | 2.48 |
| 2160 | 314 | 2.783220 | 0.34 | Fair | F | I1 | 55.8 | 62 | 4.72 | 4.60 | 2.60 |
| 2366 | 774 | 1.611117 | 0.30 | Very Good | D | VVS2 | 60.6 | 58 | 4.33 | 4.35 | 2.63 |
| 3360 | 1373 | 1.290708 | 0.51 | Premium | F | SI1 | 62.7 | 62 | 5.09 | 4.96 | 3.15 |
| 3807 | 1540 | 1.305923 | 0.61 | Good | F | SI2 | 62.5 | 65 | 5.36 | 5.29 | 3.33 |
| 3920 | 1705 | 1.201301 | 0.51 | Fair | F | VVS2 | 65.4 | 60 | 4.98 | 4.90 | 3.23 |
| 4368 | 1705 | 1.357420 | 0.51 | Fair | F | VVS2 | 60.7 | 66 | 5.21 | 5.11 | 3.13 |
| 10011 | 4048 | 1.306158 | 1.01 | Fair | D | SI2 | 64.6 | 58 | 6.25 | 6.20 | 4.02 |
| 10470 | 23622 | -1.173882 | 2.46 | Premium | E | SI2 | 59.7 | 59 | 8.82 | 8.76 | 5.25 |
In the plot of lcarat vs. lprice, there are some bright vertical strips. What do they represent?
ANSWER: ① The brighter color (lighter blue) is the hexagon, the more counts are there. ② The vertical line means that for the same x-value of “lcarat”, there are numerous y-values of “lprice”. The latter finding might be due to logarithmic transformation since the price range [$326, $18,823] narrows down to [8.348728, 14.200209]. ③ The x-values of these bright vertical stripes indicate most preferred carat cuts, e.g., one carat. In all, many similar priced diamonds have the same carat.
Hence, these bright vertical stripes show that “lcarat” may not be the single explanatory variable for “lprice”. For example, as for the same carat of 0.25, the price can go as the highest (with two records) as $1,186 when cut, color, and clarity is respectively Premium, G, and SI2, while as the lowest (with one record) as $357 when cut, color and clarity is respectively Ideal, H, and SI1.
Moreover, if only one variable is taken into consideration, then these bright vertical stripes may represent an issue of oversampling.
If \(\log_2 (price) = a_0 + a_1 \cdot \log_2 (carat)\), what does that say about the relationship between price and carat?
summary(mod_diamond)
##
## Call:
## lm(formula = lprice ~ lcarat, data = diamonds2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96407 -0.24549 -0.00844 0.23930 1.93486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.193863 0.001969 6194.5 <2e-16 ***
## lcarat 1.681371 0.001936 868.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3767 on 53812 degrees of freedom
## Multiple R-squared: 0.9334, Adjusted R-squared: 0.9334
## F-statistic: 7.542e+05 on 1 and 53812 DF, p-value: < 2.2e-16
ANSWER: As \(\log_2 (carat)\) increase one unit, then \(\log_2 (price)\) will change \(a_1\). Based on the subset data, the estimated \(a_0\) and \(a_1\) is respectively 12.193863 and 1.681371 while the \(p\)-values are both statistically significant at an alpha level of 0.05. The adjusted R-squared is 0.9334 indicating the simple model may fit very well.
After back transformation, \(price = 2^{a_0} \cdot carat^{a_1}\). A 1% increase in carat is associated with a change of \(1.01^{a_1}-1=1.687093\%\) in price, given \(\hat{a_1}=1.681371\) .
Extract the diamonds that have very high and very low residuals. Is there anything unusual about these diamonds? Are they particularly bad or good, or do you think these are pricing errors?
diamonds2a <- diamonds2 %>%
filter(abs(lresid) > 1) %>%
add_predictions(mod_diamond, "pred") %>%
mutate(pred=round(2^pred)) %>%
select(price, pred, lresid, carat:table, x:z) %>%
arrange(price)
ggplot(diamonds2a, aes(carat, price, color=lresid)) + geom_point() + labs(x="Carat", y="Price (USD)") + ggtitle("Scatterplot 6")
cut <- rbind(table(diamonds2a[diamonds2a$lresid>0,]$cut),table(diamonds2a[diamonds2a$lresid<0,]$cut))
rownames(cut) <- c("Positive residuals", "Negative residuals")
color <- rbind(table(diamonds2a[diamonds2a$lresid>0,]$color),table(diamonds2a[diamonds2a$lresid<0,]$color))
clarity <- rbind(table(diamonds2a[diamonds2a$lresid>0,]$clarity),table(diamonds2a[diamonds2a$lresid<0,]$clarity))
cbind(cut, color, clarity) %>% kable() %>% kable_styling()
| Fair | Good | Very Good | Premium | Ideal | D | E | F | G | H | I | J | I1 | SI2 | SI1 | VS2 | VS1 | VVS2 | VVS1 | IF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Positive residuals | 9 | 14 | 100 | 68 | 226 | 199 | 113 | 98 | 7 | 0 | 0 | 0 | 1 | 4 | 2 | 3 | 3 | 127 | 138 | 139 |
| Negative residuals | 168 | 41 | 22 | 83 | 12 | 3 | 17 | 46 | 89 | 76 | 51 | 44 | 303 | 20 | 3 | 0 | 0 | 0 | 0 | 0 |
ANSWER: Extract the diamonds that have high or low residuals (from the simple model) whose absolute values are greater than 1. Given \(residual_i=\ log_2 (price_i) - \log_2 (prediction_i)\), then after back logarithm, \(2^{residual_i}=\frac{price_i}{prediction_i}\) is present. A residual value of -1 indicates that \(\log_2 (price)\) is one unit lower than the prediction. In other words, the price with the residual value of -1 is half of the predicted price while the price with the residual value of 1 is twice the predicted price.
As Scatterplot 6 shows that larger carat diamonds with the extreme residuals might be cheaper than the predictions, especially when the carat is greater than 1.5.
For those 743 observations with extreme residuals, the group of the positive residuals mainly has more diamonds with ideal cut, better color (D), and better clarity (IF). In contrast, the group of the negative residuals mainly has more diamonds with fair cut, worse color (J), and worse clarity (I1).
In conclusion, there is not much unusual or pricing error. “Nothing for nothing and very little for a halfpenny.” The higher is the price, the better is the quality of the merchandise.
Does the final model, mod_diamond2, do a good job of predicting diamond prices? Would you trust it to tell you how much to spend if you were buying a diamond and why?
summary(mod_diamond2)
##
## Call:
## lm(formula = lprice ~ lcarat + color + cut + clarity, data = diamonds2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17388 -0.12437 -0.00094 0.11920 2.78322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.366696 0.008403 1352.76 <2e-16 ***
## lcarat 1.886239 0.001124 1677.81 <2e-16 ***
## colorE -0.078489 0.003034 -25.87 <2e-16 ***
## colorF -0.137370 0.003068 -44.78 <2e-16 ***
## colorG -0.232097 0.003003 -77.29 <2e-16 ***
## colorH -0.362063 0.003188 -113.56 <2e-16 ***
## colorI -0.537257 0.003573 -150.36 <2e-16 ***
## colorJ -0.737525 0.004417 -166.99 <2e-16 ***
## cutGood 0.114935 0.005596 20.54 <2e-16 ***
## cutVery Good 0.168393 0.005206 32.34 <2e-16 ***
## cutPremium 0.200701 0.005150 38.97 <2e-16 ***
## cutIdeal 0.232023 0.005105 45.45 <2e-16 ***
## claritySI2 0.589367 0.007572 77.84 <2e-16 ***
## claritySI1 0.825861 0.007525 109.75 <2e-16 ***
## clarityVS2 1.041576 0.007564 137.71 <2e-16 ***
## clarityVS1 1.142960 0.007674 148.94 <2e-16 ***
## clarityVVS2 1.338362 0.007898 169.45 <2e-16 ***
## clarityVVS1 1.441974 0.008117 177.65 <2e-16 ***
## clarityIF 1.579102 0.008758 180.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1916 on 53795 degrees of freedom
## Multiple R-squared: 0.9828, Adjusted R-squared: 0.9828
## F-statistic: 1.706e+05 on 18 and 53795 DF, p-value: < 2.2e-16
ANSWER: The estimated coefficients for the four parameters model are shown above. All the coefficients’ \(p\)-values are statistically significant at an alpha level of 0.05. The adjusted R-squared is 0.9828, which is better than 0.9334 from the simple model. In general, the four parameters model may fit very well and do a good job of predicting diamond prices. However, overfitting can still exist. I do not fully trust the predicted prices. The model only tells that “you get what you pay for”. Cluster analysis may improve the model.