library(ggplot2) # Data visualization
library(dplyr)
library(GGally)
library(boot)
diamond <- read.csv("diamonds.csv")
colSums(is.na(diamond))
## X carat cut color clarity depth table price x
## 0 0 0 0 0 0 0 0 0
## y z
## 0 0
str(diamond)
## 'data.frame': 53940 obs. of 11 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Factor w/ 5 levels "Fair","Good",..: 3 4 2 4 2 5 5 5 1 5 ...
## $ color : Factor w/ 7 levels "D","E","F","G",..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Factor w/ 8 levels "I1","IF","SI1",..: 4 3 5 6 4 8 7 3 6 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
GG Pairs
ggpairs(diamond[,2:11])

Data Visualization
ggplot(data = diamond, aes(x = cut, fill = cut)) +
geom_bar() +
labs(x='Cut', y='Number of Diamonds', title = 'Number of Diamonds by cut') +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="none")

ggplot(data = diamond, aes(x = color, fill = color)) +
geom_bar() +
labs(x='Color', y='Number of Diamonds', title = 'Number of Diamonds by Color') +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="none")

ggplot(data = diamond, aes(x = clarity, fill = clarity)) +
geom_bar() +
labs(x='clarity', y='Number of Diamonds', title = 'Number of Diamonds by clarity') +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position="none")

ggplot(diamonds,aes(x=price)) +
geom_density(aes(fill=factor(cut)),alpha=0.7) +
labs(title="Price grouped by cut",x="Price",fill="Cut") +
theme(plot.title = element_text(hjust = 0.5))

ggplot(diamonds,aes(x=price)) +
geom_density(aes(fill = factor(color)),alpha=0.7) +
labs(title="Price grouped by cut",x="Price",fill="Cut") +
theme(plot.title = element_text(hjust = 0.5))

ggplot(data = diamond, aes(x = carat, y = price, color = clarity)) +
geom_point() +
labs(x='Carat', y='Price', title = 'Price of Diamonds by Carat colored by Clarity') +
theme(plot.title = element_text(hjust = 0.5))

Regression Models
diamond_2 <- diamond[,2:11]
index <- sample(1:nrow(diamond_2), nrow(diamond_2) *0.8)
train <- diamond_2[index,]
test <- diamond_2[-index,]
Linear Regression
lm_fit <- lm(price ~ ., data = train)
summary(lm_fit)
##
## Call:
## lm(formula = price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21119.1 -594.1 -186.7 375.5 10714.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2008.380 458.512 4.380 1.19e-05 ***
## carat 11134.891 55.027 202.353 < 2e-16 ***
## cutGood 576.300 37.842 15.229 < 2e-16 ***
## cutIdeal 822.393 37.621 21.860 < 2e-16 ***
## cutPremium 764.806 36.227 21.111 < 2e-16 ***
## cutVery Good 716.964 36.294 19.754 < 2e-16 ***
## colorE -195.396 20.093 -9.724 < 2e-16 ***
## colorF -257.331 20.316 -12.666 < 2e-16 ***
## colorG -481.255 19.905 -24.177 < 2e-16 ***
## colorH -968.638 21.210 -45.670 < 2e-16 ***
## colorI -1444.287 23.807 -60.667 < 2e-16 ***
## colorJ -2364.509 29.475 -80.220 < 2e-16 ***
## clarityIF 5316.168 56.966 93.322 < 2e-16 ***
## claritySI1 3650.570 48.545 75.200 < 2e-16 ***
## claritySI2 2698.181 48.766 55.329 < 2e-16 ***
## clarityVS1 4572.871 49.580 92.232 < 2e-16 ***
## clarityVS2 4257.520 48.791 87.261 < 2e-16 ***
## clarityVVS1 5012.298 52.534 95.410 < 2e-16 ***
## clarityVVS2 4964.573 51.052 97.245 < 2e-16 ***
## depth -62.812 5.036 -12.473 < 2e-16 ***
## table -27.701 3.295 -8.407 < 2e-16 ***
## x -1028.810 47.160 -21.815 < 2e-16 ***
## y 71.978 38.647 1.862 0.0625 .
## z -36.287 34.130 -1.063 0.2877
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1136 on 43128 degrees of freedom
## Multiple R-squared: 0.9187, Adjusted R-squared: 0.9187
## F-statistic: 2.12e+04 on 23 and 43128 DF, p-value: < 2.2e-16
test$price_pred <- predict(lm_fit, newdata = test)
residual <- test$price_pred - test$price
rmse1 <- sqrt(mean(residual^2))
rmse1
## [1] 1106.045