library(ggplot2) # Data visualization
library(dplyr)
library(GGally)
library(boot)
diamond <- read.csv("diamonds.csv")
colSums(is.na(diamond))
##       X   carat     cut   color clarity   depth   table   price       x 
##       0       0       0       0       0       0       0       0       0 
##       y       z 
##       0       0
str(diamond)
## 'data.frame':    53940 obs. of  11 variables:
##  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Factor w/ 5 levels "Fair","Good",..: 3 4 2 4 2 5 5 5 1 5 ...
##  $ color  : Factor w/ 7 levels "D","E","F","G",..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Factor w/ 8 levels "I1","IF","SI1",..: 4 3 5 6 4 8 7 3 6 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

GG Pairs

ggpairs(diamond[,2:11])

Data Visualization

ggplot(data = diamond, aes(x = cut, fill = cut)) +
  geom_bar() +
  labs(x='Cut', y='Number of Diamonds', title = 'Number of Diamonds by cut') +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="none")

ggplot(data = diamond, aes(x = color, fill = color)) +
  geom_bar() +
  labs(x='Color', y='Number of Diamonds', title = 'Number of Diamonds by Color') +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="none")

ggplot(data = diamond, aes(x = clarity, fill = clarity)) +
  geom_bar() +
  labs(x='clarity', y='Number of Diamonds', title = 'Number of Diamonds by clarity') +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(legend.position="none")

ggplot(diamonds,aes(x=price)) +
  geom_density(aes(fill=factor(cut)),alpha=0.7) +
  labs(title="Price grouped by cut",x="Price",fill="Cut") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(diamonds,aes(x=price)) +
  geom_density(aes(fill = factor(color)),alpha=0.7) +
  labs(title="Price grouped by cut",x="Price",fill="Cut") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(data = diamond, aes(x = carat, y = price, color = clarity)) +
  geom_point() +
  labs(x='Carat', y='Price', title = 'Price of Diamonds by Carat colored by Clarity') +
  theme(plot.title = element_text(hjust = 0.5))

Regression Models

diamond_2 <- diamond[,2:11]
index <- sample(1:nrow(diamond_2), nrow(diamond_2) *0.8)
train <- diamond_2[index,]
test <- diamond_2[-index,]

Linear Regression

lm_fit <- lm(price ~ ., data = train)
summary(lm_fit)
## 
## Call:
## lm(formula = price ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21119.1   -594.1   -186.7    375.5  10714.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2008.380    458.512   4.380 1.19e-05 ***
## carat        11134.891     55.027 202.353  < 2e-16 ***
## cutGood        576.300     37.842  15.229  < 2e-16 ***
## cutIdeal       822.393     37.621  21.860  < 2e-16 ***
## cutPremium     764.806     36.227  21.111  < 2e-16 ***
## cutVery Good   716.964     36.294  19.754  < 2e-16 ***
## colorE        -195.396     20.093  -9.724  < 2e-16 ***
## colorF        -257.331     20.316 -12.666  < 2e-16 ***
## colorG        -481.255     19.905 -24.177  < 2e-16 ***
## colorH        -968.638     21.210 -45.670  < 2e-16 ***
## colorI       -1444.287     23.807 -60.667  < 2e-16 ***
## colorJ       -2364.509     29.475 -80.220  < 2e-16 ***
## clarityIF     5316.168     56.966  93.322  < 2e-16 ***
## claritySI1    3650.570     48.545  75.200  < 2e-16 ***
## claritySI2    2698.181     48.766  55.329  < 2e-16 ***
## clarityVS1    4572.871     49.580  92.232  < 2e-16 ***
## clarityVS2    4257.520     48.791  87.261  < 2e-16 ***
## clarityVVS1   5012.298     52.534  95.410  < 2e-16 ***
## clarityVVS2   4964.573     51.052  97.245  < 2e-16 ***
## depth          -62.812      5.036 -12.473  < 2e-16 ***
## table          -27.701      3.295  -8.407  < 2e-16 ***
## x            -1028.810     47.160 -21.815  < 2e-16 ***
## y               71.978     38.647   1.862   0.0625 .  
## z              -36.287     34.130  -1.063   0.2877    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1136 on 43128 degrees of freedom
## Multiple R-squared:  0.9187, Adjusted R-squared:  0.9187 
## F-statistic: 2.12e+04 on 23 and 43128 DF,  p-value: < 2.2e-16
test$price_pred <- predict(lm_fit, newdata = test)
residual <- test$price_pred - test$price
rmse1 <- sqrt(mean(residual^2))
rmse1
## [1] 1106.045

10 Fold Cross Validation

set.seed(1234)
# Fit lm model using 10-fold CV: model
lm_fit <- glm(price ~ ., data = train)
cv_error <- cv.glm(data = train, glmfit = lm_fit, K = 10)
cv_error$delta
## [1] 1295034 1294752
plot(lm_fit)