Analysis of the diamond data Set in R

Mean Centering the Predictor variable to make more sense of the coefficients

This will be printing the coefficients beta0, beta1 for the linear regression model with the mean centered

  • Getting the value in carat/10
fit2<-lm(price ~ I((carat-mean(carat))*10), data = diamond)
coef(fit2)
##                   (Intercept) I((carat - mean(carat)) * 10) 
##                      500.0833                      372.1025

Illustrating the Residual variation throgh a residual plot

data(diamond)
y<-diamond$price; x<-diamond$carat; n<-length(y)

#Creating the linear regression object that is created by lm
fitval<-lm(y~x)


#Summary of Fit and elements of fit
summary(fitval)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -85.159 -21.448  -0.869  18.972  79.370 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -259.63      17.32  -14.99   <2e-16 ***
## x            3721.02      81.79   45.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared:  0.9783, Adjusted R-squared:  0.9778 
## F-statistic:  2070 on 1 and 46 DF,  p-value: < 2.2e-16
e<-resid(fitval)

#Vector of the Prediction of the observed x values ie, carat values 
yhat<-predict(fitval)

# Finding the maximum of the residual variation through the formula as well as manually
max(abs(e-(y-yhat)))
## [1] 9.485746e-13
max(abs(e-(y-coef(fitval)[1] - coef(fitval)[2]*x)))
## [1] 9.485746e-13
#Plot-1
plot(y, x, xlab = "Carats (Weight)", ylab = "Price ($)", bg="lightblue", col="black", cex=1.1, pch=21, frame=FALSE, title(main = "Residuals seen in terms of the linear fit regression line"))
abline(fitval,lwd=2)
for(i in 1:n)
  lines(c(x[i], x[i]), c(y[i], yhat[i]), col="purple", lwd=2)

#Plot-2
plot(x, e, xlab = "Carats (Weight)", ylab = "Price ($)", bg="lightblue", col="black", cex=2, pch=21, frame=FALSE, title(main = "Residual True representation"))
abline(h=0, lwd=2)
for(i in 1 :n)
  lines(c(x[i],x[i]), c(e[i], 0), col="red", lwd=2)

# Sum of Residuals 
sum(e)
## [1] -1.865175e-14
# Sum of Residuals multiplied with the predictors
sum(e*x)
## [1] 6.959711e-15

Heteroskedsticity

e<-c(resid(lm(price ~ 1, data=diamond)), resid(lm(price~carat, data=diamond)))

fitval1<-factor(c(rep("Itc", nrow(diamond)),
                  rep("Itc, Slope", nrow(diamond))))

library(ggplot2)

g<-ggplot(data.frame(e=e, fit=fitval1),aes(y=e, x=fitval1, fill=fitval1))
g<-g+ geom_dotplot(binaxis = "y", size=2, stackdir = "center", binwidth = 20)
g<-g + xlab("Fitting approach") + ylab("Residual Price")

g