Analysis of the diamond data Set in R

Mean Centering the Predictor variable to make more sense of the coefficients
This will be printing the coefficients beta0, beta1 for the linear regression model with the mean centered
- Getting the value in carat/10
fit2<-lm(price ~ I((carat-mean(carat))*10), data = diamond)
coef(fit2)
## (Intercept) I((carat - mean(carat)) * 10)
## 500.0833 372.1025
Illustrating the Residual variation throgh a residual plot
- Using the Predict funtion
- Residuals have to sum to 0 and so they have to lie above and below the line y=0
data(diamond)
y<-diamond$price; x<-diamond$carat; n<-length(y)
#Creating the linear regression object that is created by lm
fitval<-lm(y~x)
#Summary of Fit and elements of fit
summary(fitval)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.159 -21.448 -0.869 18.972 79.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -259.63 17.32 -14.99 <2e-16 ***
## x 3721.02 81.79 45.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.84 on 46 degrees of freedom
## Multiple R-squared: 0.9783, Adjusted R-squared: 0.9778
## F-statistic: 2070 on 1 and 46 DF, p-value: < 2.2e-16
e<-resid(fitval)
#Vector of the Prediction of the observed x values ie, carat values
yhat<-predict(fitval)
# Finding the maximum of the residual variation through the formula as well as manually
max(abs(e-(y-yhat)))
## [1] 9.485746e-13
max(abs(e-(y-coef(fitval)[1] - coef(fitval)[2]*x)))
## [1] 9.485746e-13
#Plot-1
plot(y, x, xlab = "Carats (Weight)", ylab = "Price ($)", bg="lightblue", col="black", cex=1.1, pch=21, frame=FALSE, title(main = "Residuals seen in terms of the linear fit regression line"))
abline(fitval,lwd=2)
for(i in 1:n)
lines(c(x[i], x[i]), c(y[i], yhat[i]), col="purple", lwd=2)

#Plot-2
plot(x, e, xlab = "Carats (Weight)", ylab = "Price ($)", bg="lightblue", col="black", cex=2, pch=21, frame=FALSE, title(main = "Residual True representation"))
abline(h=0, lwd=2)
for(i in 1 :n)
lines(c(x[i],x[i]), c(e[i], 0), col="red", lwd=2)

# Sum of Residuals
sum(e)
## [1] -1.865175e-14
# Sum of Residuals multiplied with the predictors
sum(e*x)
## [1] 6.959711e-15
Heteroskedsticity
- The variability of the data incresing along the horizontal line , y=0 , as x increases
- This porperty can be seen clearly with the help of a residual graph
e<-c(resid(lm(price ~ 1, data=diamond)), resid(lm(price~carat, data=diamond)))
fitval1<-factor(c(rep("Itc", nrow(diamond)),
rep("Itc, Slope", nrow(diamond))))
library(ggplot2)
g<-ggplot(data.frame(e=e, fit=fitval1),aes(y=e, x=fitval1, fill=fitval1))
g<-g+ geom_dotplot(binaxis = "y", size=2, stackdir = "center", binwidth = 20)
g<-g + xlab("Fitting approach") + ylab("Residual Price")
g
