library("ggplot2")
library("GGally")

library("gridExtra")
#Part B: Residuals and Transforms
#Exercise 1: Residuals Plot
#a - Obtain plot
#Obtain the Residuals vs Fitted Plot of the fitted model with an added horizontal line at 
#y=0
#Do the points look randomly distributed about the line?

ToyotaPrices <- read.csv("C:/Users/aksha/Downloads/ToyotaPrices.csv")
names(ToyotaPrices)
##  [1] "Id"               "Price"            "Age_08_04"       
##  [4] "Mfg_Month"        "Mfg_Year"         "KM"              
##  [7] "HP"               "Automatic"        "cc"              
## [10] "Doors"            "Cylinders"        "Gears"           
## [13] "Quarterly_Tax"    "Weight"           "Mfr_Guarantee"   
## [16] "BOVAG_Guarantee"  "Guarantee_Period" "ABS"             
## [19] "Airbag_1"         "Airbag_2"         "Airco"           
## [22] "Automatic_airco"  "Boardcomputer"    "CD_Player"       
## [25] "Central_Lock"     "Powered_Windows"  "Power_Steering"  
## [28] "Radio"            "Mistlamps"        "Sport_Model"     
## [31] "Backseat_Divider" "Metallic_Rim"     "Radio_cassette"  
## [34] "Tow_Bar"
myData_PKWT = subset(ToyotaPrices, select = c(Price, KM, Weight, Tow_Bar))

#Exercise 1
fit = lm(Price~ KM + Weight + Tow_Bar, data= myData_PKWT)
plot(fit$fitted.values,fit$residuals)
abline(h=0,v=NULL)

#yes the points look randomly distributed about the line.

#Exercise 2
# b - Obtain plot using the z-scores of the resiudals
# Repeat the Residuals vs Fitted Plot using z-scores of the residuals. 
# Add empirical rule horizontal lines at 
# +2 and ???2. Use these lines to judge whether or not the residuals are normal or
# there are outliers. Point out any outliers. Point out any floor or ceiling effects. Do you think the residuals are normal?

x=fit$residuals
mean(x)
## [1] 1.60432e-12
y=(x-mean(x))/sd(x)
plot(y,fit$fitted.values)
abline(h=NULL,v=-2)
abline(h=NULL,v=2)

mod= fortify(fit)
plot1= qplot(.stdresid, data= mod , geom= "histogram")
plot2= qplot(.stdresid, data= mod , geom= "density")
plot3= qplot(sample=.stdresid, data= mod , geom= "qq") + geom_abline()

grid.arrange(plot1,plot2,plot3,nrow=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#The residuals  normal and there are outliers that are present as well.


plot(floor(y),fit$fitted.values)

#If we have empirical rule between +2 and -2 then we have many outliers.


plot(ceiling(y),fit$fitted.values)

#If we have empirical rule betwwen +2 and -2 then we just have less outliers.

#Exercise 2

#2a
#a - The Plot
#Obtain the normal probability QQ-Plot of the residuals.

qqnorm(fit$residuals) 
qqline(fit$residuals) 

#2b - Normality
#Do the residuals look normal?
#the residuals look normal but there are some outliers


#Exercise 3

#3a - Obtain the composite goodness-of-fit plots
#Obtain the composite goodness-of-fit plots for the fitted model. What plots involve the residuals? Do the Residuals vs Fitted Plot and the Normal QQ-Plot look about the same as those obtained earlier?

plot(fit)

#Residuals vs Fitted Plots & Residuals vs leverage Plot involves residuals
#the Residuals vs Fitted Plot and the Normal QQ-PLot are the same as those obtained earlier

#3b-b - Outliers
#We examine the Residuals vs Leverage Plot in the composite goodness-of-fit plots. Outliers points will be identified by their row name. Are there any outliers? If so, what are their row names.
#Yes there are outliers
#602,961,222 are the row names of the outliers