#This problem involves the use of multiple
#linear regression on the Auto data set located in the ISLR package.
library(ISLR)
#loading auto data in the varaible data
data <- Auto
#summary showing Auto
summary(data)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name
## amc matador : 5
## ford pinto : 5
## toyota corolla : 5
## amc gremlin : 4
## amc hornet : 4
## chevrolet chevette: 4
## (Other) :365
#1. (5 pts.) Compute the correlation matrix between the variables using the function cor().
#You will need to exclude the name variable, which is qualitative.
#[Hint: To exclude the name variable, take a look at the subset() function.]
df = subset(data, select = -c(name))
cor(df,method = c("pearson"))
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
#Perform a multiple linear regression with mpg as the response and all other variables except name
#as the predictors.Use the summary() function to print the results. Comment on the output.
#(a) Is there a relationship between the predictors and the response?
#(b) Which predictors appear to have a statistically significant relationship to the response?
#(c) What does the coefficient for the year variable suggest?
lrm <-lm(mpg~.,data=df)
summary(lrm)
##
## Call:
## lm(formula = mpg ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
#(a) Is there a relationship between the predictors and the response?
#For displacement, weight, year and origin significant fro mpg as p-value is less than 0.05 where
#as others are greater then 0.05.
#(b) Which predictors appear to have a statistically significant relationship to the response?
#For displacement, weight, year and origin significant fro mpg as p-value is less than 0.05
#(c) What does the coefficient for the year variable suggest?
#year variable p-value is less then 0.05 says its significant for mpg as more the year of making
#less the mpg or vice versa. mpg and year are inversely proportionate
#Produce diagnostic plots of the linear regression fit.
#Comment on any problems you see with the fit. Do the residual plots suggest a non-linear relationship?
#Do the residual plots suggest any unusually large outliers?
par(mfrow = c(2,2))
plot(lrm)

#1st plot residual plot showing curve, non - linearity assumption met.
#2nd plot QQ plot data looks fairly normal, residuals are normally distributed.Right skewed.
#some outliers do exist.
#3rd plot shows the residuals appear randomly spread.
#4th plot shows the typical look when there is no influential case, or cases.
#You can barely see Cook’s distance lines (a red dashed line) because all cases are well
#inside of the Cook’s distance lines.
#Let’s try to improve our model. Use the * and : symbols to fit linear regression
#models with interaction effects.Do any interactions appear to be statistically significant?
#From the correlation matrix, we obtained the two highest correlated pairs and used them in
#picking interaction effects.
lrm2 <- lm(mpg ~ cylinders * displacement+displacement * weight, data = df)
#horsepower became more significant and the displacement and weight are significant for mpg
summary(lrm2)
##
## Call:
## lm(formula = mpg ~ cylinders * displacement + displacement *
## weight, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.2934 -2.5184 -0.3476 1.8399 17.7723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.262e+01 2.237e+00 23.519 < 2e-16 ***
## cylinders 7.606e-01 7.669e-01 0.992 0.322
## displacement -7.351e-02 1.669e-02 -4.403 1.38e-05 ***
## weight -9.888e-03 1.329e-03 -7.438 6.69e-13 ***
## cylinders:displacement -2.986e-03 3.426e-03 -0.872 0.384
## displacement:weight 2.128e-05 5.002e-06 4.254 2.64e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.103 on 386 degrees of freedom
## Multiple R-squared: 0.7272, Adjusted R-squared: 0.7237
## F-statistic: 205.8 on 5 and 386 DF, p-value: < 2.2e-16
#As can see the p-value which shows the interaction between displacement and weight is
#statistically signifcant, while the interactiion between cylinders and displacement is not.
#5.Try a few different transformations of the variables, such as log(x), sqrt(x), x2.
#Did any of your transformations yield a better fitting model? Comment on your findings
par(mfrow = c(2, 2))
plot(log(Auto$horsepower), Auto$mpg)
plot(sqrt(Auto$horsepower), Auto$mpg)
plot((Auto$horsepower)^2, Auto$mpg)
#We limit ourselves to examining “horsepower” as sole predictor.
#It seems that the log transformation gives the most linear looking plot.
#more horsepower more more mpg and less horsepower less mpg, directly proportionate.
