#This problem involves the use of multiple
#linear regression on the Auto data set located in the ISLR package.
library(ISLR)
#loading auto data in the varaible data
data <- Auto
#summary showing Auto
summary(data)
##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name    
##  amc matador       :  5  
##  ford pinto        :  5  
##  toyota corolla    :  5  
##  amc gremlin       :  4  
##  amc hornet        :  4  
##  chevrolet chevette:  4  
##  (Other)           :365
#1. (5 pts.)  Compute the correlation matrix between the variables using the function cor().  
#You will need to exclude the name variable, which is qualitative. 
#[Hint:  To exclude the name variable, take a look at the subset() function.]
df = subset(data, select = -c(name))
cor(df,method = c("pearson"))
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000
#Perform a multiple linear regression with mpg as the response and all other variables except name 
#as the predictors.Use the summary() function to print the results.  Comment on the output.
#(a)    Is there a relationship between the predictors and the response?
#(b)    Which predictors appear to have a statistically significant relationship to the response?
#(c)    What does the coefficient for the year variable suggest?
lrm <-lm(mpg~.,data=df)
summary(lrm)
## 
## Call:
## lm(formula = mpg ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5903 -2.1565 -0.1169  1.8690 13.0604 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -17.218435   4.644294  -3.707  0.00024 ***
## cylinders     -0.493376   0.323282  -1.526  0.12780    
## displacement   0.019896   0.007515   2.647  0.00844 ** 
## horsepower    -0.016951   0.013787  -1.230  0.21963    
## weight        -0.006474   0.000652  -9.929  < 2e-16 ***
## acceleration   0.080576   0.098845   0.815  0.41548    
## year           0.750773   0.050973  14.729  < 2e-16 ***
## origin         1.426141   0.278136   5.127 4.67e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared:  0.8215, Adjusted R-squared:  0.8182 
## F-statistic: 252.4 on 7 and 384 DF,  p-value: < 2.2e-16
#(a)    Is there a relationship between the predictors and the response?
#For displacement, weight, year and origin significant fro mpg as p-value is less than 0.05 where 
#as others are greater then 0.05.
#(b)    Which predictors appear to have a statistically significant relationship to the response?
#For displacement, weight, year and origin significant fro mpg as p-value is less than 0.05 
#(c)    What does the coefficient for the year variable suggest?
#year variable p-value is less then 0.05 says its significant for mpg as more the year of making 
#less the mpg or vice versa. mpg and year are inversely proportionate

#Produce diagnostic plots of the linear regression fit.
#Comment on any problems you see with the fit. Do the residual plots suggest a non-linear relationship?  
#Do the residual plots suggest any unusually large outliers?
par(mfrow = c(2,2))
plot(lrm)

#1st plot residual plot showing curve, non - linearity assumption met.
#2nd plot QQ plot data looks fairly normal, residuals are normally distributed.Right skewed.
#some outliers do exist.
#3rd plot shows the residuals appear randomly spread.
#4th plot shows the typical look when there is no influential case, or cases.
#You can barely see Cook’s distance lines (a red dashed line) because all cases are well 
#inside of the Cook’s distance lines.

#Let’s try to improve our model.  Use the * and : symbols to fit linear regression 
#models with interaction effects.Do any interactions appear to be statistically significant?
#From the correlation matrix, we obtained the two highest correlated pairs and used them in 
#picking interaction effects.
lrm2 <- lm(mpg ~ cylinders * displacement+displacement * weight, data = df)
#horsepower became more significant and the displacement and weight are significant for mpg
summary(lrm2)
## 
## Call:
## lm(formula = mpg ~ cylinders * displacement + displacement * 
##     weight, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.2934  -2.5184  -0.3476   1.8399  17.7723 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             5.262e+01  2.237e+00  23.519  < 2e-16 ***
## cylinders               7.606e-01  7.669e-01   0.992    0.322    
## displacement           -7.351e-02  1.669e-02  -4.403 1.38e-05 ***
## weight                 -9.888e-03  1.329e-03  -7.438 6.69e-13 ***
## cylinders:displacement -2.986e-03  3.426e-03  -0.872    0.384    
## displacement:weight     2.128e-05  5.002e-06   4.254 2.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.103 on 386 degrees of freedom
## Multiple R-squared:  0.7272, Adjusted R-squared:  0.7237 
## F-statistic: 205.8 on 5 and 386 DF,  p-value: < 2.2e-16
#As can see the p-value which shows the interaction between displacement and weight is 
#statistically signifcant, while the interactiion between cylinders and displacement is not.

#5.Try a few different transformations of the variables, such as log(x), sqrt(x), x2.
#Did any of your transformations yield a better fitting model?  Comment on your findings
par(mfrow = c(2, 2))
plot(log(Auto$horsepower), Auto$mpg)
plot(sqrt(Auto$horsepower), Auto$mpg)
plot((Auto$horsepower)^2, Auto$mpg)
#We limit ourselves to examining “horsepower” as sole predictor. 
#It seems that the log transformation gives the most linear looking plot.
#more horsepower more more mpg and less horsepower less mpg, directly proportionate.