#Greeshma Ganji
#ISTE 780
#Summer 2023

#PART-I
Auto<-read.csv("/Users/greeshmaganji/RIT/ISTE780/Lab2/Auto.csv")
#View(Auto)
#names(Auto)
summary(Auto)
##       mpg          cylinders      displacement    horsepower       
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Length:397        
##  1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.0   Class :character  
##  Median :23.00   Median :4.000   Median :146.0   Mode  :character  
##  Mean   :23.52   Mean   :5.458   Mean   :193.5                     
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0                     
##  Max.   :46.60   Max.   :8.000   Max.   :455.0                     
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2223   1st Qu.:13.80   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2800   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2970   Mean   :15.56   Mean   :75.99   Mean   :1.574  
##  3rd Qu.:3609   3rd Qu.:17.10   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##      name          
##  Length:397        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
Auto[,9] = as.numeric(factor(Auto[,9]))
Auto[,4] = as.numeric(factor(Auto[,4]))

#1a) scatterplot which includes all variables of data
pairs(Auto)

#View(Auto)

#1b) matrix of correlations between the variables using the function cor()
cor(subset(Auto, select = -name))
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7762599   -0.8044430  0.4228227 -0.8317389
## cylinders    -0.7762599  1.0000000    0.9509199 -0.5466585  0.8970169
## displacement -0.8044430  0.9509199    1.0000000 -0.4820705  0.9331044
## horsepower    0.4228227 -0.5466585   -0.4820705  1.0000000 -0.4821507
## weight       -0.8317389  0.8970169    0.9331044 -0.4821507  1.0000000
## acceleration  0.4222974 -0.5040606   -0.5441618  0.2662877 -0.4195023
## year          0.5814695 -0.3467172   -0.3698041  0.1274167 -0.3079004
## origin        0.5636979 -0.5649716   -0.6106643  0.2973734 -0.5812652
##              acceleration       year     origin
## mpg             0.4222974  0.5814695  0.5636979
## cylinders      -0.5040606 -0.3467172 -0.5649716
## displacement   -0.5441618 -0.3698041 -0.6106643
## horsepower      0.2662877  0.1274167  0.2973734
## weight         -0.4195023 -0.3079004 -0.5812652
## acceleration    1.0000000  0.2829009  0.2100836
## year            0.2829009  1.0000000  0.1843141
## origin          0.2100836  0.1843141  1.0000000
# 1c) multiple linear regression with mpg as the response and all other variables except name as the predictors
Auto_fit_1 <-  lm(mpg ~ . - name, data = Auto)
summary(Auto_fit_1)
## 
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.629 -2.034 -0.046  1.801 13.010 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.128e+01  4.259e+00  -4.998 8.78e-07 ***
## cylinders    -2.927e-01  3.382e-01  -0.865   0.3874    
## displacement  1.603e-02  7.284e-03   2.201   0.0283 *  
## horsepower    7.942e-03  6.809e-03   1.166   0.2442    
## weight       -6.870e-03  5.799e-04 -11.846  < 2e-16 ***
## acceleration  1.539e-01  7.750e-02   1.986   0.0477 *  
## year          7.734e-01  4.939e-02  15.661  < 2e-16 ***
## origin        1.346e+00  2.691e-01   5.004 8.52e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.331 on 389 degrees of freedom
## Multiple R-squared:  0.822,  Adjusted R-squared:  0.8188 
## F-statistic: 256.7 on 7 and 389 DF,  p-value: < 2.2e-16
#   (i) Yes, there is a relationship between the predictors and the response. 
#        R-squared value of 0.822 indicates there is 82.2% variance in mpg.
#        F-statistic of 256.7 also indicates that the relationship between the predictors and the response.
#   (ii) displacement, weight, acceleration, year appear to have a statistically 
#        significant relationship to the response as p-values are less than 0.05
#   (iii)coefficient for the year variable suggest that the average effect of an increase of 1 year is an increase of 7.734 in “mpg” 
#        cars become more fuel efficient over time.


# 1d) plot() function to produce diagnostic plots of the linear regression fit.
par(mfrow = c(2,2))
plot(Auto_fit_1)

# There are few outliers in the plot, which are unusual(values such as 323, 320, 394..)

#leverage plot
plot(Auto_fit_1, which = 5)


# 1e) Using * and : symbols to fit linear regression models with interaction effects
Auto_fit_2 <- lm(mpg ~ cylinders * displacement+displacement : weight, data = Auto[, 1:8])
summary(Auto_fit_2)
## 
## Call:
## lm(formula = mpg ~ cylinders * displacement + displacement:weight, 
##     data = Auto[, 1:8])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2630  -2.5881  -0.2766   2.1773  19.7551 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             5.028e+01  2.357e+00  21.336  < 2e-16 ***
## cylinders              -3.305e+00  5.766e-01  -5.732 1.98e-08 ***
## displacement           -1.150e-01  1.690e-02  -6.804 3.82e-11 ***
## cylinders:displacement  1.665e-02  2.376e-03   7.007 1.07e-11 ***
## displacement:weight    -1.044e-05  2.803e-06  -3.725 0.000224 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.396 on 392 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.6844 
## F-statistic: 215.7 on 4 and 392 DF,  p-value: < 2.2e-16
Auto_fit_3 = lm(mpg ~.-name+displacement:weight, data = Auto)
summary(Auto_fit_3)
## 
## Call:
## lm(formula = mpg ~ . - name + displacement:weight, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8561 -1.8167 -0.0141  1.7027 12.1594 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.130e+01  3.948e+00  -2.863  0.00443 ** 
## cylinders            2.463e-01  3.079e-01   0.800  0.42424    
## displacement        -7.153e-02  1.104e-02  -6.479  2.8e-10 ***
## horsepower           2.114e-03  6.129e-03   0.345  0.73029    
## weight              -1.127e-02  6.854e-04 -16.437  < 2e-16 ***
## acceleration         2.100e-01  6.966e-02   3.014  0.00275 ** 
## year                 8.181e-01  4.448e-02  18.394  < 2e-16 ***
## origin               4.428e-01  2.580e-01   1.716  0.08687 .  
## displacement:weight  2.212e-05  2.249e-06   9.833  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.984 on 388 degrees of freedom
## Multiple R-squared:  0.8575, Adjusted R-squared:  0.8546 
## F-statistic: 291.9 on 8 and 388 DF,  p-value: < 2.2e-16
#The interaction between displacement and weight is statistically signifcant


# 1f) different transformations of the variables, such as log(X), X0.5, X2.
par(mfrow = c(2, 2))

plot(log(Auto$horsepower), Auto$mpg, xlab = "Log(X)", ylab = "mpg")
plot(sqrt(Auto$horsepower), Auto$mpg, xlab = "sqrt(X)", ylab = "mpg")
plot((Auto$horsepower)^2, Auto$mpg, xlab = "X Square", ylab = "mpg")

#################################################################################################################################################
#PART-II
data("Carseats", package = "ISLR")

# 2a) multiple regression model to predict Sales using Price, Urban, and US.
Carseats_1 <- lm(Sales ~ Price + Urban + US, data = Carseats)
summary(Carseats_1)
## 
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9206 -1.6220 -0.0564  1.5786  7.0581 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.043469   0.651012  20.036  < 2e-16 ***
## Price       -0.054459   0.005242 -10.389  < 2e-16 ***
## UrbanYes    -0.021916   0.271650  -0.081    0.936    
## USYes        1.200573   0.259042   4.635 4.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2335 
## F-statistic: 41.52 on 3 and 396 DF,  p-value: < 2.2e-16
# 2b) The coefficient of the Price variable can be interpreted as
#     average effect of a price increase of 1 dollar is a decrease of 54.459 units in sales. 
#     average of the unit sales in urban location are 21.916 units less than in rural location 
#     average of the unit sales in a US store are 1200.573 units less than non-US.
# it can be written as follows

# 2c) Sales=13.043469+(−0.054459)×Price+(−0.021916)×Urban+(1.200573)×US + e

# 2d) Price and US null hypothesis can be rejected since the p-value for Urban is greater than 0.05 we cannot reject it. 

# 2e) smaller model that only uses the predictors price and US
Carseats_2 <- lm(Sales ~ Price + US, data = Carseats)
summary(Carseats_2)
## 
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9269 -1.6286 -0.0574  1.5766  7.0515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.03079    0.63098  20.652  < 2e-16 ***
## Price       -0.05448    0.00523 -10.416  < 2e-16 ***
## USYes        1.19964    0.25846   4.641 4.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2354 
## F-statistic: 62.43 on 2 and 397 DF,  p-value: < 2.2e-16
# 2f)They both fit well, and give a significant understanding of the data. smaller model is better than for the bigger model as it has high R2.

#2g)obtaining 95% confidence intervals for the coefficient(s).
confint(Carseats_2)
##                   2.5 %      97.5 %
## (Intercept) 11.79032020 14.27126531
## Price       -0.06475984 -0.04419543
## USYes        0.69151957  1.70776632