#(6.1) Fit a model to evaluate whether the relationship between city.mpg and wt is the same in domestic and foreign 
library(readr)# Read in the dataset for the analysis
car_DF<-read_csv("http://www.users.miamioh.edu/baileraj/classes/sta363/cars-csv.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   manuf = col_character(),
##   model = col_character(),
##   type = col_character(),
##   min.price = col_double(),
##   mid.price = col_double(),
##   max.price = col_double(),
##   eng.size = col_double(),
##   fuel.tank = col_double(),
##   rear.seat = col_double()
## )
## See spec(...) for full column specifications.
names(car_DF)#To check the correct headings for each colunm in the dataset
##  [1] "obs"          "manuf"        "model"        "type"        
##  [5] "min.price"    "mid.price"    "max.price"    "city.mpg"    
##  [9] "hwy.mpg"      "air.bags"     "drive"        "n.cyl"       
## [13] "eng.size"     "horse.pwr"    "rpm"          "eng.rev"     
## [17] "man.tran"     "fuel.tank"    "n.passengers" "length"      
## [21] "wheelbase"    "width"        "Uturn"        "rear.seat"   
## [25] "luggage"      "wt"           "domestic"
library(dplyr)#Load a package from which the mutate function can be used
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Note: the domestic variable in the data frame is already in numeric, however, I am creating the indicator variable from it.
car_DF2 <- car_DF %>%
  mutate(Idomestic = ifelse(domestic %in% c(1),
                                  1,0),IDwt=Idomestic*wt)%>%
na.omit()
# Adding a new colunm for the indicator variable and also the IDwt=Idomestic*wt
View(car_DF2)
#Modeling
car_fit1<-lm(city.mpg~wt,data=car_DF2)#This model gives the parameters of the foreign cars provided domestic cars have indicator variable=0
car_fit2<-lm(city.mpg~wt+Idomestic+IDwt,data=car_DF2)# This model considers the domestic cars to be "1" in the indicator variablere s as to solve for the sulutions of if the relationship between city,mpg and wt is the same in both origins of cars.
summary(car_fit1)#Reporting model summary for the full and reduced model
## 
## Call:
## lm(formula = city.mpg ~ wt, data = car_DF2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5571 -1.9507 -0.0602  1.1697 13.6431 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.7686025  1.8451263   25.89   <2e-16 ***
## wt          -0.0082603  0.0006068  -13.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.091 on 80 degrees of freedom
## Multiple R-squared:  0.6985, Adjusted R-squared:  0.6947 
## F-statistic: 185.3 on 1 and 80 DF,  p-value: < 2.2e-16
summary(car_fit2)
## 
## Call:
## lm(formula = city.mpg ~ wt + Idomestic + IDwt, data = car_DF2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0605 -1.2179 -0.1448  1.2378 12.3033 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.338e+01  2.495e+00  21.396  < 2e-16 ***
## wt          -1.008e-02  8.609e-04 -11.707  < 2e-16 ***
## Idomestic   -1.217e+01  3.577e+00  -3.403  0.00105 ** 
## IDwt         3.787e-03  1.181e-03   3.207  0.00195 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.912 on 78 degrees of freedom
## Multiple R-squared:  0.739,  Adjusted R-squared:  0.729 
## F-statistic: 73.63 on 3 and 78 DF,  p-value: < 2.2e-16
anova(car_fit1,car_fit2)#Comparing the models to check for the effect of the additional predictors on city.mpg
## Analysis of Variance Table
## 
## Model 1: city.mpg ~ wt
## Model 2: city.mpg ~ wt + Idomestic + IDwt
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1     80 764.24                                
## 2     78 661.41  2    102.83 6.0631 0.003569 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#City.mpg =??0 + ??1wt + ??2IDwt+??3IDwt +??
#Thus, ??0F = ??0, ??1F = ??1, and ??0D = ??0 + ??2, ??1D = ??1 + ??3 (F=Foreign cars, D= Domestic cars)
#Hypothesis
#H0: ??2=??3=0
#H1: ??2?????3???0
#Test  Statistics: F2,78df = 6.063, P-value  <0.005
#Result: The p-vale of the test statistics shows that one of the additional predictor is needed in the model that already has wt. Thus, ??2?????3???0. I therefore fail to reject the alternate hypothesis and conclude that the relationship between city.mpg and wt is not the same in the foreign and domestic cars


#(6.2) Use backwards elimination to select a model to predict city.mpg as a function of n.cyl, eng.size, horse.pwr, rpm, and wt.
full_model<-lm(city.mpg~ n.cyl+ eng.size + horse.pwr + rpm + wt,data=car_DF2)#Runing the full model to which the backward selection will be applied

null_model <- lm(city.mpg ~ 1, data=car_DF2)# Although the null model is not needed here but just to see the outputs
summary(full_model)
## 
## Call:
## lm(formula = city.mpg ~ n.cyl + eng.size + horse.pwr + rpm + 
##     wt, data = car_DF2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3066 -1.5760 -0.1778  1.0882 13.3889 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 40.965319   7.998928   5.121 2.23e-06 ***
## n.cyl       -0.058885   0.632454  -0.093    0.926    
## eng.size     1.778100   1.315093   1.352    0.180    
## horse.pwr   -0.012842   0.021599  -0.595    0.554    
## rpm          0.001502   0.001169   1.285    0.203    
## wt          -0.009505   0.001970  -4.824 7.09e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.119 on 76 degrees of freedom
## Multiple R-squared:  0.7082, Adjusted R-squared:  0.689 
## F-statistic: 36.89 on 5 and 76 DF,  p-value: < 2.2e-16
summary(null_model)
## 
## Call:
## lm(formula = city.mpg ~ 1, data = car_DF2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.085 -4.085 -1.085  1.915 22.915 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.0854     0.6177   37.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.594 on 81 degrees of freedom
# backwards elimination
step(full_model,direction="backward") #Selection line of code
## Start:  AIC=192.34
## city.mpg ~ n.cyl + eng.size + horse.pwr + rpm + wt
## 
##             Df Sum of Sq    RSS    AIC
## - n.cyl      1     0.084 739.63 190.35
## - horse.pwr  1     3.440 742.99 190.72
## - rpm        1    16.063 755.61 192.11
## - eng.size   1    17.789 757.34 192.29
## <none>                   739.55 192.34
## - wt         1   226.422 965.97 212.25
## 
## Step:  AIC=190.35
## city.mpg ~ eng.size + horse.pwr + rpm + wt
## 
##             Df Sum of Sq    RSS    AIC
## - horse.pwr  1     4.217 743.85 188.82
## - rpm        1    16.429 756.06 190.16
## <none>                   739.63 190.35
## - eng.size   1    20.985 760.62 190.65
## - wt         1   231.564 971.19 210.69
## 
## Step:  AIC=188.82
## city.mpg ~ eng.size + rpm + wt
## 
##            Df Sum of Sq     RSS    AIC
## - rpm       1     14.69  758.54 188.42
## - eng.size  1     16.79  760.63 188.65
## <none>                   743.85 188.82
## - wt        1    360.62 1104.47 219.23
## 
## Step:  AIC=188.42
## city.mpg ~ eng.size + wt
## 
##            Df Sum of Sq     RSS    AIC
## - eng.size  1      5.70  764.24 187.04
## <none>                   758.54 188.42
## - wt        1    360.88 1119.42 218.34
## 
## Step:  AIC=187.04
## city.mpg ~ wt
## 
##        Df Sum of Sq     RSS    AIC
## <none>               764.24 187.04
## - wt    1    1770.2 2534.40 283.34
## 
## Call:
## lm(formula = city.mpg ~ wt, data = car_DF2)
## 
## Coefficients:
## (Intercept)           wt  
##    47.76860     -0.00826
#The final model selected by backward elimination method has the model equation:
#city.pg(mean)= 47.7689-0.00826wt (with AIC value=187.04)