#(6.1) Fit a model to evaluate whether the relationship between city.mpg and wt is the same in domestic and foreign
library(readr)# Read in the dataset for the analysis
car_DF<-read_csv("http://www.users.miamioh.edu/baileraj/classes/sta363/cars-csv.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## manuf = col_character(),
## model = col_character(),
## type = col_character(),
## min.price = col_double(),
## mid.price = col_double(),
## max.price = col_double(),
## eng.size = col_double(),
## fuel.tank = col_double(),
## rear.seat = col_double()
## )
## See spec(...) for full column specifications.
names(car_DF)#To check the correct headings for each colunm in the dataset
## [1] "obs" "manuf" "model" "type"
## [5] "min.price" "mid.price" "max.price" "city.mpg"
## [9] "hwy.mpg" "air.bags" "drive" "n.cyl"
## [13] "eng.size" "horse.pwr" "rpm" "eng.rev"
## [17] "man.tran" "fuel.tank" "n.passengers" "length"
## [21] "wheelbase" "width" "Uturn" "rear.seat"
## [25] "luggage" "wt" "domestic"
library(dplyr)#Load a package from which the mutate function can be used
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Note: the domestic variable in the data frame is already in numeric, however, I am creating the indicator variable from it.
car_DF2 <- car_DF %>%
mutate(Idomestic = ifelse(domestic %in% c(1),
1,0),IDwt=Idomestic*wt)%>%
na.omit()
# Adding a new colunm for the indicator variable and also the IDwt=Idomestic*wt
View(car_DF2)
#Modeling
car_fit1<-lm(city.mpg~wt,data=car_DF2)#This model gives the parameters of the foreign cars provided domestic cars have indicator variable=0
car_fit2<-lm(city.mpg~wt+Idomestic+IDwt,data=car_DF2)# This model considers the domestic cars to be "1" in the indicator variablere s as to solve for the sulutions of if the relationship between city,mpg and wt is the same in both origins of cars.
summary(car_fit1)#Reporting model summary for the full and reduced model
##
## Call:
## lm(formula = city.mpg ~ wt, data = car_DF2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5571 -1.9507 -0.0602 1.1697 13.6431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.7686025 1.8451263 25.89 <2e-16 ***
## wt -0.0082603 0.0006068 -13.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.091 on 80 degrees of freedom
## Multiple R-squared: 0.6985, Adjusted R-squared: 0.6947
## F-statistic: 185.3 on 1 and 80 DF, p-value: < 2.2e-16
summary(car_fit2)
##
## Call:
## lm(formula = city.mpg ~ wt + Idomestic + IDwt, data = car_DF2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0605 -1.2179 -0.1448 1.2378 12.3033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.338e+01 2.495e+00 21.396 < 2e-16 ***
## wt -1.008e-02 8.609e-04 -11.707 < 2e-16 ***
## Idomestic -1.217e+01 3.577e+00 -3.403 0.00105 **
## IDwt 3.787e-03 1.181e-03 3.207 0.00195 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.912 on 78 degrees of freedom
## Multiple R-squared: 0.739, Adjusted R-squared: 0.729
## F-statistic: 73.63 on 3 and 78 DF, p-value: < 2.2e-16
anova(car_fit1,car_fit2)#Comparing the models to check for the effect of the additional predictors on city.mpg
## Analysis of Variance Table
##
## Model 1: city.mpg ~ wt
## Model 2: city.mpg ~ wt + Idomestic + IDwt
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 80 764.24
## 2 78 661.41 2 102.83 6.0631 0.003569 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#City.mpg =??0 + ??1wt + ??2IDwt+??3IDwt +??
#Thus, ??0F = ??0, ??1F = ??1, and ??0D = ??0 + ??2, ??1D = ??1 + ??3 (F=Foreign cars, D= Domestic cars)
#Hypothesis
#H0: ??2=??3=0
#H1: ??2?????3???0
#Test Statistics: F2,78df = 6.063, P-value <0.005
#Result: The p-vale of the test statistics shows that one of the additional predictor is needed in the model that already has wt. Thus, ??2?????3???0. I therefore fail to reject the alternate hypothesis and conclude that the relationship between city.mpg and wt is not the same in the foreign and domestic cars
#(6.2) Use backwards elimination to select a model to predict city.mpg as a function of n.cyl, eng.size, horse.pwr, rpm, and wt.
full_model<-lm(city.mpg~ n.cyl+ eng.size + horse.pwr + rpm + wt,data=car_DF2)#Runing the full model to which the backward selection will be applied
null_model <- lm(city.mpg ~ 1, data=car_DF2)# Although the null model is not needed here but just to see the outputs
summary(full_model)
##
## Call:
## lm(formula = city.mpg ~ n.cyl + eng.size + horse.pwr + rpm +
## wt, data = car_DF2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3066 -1.5760 -0.1778 1.0882 13.3889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.965319 7.998928 5.121 2.23e-06 ***
## n.cyl -0.058885 0.632454 -0.093 0.926
## eng.size 1.778100 1.315093 1.352 0.180
## horse.pwr -0.012842 0.021599 -0.595 0.554
## rpm 0.001502 0.001169 1.285 0.203
## wt -0.009505 0.001970 -4.824 7.09e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.119 on 76 degrees of freedom
## Multiple R-squared: 0.7082, Adjusted R-squared: 0.689
## F-statistic: 36.89 on 5 and 76 DF, p-value: < 2.2e-16
summary(null_model)
##
## Call:
## lm(formula = city.mpg ~ 1, data = car_DF2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.085 -4.085 -1.085 1.915 22.915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.0854 0.6177 37.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.594 on 81 degrees of freedom
# backwards elimination
step(full_model,direction="backward") #Selection line of code
## Start: AIC=192.34
## city.mpg ~ n.cyl + eng.size + horse.pwr + rpm + wt
##
## Df Sum of Sq RSS AIC
## - n.cyl 1 0.084 739.63 190.35
## - horse.pwr 1 3.440 742.99 190.72
## - rpm 1 16.063 755.61 192.11
## - eng.size 1 17.789 757.34 192.29
## <none> 739.55 192.34
## - wt 1 226.422 965.97 212.25
##
## Step: AIC=190.35
## city.mpg ~ eng.size + horse.pwr + rpm + wt
##
## Df Sum of Sq RSS AIC
## - horse.pwr 1 4.217 743.85 188.82
## - rpm 1 16.429 756.06 190.16
## <none> 739.63 190.35
## - eng.size 1 20.985 760.62 190.65
## - wt 1 231.564 971.19 210.69
##
## Step: AIC=188.82
## city.mpg ~ eng.size + rpm + wt
##
## Df Sum of Sq RSS AIC
## - rpm 1 14.69 758.54 188.42
## - eng.size 1 16.79 760.63 188.65
## <none> 743.85 188.82
## - wt 1 360.62 1104.47 219.23
##
## Step: AIC=188.42
## city.mpg ~ eng.size + wt
##
## Df Sum of Sq RSS AIC
## - eng.size 1 5.70 764.24 187.04
## <none> 758.54 188.42
## - wt 1 360.88 1119.42 218.34
##
## Step: AIC=187.04
## city.mpg ~ wt
##
## Df Sum of Sq RSS AIC
## <none> 764.24 187.04
## - wt 1 1770.2 2534.40 283.34
##
## Call:
## lm(formula = city.mpg ~ wt, data = car_DF2)
##
## Coefficients:
## (Intercept) wt
## 47.76860 -0.00826
#The final model selected by backward elimination method has the model equation:
#city.pg(mean)= 47.7689-0.00826wt (with AIC value=187.04)