library(readr)
## Warning: package 'readr' was built under R version 3.2.5
C3.Cars <- read_csv("P:/SUNY-Brockport/SUNY Brockport_1/teaching/Staitstical Methods/Spring 2017/R_Projects_in_class_spring17/Ch3/C3 Cars.csv")
## Parsed with column specification:
## cols(
## Mileage = col_integer(),
## Price = col_double(),
## Make = col_character(),
## Model = col_character(),
## Trim = col_character(),
## Type = col_character(),
## Cyl = col_integer(),
## Liter = col_double(),
## Doors = col_integer(),
## Cruise = col_integer(),
## Sound = col_integer(),
## Leather = col_integer()
## )
View(C3.Cars)
require(mosaic)
## Loading required package: mosaic
## Warning: package 'mosaic' was built under R version 3.2.5
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
## Loading required package: mosaicData
## Warning: package 'mosaicData' was built under R version 3.2.5
## Loading required package: Matrix
##
## The 'mosaic' package masks several functions from core packages in order to add additional features.
## The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cov, D, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
require(leaps)
## Loading required package: leaps
## Warning: package 'leaps' was built under R version 3.2.5
require(graphics)
##Packages lattice and dplyr are automatically added with mosaic
attach(C3.Cars)
#Create a scatterplot between Mileage and Price:
xyplot( Price ~ Mileage, data=C3.Cars, main="")

#fitting LM line or any arbitrary line using abline,
#abline(h=4000)
plot(Mileage,Price)
abline(lm(Mileage~Price),col="red")
#fitting a lowess curve
lines(lowess(C3.Cars),col="green")
## Warning in data.matrix(x): NAs introduced by coercion
## Warning in data.matrix(x): NAs introduced by coercion
## Warning in data.matrix(x): NAs introduced by coercion
## Warning in data.matrix(x): NAs introduced by coercion

#The scatterplot( ) function in the car package
#offers many enhanced features, including fit lines, marginal box plots,
#conditioning on a factor, and interactive point identification.
# Enhanced Scatterplot of MPG vs. price
# by Number of Car Cylinders
library(car)
## Warning: package 'car' was built under R version 3.2.5
##
## Attaching package: 'car'
## The following objects are masked from 'package:mosaic':
##
## deltaMethod, logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplot(Mileage~Price |Cyl , data=C3.Cars,
xlab="Mileage", ylab="Price",
main="Scatter Plot",
labels=row.names(C3.Cars))

# for more on scatter plots refer to the below url
# http://www.statmethods.net/graphs/scatterplot.html
#subsetting data to look further into it
car.subset <- C3.Cars[C3.Cars$Price> 52000,]
head(car.subset)
## # A tibble: 6 × 12
## Mileage Price Make Model Trim Type Cyl Liter
## <int> <dbl> <chr> <chr> <chr> <chr> <int> <dbl>
## 1 583 70755.47 Cadillac XLR-V8 Hardtop Conv 2D Convertible 8 4.6
## 2 6420 68566.19 Cadillac XLR-V8 Hardtop Conv 2D Convertible 8 4.6
## 3 7892 69133.73 Cadillac XLR-V8 Hardtop Conv 2D Convertible 8 4.6
## 4 12021 66374.31 Cadillac XLR-V8 Hardtop Conv 2D Convertible 8 4.6
## 5 15600 65281.48 Cadillac XLR-V8 Hardtop Conv 2D Convertible 8 4.6
## 6 18200 63913.12 Cadillac XLR-V8 Hardtop Conv 2D Convertible 8 4.6
## # ... with 4 more variables: Doors <int>, Cruise <int>, Sound <int>,
## # Leather <int>
#Create a "regression" object, named cars.lm, that will contain
#summary information about the fitted
#regression model of Price on Mileage:
#subset model
C3.Cars.subset <- lm(Mileage~Price,data=car.subset)
summary(C3.Cars.subset)
##
## Call:
## lm(formula = Mileage ~ Price, data = car.subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2216.47 -1204.31 -72.41 1544.19 1779.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.473e+05 5.448e+03 27.04 3.77e-09 ***
## Price -2.043e+00 8.617e-02 -23.71 1.07e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1638 on 8 degrees of freedom
## Multiple R-squared: 0.986, Adjusted R-squared: 0.9842
## F-statistic: 562 on 1 and 8 DF, p-value: 1.067e-08
#full data model
cars.lm = lm(C3.Cars$Price~C3.Cars$Mileage,data=C3.Cars)
#The estimated coefficients, R^2, t-statistics, and other results
#about the model can be extracted from cars.lm
#using the summary()command:
summary(cars.lm)
##
## Call:
## lm(formula = C3.Cars$Price ~ C3.Cars$Mileage, data = C3.Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13905 -7254 -3520 5188 46091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.476e+04 9.044e+02 27.383 < 2e-16 ***
## C3.Cars$Mileage -1.725e-01 4.215e-02 -4.093 4.68e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9789 on 802 degrees of freedom
## Multiple R-squared: 0.02046, Adjusted R-squared: 0.01924
## F-statistic: 16.75 on 1 and 802 DF, p-value: 4.685e-05
#ANOVA
anova(cars.lm) # anova table also is useful
## Analysis of Variance Table
##
## Response: C3.Cars$Price
## Df Sum Sq Mean Sq F value Pr(>F)
## C3.Cars$Mileage 1 1.6056e+09 1605590375 16.755 4.685e-05 ***
## Residuals 802 7.6856e+10 95830165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Create a vector of residuals from the cars.lm model:
resid1 <- cars.lm$res
#residuals can be found using cars.lm$res or residuals(cars.lm)
#fitted/predicted values can be found using fitted(cars.lm)
fit<-fitted(cars.lm)
#Residual for the first car (Buick Century):
resid1[1] # to show the first element of the vector of residuals
## 1
## -6032.165
#It is often helpful to attach the residuals and fitted values to data frame.
Cars<-mutate(C3.Cars, resid1,fit)
Cars[1,c(1:3,13:14)] #to show the first row and selected columns of the Cars data frame
## # A tibble: 1 × 5
## Mileage Price Make resid1 fit
## <int> <dbl> <chr> <dbl> <dbl>
## 1 8221 17314.1 Buick -6032.165 23346.27
#############Stepwise regression########
#Note that the stepwise regression method implemented in R does not use
#the improvement in R^2 as the basis for adding a new variable to
#the model.
#It uses a criterion called AIC.
#Models with smaller AIC values are preferred.
#To perform stepwise regression for the cars data in R,
#first create a regression object that contains all relevant explanatory variables
################################################################
full.model<-lm(Price~Cyl+Liter+Doors+Cruise+Sound+Leather+Mileage, data = Cars)
#Now use the step function, and store the results in the object step.model:
step.model <- step(full.model)
## Start: AIC=14331.39
## Price ~ Cyl + Liter + Doors + Cruise + Sound + Leather + Mileage
##
## Df Sum of Sq RSS AIC
## - Liter 1 44992316 4.3492e+10 14330
## <none> 4.3447e+10 14331
## - Sound 1 663674405 4.4111e+10 14342
## - Doors 1 1265022645 4.4712e+10 14352
## - Mileage 1 1548003060 4.4995e+10 14358
## - Cyl 1 1681894212 4.5129e+10 14360
## - Leather 1 1714076275 4.5161e+10 14360
## - Cruise 1 4986166639 4.8433e+10 14417
##
## Step: AIC=14330.22
## Price ~ Cyl + Doors + Cruise + Sound + Leather + Mileage
##
## Df Sum of Sq RSS AIC
## <none> 4.3492e+10 14330
## - Sound 1 6.8659e+08 4.4178e+10 14341
## - Doors 1 1.2297e+09 4.4722e+10 14351
## - Mileage 1 1.5632e+09 4.5055e+10 14357
## - Leather 1 1.6943e+09 4.5186e+10 14359
## - Cruise 1 4.9514e+09 4.8443e+10 14415
## - Cyl 1 1.3563e+10 5.7055e+10 14546
#To see the suggested final model, apply the summary command to step.model:
summary(step.model)
##
## Call:
## lm(formula = Price ~ Cyl + Doors + Cruise + Sound + Leather +
## Mileage, data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13104 -5566 -1544 3877 33349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.323e+03 1.771e+03 4.135 3.92e-05 ***
## Cyl 3.200e+03 2.030e+02 15.765 < 2e-16 ***
## Doors -1.463e+03 3.083e+02 -4.747 2.45e-06 ***
## Cruise 6.206e+03 6.515e+02 9.525 < 2e-16 ***
## Sound -2.024e+03 5.707e+02 -3.547 0.000412 ***
## Leather 3.327e+03 5.971e+02 5.572 3.45e-08 ***
## Mileage -1.705e-01 3.186e-02 -5.352 1.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7387 on 797 degrees of freedom
## Multiple R-squared: 0.4457, Adjusted R-squared: 0.4415
## F-statistic: 106.8 on 6 and 797 DF, p-value: < 2.2e-16
#Note that the final model left out one of the original seven variables.
######Best Subset Regression###################
#Requires loading the R package leaps.
#The following code can be used to select the "best" models based on the lowest value of Cp.
#You can find additional information about various options for the leaps function by using the help function
##################################################
require(leaps)
help(leaps)
## starting httpd help server ...
## done
#First, create a "design matrix" whose columns contain the values of the explanatory variables of interest
full.data=cbind(Cars$Mileage,Cars$Cyl,Cars$Liter,Cars$Doors,Cars$Cruise,Cars$Sound,Cars$Leather)
head(full.data)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 8221 6 3.1 4 1 1 1
## [2,] 9135 6 3.1 4 1 1 0
## [3,] 13196 6 3.1 4 1 1 0
## [4,] 16342 6 3.1 4 1 0 0
## [5,] 19832 6 3.1 4 1 0 1
## [6,] 22236 6 3.1 4 1 1 0
#Now use the leaps function, and store the results in the object best.lm:
best.lm <- leaps(full.data,Cars$Price, method="Cp", names = c('Mileage','Cyl','Liter','Doors','Cruise','Sound','Leather'), nbest=3)
#method=c("Cp", "adjr2", "r2") # u can pick any of them
names(best.lm)
## [1] "which" "label" "size" "Cp"
data1<-cbind(best.lm$which,best.lm$Cp)
head(data1)
## Mileage Cyl Liter Doors Cruise Sound Leather
## 1 0 1 0 0 0 0 0 171.95875
## 1 0 0 1 0 0 0 0 189.68653
## 1 0 0 0 0 1 0 0 370.65991
## 2 0 1 0 0 1 0 0 87.57869
## 2 0 0 1 0 1 0 0 110.43981
## 2 0 1 0 1 0 0 0 145.78142
#####Cp statistic##########
# is a measure of error in the best subset model, relative to incorporating all variables
#Adequate models have Cp roughly = parameters i nthe model( including the constant/intercept)
#and/or Cp is at its minimum
#####################################
#To identify the best models, examine the output of the output matrix we have called data1.
#Each row of the matrix provides a value of 1 if the explanatory variable
#was included, and a value of 0 if it was not. For example,
#the best model with one explanatory variable contains the predictor Cyl, and has an adjusted R-squared value of 0.323.
#We will use the regression model that contains all explanatory variables except Liter:
cars.lm2 <- lm(Price~Cyl+Doors+Cruise+Sound+Leather+Mileage)
#Create a vector of predicted (fitted) values of Price:
fit2 <- cars.lm2$fitted
#Create a vector of residuals:
resid2 <- cars.lm2$res
Cars<-mutate(Cars, resid2, fit2)
tail(Cars,10) # shows the 10 rows with the largest residuals.
## # A tibble: 10 × 16
## Mileage Price Make Model Trim Type Cyl Liter Doors
## <int> <dbl> <chr> <chr> <chr> <chr> <int> <dbl> <int>
## 1 5826 18173.98 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 2 7755 18490.98 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 3 10102 17322.08 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 4 10986 17978.36 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 5 14242 16425.17 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 6 16229 16507.07 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 7 19095 16175.96 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 8 20484 15731.13 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 9 25979 15118.89 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## 10 35662 13585.64 Saturn L Series L300 Sedan 4D Sedan 6 3 4
## # ... with 7 more variables: Cruise <int>, Sound <int>, Leather <int>,
## # resid1 <dbl>, fit <dbl>, resid2 <dbl>, fit2 <dbl>
# which Make and Type correspond to these values?
####################################
# model diagnostics using library car
#####################################
library(car)
# Assessing Outliers
outlierTest(cars.lm2) # Bonferonni p-value for most extreme obs
## rstudent unadjusted p-value Bonferonni p
## 151 4.612092 4.6435e-06 0.0037334
## 153 4.549033 6.2291e-06 0.0050082
## 152 4.434826 1.0510e-05 0.0084498
## 154 4.255602 2.3331e-05 0.0187580
## 155 4.186102 3.1543e-05 0.0253610
## 156 4.055701 5.4902e-05 0.0441410
# Normality of Residuals
# qq plot for studentized resid
qqPlot(cars.lm2, main="QQ Plot")

# distribution of studentized residuals
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
sresid <- studres(cars.lm2)
hist(sresid, freq=FALSE,
main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(cars.lm2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 139.5819 Df = 1 p = 3.285834e-32
# plot studentized residuals vs. fitted values
spreadLevelPlot(cars.lm2)

##
## Suggested power transformation: -0.1152622
#pearsons residuals vs fitted value plus curvature test
residualPlot(cars.lm2)

##############################
# Evaluate MultiCollinearity
###############################
#initial diagnostics
#Plot Cyl versus Liter:
par(mfrow=c(1,1))
plot(C3.Cars$Liter,C3.Cars$Cyl)

#Correlation between Cyl and Liter:
cor(C3.Cars$Liter,C3.Cars$Cyl)
## [1] 0.9578966
vif(cars.lm2) # variance inflation factors
## Cyl Doors Cruise Sound Leather Mileage
## 1.167272 1.010767 1.164616 1.045804 1.049990 1.003421
sqrt(vif(cars.lm2)) > 2 # problem?
## Cyl Doors Cruise Sound Leather Mileage
## FALSE FALSE FALSE FALSE FALSE FALSE
#############################
# Evaluate Nonlinearity
# component + residual plot
##############################
crPlots(cars.lm2)

###############################
# Test for Autocorrelated Errors
#################################
durbinWatsonTest(cars.lm2)
## lag Autocorrelation D-W Statistic p-value
## 1 0.8462678 0.3042106 0
## Alternative hypothesis: rho != 0
#another way of getting diagnostic curves
par(mfrow=c(2,2))
plot(cars.lm2)

##################################################
########Transforming the response ################
#################################################
#Create regression models with the log-transformed (log base 10) and square root-transformed Price:
cars.log <- lm(log10(Price)~Cyl+Doors+Cruise+Sound+Leather+Mileage, data = Cars)
cars.sqrt <- lm(sqrt(Price)~Cyl+Doors+Cruise+Sound+Leather+Mileage, data = Cars)
###############################################
#######Dummy Variable Creation################
#################################################
attach(Cars)
## The following objects are masked _by_ .GlobalEnv:
##
## fit, fit2, resid1, resid2
## The following objects are masked from C3.Cars:
##
## Cruise, Cyl, Doors, Leather, Liter, Make, Mileage, Model,
## Price, Sound, Trim, Type
# Create indicator variables for the categorical variable Make:
DM1 <- (Make=="Buick")*1
DM2 <- (Make=="Cadillac")*1
DM3 <- (Make=="Chevrolet")*1
DM4 <- (Make=="Pontiac")*1
DM5 <- (Make=="SAAB")*1
DM6 <- (Make=="Saturn")*1
#Construct boxplots of TPrice versus the categorical explanatory variables Make, Model, Trim, and Type:
TPrice <- log10(C3.Cars$Price)
par(mfrow=c(2,2))
boxplot(TPrice~C3.Cars$Make)
boxplot(TPrice~C3.Cars$Model)
boxplot(TPrice~C3.Cars$Trim)
boxplot(TPrice~C3.Cars$Type)

################################
#DIY
############################
#Construct regression model of Price on Mileage for Cadillac, and SAAB only
# cyl.lm3 <- lm(Price~Mileage+Cadillac+SAAB,data = ...)
# Construct regression model of Price on Mileage, Cadillac, SAAB, and the interaction terms
# between Mileage and Cadillac, and Mileage and SAAB
# cyl.lm4 <- lm(Price~Mileage + Cadillac+ SAAB + Cadillac*Mileage + SAAB*Mileage,data=...)
#Use the anova() command to compare models cyl.lm3 and cyl.lm4 to determine if the interaction terms are significant.
#anova(cyl.lm3,cyl.lm4)