Ch3_Lab.R

library(readr)

## Warning: package 'readr' was built under R version 3.2.5

C3.Cars <- read_csv("P:/SUNY-Brockport/SUNY Brockport_1/teaching/Staitstical Methods/Spring 2017/R_Projects_in_class_spring17/Ch3/C3 Cars.csv")

## Parsed with column specification:
## cols(
##   Mileage = col_integer(),
##   Price = col_double(),
##   Make = col_character(),
##   Model = col_character(),
##   Trim = col_character(),
##   Type = col_character(),
##   Cyl = col_integer(),
##   Liter = col_double(),
##   Doors = col_integer(),
##   Cruise = col_integer(),
##   Sound = col_integer(),
##   Leather = col_integer()
## )

View(C3.Cars)

require(mosaic)

## Loading required package: mosaic

## Warning: package 'mosaic' was built under R version 3.2.5

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.2.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.2.5

## Loading required package: mosaicData

## Warning: package 'mosaicData' was built under R version 3.2.5

## Loading required package: Matrix

## 
## The 'mosaic' package masks several functions from core packages in order to add additional features.  
## The original behavior of these functions should not be affected by this.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cov, D, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

require(leaps)

## Loading required package: leaps

## Warning: package 'leaps' was built under R version 3.2.5

require(graphics)

##Packages lattice and dplyr are automatically added with mosaic
attach(C3.Cars)

#Create a scatterplot between Mileage and Price:
xyplot( Price ~ Mileage, data=C3.Cars, main="")

#fitting LM line or any arbitrary line using abline, 

#abline(h=4000)
plot(Mileage,Price)
abline(lm(Mileage~Price),col="red")
#fitting a lowess curve
lines(lowess(C3.Cars),col="green")

## Warning in data.matrix(x): NAs introduced by coercion

## Warning in data.matrix(x): NAs introduced by coercion

## Warning in data.matrix(x): NAs introduced by coercion

## Warning in data.matrix(x): NAs introduced by coercion

#The scatterplot( ) function in the car package 
#offers many enhanced features, including fit lines, marginal box plots,
#conditioning on a factor, and interactive point identification. 

# Enhanced Scatterplot of MPG vs. price
# by Number of Car Cylinders
library(car)

## Warning: package 'car' was built under R version 3.2.5

## 
## Attaching package: 'car'

## The following objects are masked from 'package:mosaic':
## 
##     deltaMethod, logit

## The following object is masked from 'package:dplyr':
## 
##     recode

scatterplot(Mileage~Price |Cyl , data=C3.Cars,
            xlab="Mileage", ylab="Price",
            main="Scatter Plot",
            labels=row.names(C3.Cars))

# for more on scatter plots refer to the below url
# http://www.statmethods.net/graphs/scatterplot.html

#subsetting data to look further into it
car.subset <- C3.Cars[C3.Cars$Price> 52000,]
head(car.subset)

## # A tibble: 6 × 12
##   Mileage    Price     Make  Model            Trim        Type   Cyl Liter
##     <int>    <dbl>    <chr>  <chr>           <chr>       <chr> <int> <dbl>
## 1     583 70755.47 Cadillac XLR-V8 Hardtop Conv 2D Convertible     8   4.6
## 2    6420 68566.19 Cadillac XLR-V8 Hardtop Conv 2D Convertible     8   4.6
## 3    7892 69133.73 Cadillac XLR-V8 Hardtop Conv 2D Convertible     8   4.6
## 4   12021 66374.31 Cadillac XLR-V8 Hardtop Conv 2D Convertible     8   4.6
## 5   15600 65281.48 Cadillac XLR-V8 Hardtop Conv 2D Convertible     8   4.6
## 6   18200 63913.12 Cadillac XLR-V8 Hardtop Conv 2D Convertible     8   4.6
## # ... with 4 more variables: Doors <int>, Cruise <int>, Sound <int>,
## #   Leather <int>

#Create a "regression" object, named cars.lm, that will contain 
#summary information about the fitted
#regression model of Price on Mileage:
#subset model
C3.Cars.subset <- lm(Mileage~Price,data=car.subset)
summary(C3.Cars.subset)

## 
## Call:
## lm(formula = Mileage ~ Price, data = car.subset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2216.47 -1204.31   -72.41  1544.19  1779.85 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.473e+05  5.448e+03   27.04 3.77e-09 ***
## Price       -2.043e+00  8.617e-02  -23.71 1.07e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1638 on 8 degrees of freedom
## Multiple R-squared:  0.986,  Adjusted R-squared:  0.9842 
## F-statistic:   562 on 1 and 8 DF,  p-value: 1.067e-08

#full data model

cars.lm = lm(C3.Cars$Price~C3.Cars$Mileage,data=C3.Cars)

#The estimated coefficients, R^2, t-statistics, and other results 
#about the model can be extracted from cars.lm
#using the summary()command:
summary(cars.lm)

## 
## Call:
## lm(formula = C3.Cars$Price ~ C3.Cars$Mileage, data = C3.Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13905  -7254  -3520   5188  46091 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.476e+04  9.044e+02  27.383  < 2e-16 ***
## C3.Cars$Mileage -1.725e-01  4.215e-02  -4.093 4.68e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9789 on 802 degrees of freedom
## Multiple R-squared:  0.02046,    Adjusted R-squared:  0.01924 
## F-statistic: 16.75 on 1 and 802 DF,  p-value: 4.685e-05

#ANOVA 
anova(cars.lm)  # anova table also is useful

## Analysis of Variance Table
## 
## Response: C3.Cars$Price
##                  Df     Sum Sq    Mean Sq F value    Pr(>F)    
## C3.Cars$Mileage   1 1.6056e+09 1605590375  16.755 4.685e-05 ***
## Residuals       802 7.6856e+10   95830165                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Create a vector of residuals from the cars.lm model:
resid1 <- cars.lm$res

#residuals can be found using cars.lm$res or residuals(cars.lm)
#fitted/predicted values can be found using fitted(cars.lm)
fit<-fitted(cars.lm)
#Residual for the first car (Buick Century):
resid1[1]  # to show the first element of the vector of residuals

##         1 
## -6032.165

#It is often helpful to attach the residuals and fitted values to data frame.
Cars<-mutate(C3.Cars, resid1,fit)
Cars[1,c(1:3,13:14)] #to show the first row and selected columns of the Cars data frame

## # A tibble: 1 × 5
##   Mileage   Price  Make    resid1      fit
##     <int>   <dbl> <chr>     <dbl>    <dbl>
## 1    8221 17314.1 Buick -6032.165 23346.27

#############Stepwise regression########
#Note that the stepwise regression method implemented in R does not use 
#the improvement in R^2 as the basis for adding a new variable to 
#the model. 
#It uses a criterion called AIC. 
#Models with smaller AIC values are preferred. 
#To perform stepwise regression for the cars data in R, 
#first create a regression object that contains all relevant explanatory variables
################################################################
full.model<-lm(Price~Cyl+Liter+Doors+Cruise+Sound+Leather+Mileage, data = Cars)

#Now use the step function, and store the results in the object step.model:
step.model <- step(full.model)

## Start:  AIC=14331.39
## Price ~ Cyl + Liter + Doors + Cruise + Sound + Leather + Mileage
## 
##           Df  Sum of Sq        RSS   AIC
## - Liter    1   44992316 4.3492e+10 14330
## <none>                  4.3447e+10 14331
## - Sound    1  663674405 4.4111e+10 14342
## - Doors    1 1265022645 4.4712e+10 14352
## - Mileage  1 1548003060 4.4995e+10 14358
## - Cyl      1 1681894212 4.5129e+10 14360
## - Leather  1 1714076275 4.5161e+10 14360
## - Cruise   1 4986166639 4.8433e+10 14417
## 
## Step:  AIC=14330.22
## Price ~ Cyl + Doors + Cruise + Sound + Leather + Mileage
## 
##           Df  Sum of Sq        RSS   AIC
## <none>                  4.3492e+10 14330
## - Sound    1 6.8659e+08 4.4178e+10 14341
## - Doors    1 1.2297e+09 4.4722e+10 14351
## - Mileage  1 1.5632e+09 4.5055e+10 14357
## - Leather  1 1.6943e+09 4.5186e+10 14359
## - Cruise   1 4.9514e+09 4.8443e+10 14415
## - Cyl      1 1.3563e+10 5.7055e+10 14546

#To see the suggested final model, apply the summary command to step.model:
summary(step.model)

## 
## Call:
## lm(formula = Price ~ Cyl + Doors + Cruise + Sound + Leather + 
##     Mileage, data = Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13104  -5566  -1544   3877  33349 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.323e+03  1.771e+03   4.135 3.92e-05 ***
## Cyl          3.200e+03  2.030e+02  15.765  < 2e-16 ***
## Doors       -1.463e+03  3.083e+02  -4.747 2.45e-06 ***
## Cruise       6.206e+03  6.515e+02   9.525  < 2e-16 ***
## Sound       -2.024e+03  5.707e+02  -3.547 0.000412 ***
## Leather      3.327e+03  5.971e+02   5.572 3.45e-08 ***
## Mileage     -1.705e-01  3.186e-02  -5.352 1.14e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7387 on 797 degrees of freedom
## Multiple R-squared:  0.4457, Adjusted R-squared:  0.4415 
## F-statistic: 106.8 on 6 and 797 DF,  p-value: < 2.2e-16

#Note that the final model left out one of the original seven variables.

######Best Subset Regression###################
#Requires loading the R package leaps. 
#The following code can be used to select the "best" models based on the lowest value of Cp. 
#You can find additional information about various options for the leaps function by using the help function
##################################################
require(leaps)
help(leaps)

## starting httpd help server ...

##  done

#First, create a "design matrix" whose columns contain the values of the explanatory variables of interest

full.data=cbind(Cars$Mileage,Cars$Cyl,Cars$Liter,Cars$Doors,Cars$Cruise,Cars$Sound,Cars$Leather)
head(full.data)

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,]  8221    6  3.1    4    1    1    1
## [2,]  9135    6  3.1    4    1    1    0
## [3,] 13196    6  3.1    4    1    1    0
## [4,] 16342    6  3.1    4    1    0    0
## [5,] 19832    6  3.1    4    1    0    1
## [6,] 22236    6  3.1    4    1    1    0

#Now use the leaps function, and store the results in the object best.lm:

best.lm <- leaps(full.data,Cars$Price, method="Cp", names = c('Mileage','Cyl','Liter','Doors','Cruise','Sound','Leather'), nbest=3)
#method=c("Cp", "adjr2", "r2") # u can pick any of them
names(best.lm)

## [1] "which" "label" "size"  "Cp"

data1<-cbind(best.lm$which,best.lm$Cp)
head(data1)

##   Mileage Cyl Liter Doors Cruise Sound Leather          
## 1       0   1     0     0      0     0       0 171.95875
## 1       0   0     1     0      0     0       0 189.68653
## 1       0   0     0     0      1     0       0 370.65991
## 2       0   1     0     0      1     0       0  87.57869
## 2       0   0     1     0      1     0       0 110.43981
## 2       0   1     0     1      0     0       0 145.78142

#####Cp statistic########## 
# is a measure of error in the best subset model, relative to incorporating all variables
#Adequate models have Cp roughly = parameters i nthe model( including the constant/intercept)
#and/or Cp is at its minimum
#####################################
#To identify the best models, examine the output of the output matrix we have called data1.
#Each row of the matrix provides a value of 1 if the explanatory variable 
#was included, and a value of 0 if it was not. For example, 
#the best model with one explanatory variable contains the predictor Cyl, and has an adjusted R-squared value of 0.323.


#We will use the regression model that contains all explanatory variables except Liter:
cars.lm2 <- lm(Price~Cyl+Doors+Cruise+Sound+Leather+Mileage)

#Create a vector of predicted (fitted) values of Price:
fit2 <- cars.lm2$fitted

#Create a vector of residuals:
resid2 <- cars.lm2$res

Cars<-mutate(Cars, resid2, fit2)

tail(Cars,10)  # shows the 10 rows with the largest residuals.

## # A tibble: 10 × 16
##    Mileage    Price   Make    Model          Trim  Type   Cyl Liter Doors
##      <int>    <dbl>  <chr>    <chr>         <chr> <chr> <int> <dbl> <int>
## 1     5826 18173.98 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 2     7755 18490.98 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 3    10102 17322.08 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 4    10986 17978.36 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 5    14242 16425.17 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 6    16229 16507.07 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 7    19095 16175.96 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 8    20484 15731.13 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 9    25979 15118.89 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## 10   35662 13585.64 Saturn L Series L300 Sedan 4D Sedan     6     3     4
## # ... with 7 more variables: Cruise <int>, Sound <int>, Leather <int>,
## #   resid1 <dbl>, fit <dbl>, resid2 <dbl>, fit2 <dbl>

# which Make and Type correspond to these values?

####################################
# model diagnostics using library car
#####################################

library(car)

# Assessing Outliers
outlierTest(cars.lm2) # Bonferonni p-value for most extreme obs

##     rstudent unadjusted p-value Bonferonni p
## 151 4.612092         4.6435e-06    0.0037334
## 153 4.549033         6.2291e-06    0.0050082
## 152 4.434826         1.0510e-05    0.0084498
## 154 4.255602         2.3331e-05    0.0187580
## 155 4.186102         3.1543e-05    0.0253610
## 156 4.055701         5.4902e-05    0.0441410

# Normality of Residuals
# qq plot for studentized resid
qqPlot(cars.lm2, main="QQ Plot")

# distribution of studentized residuals

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

sresid <- studres(cars.lm2)
hist(sresid, freq=FALSE,
     main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(cars.lm2)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 139.5819    Df = 1     p = 3.285834e-32

# plot studentized residuals vs. fitted values
spreadLevelPlot(cars.lm2)

## 
## Suggested power transformation:  -0.1152622

#pearsons residuals vs fitted value plus curvature test 
residualPlot(cars.lm2)

##############################
# Evaluate MultiCollinearity
###############################
#initial diagnostics 
#Plot Cyl versus Liter:
par(mfrow=c(1,1))
plot(C3.Cars$Liter,C3.Cars$Cyl)

#Correlation between Cyl and Liter:
cor(C3.Cars$Liter,C3.Cars$Cyl)

## [1] 0.9578966

vif(cars.lm2) # variance inflation factors

##      Cyl    Doors   Cruise    Sound  Leather  Mileage 
## 1.167272 1.010767 1.164616 1.045804 1.049990 1.003421

sqrt(vif(cars.lm2)) > 2 # problem?

##     Cyl   Doors  Cruise   Sound Leather Mileage 
##   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE

#############################
# Evaluate Nonlinearity
# component + residual plot
##############################
crPlots(cars.lm2)

###############################
# Test for Autocorrelated Errors
#################################
durbinWatsonTest(cars.lm2)

##  lag Autocorrelation D-W Statistic p-value
##    1       0.8462678     0.3042106       0
##  Alternative hypothesis: rho != 0

#another way of getting diagnostic curves
par(mfrow=c(2,2))
plot(cars.lm2)

##################################################
########Transforming the response ################
#################################################

#Create regression models with the log-transformed (log base 10) and square root-transformed Price:

cars.log <- lm(log10(Price)~Cyl+Doors+Cruise+Sound+Leather+Mileage, data = Cars)

cars.sqrt <- lm(sqrt(Price)~Cyl+Doors+Cruise+Sound+Leather+Mileage, data = Cars)

###############################################
#######Dummy Variable Creation################
#################################################
attach(Cars)

## The following objects are masked _by_ .GlobalEnv:
## 
##     fit, fit2, resid1, resid2

## The following objects are masked from C3.Cars:
## 
##     Cruise, Cyl, Doors, Leather, Liter, Make, Mileage, Model,
##     Price, Sound, Trim, Type

# Create indicator variables for the categorical variable Make:
DM1 <- (Make=="Buick")*1
DM2 <- (Make=="Cadillac")*1
DM3 <- (Make=="Chevrolet")*1
DM4 <- (Make=="Pontiac")*1
DM5 <- (Make=="SAAB")*1
DM6 <- (Make=="Saturn")*1

#Construct boxplots of TPrice versus the categorical explanatory variables Make, Model, Trim, and Type:
TPrice <- log10(C3.Cars$Price)

par(mfrow=c(2,2))
boxplot(TPrice~C3.Cars$Make)
boxplot(TPrice~C3.Cars$Model)
boxplot(TPrice~C3.Cars$Trim)
boxplot(TPrice~C3.Cars$Type)

################################  
#DIY 
############################
#Construct regression model of Price on Mileage for Cadillac, and SAAB only
# cyl.lm3 <- lm(Price~Mileage+Cadillac+SAAB,data = ...)

# Construct regression model of Price on Mileage, Cadillac, SAAB, and the interaction terms
# between Mileage and Cadillac, and Mileage and SAAB

# cyl.lm4 <- lm(Price~Mileage + Cadillac+ SAAB + Cadillac*Mileage + SAAB*Mileage,data=...)

#Use the anova() command to compare models cyl.lm3 and cyl.lm4 to determine if the interaction terms are significant.
#anova(cyl.lm3,cyl.lm4)

Ch3_Lab.R

tzaihra

Wed Mar 22 11:21:01 2017