library(readr)
Cars <- read_csv("P:/SUNY-Brockport/SUNY Brockport_1/teaching/Staitstical Methods/Text CD/Chapter 03/CSV Files/C3 Cars.csv")
Cars<-Cars[,c(1:12)]
#glimpse(Cars)
#head(Cars)# will do same stuff as glimpse
str(Cars)
## Classes 'tbl_df', 'tbl' and 'data.frame': 804 obs. of 12 variables:
## $ Mileage: int 8221 9135 13196 16342 19832 22236 22576 22964 24021 27325 ...
## $ Price : num 17314 17542 16219 16337 16339 ...
## $ Make : chr "Buick" "Buick" "Buick" "Buick" ...
## $ Model : chr "Century" "Century" "Century" "Century" ...
## $ Trim : chr "Sedan 4D" "Sedan 4D" "Sedan 4D" "Sedan 4D" ...
## $ Type : chr "Sedan" "Sedan" "Sedan" "Sedan" ...
## $ Cyl : int 6 6 6 6 6 6 6 6 6 6 ...
## $ Liter : num 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ...
## $ Doors : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Cruise : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Sound : int 1 1 1 0 0 1 1 1 0 1 ...
## $ Leather: int 1 0 0 0 1 0 0 0 1 1 ...
tally(Cyl~ Make, data=Cars)
## Make
## Cyl Buick Cadillac Chevrolet Pontiac SAAB Saturn
## 4 0 0 180 50 114 50
## 6 80 20 120 80 0 10
## 8 0 60 20 20 0 0
#Create a scatterplot between Mileage and Price:
#xyplot( Price ~ Mileage, data=Cars, main="Scatter Plot of Mileage vs. Price")
#fitting LM line or any arbitrary line using abline,
attach(Cars)
#abline(h=4000)
plot(Mileage,Price)
abline(lm(Mileage~Price),col="red")
#fitting a lowess curve
lines(lowess(Cars),col="green")
Use the scatterplot( ) function in the car package, it offers many enhanced features, including fit lines, marginal box plots,conditioning on a factor, and interactive point identification.
For more on scatter plots go to http://www.statmethods.net/graphs/scatterplot.html
library(car)
scatterplot(Price~Mileage|Cyl , data=Cars,
xlab="Mileage", ylab="Price",
main="",
labels=row.names(Cars))
scatterplot(Price~Mileage|Make , data=Cars,
xlab="Mileage", ylab="Price",
main="",
labels=row.names(Cars))
#cadil<-Cars[which(Cars$Make=="Cadillac"),] # another way of subsetting
cadil <- subset(Cars, Price >= 60000 & Cars$Make=="Cadillac", select=c(Price, Mileage,Make,Trim,Model))
head(cadil)
## # A tibble: 6 x 5
## Price Mileage Make Trim Model
## <dbl> <int> <chr> <chr> <chr>
## 1 70755 583 Cadillac Hardtop Conv 2D XLR-V8
## 2 68566 6420 Cadillac Hardtop Conv 2D XLR-V8
## 3 69134 7892 Cadillac Hardtop Conv 2D XLR-V8
## 4 66374 12021 Cadillac Hardtop Conv 2D XLR-V8
## 5 65281 15600 Cadillac Hardtop Conv 2D XLR-V8
## 6 63913 18200 Cadillac Hardtop Conv 2D XLR-V8
#tail(cadil) # use this if u want to see last few rows
Cars_sub<- subset(Cars, Price < 50000 , select=c(Price,Mileage,Make,Trim,Model,Cyl))
library(car)
scatterplot(Price~Mileage|Cyl , data=Cars_sub,
xlab="Mileage", ylab="Price",
main="",
labels=row.names(Cars_sub))
scatterplot(Price~Mileage|Make , data=Cars_sub,
xlab="Mileage", ylab="Price",
main="",
labels=row.names(Cars_sub))
carmodel<- lm(Price ~ Mileage + Make+Cyl,data=Cars)
#coef(carmodel) just to print coefficients
summary(carmodel)
##
## Call:
## lm(formula = Price ~ Mileage + Make + Cyl, data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11054 -2630 -26 1651 24612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.21e+02 1.02e+03 0.51 0.611
## Mileage -1.76e-01 1.76e-02 -9.99 <2e-16 ***
## MakeCadillac 1.39e+04 6.78e+02 20.48 <2e-16 ***
## MakeChevrolet -5.43e+02 5.28e+02 -1.03 0.304
## MakePontiac -1.01e+03 5.66e+02 -1.78 0.076 .
## MakeSAAB 1.67e+04 6.57e+02 25.46 <2e-16 ***
## MakeSaturn -2.19e+02 7.34e+02 -0.30 0.765
## Cyl 3.98e+03 1.41e+02 28.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4070 on 796 degrees of freedom
## Multiple R-squared: 0.832, Adjusted R-squared: 0.831
## F-statistic: 564 on 7 and 796 DF, p-value: <2e-16
#aov(carmodel)# u can print ANOVA table too, next chapter
carmodel_sub<- lm(Price ~ Mileage + Make+Cyl,data=Cars_sub)
coef(carmodel_sub)
## (Intercept) Mileage MakeCadillac MakeChevrolet MakePontiac
## 1134.670 -0.163 11132.617 -678.379 -1049.510
## MakeSAAB MakeSaturn Cyl
## 16437.087 -459.593 3835.060
summary(carmodel_sub)
##
## Call:
## lm(formula = Price ~ Mileage + Make + Cyl, data = Cars_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8046 -2485 -7 1637 16782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.13e+03 8.24e+02 1.38 0.169
## Mileage -1.63e-01 1.43e-02 -11.37 <2e-16 ***
## MakeCadillac 1.11e+04 5.60e+02 19.86 <2e-16 ***
## MakeChevrolet -6.78e+02 4.24e+02 -1.60 0.110
## MakePontiac -1.05e+03 4.55e+02 -2.31 0.021 *
## MakeSAAB 1.64e+04 5.28e+02 31.15 <2e-16 ***
## MakeSaturn -4.60e+02 5.89e+02 -0.78 0.435
## Cyl 3.84e+03 1.14e+02 33.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3260 on 785 degrees of freedom
## Multiple R-squared: 0.86, Adjusted R-squared: 0.858
## F-statistic: 687 on 7 and 785 DF, p-value: <2e-16
aov(carmodel_sub)
## Call:
## aov(formula = carmodel_sub)
##
## Terms:
## Mileage Make Cyl Residuals
## Sum of Squares 1.10e+09 3.80e+10 1.21e+10 8.37e+09
## Deg. of Freedom 1 5 1 785
##
## Residual standard error: 3265
## Estimated effects may be unbalanced
The P-value associated with the t-test for significance of \[H_{0}: \beta_1 =0\] is the same as the P-value associated with the analysis of variance F-test. This will always be true for the simple linear regression model The P-values are the same because of a well-known relationship between a t random variable and an F random variable that has 1 numerator degree of freedom. Namely:
\[t^{2}_{(n-2)}=F_{(1,n-2)}\] Thus for a given significance level \[\alpha\], the F-test of \[H_{0}: \beta_1 =0\] versus \[H_{a}: \beta_1 \neq 0\] is algebraically equivalent to the two-tailed t-test. We will get exactly the same P-values (Note: (-4.093)^{2}=16.75265) * If one test rejects H0, then so will the other. * If one test does not reject H0, then so will the other.
The natural question then is when should we use the F-test and when should we use the t-test?
The F-test is only appropriate for testing that the slope differs from 0. \[H_{a}:\beta_{1}\neq 0\] Use the t-test to test that the slope is positive \[H_{a}:\beta_{1}> 0\] or negative \[H_{a}:\beta_{1}< 0\]
mod1<- lm(Price ~ Mileage,data=Cars)
summary(mod1)
##
## Call:
## lm(formula = Price ~ Mileage, data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13905 -7254 -3520 5188 46091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.48e+04 9.04e+02 27.38 < 2e-16 ***
## Mileage -1.72e-01 4.21e-02 -4.09 4.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9790 on 802 degrees of freedom
## Multiple R-squared: 0.0205, Adjusted R-squared: 0.0192
## F-statistic: 16.8 on 1 and 802 DF, p-value: 4.68e-05
msummary(carmodel)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.21e+02 1.02e+03 0.51 0.611
## Mileage -1.76e-01 1.76e-02 -9.99 <2e-16 ***
## MakeCadillac 1.39e+04 6.78e+02 20.48 <2e-16 ***
## MakeChevrolet -5.43e+02 5.28e+02 -1.03 0.304
## MakePontiac -1.01e+03 5.66e+02 -1.78 0.076 .
## MakeSAAB 1.67e+04 6.57e+02 25.46 <2e-16 ***
## MakeSaturn -2.19e+02 7.34e+02 -0.30 0.765
## Cyl 3.98e+03 1.41e+02 28.17 <2e-16 ***
##
## Residual standard error: 4070 on 796 degrees of freedom
## Multiple R-squared: 0.832, Adjusted R-squared: 0.831
## F-statistic: 564 on 7 and 796 DF, p-value: <2e-16
rsquared(carmodel)
## [1] 0.832
confint(carmodel)
## 2.5 % 97.5 %
## (Intercept) -1489.14 2530.716
## Mileage -0.21 -0.141
## MakeCadillac 12553.59 15214.841
## MakeChevrolet -1579.32 493.323
## MakePontiac -2117.16 106.384
## MakeSAAB 15443.74 18024.259
## MakeSaturn -1658.98 1221.099
## Cyl 3702.83 4257.468
anova(carmodel)
## Analysis of Variance Table
##
## Response: Price
## Df Sum Sq Mean Sq F value Pr(>F)
## Mileage 1 1.61e+09 1.61e+09 97 <2e-16 ***
## Make 5 5.05e+10 1.01e+10 611 <2e-16 ***
## Cyl 1 1.31e+10 1.31e+10 794 <2e-16 ***
## Residuals 796 1.32e+10 1.66e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
error<-residuals(carmodel)
residualPlot(carmodel)
fit1<-fitted(carmodel)
#It is often helpful to attach the residuals and fitted values to data frame. note we use mutate
Cars<-mutate(Cars,error, fit1)
xyplot(Price ~ Mileage,xlab="Mileage", group=Make,auto.key=TRUE,lwd=2,type=c("p", "r", "smooth"), data=Cars) # superimposed line and a smoother
tally(Cyl~ Make, data=Cars)
## Make
## Cyl Buick Cadillac Chevrolet Pontiac SAAB Saturn
## 4 0 0 180 50 114 50
## 6 80 20 120 80 0 10
## 8 0 60 20 20 0 0
xyplot(resid(carmodel) ~ Mileage, xlab="Mileage",
type=c("p", "r", "smooth"), data=Cars) # resiudal vs x
xyplot(resid(carmodel) ~ fitted(carmodel), xlab="Predicted values",
ylab="Residuals",
type=c("p", "r", "smooth"), data=Cars) #residual vs predicted value
xqqmath(~ resid(carmodel))
histogram(~ resid(carmodel)) #histogram of residuals
Interpretation: The residual versus fitted value plot indicate a good fit if the variance is roughly the same all the way across and there are no worrisome patterns
If plot of residuals versus fits shows that the residual variance (vertical spread, funnel shape) increases as the fitted values (predicted values of sale price) increase. This violates the assumption of constant error variance.
If the pattern is curved it indicates that the wrong type of equation was used.
The two research questions that we deal with are
What is the mean response \[E(Y)\] when the predictor value is \[x_{h}\]?
What value will a new response \[y_{new}\] take when the predictor value is \[x_{h}\]?
confpred <- predict(carmodel, interval="confidence")
head(confpred)
## fit lwr upr
## 1 22958 21971 23945
## 2 22798 21824 23772
## 3 22085 21158 23012
## 4 21533 20629 22436
## 5 20920 20027 21813
## 6 20498 19603 21393
intpred <- predict(carmodel, interval="prediction")
head(intpred)
## fit lwr upr
## 1 22958 14912 31005
## 2 22798 14753 30843
## 3 22085 14045 30124
## 4 21533 13496 29569
## 5 20920 12884 28955
## 6 20498 12462 28534
xyplot(Price ~ Mileage, xlab="Mileage",
panel=panel.lmbands, lwd=2, cex=0.2, data=Cars)
The stepwise regression method implemented in R uses AIC (not \[R^{2}\] as the basis for adding a new variable to the model.
Models with smaller AIC values are preferred.
full.model<-lm(Price~Make+Cyl+Liter+Doors+Cruise+Sound+Leather+Mileage, data = Cars)
#Now use the step function, and store the results in the object step.model:
step.model <- step(full.model)
## Start: AIC=13118
## Price ~ Make + Cyl + Liter + Doors + Cruise + Sound + Leather +
## Mileage
##
## Df Sum of Sq RSS AIC
## - Leather 1 8.65e+04 9.49e+09 13116
## - Sound 1 3.03e+05 9.49e+09 13116
## - Cyl 1 3.40e+06 9.49e+09 13117
## <none> 9.49e+09 13118
## - Cruise 1 2.53e+07 9.52e+09 13118
## - Liter 1 1.29e+09 1.08e+10 13218
## - Doors 1 1.46e+09 1.09e+10 13231
## - Mileage 1 1.74e+09 1.12e+10 13252
## - Make 5 3.40e+10 4.34e+10 14331
##
## Step: AIC=13116
## Price ~ Make + Cyl + Liter + Doors + Cruise + Sound + Mileage
##
## Df Sum of Sq RSS AIC
## - Sound 1 2.65e+05 9.49e+09 13114
## - Cyl 1 3.59e+06 9.49e+09 13115
## <none> 9.49e+09 13116
## - Cruise 1 2.60e+07 9.52e+09 13117
## - Liter 1 1.32e+09 1.08e+10 13219
## - Doors 1 1.46e+09 1.09e+10 13229
## - Mileage 1 1.74e+09 1.12e+10 13250
## - Make 5 3.57e+10 4.52e+10 14360
##
## Step: AIC=13114
## Price ~ Make + Cyl + Liter + Doors + Cruise + Mileage
##
## Df Sum of Sq RSS AIC
## - Cyl 1 3.45e+06 9.49e+09 13113
## <none> 9.49e+09 13114
## - Cruise 1 2.59e+07 9.52e+09 13115
## - Liter 1 1.32e+09 1.08e+10 13217
## - Doors 1 1.46e+09 1.09e+10 13227
## - Mileage 1 1.74e+09 1.12e+10 13248
## - Make 5 3.60e+10 4.55e+10 14365
##
## Step: AIC=13113
## Price ~ Make + Liter + Doors + Cruise + Mileage
##
## Df Sum of Sq RSS AIC
## <none> 9.49e+09 13113
## - Cruise 1 2.64e+07 9.52e+09 13113
## - Doors 1 1.55e+09 1.10e+10 13232
## - Mileage 1 1.74e+09 1.12e+10 13246
## - Liter 1 1.12e+10 2.07e+10 13739
## - Make 5 3.79e+10 4.73e+10 14395
#To see the suggested final model, apply the summary command to step.model:
summary(step.model)
##
## Call:
## lm(formula = Price ~ Make + Liter + Doors + Cruise + Mileage,
## data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9576 -1909 -183 1337 22534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.53e+04 1.02e+03 15.08 < 2e-16 ***
## MakeCadillac 1.61e+04 5.58e+02 28.93 < 2e-16 ***
## MakeChevrolet -2.21e+03 4.72e+02 -4.69 3.2e-06 ***
## MakePontiac -1.78e+03 4.92e+02 -3.61 0.00032 ***
## MakeSAAB 1.47e+04 5.63e+02 26.15 < 2e-16 ***
## MakeSaturn -2.26e+03 6.45e+02 -3.51 0.00048 ***
## Liter 4.53e+03 1.48e+02 30.66 < 2e-16 ***
## Doors -1.73e+03 1.52e+02 -11.37 < 2e-16 ***
## Cruise -5.11e+02 3.44e+02 -1.49 0.13779
## Mileage -1.80e-01 1.49e-02 -12.07 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3460 on 794 degrees of freedom
## Multiple R-squared: 0.879, Adjusted R-squared: 0.878
## F-statistic: 641 on 9 and 794 DF, p-value: <2e-16
summary(carmodel)
##
## Call:
## lm(formula = Price ~ Mileage + Make + Cyl, data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11054 -2630 -26 1651 24612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.21e+02 1.02e+03 0.51 0.611
## Mileage -1.76e-01 1.76e-02 -9.99 <2e-16 ***
## MakeCadillac 1.39e+04 6.78e+02 20.48 <2e-16 ***
## MakeChevrolet -5.43e+02 5.28e+02 -1.03 0.304
## MakePontiac -1.01e+03 5.66e+02 -1.78 0.076 .
## MakeSAAB 1.67e+04 6.57e+02 25.46 <2e-16 ***
## MakeSaturn -2.19e+02 7.34e+02 -0.30 0.765
## Cyl 3.98e+03 1.41e+02 28.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4070 on 796 degrees of freedom
## Multiple R-squared: 0.832, Adjusted R-squared: 0.831
## F-statistic: 564 on 7 and 796 DF, p-value: <2e-16
When the regression function is not linear and the error terms are not normal and have unequal variances, we can use transfomrations. In general,although not always!:
Transforming the y values corrects problems with the error terms (and may help the non-linearity).
Transforming the x values primarily corrects the non-linearity.
carmodel_log<- lm(log(Price) ~ Mileage + Make+Cyl,data=Cars)
summary(carmodel_log)
##
## Call:
## lm(formula = log(Price) ~ Mileage + Make + Cyl, data = Cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3438 -0.0993 -0.0003 0.0836 0.4287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.94e+00 3.53e-02 253.40 < 2e-16 ***
## Mileage -7.88e-06 6.05e-07 -13.03 < 2e-16 ***
## MakeCadillac 3.60e-01 2.34e-02 15.42 < 2e-16 ***
## MakeChevrolet -1.04e-01 1.82e-02 -5.69 1.8e-08 ***
## MakePontiac -6.69e-02 1.95e-02 -3.43 0.00064 ***
## MakeSAAB 7.40e-01 2.27e-02 32.67 < 2e-16 ***
## MakeSaturn -7.68e-02 2.53e-02 -3.04 0.00246 **
## Cyl 1.92e-01 4.87e-03 39.49 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.14 on 796 degrees of freedom
## Multiple R-squared: 0.884, Adjusted R-squared: 0.883
## F-statistic: 868 on 7 and 796 DF, p-value: <2e-16
residualPlot(carmodel_log)
xyplot(resid(carmodel_log) ~ Mileage, xlab="Mileage",
type=c("p", "r", "smooth"), data=Cars) # resiudal vs x
xyplot(resid(carmodel_log) ~ fitted(carmodel), xlab="Predicted values",
ylab="Residuals",
type=c("p", "r", "smooth"), data=Cars) #residual vs predicted value
xqqmath(~ resid(carmodel_log))
histogram(~ resid(carmodel_log)) #histogram of residuals
xyplot(Price ~ Mileage,xlab="Mileage", group=Make,auto.key=TRUE,lwd=2,type=c("p", "r", "smooth"), data=Cars_sub) # superimposed line and a smoother
# another way of graphing
model <- lm(Price ~ Mileage + Make,data=Cars_sub)
wt <- makeFun(model)
xyplot(Price ~ Mileage, groups=Make, data = Cars_sub, col=c("blue", "purple" ,"gray" , "red", "orange","green"),auto.key=list(space="top",columns=4,title="Cars", cex.title=1,lwd=2,type=c("p", "r", "smooth"), points=FALSE,col=c("blue", "purple" ,"gray" , "red", "orange","green")))
plotFun(wt(h,Make="Buick") ~ h, add=TRUE,
col="blue")
plotFun(wt(h,Make="Cadillac") ~ h, add=TRUE,
col="purple")
plotFun(wt(h,Make="Chevrolet") ~ h, add=TRUE,
col="gray")
plotFun(wt(h,Make="Pontiac") ~ h, add=TRUE,
col="red")
plotFun(wt(h,Make="SAAB") ~ h, add=TRUE,
col="orange")
plotFun(wt(h,Make="Saturn") ~ h, add=TRUE,
col="green")
Below figure displays the scatterplot stratified by what shelf (category) it is displayed on at the store, it also shows how you can add a factor variable ot a data
Cereals <- read.csv("http://nhorton.people.amherst.edu/sdm4/data/Cereals.csv")
Cereals[c(1,2),c(1,2,3)]
## name mfr calories
## 1 100%_Bran N 70
## 2 100%_Natural_Bran Q 120
tally(~ shelf, data=Cereals)
## shelf
## 1 2 3
## 20 21 36
# adding a factor variable to data
Cereals <- mutate(Cereals, shelfgrp = derivedFactor(
bottomshelf = shelf==1,
middleshelf = shelf==2,
topshelf = shelf==3
))
tally(~ shelfgrp, data=Cereals)
## shelfgrp
## bottomshelf middleshelf topshelf
## 20 21 36
xyplot(calories ~ sugars, group=shelfgrp, auto.key=TRUE,
lwd=2, type=c("p", "r"), data=Cereals)
data(Births78)
if (require(lattice)) {
xyplot(births ~ date, Births78)
xyplot(births ~ date, Births78, groups = wday, main="Births by Weekday ",t="l", auto.key=list(space="top",columns=4,
title="Weekday", cex.title=1,
lines=TRUE, points=FALSE))
}