Exploring the data

library(readr)
Cars <- read_csv("P:/SUNY-Brockport/SUNY Brockport_1/teaching/Staitstical Methods/Text CD/Chapter 03/CSV Files/C3 Cars.csv")

Cars<-Cars[,c(1:12)]
cyl1<-as.character(Cars$Cyl)
Cars<-mutate(Cars,cyl1)
#glimpse(Cars)
#head(Cars)# will do same stuff as glimpse
str(Cars)
## Classes 'tbl_df', 'tbl' and 'data.frame':    804 obs. of  13 variables:
##  $ Mileage: int  8221 9135 13196 16342 19832 22236 22576 22964 24021 27325 ...
##  $ Price  : num  17314 17542 16219 16337 16339 ...
##  $ Make   : chr  "Buick" "Buick" "Buick" "Buick" ...
##  $ Model  : chr  "Century" "Century" "Century" "Century" ...
##  $ Trim   : chr  "Sedan 4D" "Sedan 4D" "Sedan 4D" "Sedan 4D" ...
##  $ Type   : chr  "Sedan" "Sedan" "Sedan" "Sedan" ...
##  $ Cyl    : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ Liter  : num  3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ...
##  $ Doors  : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Cruise : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Sound  : int  1 1 1 0 0 1 1 1 0 1 ...
##  $ Leather: int  1 0 0 0 1 0 0 0 1 1 ...
##  $ cyl1   : chr  "6" "6" "6" "6" ...
tally(Cyl~ Make, data=Cars)
##    Make
## Cyl Buick Cadillac Chevrolet Pontiac SAAB Saturn
##   4     0        0       180      50  114     50
##   6    80       20       120      80    0     10
##   8     0       60        20      20    0      0

Graphical Exploration

#Create a scatterplot between Mileage and Price:
#xyplot( Price ~ Mileage, data=Cars, main="Scatter Plot of Mileage vs. Price")

#fitting LM line or any arbitrary line using abline, 
attach(Cars)
#abline(h=4000)
plot(Mileage,Price)
abline(lm(Mileage~Price),col="red")
#fitting a lowess curve
lines(lowess(Cars),col="green")

Another way of constructing fancier scatter plots

  • Use the scatterplot( ) function in the car package, it offers many enhanced features, including fit lines, marginal box plots,conditioning on a factor, and interactive point identification.

  • For more on scatter plots go to http://www.statmethods.net/graphs/scatterplot.html

library(car)
scatterplot(Price~Mileage|cyl1 , data=Cars,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars)) 

scatterplot(Price~Mileage|Make , data=Cars,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars)) 

Subsetting Data to see outliers, cadillac hardtop convertibles

#cadil<-Cars[which(Cars$Make=="Cadillac"),] # another way of subsetting

cadil <- subset(Cars, Price >= 60000 & Cars$Make=="Cadillac", select=c(Price, Mileage,Make,Trim,Model))

head(cadil)
## # A tibble: 6 x 5
##   Price Mileage     Make            Trim  Model
##   <dbl>   <int>    <chr>           <chr>  <chr>
## 1 70755     583 Cadillac Hardtop Conv 2D XLR-V8
## 2 68566    6420 Cadillac Hardtop Conv 2D XLR-V8
## 3 69134    7892 Cadillac Hardtop Conv 2D XLR-V8
## 4 66374   12021 Cadillac Hardtop Conv 2D XLR-V8
## 5 65281   15600 Cadillac Hardtop Conv 2D XLR-V8
## 6 63913   18200 Cadillac Hardtop Conv 2D XLR-V8
#tail(cadil) # use this if u want to see last few rows

Checking out model for cars priced less than 50K

Cars_sub<- subset(Cars, Price < 50000 , select=c(Price,Mileage,Make,Trim,Model,cyl1))


library(car)
scatterplot(Price~Mileage|cyl1 , data=Cars_sub,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars_sub)) 

scatterplot(Price~Mileage|Make , data=Cars_sub,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars_sub)) 

Model Fit

  • What happens to price, in general, when there is one or more mile on the car?
  • Does the fact that b1 is small means mielage is not improtant?
  • Does mileage help you predict price?
  • What does R-square value tell you?
carmodel<- lm(Price ~ Mileage + Make+cyl1,data=Cars)
#coef(carmodel) just to print coefficients
summary(carmodel)
## 
## Call:
## lm(formula = Price ~ Mileage + Make + cyl1, data = Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12236  -1785   -144   1524  23330 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.89e+04   6.56e+02   28.83  < 2e-16 ***
## Mileage       -1.80e-01   1.66e-02  -10.85  < 2e-16 ***
## MakeCadillac   1.03e+04   7.36e+02   13.97  < 2e-16 ***
## MakeChevrolet -2.19e+03   5.25e+02   -4.18  3.3e-05 ***
## MakePontiac   -2.45e+03   5.54e+02   -4.42  1.1e-05 ***
## MakeSAAB       1.43e+04   6.65e+02   21.55  < 2e-16 ***
## MakeSaturn    -2.21e+03   7.21e+02   -3.07   0.0022 ** 
## cyl16          5.57e+03   3.59e+02   15.49  < 2e-16 ***
## cyl18          1.83e+04   5.86e+02   31.29  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3840 on 795 degrees of freedom
## Multiple R-squared:  0.851,  Adjusted R-squared:  0.849 
## F-statistic:  566 on 8 and 795 DF,  p-value: <2e-16
#aov(carmodel)# u can print ANOVA table too, next chapter

Fitting a seperate model for <50k cars imprvoes model fit

carmodel_sub<- lm(Price ~ Mileage + Make+cyl1,data=Cars_sub)
coef(carmodel_sub)
##   (Intercept)       Mileage  MakeCadillac MakeChevrolet   MakePontiac 
##     18485.953        -0.167      8350.489     -2009.819     -2217.745 
##      MakeSAAB    MakeSaturn         cyl16         cyl18 
##     14512.439     -2065.687      5743.264     17323.781
summary(carmodel_sub)
## 
## Call:
## lm(formula = Price ~ Mileage + Make + cyl1, data = Cars_sub)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9163  -1789    -79   1518  14141 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.85e+04   5.29e+02   34.96  < 2e-16 ***
## Mileage       -1.67e-01   1.35e-02  -12.36  < 2e-16 ***
## MakeCadillac   8.35e+03   5.97e+02   13.99  < 2e-16 ***
## MakeChevrolet -2.01e+03   4.21e+02   -4.77  2.2e-06 ***
## MakePontiac   -2.22e+03   4.44e+02   -4.99  7.3e-07 ***
## MakeSAAB       1.45e+04   5.33e+02   27.21  < 2e-16 ***
## MakeSaturn    -2.07e+03   5.78e+02   -3.58  0.00037 ***
## cyl16          5.74e+03   2.88e+02   19.93  < 2e-16 ***
## cyl18          1.73e+04   4.72e+02   36.70  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3080 on 784 degrees of freedom
## Multiple R-squared:  0.876,  Adjusted R-squared:  0.874 
## F-statistic:  690 on 8 and 784 DF,  p-value: <2e-16
aov(carmodel_sub)
## Call:
##    aov(formula = carmodel_sub)
## 
## Terms:
##                  Mileage     Make     cyl1 Residuals
## Sum of Squares  1.10e+09 3.80e+10 1.31e+10  7.42e+09
## Deg. of Freedom        1        5        2       784
## 
## Residual standard error: 3077
## Estimated effects may be unbalanced

Assumptions and conditions

The P-value associated with the t-test for significance of \[H_{0}: \beta_1 =0\] is the same as the P-value associated with the analysis of variance F-test. This will always be true for the simple linear regression model The P-values are the same because of a well-known relationship between a t random variable and an F random variable that has 1 numerator degree of freedom. Namely:

\[t^{2}_{(n-2)}=F_{(1,n-2)}\] Thus for a given significance level \[\alpha\], the F-test of \[H_{0}: \beta_1 =0\] versus \[H_{a}: \beta_1 \neq 0\] is algebraically equivalent to the two-tailed t-test. We will get exactly the same P-values (Note: (-4.093)^{2}=16.75265) * If one test rejects H0, then so will the other. * If one test does not reject H0, then so will the other.

The natural question then is when should we use the F-test and when should we use the t-test?

The F-test is only appropriate for testing that the slope differs from 0. \[H_{a}:\beta_{1}\neq 0\] Use the t-test to test that the slope is positive \[H_{a}:\beta_{1}> 0\] or negative \[H_{a}:\beta_{1}< 0\]

  • The F-test is more useful for the multiple regression model when we want to test that more than one slope parameter is 0.
mod1<- lm(Price ~ Mileage,data=Cars)
summary(mod1)
## 
## Call:
## lm(formula = Price ~ Mileage, data = Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13905  -7254  -3520   5188  46091 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.48e+04   9.04e+02   27.38  < 2e-16 ***
## Mileage     -1.72e-01   4.21e-02   -4.09  4.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9790 on 802 degrees of freedom
## Multiple R-squared:  0.0205, Adjusted R-squared:  0.0192 
## F-statistic: 16.8 on 1 and 802 DF,  p-value: 4.68e-05
msummary(carmodel)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.89e+04   6.56e+02   28.83  < 2e-16 ***
## Mileage       -1.80e-01   1.66e-02  -10.85  < 2e-16 ***
## MakeCadillac   1.03e+04   7.36e+02   13.97  < 2e-16 ***
## MakeChevrolet -2.19e+03   5.25e+02   -4.18  3.3e-05 ***
## MakePontiac   -2.45e+03   5.54e+02   -4.42  1.1e-05 ***
## MakeSAAB       1.43e+04   6.65e+02   21.55  < 2e-16 ***
## MakeSaturn    -2.21e+03   7.21e+02   -3.07   0.0022 ** 
## cyl16          5.57e+03   3.59e+02   15.49  < 2e-16 ***
## cyl18          1.83e+04   5.86e+02   31.29  < 2e-16 ***
## 
## Residual standard error: 3840 on 795 degrees of freedom
## Multiple R-squared:  0.851,  Adjusted R-squared:  0.849 
## F-statistic:  566 on 8 and 795 DF,  p-value: <2e-16
rsquared(carmodel)
## [1] 0.851
confint(carmodel)   
##                   2.5 %    97.5 %
## (Intercept)   17637.337 20214.455
## Mileage          -0.213    -0.147
## MakeCadillac   8832.093 11720.153
## MakeChevrolet -3224.171 -1162.266
## MakePontiac   -3535.868 -1361.270
## MakeSAAB      13036.078 15648.431
## MakeSaturn    -3629.528  -799.359
## cyl16          4860.666  6271.698
## cyl18         17178.395 19478.206
anova(carmodel)
## Analysis of Variance Table
## 
## Response: Price
##            Df   Sum Sq  Mean Sq F value Pr(>F)    
## Mileage     1 1.61e+09 1.61e+09     109 <2e-16 ***
## Make        5 5.05e+10 1.01e+10     686 <2e-16 ***
## cyl1        2 1.46e+10 7.30e+09     495 <2e-16 ***
## Residuals 795 1.17e+10 1.47e+07                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • The residual standard deviation S is and this estimates the standard deviation of the errors. r2 = (SSTO-SSE) / SSTO = SSR / (SSR+SSE) = 6.52/ (6.52+7.84) = .84 The interpretation is that mileage,make and cylinder together explain 84% of the variation in price of used cars

Extracting residuals and fitted values

  • Residuals can be found using carmodel$res or residuals(carmodel)
  • Fitted/predicted values can be found using fitted(carmodel)
error<-residuals(carmodel)
residualPlot(carmodel)

fit1<-fitted(carmodel)

#It is often helpful to attach the residuals and fitted values to data frame. note we use mutate

Cars<-mutate(Cars,error, fit1)

Examining residuals graphically, also checking model assumptions

xyplot(Price ~ Mileage,xlab="Mileage", group=Make,auto.key=TRUE,lwd=2,type=c("p", "r", "smooth"), data=Cars) # superimposed line and a smoother

tally(Cyl~ Make, data=Cars)
##    Make
## Cyl Buick Cadillac Chevrolet Pontiac SAAB Saturn
##   4     0        0       180      50  114     50
##   6    80       20       120      80    0     10
##   8     0       60        20      20    0      0
xyplot(resid(carmodel) ~ Mileage, xlab="Mileage", 
       type=c("p", "r", "smooth"), data=Cars)  # resiudal vs x