Exploring the data

library(readr)
Cars <- read_csv("P:/SUNY-Brockport/SUNY Brockport_1/teaching/Staitstical Methods/Text CD/Chapter 03/CSV Files/C3 Cars.csv")

Cars<-Cars[,c(1:12)]
cyl1<-as.character(Cars$Cyl)
Cars<-mutate(Cars,cyl1)
#glimpse(Cars)
#head(Cars)# will do same stuff as glimpse
str(Cars)

## Classes 'tbl_df', 'tbl' and 'data.frame':    804 obs. of  13 variables:
##  $ Mileage: int  8221 9135 13196 16342 19832 22236 22576 22964 24021 27325 ...
##  $ Price  : num  17314 17542 16219 16337 16339 ...
##  $ Make   : chr  "Buick" "Buick" "Buick" "Buick" ...
##  $ Model  : chr  "Century" "Century" "Century" "Century" ...
##  $ Trim   : chr  "Sedan 4D" "Sedan 4D" "Sedan 4D" "Sedan 4D" ...
##  $ Type   : chr  "Sedan" "Sedan" "Sedan" "Sedan" ...
##  $ Cyl    : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ Liter  : num  3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 ...
##  $ Doors  : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Cruise : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Sound  : int  1 1 1 0 0 1 1 1 0 1 ...
##  $ Leather: int  1 0 0 0 1 0 0 0 1 1 ...
##  $ cyl1   : chr  "6" "6" "6" "6" ...

tally(Cyl~ Make, data=Cars)

##    Make
## Cyl Buick Cadillac Chevrolet Pontiac SAAB Saturn
##   4     0        0       180      50  114     50
##   6    80       20       120      80    0     10
##   8     0       60        20      20    0      0

Graphical Exploration

#Create a scatterplot between Mileage and Price:
#xyplot( Price ~ Mileage, data=Cars, main="Scatter Plot of Mileage vs. Price")

#fitting LM line or any arbitrary line using abline, 
attach(Cars)
#abline(h=4000)
plot(Mileage,Price)
abline(lm(Mileage~Price),col="red")
#fitting a lowess curve
lines(lowess(Cars),col="green")

Another way of constructing fancier scatter plots

Use the scatterplot( ) function in the car package, it offers many enhanced features, including fit lines, marginal box plots,conditioning on a factor, and interactive point identification.
For more on scatter plots go to http://www.statmethods.net/graphs/scatterplot.html

library(car)
scatterplot(Price~Mileage|cyl1 , data=Cars,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars))

scatterplot(Price~Mileage|Make , data=Cars,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars))

Subsetting Data to see outliers, cadillac hardtop convertibles

#cadil<-Cars[which(Cars$Make=="Cadillac"),] # another way of subsetting

cadil <- subset(Cars, Price >= 60000 & Cars$Make=="Cadillac", select=c(Price, Mileage,Make,Trim,Model))

head(cadil)

## # A tibble: 6 x 5
##   Price Mileage     Make            Trim  Model
##   <dbl>   <int>    <chr>           <chr>  <chr>
## 1 70755     583 Cadillac Hardtop Conv 2D XLR-V8
## 2 68566    6420 Cadillac Hardtop Conv 2D XLR-V8
## 3 69134    7892 Cadillac Hardtop Conv 2D XLR-V8
## 4 66374   12021 Cadillac Hardtop Conv 2D XLR-V8
## 5 65281   15600 Cadillac Hardtop Conv 2D XLR-V8
## 6 63913   18200 Cadillac Hardtop Conv 2D XLR-V8

#tail(cadil) # use this if u want to see last few rows

Checking out model for cars priced less than 50K

Cars_sub<- subset(Cars, Price < 50000 , select=c(Price,Mileage,Make,Trim,Model,cyl1))


library(car)
scatterplot(Price~Mileage|cyl1 , data=Cars_sub,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars_sub))

scatterplot(Price~Mileage|Make , data=Cars_sub,
            xlab="Mileage", ylab="Price",
            main="",
            labels=row.names(Cars_sub))

Model Fit

What happens to price, in general, when there is one or more mile on the car?
Does the fact that b1 is small means mielage is not improtant?
Does mileage help you predict price?
What does R-square value tell you?

carmodel<- lm(Price ~ Mileage + Make+cyl1,data=Cars)
#coef(carmodel) just to print coefficients
summary(carmodel)

## 
## Call:
## lm(formula = Price ~ Mileage + Make + cyl1, data = Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12236  -1785   -144   1524  23330 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.89e+04   6.56e+02   28.83  < 2e-16 ***
## Mileage       -1.80e-01   1.66e-02  -10.85  < 2e-16 ***
## MakeCadillac   1.03e+04   7.36e+02   13.97  < 2e-16 ***
## MakeChevrolet -2.19e+03   5.25e+02   -4.18  3.3e-05 ***
## MakePontiac   -2.45e+03   5.54e+02   -4.42  1.1e-05 ***
## MakeSAAB       1.43e+04   6.65e+02   21.55  < 2e-16 ***
## MakeSaturn    -2.21e+03   7.21e+02   -3.07   0.0022 ** 
## cyl16          5.57e+03   3.59e+02   15.49  < 2e-16 ***
## cyl18          1.83e+04   5.86e+02   31.29  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3840 on 795 degrees of freedom
## Multiple R-squared:  0.851,  Adjusted R-squared:  0.849 
## F-statistic:  566 on 8 and 795 DF,  p-value: <2e-16

#aov(carmodel)# u can print ANOVA table too, next chapter

Fitting a seperate model for <50k cars imprvoes model fit

carmodel_sub<- lm(Price ~ Mileage + Make+cyl1,data=Cars_sub)
coef(carmodel_sub)

##   (Intercept)       Mileage  MakeCadillac MakeChevrolet   MakePontiac 
##     18485.953        -0.167      8350.489     -2009.819     -2217.745 
##      MakeSAAB    MakeSaturn         cyl16         cyl18 
##     14512.439     -2065.687      5743.264     17323.781

summary(carmodel_sub)

## 
## Call:
## lm(formula = Price ~ Mileage + Make + cyl1, data = Cars_sub)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9163  -1789    -79   1518  14141 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.85e+04   5.29e+02   34.96  < 2e-16 ***
## Mileage       -1.67e-01   1.35e-02  -12.36  < 2e-16 ***
## MakeCadillac   8.35e+03   5.97e+02   13.99  < 2e-16 ***
## MakeChevrolet -2.01e+03   4.21e+02   -4.77  2.2e-06 ***
## MakePontiac   -2.22e+03   4.44e+02   -4.99  7.3e-07 ***
## MakeSAAB       1.45e+04   5.33e+02   27.21  < 2e-16 ***
## MakeSaturn    -2.07e+03   5.78e+02   -3.58  0.00037 ***
## cyl16          5.74e+03   2.88e+02   19.93  < 2e-16 ***
## cyl18          1.73e+04   4.72e+02   36.70  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3080 on 784 degrees of freedom
## Multiple R-squared:  0.876,  Adjusted R-squared:  0.874 
## F-statistic:  690 on 8 and 784 DF,  p-value: <2e-16

aov(carmodel_sub)

## Call:
##    aov(formula = carmodel_sub)
## 
## Terms:
##                  Mileage     Make     cyl1 Residuals
## Sum of Squares  1.10e+09 3.80e+10 1.31e+10  7.42e+09
## Deg. of Freedom        1        5        2       784
## 
## Residual standard error: 3077
## Estimated effects may be unbalanced

Assumptions and conditions

The P-value associated with the t-test for significance of \[H_{0}: \beta_1 =0\] is the same as the P-value associated with the analysis of variance F-test. This will always be true for the simple linear regression model The P-values are the same because of a well-known relationship between a t random variable and an F random variable that has 1 numerator degree of freedom. Namely:

\[t^{2}_{(n-2)}=F_{(1,n-2)}\] Thus for a given significance level \[\alpha\], the F-test of \[H_{0}: \beta_1 =0\] versus \[H_{a}: \beta_1 \neq 0\] is algebraically equivalent to the two-tailed t-test. We will get exactly the same P-values (Note: (-4.093)^{2}=16.75265) * If one test rejects H0, then so will the other. * If one test does not reject H0, then so will the other.

The natural question then is when should we use the F-test and when should we use the t-test?

The F-test is only appropriate for testing that the slope differs from 0. \[H_{a}:\beta_{1}\neq 0\] Use the t-test to test that the slope is positive \[H_{a}:\beta_{1}> 0\] or negative \[H_{a}:\beta_{1}< 0\]

The F-test is more useful for the multiple regression model when we want to test that more than one slope parameter is 0.

mod1<- lm(Price ~ Mileage,data=Cars)
summary(mod1)

## 
## Call:
## lm(formula = Price ~ Mileage, data = Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13905  -7254  -3520   5188  46091 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.48e+04   9.04e+02   27.38  < 2e-16 ***
## Mileage     -1.72e-01   4.21e-02   -4.09  4.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9790 on 802 degrees of freedom
## Multiple R-squared:  0.0205, Adjusted R-squared:  0.0192 
## F-statistic: 16.8 on 1 and 802 DF,  p-value: 4.68e-05

msummary(carmodel)

##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.89e+04   6.56e+02   28.83  < 2e-16 ***
## Mileage       -1.80e-01   1.66e-02  -10.85  < 2e-16 ***
## MakeCadillac   1.03e+04   7.36e+02   13.97  < 2e-16 ***
## MakeChevrolet -2.19e+03   5.25e+02   -4.18  3.3e-05 ***
## MakePontiac   -2.45e+03   5.54e+02   -4.42  1.1e-05 ***
## MakeSAAB       1.43e+04   6.65e+02   21.55  < 2e-16 ***
## MakeSaturn    -2.21e+03   7.21e+02   -3.07   0.0022 ** 
## cyl16          5.57e+03   3.59e+02   15.49  < 2e-16 ***
## cyl18          1.83e+04   5.86e+02   31.29  < 2e-16 ***
## 
## Residual standard error: 3840 on 795 degrees of freedom
## Multiple R-squared:  0.851,  Adjusted R-squared:  0.849 
## F-statistic:  566 on 8 and 795 DF,  p-value: <2e-16

rsquared(carmodel)

## [1] 0.851

confint(carmodel)

##                   2.5 %    97.5 %
## (Intercept)   17637.337 20214.455
## Mileage          -0.213    -0.147
## MakeCadillac   8832.093 11720.153
## MakeChevrolet -3224.171 -1162.266
## MakePontiac   -3535.868 -1361.270
## MakeSAAB      13036.078 15648.431
## MakeSaturn    -3629.528  -799.359
## cyl16          4860.666  6271.698
## cyl18         17178.395 19478.206

anova(carmodel)

## Analysis of Variance Table
## 
## Response: Price
##            Df   Sum Sq  Mean Sq F value Pr(>F)    
## Mileage     1 1.61e+09 1.61e+09     109 <2e-16 ***
## Make        5 5.05e+10 1.01e+10     686 <2e-16 ***
## cyl1        2 1.46e+10 7.30e+09     495 <2e-16 ***
## Residuals 795 1.17e+10 1.47e+07                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The residual standard deviation S is and this estimates the standard deviation of the errors. r2 = (SSTO-SSE) / SSTO = SSR / (SSR+SSE) = 6.52/ (6.52+7.84) = .84 The interpretation is that mileage,make and cylinder together explain 84% of the variation in price of used cars

Extracting residuals and fitted values

Residuals can be found using carmodel$res or residuals(carmodel)
Fitted/predicted values can be found using fitted(carmodel)

error<-residuals(carmodel)
residualPlot(carmodel)

fit1<-fitted(carmodel)

#It is often helpful to attach the residuals and fitted values to data frame. note we use mutate

Cars<-mutate(Cars,error, fit1)

Examining residuals graphically, also checking model assumptions

xyplot(Price ~ Mileage,xlab="Mileage", group=Make,auto.key=TRUE,lwd=2,type=c("p", "r", "smooth"), data=Cars) # superimposed line and a smoother

tally(Cyl~ Make, data=Cars)

##    Make
## Cyl Buick Cadillac Chevrolet Pontiac SAAB Saturn
##   4     0        0       180      50  114     50
##   6    80       20       120      80    0     10
##   8     0       60        20      20    0      0

xyplot(resid(carmodel) ~ Mileage, xlab="Mileage", 
       type=c("p", "r", "smooth"), data=Cars)  # resiudal vs x

xyplot(resid(carmodel) ~ fitted(carmodel), xlab="Predicted values", 
       ylab="Residuals",
       type=c("p", "r", "smooth"), data=Cars) #residual vs predicted value

xqqmath(~ resid(carmodel))

histogram(~ resid(carmodel)) #histogram of residuals

Interpretation: The residual versus fitted value plot indicate a good fit if the variance is roughly the same all the way across and there are no worrisome patterns
If plot of residuals versus fits shows that the residual variance (vertical spread, funnel shape) increases as the fitted values (predicted values of sale price) increase. This violates the assumption of constant error variance.
If the pattern is curved it indicates that the wrong type of equation was used.

Estimation and Prediction & Prediction intervals using the panel.lmbands() function.

The two research questions that we deal with are

What is the mean response \[E(Y)\] when the predictor value is \[x_{h}\]?

What value will a new response \[y_{new}\] take when the predictor value is \[x_{h}\]?

confpred <- predict(carmodel, interval="confidence")
head(confpred)

##     fit   lwr   upr
## 1 23012 22081 23944
## 2 22848 21928 23767
## 3 22117 21242 22992
## 4 21551 20698 22404
## 5 20922 20080 21765
## 6 20490 19645 21334

intpred <- predict(carmodel, interval="prediction")
head(intpred)

##     fit   lwr   upr
## 1 23012 15418 30606
## 2 22848 15255 30440
## 3 22117 14529 29704
## 4 21551 13966 29135
## 5 20922 13339 28506
## 6 20490 12906 28074

xyplot(Price ~ Mileage, xlab="Mileage", 
       panel=panel.lmbands, lwd=2, cex=0.2, data=Cars)

Stepwise regression

The stepwise regression method implemented in R uses AIC (not \[R^{2}\] as the basis for adding a new variable to the model.
Models with smaller AIC values are preferred.

full.model<-lm(Price~Make+cyl1+Liter+Doors+Cruise+Sound+Leather+Mileage, data = Cars)
#Now use the step function, and store the results in the object step.model:
step.model <- step(full.model)

## Start:  AIC=13067
## Price ~ Make + cyl1 + Liter + Doors + Cruise + Sound + Leather + 
##     Mileage
## 
##           Df Sum of Sq      RSS   AIC
## - Cruise   1  2.02e+06 8.88e+09 13065
## - Sound    1  4.25e+06 8.88e+09 13065
## - Leather  1  1.14e+07 8.89e+09 13066
## <none>                 8.88e+09 13067
## - cyl1     2  6.15e+08 9.49e+09 13117
## - Doors    1  1.01e+09 9.89e+09 13152
## - Liter    1  1.13e+09 1.00e+10 13161
## - Mileage  1  1.79e+09 1.07e+10 13212
## - Make     5  2.54e+10 3.43e+10 14143
## 
## Step:  AIC=13065
## Price ~ Make + cyl1 + Liter + Doors + Sound + Leather + Mileage
## 
##           Df Sum of Sq      RSS   AIC
## - Sound    1  4.37e+06 8.89e+09 13063
## - Leather  1  1.06e+07 8.89e+09 13064
## <none>                 8.88e+09 13065
## - cyl1     2  6.38e+08 9.52e+09 13117
## - Doors    1  1.02e+09 9.90e+09 13150
## - Liter    1  1.14e+09 1.00e+10 13160
## - Mileage  1  1.79e+09 1.07e+10 13210
## - Make     5  3.12e+10 4.00e+10 14266
## 
## Step:  AIC=13063
## Price ~ Make + cyl1 + Liter + Doors + Leather + Mileage
## 
##           Df Sum of Sq      RSS   AIC
## - Leather  1  8.85e+06 8.89e+09 13062
## <none>                 8.89e+09 13063
## - cyl1     2  6.34e+08 9.52e+09 13115
## - Doors    1  1.03e+09 9.91e+09 13149
## - Liter    1  1.15e+09 1.00e+10 13159
## - Mileage  1  1.79e+09 1.07e+10 13209
## - Make     5  3.14e+10 4.03e+10 14268
## 
## Step:  AIC=13062
## Price ~ Make + cyl1 + Liter + Doors + Mileage
## 
##           Df Sum of Sq      RSS   AIC
## <none>                 8.89e+09 13062
## - cyl1     2  6.26e+08 9.52e+09 13113
## - Doors    1  1.03e+09 9.93e+09 13149
## - Liter    1  1.14e+09 1.00e+10 13157
## - Mileage  1  1.80e+09 1.07e+10 13208
## - Make     5  3.14e+10 4.03e+10 14267

#To see the suggested final model, apply the summary command to step.model:
summary(step.model)

## 
## Call:
## lm(formula = Price ~ Make + cyl1 + Liter + Doors + Mileage, data = Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10431  -1747     28   1310  22134 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.58e+04   1.31e+03   12.04  < 2e-16 ***
## MakeCadillac   1.38e+04   7.00e+02   19.71  < 2e-16 ***
## MakeChevrolet -3.01e+03   4.65e+02   -6.48  1.6e-10 ***
## MakePontiac   -2.54e+03   4.86e+02   -5.24  2.1e-07 ***
## MakeSAAB       1.32e+04   5.85e+02   22.47  < 2e-16 ***
## MakeSaturn    -3.24e+03   6.35e+02   -5.10  4.2e-07 ***
## cyl16         -1.54e+03   7.86e+02   -1.97     0.05 *  
## cyl18          1.76e+03   1.59e+03    1.11     0.27    
## Liter          4.42e+03   4.38e+02   10.09  < 2e-16 ***
## Doors         -1.47e+03   1.53e+02   -9.60  < 2e-16 ***
## Mileage       -1.83e-01   1.45e-02  -12.65  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3350 on 793 degrees of freedom
## Multiple R-squared:  0.887,  Adjusted R-squared:  0.885 
## F-statistic:  620 on 10 and 793 DF,  p-value: <2e-16

summary(carmodel)

## 
## Call:
## lm(formula = Price ~ Mileage + Make + cyl1, data = Cars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12236  -1785   -144   1524  23330 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.89e+04   6.56e+02   28.83  < 2e-16 ***
## Mileage       -1.80e-01   1.66e-02  -10.85  < 2e-16 ***
## MakeCadillac   1.03e+04   7.36e+02   13.97  < 2e-16 ***
## MakeChevrolet -2.19e+03   5.25e+02   -4.18  3.3e-05 ***
## MakePontiac   -2.45e+03   5.54e+02   -4.42  1.1e-05 ***
## MakeSAAB       1.43e+04   6.65e+02   21.55  < 2e-16 ***
## MakeSaturn    -2.21e+03   7.21e+02   -3.07   0.0022 ** 
## cyl16          5.57e+03   3.59e+02   15.49  < 2e-16 ***
## cyl18          1.83e+04   5.86e+02   31.29  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3840 on 795 degrees of freedom
## Multiple R-squared:  0.851,  Adjusted R-squared:  0.849 
## F-statistic:  566 on 8 and 795 DF,  p-value: <2e-16

Note that the final model left out some of the original seven variables. Comments!
Best Subset Regression: another way of automated model selection, can be doen using leaps package

Finding a Good Re-expression

When the regression function is not linear and the error terms are not normal and have unequal variances, we can use transfomrations. In general,although not always!:
Transforming the y values corrects problems with the error terms (and may help the non-linearity).
Transforming the x values primarily corrects the non-linearity.

carmodel_log<- lm(log(Price) ~ Mileage + Make+cyl1,data=Cars)
summary(carmodel_log)

## 
## Call:
## lm(formula = log(Price) ~ Mileage + Make + cyl1, data = Cars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3537 -0.0884  0.0004  0.0853  0.3851 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.74e+00   2.38e-02  409.86  < 2e-16 ***
## Mileage       -7.95e-06   6.01e-07  -13.23  < 2e-16 ***
## MakeCadillac   3.11e-01   2.66e-02   11.67  < 2e-16 ***
## MakeChevrolet -1.26e-01   1.90e-02   -6.63  6.2e-11 ***
## MakePontiac   -8.66e-02   2.01e-02   -4.32  1.8e-05 ***
## MakeSAAB       7.07e-01   2.41e-02   29.35  < 2e-16 ***
## MakeSaturn    -1.04e-01   2.61e-02   -3.99  7.3e-05 ***
## cyl16          3.52e-01   1.30e-02   27.02  < 2e-16 ***
## cyl18          8.02e-01   2.12e-02   37.80  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.139 on 795 degrees of freedom
## Multiple R-squared:  0.886,  Adjusted R-squared:  0.885 
## F-statistic:  774 on 8 and 795 DF,  p-value: <2e-16

residualPlot(carmodel_log)

xyplot(resid(carmodel_log) ~ Mileage, xlab="Mileage", 
       type=c("p", "r", "smooth"), data=Cars)  # resiudal vs x

xyplot(resid(carmodel_log) ~ fitted(carmodel), xlab="Predicted values", 
       ylab="Residuals",
       type=c("p", "r", "smooth"), data=Cars) #residual vs predicted value

xqqmath(~ resid(carmodel_log))

histogram(~ resid(carmodel_log)) #histogram of residuals

Bells and whiskers of visual displays using diffrent data

xyplot(Price ~ Mileage,xlab="Mileage", group=Make,auto.key=TRUE,lwd=2,type=c("p", "r", "smooth"), data=Cars_sub) # superimposed line and a smoother

# another way of graphing
model <- lm(Price ~ Mileage + Make,data=Cars_sub)
wt <- makeFun(model)
xyplot(Price ~ Mileage, groups=Make, data = Cars_sub, col=c("blue", "purple" ,"gray" , "red", "orange","green"),auto.key=list(space="top",columns=4,title="Cars", cex.title=1,lwd=2,type=c("p", "r", "smooth"), points=FALSE,col=c("blue", "purple" ,"gray" , "red", "orange","green")))

plotFun(wt(h,Make="Buick") ~ h, add=TRUE,
col="blue")

plotFun(wt(h,Make="Cadillac") ~ h, add=TRUE,
col="purple")

plotFun(wt(h,Make="Chevrolet") ~ h, add=TRUE,
col="gray")

plotFun(wt(h,Make="Pontiac") ~ h, add=TRUE,
col="red")

plotFun(wt(h,Make="SAAB") ~ h, add=TRUE,
col="orange")

plotFun(wt(h,Make="Saturn") ~ h, add=TRUE,
col="green")

Below figure displays the scatterplot stratified by what shelf (category) it is displayed on at the store, it also shows how you can add a factor variable ot a data

Cereals <- read.csv("http://nhorton.people.amherst.edu/sdm4/data/Cereals.csv")
Cereals[c(1,2),c(1,2,3)]

##                name mfr calories
## 1         100%_Bran   N       70
## 2 100%_Natural_Bran   Q      120

tally(~ shelf, data=Cereals)

## shelf
##  1  2  3 
## 20 21 36

# adding a factor variable to data
Cereals <- mutate(Cereals, shelfgrp = derivedFactor(
  bottomshelf = shelf==1,
  middleshelf = shelf==2,
  topshelf = shelf==3
))
tally(~ shelfgrp, data=Cereals)

## shelfgrp
## bottomshelf middleshelf    topshelf 
##          20          21          36

xyplot(calories ~ sugars, group=shelfgrp, auto.key=TRUE, 
  lwd=2, type=c("p", "r"), data=Cereals)

data(Births78)
if (require(lattice)) {
  xyplot(births ~ date, Births78)
  xyplot(births ~ date, Births78, groups = wday, main="Births by Weekday ",t="l", auto.key=list(space="top",columns=4, 
                     title="Weekday", cex.title=1,
                     lines=TRUE, points=FALSE))
  
}

Regression

TZaihra

March 1, 2018