##           installed_and_loaded.packages.
## prettydoc                           TRUE
## dplyr                               TRUE
## gvlma                               TRUE
## MASS                                TRUE
## lmtest                              TRUE
## car                                 TRUE

Using R, build a multiple regression model for data that interests you.

Include in this model at least one quadratic term, one dichotomous term, and one dichotomous vs. quantitative interaction term.

Interpret all coefficients.
Conduct residual analysis.
Was the linear model appropriate? Why or why not?

Multiple LR with Wine Data Set

Last week, I created a univarate LM using the wine’s color intensity to predict the alcohol content, but it only accounted for 30% of the variation. Let’s see if we can find a better multiple linear model for a wine’s alcohol content.

Background Info on Data

“These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines.” UCI Machine Learning Repository

url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

wine_df <- read.csv(url, header = F)

names(wine_df) <- c("cultivars", "Alcohol", "Malic_acid", "Ash", "Alcalinity_of_ash", 
    "Magnesium", "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", 
    "Color_intensity", "Hue", "OD280_OD315_of_dilutedwines", "Proline")

glimpse(wine_df)

## Observations: 178
## Variables: 14
## $ cultivars                   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ Alcohol                     <dbl> 14.23, 13.20, 13.16, 14.37, 13.24,...
## $ Malic_acid                  <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76...
## $ Ash                         <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45...
## $ Alcalinity_of_ash           <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2...
## $ Magnesium                   <int> 127, 100, 101, 113, 118, 112, 96, ...
## $ Total_phenols               <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27...
## $ Flavanoids                  <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39...
## $ Nonflavanoid_phenols        <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34...
## $ Proanthocyanins             <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97...
## $ Color_intensity             <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75...
## $ Hue                         <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05...
## $ OD280_OD315_of_dilutedwines <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85...
## $ Proline                     <int> 1065, 1050, 1185, 1480, 735, 1450,...

Before proceeding with the model, let’s get R to treat the cultivars like the categorical variable that it is by converting it to a factor.

wine_df$cultivars <- as.factor(wine_df$cultivars)

Pairs

pairs(wine_df)

Model Summary

# wine_df$Color_intensity <- log(wine_df$Color_intensity)
wine_model <- lm(Alcohol ~ cultivars + Malic_acid + Ash + Alcalinity_of_ash + 
    Magnesium + Total_phenols + Flavanoids + Nonflavanoid_phenols + Proanthocyanins + 
    Color_intensity + Hue + OD280_OD315_of_dilutedwines + Proline, wine_df)

summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Alcalinity_of_ash + 
##     Magnesium + Total_phenols + Flavanoids + Nonflavanoid_phenols + 
##     Proanthocyanins + Color_intensity + Hue + OD280_OD315_of_dilutedwines + 
##     Proline, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28644 -0.32082 -0.01968  0.32130  1.55798 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 12.8897743  0.6520965  19.767  < 2e-16 ***
## cultivars2                  -1.1190123  0.1977998  -5.657 6.76e-08 ***
## cultivars3                  -0.7246470  0.2977755  -2.434   0.0160 *  
## Malic_acid                   0.0745271  0.0437377   1.704   0.0903 .  
## Ash                         -0.2857993  0.2129813  -1.342   0.1815    
## Alcalinity_of_ash            0.0043847  0.0180854   0.242   0.8087    
## Magnesium                   -0.0001002  0.0030792  -0.033   0.9741    
## Total_phenols                0.0987638  0.1245065   0.793   0.4288    
## Flavanoids                  -0.0504991  0.1152621  -0.438   0.6619    
## Nonflavanoid_phenols        -0.0372904  0.4047649  -0.092   0.9267    
## Proanthocyanins             -0.0671147  0.0912196  -0.736   0.4629    
## Color_intensity              0.1243005  0.0300102   4.142 5.52e-05 ***
## Hue                          0.3521632  0.2630523   1.339   0.1825    
## OD280_OD315_of_dilutedwines  0.0307750  0.1090821   0.282   0.7782    
## Proline                      0.0001729  0.0002380   0.726   0.4687    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4912 on 163 degrees of freedom
## Multiple R-squared:  0.6628, Adjusted R-squared:  0.6339 
## F-statistic: 22.89 on 14 and 163 DF,  p-value: < 2.2e-16

Backward Elimination

With all the variables, our Adjusted R-squared is 0.639, but we have some variables that are not relevant to the model. Let’s start by removing Nonflavanoid_phenols, which has the largest p-value greater than our signigicance level of .05.

wine_model <- update(wine_model, . ~ . - Nonflavanoid_phenols, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Alcalinity_of_ash + 
##     Magnesium + Total_phenols + Flavanoids + Proanthocyanins + 
##     Color_intensity + Hue + OD280_OD315_of_dilutedwines + Proline, 
##     data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29054 -0.32101 -0.01795  0.31986  1.56541 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.288e+01  6.375e-01  20.200  < 2e-16 ***
## cultivars2                  -1.120e+00  1.971e-01  -5.681 5.97e-08 ***
## cultivars3                  -7.211e-01  2.944e-01  -2.449   0.0154 *  
## Malic_acid                   7.404e-02  4.328e-02   1.711   0.0890 .  
## Ash                         -2.917e-01  2.025e-01  -1.440   0.1516    
## Alcalinity_of_ash            4.376e-03  1.803e-02   0.243   0.8085    
## Magnesium                   -1.947e-05  2.943e-03  -0.007   0.9947    
## Total_phenols                9.773e-02  1.236e-01   0.791   0.4304    
## Flavanoids                  -4.741e-02  1.099e-01  -0.431   0.6669    
## Proanthocyanins             -6.749e-02  9.085e-02  -0.743   0.4586    
## Color_intensity              1.240e-01  2.969e-02   4.175 4.83e-05 ***
## Hue                          3.493e-01  2.605e-01   1.341   0.1817    
## OD280_OD315_of_dilutedwines  3.284e-02  1.064e-01   0.309   0.7581    
## Proline                      1.736e-04  2.372e-04   0.732   0.4652    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4897 on 164 degrees of freedom
## Multiple R-squared:  0.6628, Adjusted R-squared:  0.6361 
## F-statistic:  24.8 on 13 and 164 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6411. Next, let’s remove Magnesium, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Magnesium, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Alcalinity_of_ash + 
##     Total_phenols + Flavanoids + Proanthocyanins + Color_intensity + 
##     Hue + OD280_OD315_of_dilutedwines + Proline, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.29034 -0.32130 -0.01807  0.31952  1.56564 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 12.8767316  0.6057498  21.258  < 2e-16 ***
## cultivars2                  -1.1196049  0.1964431  -5.699 5.41e-08 ***
## cultivars3                  -0.7211308  0.2934733  -2.457   0.0150 *  
## Malic_acid                   0.0740501  0.0430891   1.719   0.0876 .  
## Ash                         -0.2919628  0.1979999  -1.475   0.1422    
## Alcalinity_of_ash            0.0043819  0.0179556   0.244   0.8075    
## Total_phenols                0.0977106  0.1232245   0.793   0.4289    
## Flavanoids                  -0.0473888  0.1095632  -0.433   0.6659    
## Proanthocyanins             -0.0676042  0.0889758  -0.760   0.4485    
## Color_intensity              0.1239667  0.0295861   4.190 4.53e-05 ***
## Hue                          0.3493289  0.2596745   1.345   0.1804    
## OD280_OD315_of_dilutedwines  0.0328908  0.1058234   0.311   0.7563    
## Proline                      0.0001734  0.0002336   0.742   0.4591    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4882 on 165 degrees of freedom
## Multiple R-squared:  0.6628, Adjusted R-squared:  0.6383 
## F-statistic: 27.03 on 12 and 165 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6432 Next, let’s remove Proanthocyanins, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Proanthocyanins, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Alcalinity_of_ash + 
##     Total_phenols + Flavanoids + Color_intensity + Hue + OD280_OD315_of_dilutedwines + 
##     Proline, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31957 -0.32146 -0.00911  0.31488  1.56805 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 12.8747746  0.6049726  21.282  < 2e-16 ***
## cultivars2                  -1.1432034  0.1937252  -5.901 1.97e-08 ***
## cultivars3                  -0.7353523  0.2925027  -2.514   0.0129 *  
## Malic_acid                   0.0711548  0.0428656   1.660   0.0988 .  
## Ash                         -0.2740977  0.1963484  -1.396   0.1646    
## Alcalinity_of_ash            0.0035524  0.0178995   0.198   0.8429    
## Total_phenols                0.0894779  0.1225908   0.730   0.4665    
## Flavanoids                  -0.0698754  0.1053560  -0.663   0.5081    
## Color_intensity              0.1207591  0.0292460   4.129 5.76e-05 ***
## Hue                          0.3528511  0.2593024   1.361   0.1754    
## OD280_OD315_of_dilutedwines  0.0257211  0.1052676   0.244   0.8073    
## Proline                      0.0001525  0.0002317   0.658   0.5114    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4876 on 166 degrees of freedom
## Multiple R-squared:  0.6616, Adjusted R-squared:  0.6392 
## F-statistic: 29.51 on 11 and 166 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6452 Next, let’s remove OD280_OD315_of_dilutedwines, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - OD280_OD315_of_dilutedwines, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Alcalinity_of_ash + 
##     Total_phenols + Flavanoids + Color_intensity + Hue + Proline, 
##     data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.32097 -0.32235 -0.01811  0.31924  1.56915 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       12.9414239  0.5384655  24.034  < 2e-16 ***
## cultivars2        -1.1561527  0.1858098  -6.222 3.81e-09 ***
## cultivars3        -0.7616554  0.2712065  -2.808  0.00557 ** 
## Malic_acid         0.0716138  0.0427037   1.677  0.09541 .  
## Ash               -0.2781878  0.1950820  -1.426  0.15573    
## Alcalinity_of_ash  0.0042375  0.0176287   0.240  0.81033    
## Total_phenols      0.0960091  0.1193039   0.805  0.42211    
## Flavanoids        -0.0658263  0.1037512  -0.634  0.52665    
## Color_intensity    0.1191907  0.0284525   4.189 4.53e-05 ***
## Hue                0.3504651  0.2583879   1.356  0.17682    
## Proline            0.0001459  0.0002295   0.636  0.52586    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4862 on 167 degrees of freedom
## Multiple R-squared:  0.6615, Adjusted R-squared:  0.6412 
## F-statistic: 32.64 on 10 and 167 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.647 Next, let’s remove Proline, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Proline, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Alcalinity_of_ash + 
##     Total_phenols + Flavanoids + Color_intensity + Hue, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.35224 -0.31654 -0.00908  0.33739  1.54308 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       13.05249    0.50842  25.672  < 2e-16 ***
## cultivars2        -1.23177    0.14249  -8.644 4.12e-15 ***
## cultivars3        -0.83233    0.24693  -3.371  0.00093 ***
## Malic_acid         0.06916    0.04245   1.629  0.10519    
## Ash               -0.27601    0.19471  -1.418  0.15817    
## Alcalinity_of_ash  0.00385    0.01759   0.219  0.82700    
## Total_phenols      0.10348    0.11851   0.873  0.38383    
## Flavanoids        -0.07294    0.10296  -0.708  0.47967    
## Color_intensity    0.12400    0.02738   4.529 1.12e-05 ***
## Hue                0.37991    0.25375   1.497  0.13622    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4854 on 168 degrees of freedom
## Multiple R-squared:  0.6607, Adjusted R-squared:  0.6425 
## F-statistic: 36.35 on 9 and 168 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6485 Next, let’s remove Alcalinity_of_ash, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Alcalinity_of_ash, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Total_phenols + 
##     Flavanoids + Color_intensity + Hue, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34412 -0.30791 -0.00739  0.33559  1.55061 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.05188    0.50698  25.744  < 2e-16 ***
## cultivars2      -1.21371    0.11586 -10.475  < 2e-16 ***
## cultivars3      -0.81491    0.23308  -3.496 0.000603 ***
## Malic_acid       0.07005    0.04214   1.662 0.098296 .  
## Ash             -0.24829    0.14750  -1.683 0.094153 .  
## Total_phenols    0.10267    0.11812   0.869 0.385977    
## Flavanoids      -0.07171    0.10252  -0.699 0.485224    
## Color_intensity  0.12368    0.02726   4.537 1.08e-05 ***
## Hue              0.37689    0.25266   1.492 0.137650    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.484 on 169 degrees of freedom
## Multiple R-squared:  0.6606, Adjusted R-squared:  0.6445 
## F-statistic: 41.12 on 8 and 169 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.65 Next, let’s remove Flavanoids, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Flavanoids, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Total_phenols + 
##     Color_intensity + Hue, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34724 -0.30279 -0.02136  0.31395  1.54046 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.06056    0.50607  25.808  < 2e-16 ***
## cultivars2      -1.19500    0.11256 -10.616  < 2e-16 ***
## cultivars3      -0.71099    0.17933  -3.965 0.000108 ***
## Malic_acid       0.06924    0.04206   1.646 0.101539    
## Ash             -0.26666    0.14492  -1.840 0.067512 .  
## Total_phenols    0.04809    0.08854   0.543 0.587768    
## Color_intensity  0.11946    0.02655   4.500 1.26e-05 ***
## Hue              0.37924    0.25226   1.503 0.134595    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4833 on 170 degrees of freedom
## Multiple R-squared:  0.6596, Adjusted R-squared:  0.6456 
## F-statistic: 47.06 on 7 and 170 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6495 Next, let’s remove Total_phenols, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Total_phenols, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Color_intensity + 
##     Hue, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.33440 -0.30934 -0.01978  0.32788  1.56680 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.14205    0.48231  27.248  < 2e-16 ***
## cultivars2      -1.21023    0.10879 -11.124  < 2e-16 ***
## cultivars3      -0.77362    0.13705  -5.645 6.75e-08 ***
## Malic_acid       0.06962    0.04197   1.659   0.0990 .  
## Ash             -0.25578    0.14324  -1.786   0.0759 .  
## Color_intensity  0.12371    0.02531   4.887 2.34e-06 ***
## Hue              0.38310    0.25164   1.522   0.1298    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4823 on 171 degrees of freedom
## Multiple R-squared:  0.659,  Adjusted R-squared:  0.6471 
## F-statistic: 55.08 on 6 and 171 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6509 Next, let’s remove Hue, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Hue, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Malic_acid + Ash + Color_intensity, 
##     data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10320 -0.31346 -0.00365  0.32578  1.66758 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.59045    0.38340  35.447  < 2e-16 ***
## cultivars2      -1.23245    0.10822 -11.388  < 2e-16 ***
## cultivars3      -0.87260    0.12111  -7.205 1.74e-11 ***
## Malic_acid       0.04817    0.03968   1.214    0.226    
## Ash             -0.23398    0.14306  -1.635    0.104    
## Color_intensity  0.11432    0.02464   4.639 6.91e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4841 on 172 degrees of freedom
## Multiple R-squared:  0.6544, Adjusted R-squared:  0.6444 
## F-statistic: 65.14 on 5 and 172 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6487 Next, let’s remove Malic_acid, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Malic_acid, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Ash + Color_intensity, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.12938 -0.30689 -0.02893  0.31972  1.64186 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.67754    0.37714  36.266  < 2e-16 ***
## cultivars2      -1.24623    0.10777 -11.564  < 2e-16 ***
## cultivars3      -0.79825    0.10462  -7.630 1.51e-12 ***
## Ash             -0.21756    0.14262  -1.525    0.129    
## Color_intensity  0.10880    0.02425   4.486 1.32e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4848 on 173 degrees of freedom
## Multiple R-squared:  0.6514, Adjusted R-squared:  0.6434 
## F-statistic: 80.83 on 4 and 173 DF,  p-value: < 2.2e-16

Our Adjusted R-squared is 0.6473 Next, let’s remove Ash, which has next the largest p-value greater than our signigicance level.

wine_model <- update(wine_model, . ~ . - Ash, data = wine_df)
summary(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Color_intensity, data = wine_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.12074 -0.32721 -0.04133  0.34799  1.54962 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.14845    0.14871  88.417  < 2e-16 ***
## cultivars2      -1.20265    0.10431 -11.530  < 2e-16 ***
## cultivars3      -0.79248    0.10495  -7.551 2.33e-12 ***
## Color_intensity  0.10786    0.02434   4.432 1.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4866 on 174 degrees of freedom
## Multiple R-squared:  0.6468, Adjusted R-squared:  0.6407 
## F-statistic: 106.2 on 3 and 174 DF,  p-value: < 2.2e-16

Interpretation

Our remaining 3 variables now have p-values less than our significance threshold.
The stepAIC function agrees with this model.

stepAIC(wine_model)

## Start:  AIC=-252.45
## Alcohol ~ cultivars + Color_intensity
## 
##                   Df Sum of Sq    RSS     AIC
## <none>                         41.207 -252.44
## - Color_intensity  1     4.652 45.859 -235.41
## - cultivars        2    40.624 81.831 -134.33

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Color_intensity, data = wine_df)
## 
## Coefficients:
##     (Intercept)       cultivars2       cultivars3  Color_intensity  
##         13.1484          -1.2026          -0.7925           0.1079

The coefficint for cultivars2 is -1.0989, which means on average we would expect the alcohol content to be 1.0989 less for cultivar #2, holding all the variables constant.
The coefficint for cultivars3 is -0.7535, which means on average we would expect the alcohol content to be 0.7535 less for cultivar #3, holding all the variables constant.
The coefficint for Color_intensity is 0.6118, which means on average we would expect for every 0.6118 increase in color intensity, the alcohol content to be one unit greater, holding all the variables constant.
The y-intercept’s p-value is both near zero, which means that there is very little chance that it is not relevant to the model.
In this model, multiple \(R^2\) is \(0.6443\), which means that the model accounts for approximately \(64\%\) of the variation in the alcohol content.

Model Diagnostics

Let’s assess if this model is reliable.

Linearity: Do the variables have a linear relationship?

The plots of the variables appear to show linear relationships.

crPlots(wine_model)

Nearly normal residuals: Are the model’s residuals distributed normally?

Yes, per the histogram and Q-Q plot, the residuals are fairly normally distributed.

sresid <- studres(wine_model)
hist(sresid, freq = FALSE, main = "Distribution of Studentized Residuals")
xfit <- seq(min(sresid), max(sresid), length = 40)
yfit <- dnorm(xfit)
lines(xfit, yfit)

plot(wine_model, which = 2)

Homoscedasticity: Is there constant variability among the residuals?

Based on the scatter plot of the residuals shown above, the residuals evenly distributed.

The Non-constant Variance Score Test has a p-value of >.05, which means that we fail to reject the null hypothesis of homoscedasticity.

plot(wine_model, which = 1)

ncvTest(wine_model)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1.673241    Df = 1     p = 0.195825

Independent observations: Are the data from a random sample and not from a time series?

The Durbin Watson test’s p-value is >.05. Therefore, we fail to reject the null hypothesis of independence (no autocorrelation).

durbinWatsonTest(wine_model)

##  lag Autocorrelation D-W Statistic p-value
##    1      0.06118636      1.857365   0.224
##  Alternative hypothesis: rho != 0

Conclusion

Based upon my diagnostics and the gvlma function, the conditions for linear regression have been met.

gvlma(wine_model)

## 
## Call:
## lm(formula = Alcohol ~ cultivars + Color_intensity, data = wine_df)
## 
## Coefficients:
##     (Intercept)       cultivars2       cultivars3  Color_intensity  
##         13.1484          -1.2026          -0.7925           0.1079  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = wine_model) 
## 
##                        Value p-value                Decision
## Global Stat        3.3040112  0.5083 Assumptions acceptable.
## Skewness           2.7000824  0.1003 Assumptions acceptable.
## Kurtosis           0.1231648  0.7256 Assumptions acceptable.
## Link Function      0.0002385  0.9877 Assumptions acceptable.
## Heteroscedasticity 0.4805255  0.4882 Assumptions acceptable.

DATA 605 FUNDAMENTALS OF COMPUTATIONAL MATHEMATICS

Discussion 12: Multiple Linear Regression

Kyle Gilde

11/15/2017