1 Introduction

The models so far all assume that a straight line describes the relationship. But there’s nothing special about straight lines, aside from their simplicity.

winequality <- read.csv("//Users/mikelapika/Downloads/winequality-red.csv", header = T, sep = ",")
head(winequality)

str(winequality)

## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

Plot

m <- lm(winequality$citric.acid ~ winequality$fixed.acidity)
plot( citric.acid ~fixed.acidity, data = winequality)
abline(mean(winequality$citric.acid),0,col="red")

#The relationship is visibly curved
#Looking how the model fits the data is more than enough for concluding that this model is misspecified.

2 The Model

2.1 Standardize

The first thing to do is to STANDARDIZE the predictor variable. This means to first center the variable and then divide it by its standard deviation.

So to standardize:

 #winequality$citric.acid.s <- plot( citric.acid ~fixed.acidity, data = winequality)
 winequality$citric.acid.std <- (winequality$citric.acid - mean(winequality$citric.acid)) /sd(winequality$citric.acid)

The new variable winequality$citric.acid.std has mean zero and standard deviation 1. No information has been lost in this procedure. Let’s plot to see that the relationship between the variable is unaltered, but now with different range on the horizontal axis:

plot(winequality$citric.acid.std ~ winequality$fixed.acidity, data = winequality)

3 Fitting the model

3.1 Let’s fit the model:

# library(magrittr)
# library(tibble)
# library(purrr)

 winequality$citric.acid.std2  <- winequality$citric.acid.std^2
# wine-red <- map(alist(winequality$fixed.acidity ~ dnorm(mu, sigma),
#     mu <- a + b1*winequality$citric.acid.std + b2*winequality$citric.acid.std2,
#     a ~ dnorm(178, 100),b1 ~ dnorm(0, 10),b2 ~ dnorm(0, 10),sigma ~ dunif(0, 50)), data = winequality)
# # Let’s take a look at the table of summary:
# 
# precis(wine-red, corr = TRUE)

## degree = 1
plot(winequality$citric.acid.std ~ winequality$fixed.acidity, data = winequality)
m <- lm(winequality$citric.acid ~ winequality$fixed.acidity)
summary(m)

## 
## Call:
## lm(formula = winequality$citric.acid ~ winequality$fixed.acidity)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33302 -0.11017 -0.00938  0.09317  0.71341 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -0.354270   0.017629  -20.09   <2e-16 ***
## winequality$fixed.acidity  0.075153   0.002074   36.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1444 on 1597 degrees of freedom
## Multiple R-squared:  0.4512, Adjusted R-squared:  0.4508 
## F-statistic:  1313 on 1 and 1597 DF,  p-value: < 2.2e-16

abline(m,lwd=2,col="green")

library(car)
## Loading required package: carData
shapiro.test(m$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  m$residuals
## W = 0.98341, p-value = 1.259e-12

As p-value is < 0.05 we can accept null hypothesis. Hence Residual data is not normally distributed.

## degree = 2
m1 <- lm(winequality$citric.acid ~ poly(winequality$fixed.acidity,2))
summary(m1)

## 
## Call:
## lm(formula = winequality$citric.acid ~ poly(winequality$fixed.acidity, 
##     2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33657 -0.10879 -0.00831  0.09073  0.73010 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)                          0.270976   0.003605  75.160   <2e-16
## poly(winequality$fixed.acidity, 2)1  5.230671   0.144167  36.282   <2e-16
## poly(winequality$fixed.acidity, 2)2 -0.329575   0.144167  -2.286   0.0224
##                                        
## (Intercept)                         ***
## poly(winequality$fixed.acidity, 2)1 ***
## poly(winequality$fixed.acidity, 2)2 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1442 on 1596 degrees of freedom
## Multiple R-squared:  0.453,  Adjusted R-squared:  0.4523 
## F-statistic: 660.8 on 2 and 1596 DF,  p-value: < 2.2e-16

pred <- predict(m1,data=winequality)
plot(winequality$fixed.acidity, winequality$citric.acid)
lines(sort(winequality$fixed.acidity), pred[order(winequality$fixed.acidity)], lwd = 2, col = "blue")

library(car)
## Loading required package: carData
shapiro.test(m1$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  m1$residuals
## W = 0.98372, p-value = 1.777e-12

As p-value is < 0.05 we can accept null hypothesis. Hence Residual data is not normally distributed.

4 Fit the ANOVA

anova(m,m1)

Polynomial Regression

Wei Wang, Mike Lapika