Simple Linear Regression

Pre-processing

df$conc_fed <- factor(df$conc_fed)
df$protein_whey <- df$protein - df$casein
df$log_scc <- log(df$scc)

Exploratory Data Analysis

as.array(summary(df$scc)) %>%
  kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary Somatic Cell Count") %>%
  kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("HOLD_position"))

Summary Somatic Cell Count
Var1	Freq
Min.	8.00
1st Qu.	36.00
Median	74.00
Mean	145.89
3rd Qu.	144.00
Max.	3447.00

plot1 <- ggplot(df, aes(x=scc)) + 
  geom_histogram(binwidth = 40) + 
  # ggtitle("Histogram scc") + 
  labs(x = "Somatic Cell Count (cells/L)", y = "Frequency") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

plot2 <- ggplot(df, aes(y=scc)) + 
  geom_boxplot(lwd=0.1, outlier.size = 0.1) + 
  coord_flip() + 
  # ggtitle("Boxplot scc") + 
  labs(y = "Somatic Cell Count (cells/L)") +
  theme(plot.title = element_text(size=10),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)

$\label{fig:figs} Somatic Cell Count Distribution$

Somatic Cell Count Distribution

2. Log of Somatic Cell Count

as.array(summary(df$log_scc)) %>%
  kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary log Somatic Cell Count") %>%
  kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))

Summary log Somatic Cell Count
Var1	Freq
Min.	2.08
1st Qu.	3.58
Median	4.30
Mean	4.36
3rd Qu.	4.97
Max.	8.15

plot1 <- ggplot(df, aes(x=log_scc)) + 
  geom_histogram() + 
  labs(x = "log Somatic Cell Count (cells/L)", y = "Frequency") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

plot2 <- ggplot(df, aes(y=log_scc)) + 
  geom_boxplot(lwd=0.1, outlier.size = 0.1) + 
  coord_flip() + 
  labs(y = "log Somatic Cell Count (cells/L)") +
  theme(plot.title = element_text(size=10),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)

$\label{fig:figs}log Somatic Cell Count Distribution$

log Somatic Cell Count Distribution

3. Whey Protein (%g/L)

as.array(summary(df$protein_whey)) %>%
  kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary protein") %>%
  kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))

Summary protein
Var1	Freq
Min.	0.78
1st Qu.	0.96
Median	1.01
Mean	1.02
3rd Qu.	1.07
Max.	1.31

plot1 <- ggplot(df, aes(x=protein_whey)) + 
  geom_histogram() + 
  labs(x = "Whey Protein (%g/L)") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

plot2 <- ggplot(df, aes(y=protein_whey)) + 
  geom_boxplot(lwd=0.1, outlier.size = 0.1) + 
  coord_flip() + 
  labs(y = "Whey Protein (%g/L)") +
  theme(plot.title = element_text(size=10),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8)
        )

g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)

$\label{fig:figs}Whey Protein Distribution$

Whey Protein Distribution

4. Casein (%g/L)

as.array(summary(df$casein)) %>%
  kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary casein") %>%
  kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))

Summary casein
Var1	Freq
Min.	1.40
1st Qu.	3.15
Median	3.37
Mean	3.37
3rd Qu.	3.63
Max.	4.79

plot1 <- ggplot(df, aes(x=casein)) + 
  geom_histogram() + 
  labs(x = "Casein (%g/L)") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))


plot2 <- ggplot(df, aes(y=casein)) + 
  geom_boxplot(lwd=0.1, outlier.size = 0.1) + 
  coord_flip() +
  labs(y = "Casein (%g/L)") +
  theme(plot.title = element_text(size=10),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8)
        )

g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)

$\label{fig:figs}Casein Distribution$

Casein Distribution

5. Concentrate Feed Frequency and Proportions

df$conc_fed <- factor(df$conc_fed)

tab = table(df$conc_fed)
ptab = prop.table(table(df$conc_fed))

cbind(tab, ptab) %>%
  kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary Concentrate Feed",  col.names = c("Nbr. Cows", "%")) %>%
  kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7,
                latex_options = c("hold_position"))

Summary Concentrate Feed
	Nbr. Cows	%
0	249	0.47
1.5	158	0.30
2.5	53	0.10
3.5	72	0.14

ggplot(df, aes(x=as.factor(conc_fed))) + 
  geom_bar() + 
  labs(x = "Concentrate Feed (%)", y = "Nbr. Cows") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

$\label{fig:figs}Concentrate Feed Frequency and Proportions$

Concentrate Feed Frequency and Proportions

6. Log of Somatic Cell Count vs. Percent Concentrate Feed

list <- describeBy(df$log_scc, df$conc_fed, digits = 2)
tmp <- data.frame(t(sapply(list,c)))
tmp <- format.data.frame(tmp, digits = 2)

tmp[ , -which(names(tmp) %in% c("item","vars"))] %>%
  kable(format="html", booktabs = T, linesep = "", digits = 2, 
        caption = "Summary log(scc) by Concentrate Feed Perc.") %>%
  kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, 
                latex_options = c("scale_down", "HOLD_position"))

Summary log(scc) by Concentrate Feed Perc.
	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
0	249	4.4	1.2	4.4	4.4	1.2	2.1	8.1	6.1	0.46	0.099	0.073
1.5	158	4.4	0.89	4.4	4.3	0.99	2.8	6.4	3.6	0.2	-0.72	0.07
2.5	53	4.1	1	4	4.1	0.79	2.6	7.3	4.7	0.82	0.5	0.14
3.5	72	4.2	0.83	4.2	4.2	0.86	2.8	6.5	3.8	0.28	-0.36	0.098

ggplot(df, aes(x=conc_fed, y=log_scc)) +
  geom_boxplot(lwd=0.1, outlier.size = 0.1) +
  labs(x = "Contentrate Feed Perc.", y = "log Somatic Cell Count") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

$\label{fig:figs}log(scc) by Concentrate Feed Perc.$

log(scc) by Concentrate Feed Perc.

7. Correlation between log(scc) and predictor variables

corr <- cor(select(df, log_scc, casein, protein_whey))

corr %>%
  kable(format="html", booktabs = T, linesep = "", digits = 2,
        caption = "Correlation of log of scc with Predictors") %>%
  kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))

Correlation of log of scc with Predictors
	log_scc	casein	protein_whey
log_scc	1.00	0.08	0.3
casein	0.08	1.00	0.6
protein_whey	0.30	0.60	1.0

plot2 <- ggplot(df, aes(x = casein, y = log_scc)) + 
  geom_point(size=0.5) + 
  geom_smooth(method=lm, se=FALSE,size=0.4) +
  labs(y = "log Somatic Cell Count", x = "Casein") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

plot3 <- ggplot(df, aes(x = protein_whey, y = log_scc)) + 
  geom_point(size=0.5) + 
  geom_smooth(method=lm, se=FALSE, size=0.4) + 
    labs(y = "log Somatic Cell Count", x = "Whey Protein") +
  theme(plot.title = element_text(size=10),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

g2grob <- ggplotGrob(plot2)
g3grob <- ggplotGrob(plot3)
grid.arrange(g2grob, g3grob, ncol=2)

$\label{fig:figs}Whey Protein and Casein correlation with scc$

Whey Protein and Casein correlation with scc

Regression Model

1. Fit linear model with log(scc)

linreg <- lm(log_scc ~ protein_whey, data=df)

The summary of the fitted model is shown in Table .

Linear Model Summary
term	estimate	std.error	statistic	p.value
(Intercept)	0.73	0.51	1.45	0.15
protein_whey	3.56	0.50	7.17	0.00

4. Variance of $\beta_0$ and $\beta_1$

N   = length(df$protein_whey)
MSE = sum(linreg$residuals^2 / (N-2))
SXX = sum((df$protein_whey - mean(df$protein_whey))^2)
VARB0 = ( sum(df$protein_whey^2) * MSE ) / ( N * SXX )
VARB1 = MSE / SXX

$\widehat{\text{Var}(\beta_0)} = \frac{\sum{x_i^2 \ MSE}}{n \ SXX} = 0.2572937$

and

$\widehat{\text{Var}(\beta_1)} = \frac{MSE}{SXX} = 0.2471896$

5. Confidence Interval for $\beta_0$

alpha=0.05
beta0 = linreg$coefficients[1]
CILOW <- beta0 - qt(1-alpha/2,N-2)*sqrt(VARB0)
CIUPP <- beta0 + qt(1-alpha/2,N-2)*sqrt(VARB0)

\[\hat{\beta}_0 \pm t_{\frac{\alpha}{2}, N-2}\sqrt{\text{Var}(\hat{\beta}_0)} = [-0.26, 1.73]\]

We are 95% confident that $\beta_0$ lies between -0.26 < $\beta_0$ < 1.73

6. Confidence Interval for $\beta_1$

alpha=0.05
beta1= linreg$coefficients[2]
CILOW <- beta1 - qt(1-alpha/2,N-2)*sqrt( VARB1)
CIUPP <- beta1 + qt(1-alpha/2,N-2)*sqrt( VARB1)

\[\hat{\beta}_1 \pm t_{\frac{\alpha}{2}, N-2}\sqrt{\text{Var}(\hat{\beta}_1)} = [2.59, 4.54]\]

We are 95% confident that $\beta_1$ lies between 2.59 < $\beta_0$ < 4.54.

7. Test the hypothesis $H_0: \beta_0=0 \text{ against } H_A:\beta_0\neq0$

T = (linreg$coefficients[1]-0)/sqrt(VARB0) 
TDIST = qt( 1-0.05/2, N-2 )
PVALUE = 2 *( 1- pt(abs(T), df = N - 2))

$T = \frac{\beta_0 - 0}{\sqrt{\beta_0}} = 1.447$

To get the critical value for the T distribution with 2 degrees of freedom

We find the the critical value $t_{0.975, N-2} = 1.964$

Since the sample t-statistic, 1.447, is not greater than the critical value 1.964, we cannot reject the null hypothesis that the intercept is zero.

We find the corresponding p-value is equal to 0.1483575.

At the 5% level of significance, the probability that $\beta_0 = 0$ is 0.148. This is not smaller than 0.05 thus we cannot reject $H_0$.

Note how the confidence interval does not contain the value 1, which suggests that the model may be biased.

8. Test the hypothesis $H_0: \beta_1=0 \text{ against } H_A:\beta_1\neq0$

T = (linreg$coefficients[2]-0)/sqrt(VARB1) 
TDIST = qt( 1-0.05/2, N-2 )
PVALUE = 2*(1-pt(abs(T), df = N - 2))

$T = \frac{\beta_1 - 0}{\sqrt{\beta_1}} = 7.168$

To get the critical value for the T distribution with 2 degrees of freedom

We find the the critical value $t_{0.975, N-2} = 1.964$

$|T| \simeq 7.168 \geq 1.964$ so reject $H_0: \beta_1=0$. At the 5% level of significance, the evidence is not strong enough to indicate the $\beta_1=0$. Indicating that a relation exists between %g of whey protein per litre of milk and somatic cell count per litre of milk.

We find the corresponding p-value is equal to 2.564393110^{-12}.

At the 5% level of significance, the probability that $\beta_1 = 0$ is 2.564393110^{-12}. This is much smaller than 0.05 thus we reject $H_0$.

9. F-statistic

x <- summary(linreg)
F <- round(x$fstatistic[1], 2)
FDIST <- round(qf(1-alpha, 1, N-2), 2)
PVALUE <- pf(F, 1, N-2, lower.tail=FALSE)

The F-statistic is used to test the hypothesis $H_0: Y_i = \hat{\beta_0} + \epsilon_i$ against $H_0: Y_i = \hat{\beta_0} + \hat{\beta_1}X_i + \epsilon_i$.

F-statistic = 51.38

F-critical $F_{0.95, N-2} = 3.86$

As 51.38 > 3.86 reject $H_0$. At the 5% level of significance the evidence is not strong enough to indicate that $\beta_1=0$. Indicating that a relation exists between %g protein per litre milk and somatic cell count per litre milk.

The probability of the F-distribution taking the value 51.38 is 2.567816610^{-12}. As this p-value is smaller than 0.05 we reject $H_0$ that the null model without explanatory variable is better.

10. R-squared

R2 <- summary(linreg)$r.squared
R2

## [1] 0.08838046

The coefficient of determination $R^2$ represents the proportion of the variation in Y that is explained by X. In the present case, for 1 litre of milk, $R^2= 8.84$% of the variation in somatic cell count is explained by the percentage gram whey protein content.

11. Residual Standard Error

SSE <- sum(linreg$residuals^2)
RMSE <- sqrt(SSE/(N-2))

\[RMSE = \sqrt{\frac{SSE}{N-2}} = 0.98\]

The standard error is generally an easy to interpret measure. It is measured in the units of the response variable and represents a type of “average” residual. In the present case the standard error of the regression is on average, log(scc) = 0.98 which means that given the measured %g/L quantity of whey protein. predictions of scc are on average off by approx 1%.

12. Confidence Interval for $\hat{Y}$

SXX = sum((df$protein_whey - mean(df$protein_whey))^2)
MSE = sum(linreg$residuals^2 / (N-2))
VAR_Y = MSE*(1/N+(df$protein_whey-mean(df$protein_whey))^2/SXX)
Yhat <- fitted(linreg)
CI <- cbind(Yhat- qt(1-alpha/2,N-2)*sqrt(VAR_Y), Yhat + qt(1-alpha/2,N-2)*sqrt(VAR_Y))

\[\hat{Y}_i \pm t_{\frac{\alpha}{2}, N-2} \sqrt{Var(\hat{Y})}\]

\[Var(\hat{Y}) = MSE \ [\frac{1}{N} + \frac{(X - \bar{X})^2} {SXX} ]\]

The confidence interval of the predicted values shown in Figure appears to be quite narrow, suggesting that predictions should be relatively accurate.

ggplot(df, aes(x = protein_whey, y = log_scc)) + 
  geom_point(size=0.5) + 
  geom_smooth(method=lm, se=TRUE, size=0.4) +
  labs(y = "log Somatic Cell Count", x = "Whey Protein (%g/L)") +
  theme(plot.title = element_text(hjust = 0.5, size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

$\label{fig:figs}Confidence Interval for estimated values of Y$

Confidence Interval for estimated values of Y

13. Linear Model Assumptions for Small Sample Inference

The key assumptions of the model concern the behaviour of the errors $\epsilon_{i}$[see @fox_applied_2016 p112]:

Linearity: $E(\epsilon_i) = 0$
Constant variance aka homoscedasticity : $\text{Var}(\epsilon_i) = \sigma^2$
Normality: $\epsilon_i \sim N( 0, \sigma^2 )$
Independence: $\epsilon_i$; $\epsilon_j$ are independent for $i \neq j$
The X values are fixed or, if random, are measured without error and are independent of the errors.

14. Residuals of the Regression Model

Zero Conditional Mean

One of the assumptions, or requirements, of the regression model is that the residuals have, on average a mean equal to zero. As can be seen in Table , this requirement is indeed satisfied. Further, Figure also shows that the residuals appear randomly scattered about the zero mean line.

Summary Residuals
Var1	Freq
Min.	-2.09
1st Qu.	-0.69
Median	-0.05
Mean	0.00
3rd Qu.	0.58
Max.	3.74

Constant variance aka Homoskedasticity

Although the least-squares estimator is unbiased and consistent even when the error variance is not constant, the efficiency of the least-squares estimator is impaired, and the usual formulas for coefficient standard errors are inaccurate. For this reason it is necessary to verify that the residuals follow a normal distribution. It is common to check this assumption by plotting the residuals against the predicted values $\hat{Y}$. For the assumption to be met the plot should not exhibit any pattern.

In the present case there is no strong visible pattern. Most of the data is captured above and below the residual values -2 and 2 respectively. However the variability appear to decrease as the fitted values get larger. Specifically, when the predicted log(scc) per litre of milk is greater than 5, the error decreases. This means that better predictions are obtained for higher somatic cell counts. This suggests that the variance may not, in this case, be constant.

ggplot(linreg, aes(x = .fitted, y = .resid)) + 
  geom_point(size=1) + 
  geom_hline(yintercept=0,linetype="dashed", color = "black", size=0.3) + 
  geom_hline(yintercept=-2,linetype="dashed", color = "blue", size=0.3) + 
  geom_hline(yintercept=2,linetype="dashed", color = "blue", size=0.3) + 
  labs(y = "Residuals", x = "Fitted") +
  theme(plot.title = element_text(hjust = 0.5, size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

$\label{fig:figs}Residuals vs. Fitted$

Residuals vs. Fitted

Checking whether errors are normally distributed

To perform model inference (T-test, F-test, CI, PI) the errors must be normally distributed or at least approximately normally distributed. This requirement can be inspected by plotting the distribution and density of the residuals.

Figure shows that there are outliers in the residuals. Specifically one can see several outliers above the upper whisker of the boxplot and one can observe a long right tail in the density plot. This means that there are errors that are unusually large.

The density plot is relatively symmetric but with a noticeable right tail. However inspection of the boxplot shows that the median appears to split the box into almost equal halves suggesting symmetric distribution. Nevertheless the noticeable right tail of the density plot indicates that the residuals are not normally distributed.

plot1  <- ggplot(df, aes(y=residuals(linreg))) +
  geom_boxplot(lwd=0.1, outlier.size = 0.1) +
  labs(y = "Residuals", x = "") +
  theme(plot.title = element_text(size=10),
        axis.ticks.x = element_blank(),
        # axis.text.x = element_blank(),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

plot2 <- ggplot(df, aes(x=residuals(linreg))) +
  geom_density(size=0.5, color="darkgrey", fill="lightgrey") +
  labs(x = "Residuals", y = "Density") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)

$\label{fig:figs}Distribution and Density of Residuals$

Distribution and Density of Residuals

QQ plot

Another common way for verifying whether the residuals are normally distributed is to look at the quantiles relative to the quantiles of a normal distribution. If the residuals follow a normal distribution they will line up with the distribution of the sample quantiles of a normal distribution, represented by the diagonal line in the QQ plot. Note that because of sampling, the tails will differ in terms of alignment of the two distributions in the QQ plot. In other words it is expected to see that at the tails, the distribution of the residuals will deviate slightly from the distribution of the sample quantiles of a normal distribution.

Figure shows that although the quantiles of the residuals are largely aligned with the quantiles of a theoretical quantiles of a normal distribution, they do deviate more than just slightly at the tails indicating that the residuals are not normally distributed.

ggplot(df, aes(sample = residuals(linreg))) + 
  stat_qq(size=0.4) + 
  stat_qq_line(size=0.4) +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme(plot.title = element_text(size=10),
        axis.title.x = element_text(size=8),
        axis.title.y = element_text(size=8))

$\label{fig:figs}QQ Plot$

QQ Plot

Shapiro-Wilk Normality Test

Shapiro-Wilk Test
statistic	p.value	method
0.9825459	5.4e-06	Shapiro-Wilk normality test

The p-value 5.370401610^{-6} < 0.05 implying that the distribution of the data is significantly different from a normal distribution.

In summary, we cannot assume that the residuals are normally distributed. This means that the results of the F-test, T-test, confidence intervals and prediction intervals may not be valid because the residuals do not meet the requirement that residuals must be normally distributed.

Simple Linear Regression

Example