df$conc_fed <- factor(df$conc_fed)
df$protein_whey <- df$protein - df$casein
df$log_scc <- log(df$scc)
as.array(summary(df$scc)) %>%
kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary Somatic Cell Count") %>%
kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("HOLD_position"))
| Var1 | Freq |
|---|---|
| Min. | 8.00 |
| 1st Qu. | 36.00 |
| Median | 74.00 |
| Mean | 145.89 |
| 3rd Qu. | 144.00 |
| Max. | 3447.00 |
plot1 <- ggplot(df, aes(x=scc)) +
geom_histogram(binwidth = 40) +
# ggtitle("Histogram scc") +
labs(x = "Somatic Cell Count (cells/L)", y = "Frequency") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
plot2 <- ggplot(df, aes(y=scc)) +
geom_boxplot(lwd=0.1, outlier.size = 0.1) +
coord_flip() +
# ggtitle("Boxplot scc") +
labs(y = "Somatic Cell Count (cells/L)") +
theme(plot.title = element_text(size=10),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)
Somatic Cell Count Distribution
as.array(summary(df$log_scc)) %>%
kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary log Somatic Cell Count") %>%
kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))
| Var1 | Freq |
|---|---|
| Min. | 2.08 |
| 1st Qu. | 3.58 |
| Median | 4.30 |
| Mean | 4.36 |
| 3rd Qu. | 4.97 |
| Max. | 8.15 |
plot1 <- ggplot(df, aes(x=log_scc)) +
geom_histogram() +
labs(x = "log Somatic Cell Count (cells/L)", y = "Frequency") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
plot2 <- ggplot(df, aes(y=log_scc)) +
geom_boxplot(lwd=0.1, outlier.size = 0.1) +
coord_flip() +
labs(y = "log Somatic Cell Count (cells/L)") +
theme(plot.title = element_text(size=10),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)
log Somatic Cell Count Distribution
as.array(summary(df$protein_whey)) %>%
kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary protein") %>%
kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))
| Var1 | Freq |
|---|---|
| Min. | 0.78 |
| 1st Qu. | 0.96 |
| Median | 1.01 |
| Mean | 1.02 |
| 3rd Qu. | 1.07 |
| Max. | 1.31 |
plot1 <- ggplot(df, aes(x=protein_whey)) +
geom_histogram() +
labs(x = "Whey Protein (%g/L)") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
plot2 <- ggplot(df, aes(y=protein_whey)) +
geom_boxplot(lwd=0.1, outlier.size = 0.1) +
coord_flip() +
labs(y = "Whey Protein (%g/L)") +
theme(plot.title = element_text(size=10),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8)
)
g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)
Whey Protein Distribution
as.array(summary(df$casein)) %>%
kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary casein") %>%
kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))
| Var1 | Freq |
|---|---|
| Min. | 1.40 |
| 1st Qu. | 3.15 |
| Median | 3.37 |
| Mean | 3.37 |
| 3rd Qu. | 3.63 |
| Max. | 4.79 |
plot1 <- ggplot(df, aes(x=casein)) +
geom_histogram() +
labs(x = "Casein (%g/L)") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
plot2 <- ggplot(df, aes(y=casein)) +
geom_boxplot(lwd=0.1, outlier.size = 0.1) +
coord_flip() +
labs(y = "Casein (%g/L)") +
theme(plot.title = element_text(size=10),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8)
)
g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)
Casein Distribution
df$conc_fed <- factor(df$conc_fed)
tab = table(df$conc_fed)
ptab = prop.table(table(df$conc_fed))
cbind(tab, ptab) %>%
kable(format="html", booktabs = T, linesep = "", digits = 2, caption = "Summary Concentrate Feed", col.names = c("Nbr. Cows", "%")) %>%
kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7,
latex_options = c("hold_position"))
| Nbr. Cows | % | |
|---|---|---|
| 0 | 249 | 0.47 |
| 1.5 | 158 | 0.30 |
| 2.5 | 53 | 0.10 |
| 3.5 | 72 | 0.14 |
ggplot(df, aes(x=as.factor(conc_fed))) +
geom_bar() +
labs(x = "Concentrate Feed (%)", y = "Nbr. Cows") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
Concentrate Feed Frequency and Proportions
list <- describeBy(df$log_scc, df$conc_fed, digits = 2)
tmp <- data.frame(t(sapply(list,c)))
tmp <- format.data.frame(tmp, digits = 2)
tmp[ , -which(names(tmp) %in% c("item","vars"))] %>%
kable(format="html", booktabs = T, linesep = "", digits = 2,
caption = "Summary log(scc) by Concentrate Feed Perc.") %>%
kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7,
latex_options = c("scale_down", "HOLD_position"))
| n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 249 | 4.4 | 1.2 | 4.4 | 4.4 | 1.2 | 2.1 | 8.1 | 6.1 | 0.46 | 0.099 | 0.073 |
| 1.5 | 158 | 4.4 | 0.89 | 4.4 | 4.3 | 0.99 | 2.8 | 6.4 | 3.6 | 0.2 | -0.72 | 0.07 |
| 2.5 | 53 | 4.1 | 1 | 4 | 4.1 | 0.79 | 2.6 | 7.3 | 4.7 | 0.82 | 0.5 | 0.14 |
| 3.5 | 72 | 4.2 | 0.83 | 4.2 | 4.2 | 0.86 | 2.8 | 6.5 | 3.8 | 0.28 | -0.36 | 0.098 |
ggplot(df, aes(x=conc_fed, y=log_scc)) +
geom_boxplot(lwd=0.1, outlier.size = 0.1) +
labs(x = "Contentrate Feed Perc.", y = "log Somatic Cell Count") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
log(scc) by Concentrate Feed Perc.
corr <- cor(select(df, log_scc, casein, protein_whey))
corr %>%
kable(format="html", booktabs = T, linesep = "", digits = 2,
caption = "Correlation of log of scc with Predictors") %>%
kable_styling(c("bordered", "condensed"), full_width = F, font_size = 7, latex_options = c("hold_position"))
| log_scc | casein | protein_whey | |
|---|---|---|---|
| log_scc | 1.00 | 0.08 | 0.3 |
| casein | 0.08 | 1.00 | 0.6 |
| protein_whey | 0.30 | 0.60 | 1.0 |
plot2 <- ggplot(df, aes(x = casein, y = log_scc)) +
geom_point(size=0.5) +
geom_smooth(method=lm, se=FALSE,size=0.4) +
labs(y = "log Somatic Cell Count", x = "Casein") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
plot3 <- ggplot(df, aes(x = protein_whey, y = log_scc)) +
geom_point(size=0.5) +
geom_smooth(method=lm, se=FALSE, size=0.4) +
labs(y = "log Somatic Cell Count", x = "Whey Protein") +
theme(plot.title = element_text(size=10),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
g2grob <- ggplotGrob(plot2)
g3grob <- ggplotGrob(plot3)
grid.arrange(g2grob, g3grob, ncol=2)
Whey Protein and Casein correlation with scc
linreg <- lm(log_scc ~ protein_whey, data=df)
The summary of the fitted model is shown in Table .
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.73 | 0.51 | 1.45 | 0.15 |
| protein_whey | 3.56 | 0.50 | 7.17 | 0.00 |
N = length(df$protein_whey)
MSE = sum(linreg$residuals^2 / (N-2))
SXX = sum((df$protein_whey - mean(df$protein_whey))^2)
VARB0 = ( sum(df$protein_whey^2) * MSE ) / ( N * SXX )
VARB1 = MSE / SXX
\(\widehat{\text{Var}(\beta_0)} = \frac{\sum{x_i^2 \ MSE}}{n \ SXX} = 0.2572937\)
and
\(\widehat{\text{Var}(\beta_1)} = \frac{MSE}{SXX} = 0.2471896\)
alpha=0.05
beta0 = linreg$coefficients[1]
CILOW <- beta0 - qt(1-alpha/2,N-2)*sqrt(VARB0)
CIUPP <- beta0 + qt(1-alpha/2,N-2)*sqrt(VARB0)
\[\hat{\beta}_0 \pm t_{\frac{\alpha}{2}, N-2}\sqrt{\text{Var}(\hat{\beta}_0)} = [-0.26, 1.73]\]
We are 95% confident that \(\beta_0\) lies between -0.26 < \(\beta_0\) < 1.73
alpha=0.05
beta1= linreg$coefficients[2]
CILOW <- beta1 - qt(1-alpha/2,N-2)*sqrt( VARB1)
CIUPP <- beta1 + qt(1-alpha/2,N-2)*sqrt( VARB1)
\[\hat{\beta}_1 \pm t_{\frac{\alpha}{2}, N-2}\sqrt{\text{Var}(\hat{\beta}_1)} = [2.59, 4.54]\]
We are 95% confident that \(\beta_1\) lies between 2.59 < \(\beta_0\) < 4.54.
T = (linreg$coefficients[1]-0)/sqrt(VARB0)
TDIST = qt( 1-0.05/2, N-2 )
PVALUE = 2 *( 1- pt(abs(T), df = N - 2))
\(T = \frac{\beta_0 - 0}{\sqrt{\beta_0}} = 1.447\)
To get the critical value for the T distribution with 2 degrees of freedom
We find the the critical value \(t_{0.975, N-2} = 1.964\)
Since the sample t-statistic, 1.447, is not greater than the critical value 1.964, we cannot reject the null hypothesis that the intercept is zero.
We find the corresponding p-value is equal to 0.1483575.
At the 5% level of significance, the probability that \(\beta_0 = 0\) is 0.148. This is not smaller than 0.05 thus we cannot reject \(H_0\).
Note how the confidence interval does not contain the value 1, which suggests that the model may be biased.
T = (linreg$coefficients[2]-0)/sqrt(VARB1)
TDIST = qt( 1-0.05/2, N-2 )
PVALUE = 2*(1-pt(abs(T), df = N - 2))
\(T = \frac{\beta_1 - 0}{\sqrt{\beta_1}} = 7.168\)
To get the critical value for the T distribution with 2 degrees of freedom
We find the the critical value \(t_{0.975, N-2} = 1.964\)
\(|T| \simeq 7.168 \geq 1.964\) so reject \(H_0: \beta_1=0\). At the 5% level of significance, the evidence is not strong enough to indicate the \(\beta_1=0\). Indicating that a relation exists between %g of whey protein per litre of milk and somatic cell count per litre of milk.
We find the corresponding p-value is equal to 2.564393110^{-12}.
At the 5% level of significance, the probability that \(\beta_1 = 0\) is 2.564393110^{-12}. This is much smaller than 0.05 thus we reject \(H_0\).
x <- summary(linreg)
F <- round(x$fstatistic[1], 2)
FDIST <- round(qf(1-alpha, 1, N-2), 2)
PVALUE <- pf(F, 1, N-2, lower.tail=FALSE)
The F-statistic is used to test the hypothesis \(H_0: Y_i = \hat{\beta_0} + \epsilon_i\) against \(H_0: Y_i = \hat{\beta_0} + \hat{\beta_1}X_i + \epsilon_i\).
F-statistic = 51.38
F-critical \(F_{0.95, N-2} = 3.86\)
As 51.38 > 3.86 reject \(H_0\). At the 5% level of significance the evidence is not strong enough to indicate that \(\beta_1=0\). Indicating that a relation exists between %g protein per litre milk and somatic cell count per litre milk.
The probability of the F-distribution taking the value 51.38 is 2.567816610^{-12}. As this p-value is smaller than 0.05 we reject \(H_0\) that the null model without explanatory variable is better.
R2 <- summary(linreg)$r.squared
R2
## [1] 0.08838046
The coefficient of determination \(R^2\) represents the proportion of the variation in Y that is explained by X. In the present case, for 1 litre of milk, \(R^2= 8.84\)% of the variation in somatic cell count is explained by the percentage gram whey protein content.
SSE <- sum(linreg$residuals^2)
RMSE <- sqrt(SSE/(N-2))
\[RMSE = \sqrt{\frac{SSE}{N-2}} = 0.98\]
The standard error is generally an easy to interpret measure. It is measured in the units of the response variable and represents a type of “average” residual. In the present case the standard error of the regression is on average, log(scc) = 0.98 which means that given the measured %g/L quantity of whey protein. predictions of scc are on average off by approx 1%.
SXX = sum((df$protein_whey - mean(df$protein_whey))^2)
MSE = sum(linreg$residuals^2 / (N-2))
VAR_Y = MSE*(1/N+(df$protein_whey-mean(df$protein_whey))^2/SXX)
Yhat <- fitted(linreg)
CI <- cbind(Yhat- qt(1-alpha/2,N-2)*sqrt(VAR_Y), Yhat + qt(1-alpha/2,N-2)*sqrt(VAR_Y))
\[\hat{Y}_i \pm t_{\frac{\alpha}{2}, N-2} \sqrt{Var(\hat{Y})}\]
\[Var(\hat{Y}) = MSE \ [\frac{1}{N} + \frac{(X - \bar{X})^2} {SXX} ]\]
The confidence interval of the predicted values shown in Figure appears to be quite narrow, suggesting that predictions should be relatively accurate.
ggplot(df, aes(x = protein_whey, y = log_scc)) +
geom_point(size=0.5) +
geom_smooth(method=lm, se=TRUE, size=0.4) +
labs(y = "log Somatic Cell Count", x = "Whey Protein (%g/L)") +
theme(plot.title = element_text(hjust = 0.5, size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
Confidence Interval for estimated values of Y
The key assumptions of the model concern the behaviour of the errors \(\epsilon_{i}\)[see @fox_applied_2016 p112]:
Zero Conditional Mean
One of the assumptions, or requirements, of the regression model is that the residuals have, on average a mean equal to zero. As can be seen in Table , this requirement is indeed satisfied. Further, Figure also shows that the residuals appear randomly scattered about the zero mean line.
| Var1 | Freq |
|---|---|
| Min. | -2.09 |
| 1st Qu. | -0.69 |
| Median | -0.05 |
| Mean | 0.00 |
| 3rd Qu. | 0.58 |
| Max. | 3.74 |
Constant variance aka Homoskedasticity
Although the least-squares estimator is unbiased and consistent even when the error variance is not constant, the efficiency of the least-squares estimator is impaired, and the usual formulas for coefficient standard errors are inaccurate. For this reason it is necessary to verify that the residuals follow a normal distribution. It is common to check this assumption by plotting the residuals against the predicted values \(\hat{Y}\). For the assumption to be met the plot should not exhibit any pattern.
In the present case there is no strong visible pattern. Most of the data is captured above and below the residual values -2 and 2 respectively. However the variability appear to decrease as the fitted values get larger. Specifically, when the predicted log(scc) per litre of milk is greater than 5, the error decreases. This means that better predictions are obtained for higher somatic cell counts. This suggests that the variance may not, in this case, be constant.
ggplot(linreg, aes(x = .fitted, y = .resid)) +
geom_point(size=1) +
geom_hline(yintercept=0,linetype="dashed", color = "black", size=0.3) +
geom_hline(yintercept=-2,linetype="dashed", color = "blue", size=0.3) +
geom_hline(yintercept=2,linetype="dashed", color = "blue", size=0.3) +
labs(y = "Residuals", x = "Fitted") +
theme(plot.title = element_text(hjust = 0.5, size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
Residuals vs. Fitted
Checking whether errors are normally distributed
To perform model inference (T-test, F-test, CI, PI) the errors must be normally distributed or at least approximately normally distributed. This requirement can be inspected by plotting the distribution and density of the residuals.
Figure shows that there are outliers in the residuals. Specifically one can see several outliers above the upper whisker of the boxplot and one can observe a long right tail in the density plot. This means that there are errors that are unusually large.
The density plot is relatively symmetric but with a noticeable right tail. However inspection of the boxplot shows that the median appears to split the box into almost equal halves suggesting symmetric distribution. Nevertheless the noticeable right tail of the density plot indicates that the residuals are not normally distributed.
plot1 <- ggplot(df, aes(y=residuals(linreg))) +
geom_boxplot(lwd=0.1, outlier.size = 0.1) +
labs(y = "Residuals", x = "") +
theme(plot.title = element_text(size=10),
axis.ticks.x = element_blank(),
# axis.text.x = element_blank(),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
plot2 <- ggplot(df, aes(x=residuals(linreg))) +
geom_density(size=0.5, color="darkgrey", fill="lightgrey") +
labs(x = "Residuals", y = "Density") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
g1grob <- ggplotGrob(plot1)
g2grob <- ggplotGrob(plot2)
grid.arrange(g1grob, g2grob, ncol=2)
Distribution and Density of Residuals
QQ plot
Another common way for verifying whether the residuals are normally distributed is to look at the quantiles relative to the quantiles of a normal distribution. If the residuals follow a normal distribution they will line up with the distribution of the sample quantiles of a normal distribution, represented by the diagonal line in the QQ plot. Note that because of sampling, the tails will differ in terms of alignment of the two distributions in the QQ plot. In other words it is expected to see that at the tails, the distribution of the residuals will deviate slightly from the distribution of the sample quantiles of a normal distribution.
Figure shows that although the quantiles of the residuals are largely aligned with the quantiles of a theoretical quantiles of a normal distribution, they do deviate more than just slightly at the tails indicating that the residuals are not normally distributed.
ggplot(df, aes(sample = residuals(linreg))) +
stat_qq(size=0.4) +
stat_qq_line(size=0.4) +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme(plot.title = element_text(size=10),
axis.title.x = element_text(size=8),
axis.title.y = element_text(size=8))
QQ Plot
Shapiro-Wilk Normality Test
| statistic | p.value | method |
|---|---|---|
| 0.9825459 | 5.4e-06 | Shapiro-Wilk normality test |
The p-value 5.370401610^{-6} < 0.05 implying that the distribution of the data is significantly different from a normal distribution.
In summary, we cannot assume that the residuals are normally distributed. This means that the results of the F-test, T-test, confidence intervals and prediction intervals may not be valid because the residuals do not meet the requirement that residuals must be normally distributed.