Data Managment
attach(milk2mth)
names(milk2mth)
str(milk2mth)
as.factor(breed)
str(milk2mth)
head(milk2mth)
tail(milk2mth)
summary(milk2mth)
# Replace missing values of each numeric column with the mean
milk2mth[] = lapply(milk2mth, function(x) {
if(is.numeric(x)) {
x[is.na(x)] <- mean(x, na.rm = TRUE)
}
return(x)
})
# Check for missing values in the dataset
check = is.na(milk2mth)
num_missing = sum(check)
print(num_missing)
#Ans to the question no 1 ##Descriptive Analysis
#Descriptive Analysis by Num_var
library(psych)
describe(milk2mth[,1:9])
#Describe Breed Wise
describe.by(milk2mth,milk2mth$breed)
#Descriptive Analysis of Cat
Frequency_C = table(milk2mth$breed)
Frequency_C
Proporation = prop.table(Frequency_C)
Proporation
Percentage = Proporation*100
Percentage
#Check for normality
ks.test(milk2mth$scc2, "pnorm", mean(milk2mth$scc2), sd(milk2mth$scc2)) #Data is not normally distributed
#Curve
hist(milk2mth$scc2, probability = TRUE, main = "Histogram with Density Curve")
lines(density(milk2mth$scc2), col = "blue")
qqnorm(milk2mth$scc2)
qqline(milk2mth$scc2, col = "red")
#Ans to the question no 2
#correlation between scc2 and the continuous risk factors (milk1,milk2,parity,pctprot1,pctprot2,pctfat1,pctfat2)
names(milk2mth)
[1] "milk1" "milk2" "parity" "pctprot1" "pctprot2" "scc1" "scc2" "pctfat1" "pctfat2" "breed"
[11] "cowno" "herdno" "log_scc2"
#cor.test(scc2, milk1, method = "pearson")
#cor.test(scc2, milk2, method = "pearson")
#cor.test(scc2, parity, method = "pearson")
#Shortcup
corr= cor(milk2mth[ , c("scc2" , "milk1" , "milk2" , "parity" ,"pctprot1" ,"pctprot2" , "pctfat1" ,"pctfat2")],method = "spearman")
corr
scc2 milk1 milk2 parity pctprot1 pctprot2 pctfat1 pctfat2
scc2 1.0000000 -0.1796812 0.198598024 0.135205021 -0.12259215 0.42797447 -0.14313063 0.33175013
milk1 -0.1796812 1.0000000 0.586481275 0.031080499 0.70559972 -0.25410367 0.73281821 -0.11552890
milk2 0.1985980 0.5864813 1.000000000 -0.003085303 0.36244947 0.02649548 0.36433698 0.12214990
parity 0.1352050 0.0310805 -0.003085303 1.000000000 -0.07074898 -0.04888924 -0.03356184 -0.07513782
pctprot1 -0.1225921 0.7055997 0.362449469 -0.070748977 1.00000000 0.09410705 0.94334528 0.14729352
pctprot2 0.4279745 -0.2541037 0.026495476 -0.048889244 0.09410705 1.00000000 0.04961559 0.76418897
pctfat1 -0.1431306 0.7328182 0.364336979 -0.033561838 0.94334528 0.04961559 1.00000000 0.17768677
pctfat2 0.3317501 -0.1155289 0.122149901 -0.075137815 0.14729352 0.76418897 0.17768677 1.00000000
scc2_corr = corr["scc2",]
scc2_corr
scc2 milk1 milk2 parity pctprot1 pctprot2 pctfat1 pctfat2
1.0000000 -0.1796812 0.1985980 0.1352050 -0.1225921 0.4279745 -0.1431306 0.3317501
#Heat Map
library(ggcorrplot)
Warning: package ‘ggcorrplot’ was built under R version 4.3.3
ggcorrplot(corr)
#library(ggstatsplot) - DO not work
#corrrltn co-effecient
library(metan)
corr_coeff = corr_coef(data=milk2mth,scc2,milk1,milk2,parity,pctprot1,pctprot2,pctfat1,pctfat2)
corr_coeff
---------------------------------------------------------------------------
Pearson's correlation coefficient
---------------------------------------------------------------------------
scc2 milk1 milk2 parity pctprot1 pctprot2 pctfat1 pctfat2
scc2 1.0000 -0.1523 0.0424 0.2186 -0.1692 0.2085 -0.1653 0.1557
milk1 -0.1523 1.0000 0.5446 0.0381 0.8209 -0.0592 0.7887 -0.0463
milk2 0.0424 0.5446 1.0000 0.0145 0.4071 0.5002 0.3688 0.4285
parity 0.2186 0.0381 0.0145 1.0000 -0.0558 -0.0979 -0.0318 -0.0957
pctprot1 -0.1692 0.8209 0.4071 -0.0558 1.0000 0.1008 0.9748 0.1148
pctprot2 0.2085 -0.0592 0.5002 -0.0979 0.1008 1.0000 0.0997 0.9170
pctfat1 -0.1653 0.7887 0.3688 -0.0318 0.9748 0.0997 1.0000 0.1447
pctfat2 0.1557 -0.0463 0.4285 -0.0957 0.1148 0.9170 0.1447 1.0000
---------------------------------------------------------------------------
p-values for the correlation coefficients
---------------------------------------------------------------------------
scc2 milk1 milk2 parity pctprot1 pctprot2 pctfat1 pctfat2
scc2 0.00e+00 3.82e-12 5.45e-02 1.18e-23 1.14e-14 1.24e-21 4.69e-14 1.24e-12
milk1 3.82e-12 0.00e+00 3.65e-159 8.41e-02 0.00e+00 7.22e-03 0.00e+00 3.56e-02
milk2 5.45e-02 3.65e-159 0.00e+00 5.11e-01 6.54e-83 1.22e-130 2.87e-67 1.29e-92
parity 1.18e-23 8.41e-02 5.11e-01 0.00e+00 1.14e-02 8.78e-06 1.50e-01 1.37e-05
pctprot1 1.14e-14 0.00e+00 6.54e-83 1.14e-02 0.00e+00 4.65e-06 0.00e+00 1.79e-07
pctprot2 1.24e-21 7.22e-03 1.22e-130 8.78e-06 4.65e-06 0.00e+00 5.87e-06 0.00e+00
pctfat1 4.69e-14 0.00e+00 2.87e-67 1.50e-01 0.00e+00 5.87e-06 0.00e+00 4.41e-11
pctfat2 1.24e-12 3.56e-02 1.29e-92 1.37e-05 1.79e-07 0.00e+00 4.41e-11 0.00e+00
plot(corr_coeff)
NA
NA
NA
#Ans to the question no 3
#Linearity of scc2
lm_output1 = lm(scc2~milk1, data=milk2mth)
summary(lm_output1)
Call:
lm(formula = scc2 ~ milk1, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-556.5 -346.5 -218.5 3.3 8183.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 556.473 27.334 20.359 < 2e-16 ***
milk1 -12.218 1.749 -6.985 3.82e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 874 on 2054 degrees of freedom
Multiple R-squared: 0.02321, Adjusted R-squared: 0.02273
F-statistic: 48.8 on 1 and 2054 DF, p-value: 3.82e-12
#plot
library(ggplot2)
ggplot(data=milk2mth, aes(x=milk1, y=scc2))+
geom_point()+
geom_smooth(method ="lm")+
theme_minimal()
##
lm_output3 = lm(scc2~pctprot1, data=milk2mth)
summary(lm_output3)
Call:
lm(formula = scc2 ~ pctprot1, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-584.1 -328.4 -215.4 0.0 8155.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 584.10 28.43 20.54 < 2e-16 ***
pctprot1 -85.38 10.97 -7.78 1.14e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 871.6 on 2054 degrees of freedom
Multiple R-squared: 0.02862, Adjusted R-squared: 0.02815
F-statistic: 60.53 on 1 and 2054 DF, p-value: 1.141e-14
#plot
ggplot(data=milk2mth, aes(x=pctprot1, y=scc2))+
geom_point(color="purple")+
geom_smooth(method ="lm")+
theme_minimal()
##
lm_output4 = lm(scc2~pctprot2, data=milk2mth)
summary(lm_output4)
Call:
lm(formula = scc2 ~ pctprot2, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-654.3 -387.6 -223.8 2.8 8306.0
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.83 47.84 -0.059 0.953
pctprot2 136.50 14.12 9.663 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 864.9 on 2054 degrees of freedom
Multiple R-squared: 0.04348, Adjusted R-squared: 0.04302
F-statistic: 93.38 on 1 and 2054 DF, p-value: < 2.2e-16
#plot
ggplot(data=milk2mth, aes(x=pctprot2, y=scc2))+
geom_point(color="purple")+
geom_smooth(method ="lm")+
theme_minimal()
##
lm_output5 = lm(scc2~pctfat1, data=milk2mth)
summary(lm_output5)
Call:
lm(formula = scc2 ~ pctfat1, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-575.5 -342.7 -215.4 -2.1 8164.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 575.535 27.993 20.560 < 2e-16 ***
pctfat1 -61.570 8.108 -7.594 4.69e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 872.2 on 2054 degrees of freedom
Multiple R-squared: 0.02731, Adjusted R-squared: 0.02683
F-statistic: 57.67 on 1 and 2054 DF, p-value: 4.686e-14
#plot
ggplot(data=milk2mth, aes(x=pctfat1, y=scc2))+
geom_point(color="violet")+
geom_smooth(method ="lm")+
theme_minimal()
##
lm_output6 = lm(scc2~pctfat2, data=milk2mth)
summary(lm_output5)
Call:
lm(formula = scc2 ~ pctfat1, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-575.5 -342.7 -215.4 -2.1 8164.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 575.535 27.993 20.560 < 2e-16 ***
pctfat1 -61.570 8.108 -7.594 4.69e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 872.2 on 2054 degrees of freedom
Multiple R-squared: 0.02731, Adjusted R-squared: 0.02683
F-statistic: 57.67 on 1 and 2054 DF, p-value: 4.686e-14
#plot
ggplot(data=milk2mth, aes(x=pctfat2, y=scc2))+
geom_point(color="violet")+
geom_smooth(method ="lm")+
theme_minimal()
#Ans to the question no 4
lm_output2 = lm(scc2~milk2, data=milk2mth)
summary(lm_output2)
Call:
lm(formula = scc2 ~ milk2, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-508.1 -357.7 -286.8 0.0 8350.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 357.675 38.287 9.342 <2e-16 ***
milk2 4.115 2.139 1.924 0.0545 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 883.5 on 2054 degrees of freedom
Multiple R-squared: 0.001799, Adjusted R-squared: 0.001313
F-statistic: 3.703 on 1 and 2054 DF, p-value: 0.05446
#plot
ggplot(data=milk2mth, aes(x=milk2, y=scc2))+
geom_point(color="red")+
geom_smooth(method ="lm")+
theme_minimal()
confint(lm_output2 )
2.5 % 97.5 %
(Intercept) 282.5886129 432.760611
milk2 -0.0788472 8.309708
#No significant linear Relation
#Ans to the question no 5
# Fit a multivariable linear regression model
multi_lm_model = lm(scc2~milk1+parity+pctprot1+pctprot2+pctfat1+pctfat2 ,data = milk2mth)
summary(multi_lm_model)
Call:
lm(formula = scc2 ~ milk1 + parity + pctprot1 + pctprot2 + pctfat1 +
pctfat2, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-1211.2 -353.1 -131.9 88.1 8082.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -264.3969 60.5843 -4.364 1.34e-05 ***
milk1 0.1316 3.0147 0.044 0.965181
parity 146.8230 13.2752 11.060 < 2e-16 ***
pctprot1 -90.3943 53.7136 -1.683 0.092548 .
pctprot2 276.9606 35.7022 7.758 1.36e-14 ***
pctfat1 0.2410 37.0373 0.007 0.994809
pctfat2 -85.2491 24.7864 -3.439 0.000595 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 821.4 on 2049 degrees of freedom
Multiple R-squared: 0.1393, Adjusted R-squared: 0.1368
F-statistic: 55.28 on 6 and 2049 DF, p-value: < 2.2e-16
multi_lm_model2 = lm(scc2~milk1+parity+pctprot1+pctprot2+pctfat2 ,data = milk2mth)
summary(multi_lm_model2)
Call:
lm(formula = scc2 ~ milk1 + parity + pctprot1 + pctprot2 + pctfat2,
data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-1211.3 -353.0 -132.0 88.1 8082.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -264.4177 60.4850 -4.372 1.29e-05 ***
milk1 0.1297 3.0002 0.043 0.965510
parity 146.8339 13.1657 11.153 < 2e-16 ***
pctprot1 -90.0674 18.9759 -4.746 2.21e-06 ***
pctprot2 276.8840 33.6978 8.217 3.66e-16 ***
pctfat2 -85.1920 23.1730 -3.676 0.000243 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 821.2 on 2050 degrees of freedom
Multiple R-squared: 0.1393, Adjusted R-squared: 0.1372
F-statistic: 66.37 on 5 and 2050 DF, p-value: < 2.2e-16
multi_lm_model3 = lm(scc2~parity+pctprot1+pctprot2+pctfat2 ,data = milk2mth) #Final Model
summary(multi_lm_model3)
Call:
lm(formula = scc2 ~ parity + pctprot1 + pctprot2 + pctfat2, data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-1211.4 -353.2 -132.1 88.0 8082.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -264.01 59.74 -4.419 1.04e-05 ***
parity 146.91 13.05 11.254 < 2e-16 ***
pctprot1 -89.38 10.42 -8.579 < 2e-16 ***
pctprot2 276.80 33.64 8.229 3.31e-16 ***
pctfat2 -85.24 23.14 -3.683 0.000236 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 821 on 2051 degrees of freedom
Multiple R-squared: 0.1393, Adjusted R-squared: 0.1376
F-statistic: 83 on 4 and 2051 DF, p-value: < 2.2e-16
attributes(multi_lm_model3)
$names
[1] "coefficients" "residuals" "effects" "rank" "fitted.values" "assign" "qr"
[8] "df.residual" "xlevels" "call" "terms" "model"
$class
[1] "lm"
#Diagnosis of our model
plot(multi_lm_model3)
# Plot the residuals
par(mfrow = c(2, 2))
plot(multi_lm_model3)
#Perform normality test on residuals
shapiro.test(residuals(multi_lm_model3)) #(p-value < 0.05), so data is non-normal
Shapiro-Wilk normality test
data: residuals(multi_lm_model3)
W = 0.5793, p-value < 2.2e-16
#log transformation
milk2mth$log_scc2 = log(milk2mth$scc2 + 1) # Adding 1 to avoid log(0)
# Refit the model with log-transformed scc2
log_lm_model = lm(log_scc2 ~ parity+pctprot1+pctprot2+pctfat2 ,data = milk2mth)
summary(log_lm_model)
Call:
lm(formula = log_scc2 ~ parity + pctprot1 + pctprot2 + pctfat2,
data = milk2mth)
Residuals:
Min 1Q Median 3Q Max
-4.7772 -0.7713 -0.1252 0.8334 4.7822
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.13395 0.09511 1.408 0.159
parity 0.20214 0.02078 9.726 < 2e-16 ***
pctprot1 -0.13430 0.01659 -8.097 9.56e-16 ***
pctprot2 1.69871 0.05355 31.721 < 2e-16 ***
pctfat2 -0.27154 0.03684 -7.370 2.46e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.307 on 2051 degrees of freedom
Multiple R-squared: 0.6571, Adjusted R-squared: 0.6564
F-statistic: 982.6 on 4 and 2051 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(log_lm_model)
shapiro.test(residuals(log_lm_model))
Shapiro-Wilk normality test
data: residuals(log_lm_model)
W = 0.99252, p-value = 9.425e-09
#Ans to the question no 1 #Final Model #log_lm_model = lm(log_scc2 ~ parity+pctprot1+pctprot2+pctfat2 ,data = milk2mth)