# Make sure you have java 6 runtime, this is required for xlsx library
# For Mac, find help here https://support.apple.com/kb/DL1572?locale=en_US
# install packages
install.packages("xlsx", repos="http://cran.rstudio.com/")
install.packages("usdm", repos="http://cran.rstudio.com/")
# load libraries
library(xlsx)
library(usdm)
# load data, my data is under datasets folder inside workspace
dairy_data = read.xlsx("/Users/uday/Documents/extra/R/workspace/datasets/DairyCattle_Model.xlsx", 1)
# Find out the independent or the predictor variables
indep_vars = colnames(dairy_data)[colnames(dairy_data) != "WaterConsumption"]
indep_vars = indep_vars[indep_vars != "CattleNumber"]
Lets find out the collinearity between the predictor variables, i.e, how closely a variable is related to another variables.
# calculate vif on predictors
vif(subset(dairy_data, select = (indep_vars)))
## Variables VIF
## 1 Weight 4.092005
## 2 BodyTemp 120.017028
## 3 BodyHumidity 2.898050
## 4 AmbientTemp 7.066412
## 5 AmbientHumdity 53.909861
## 6 Medication 1.208230
## 7 FoodIntake 95.787927
We see that variables like body temperatures, AmbientTemp, AmbientHumdity and FoodIntake have higher vif values. lets try vifcor and vifstep functions from usdm package or R to exclude highly collinear varibales.
Using vifcor with threshold equal to 0.4
vifcor(subset(dairy_data, select = (indep_vars)), th=0.4)
## 2 variables from the 7 input variables have collinearity problem:
##
## BodyTemp FoodIntake
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( AmbientTemp ~ BodyHumidity ): 0.02762022
## max correlation ( AmbientHumdity ~ AmbientTemp ): 0.3638971
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 Weight 1.073443
## 2 BodyHumidity 1.138330
## 3 AmbientTemp 1.192287
## 4 AmbientHumdity 1.322952
## 5 Medication 1.169576
Using vifstep with threshold equal to 4
vifstep(subset(dairy_data, select = (indep_vars)), th=4)
## 2 variables from the 7 input variables have collinearity problem:
##
## BodyTemp FoodIntake
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( AmbientTemp ~ BodyHumidity ): 0.02762022
## max correlation ( AmbientHumdity ~ AmbientTemp ): 0.3638971
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 Weight 1.073443
## 2 BodyHumidity 1.138330
## 3 AmbientTemp 1.192287
## 4 AmbientHumdity 1.322952
## 5 Medication 1.169576
We see that BodyTemp and FoodIntake are two varibles that can be excluded to get better results. Lets calculate the multi linear regression now.
## Remove BodyTemp and FoodIntake from independent varibles
indep_vars2 = indep_vars[indep_vars != "BodyTemp"]
indep_vars2 = indep_vars2[indep_vars2 != "FoodIntake"]
## get the formula for linear modelling
f = as.formula(paste('WaterConsumption ~', paste(indep_vars2, collapse='+')))
## perform multiple linear regression
dairy_data.lm = lm(f, data = dairy_data)
## show summary
summary(dairy_data.lm)
##
## Call:
## lm(formula = f, data = dairy_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4912 -0.5730 0.1407 0.5216 1.9122
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.222002 2.950953 0.414 0.683976
## Weight 0.011109 0.001547 7.180 1.54e-06 ***
## BodyHumidity -0.033961 0.061941 -0.548 0.590627
## AmbientTemp 0.111083 0.035627 3.118 0.006259 **
## AmbientHumdity -0.213626 0.071434 -2.991 0.008218 **
## Medication 2.890074 0.679122 4.256 0.000534 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.142 on 17 degrees of freedom
## Multiple R-squared: 0.8333, Adjusted R-squared: 0.7843
## F-statistic: 16.99 on 5 and 17 DF, p-value: 4.377e-06