Initialize the environment

# Make sure you have java 6 runtime, this is required for xlsx library
# For Mac, find help here https://support.apple.com/kb/DL1572?locale=en_US

# install packages
install.packages("xlsx", repos="http://cran.rstudio.com/")
install.packages("usdm", repos="http://cran.rstudio.com/")

# load libraries
library(xlsx)
library(usdm)

Read the data

# load data, my data is under datasets folder inside workspace
dairy_data = read.xlsx("/Users/uday/Documents/extra/R/workspace/datasets/DairyCattle_Model.xlsx", 1)

# Find out the independent or the predictor variables
indep_vars = colnames(dairy_data)[colnames(dairy_data) != "WaterConsumption"]
indep_vars = indep_vars[indep_vars != "CattleNumber"]

Variance Inflation Factors

Lets find out the collinearity between the predictor variables, i.e, how closely a variable is related to another variables.

# calculate vif on predictors
vif(subset(dairy_data, select = (indep_vars)))
##        Variables        VIF
## 1         Weight   4.092005
## 2       BodyTemp 120.017028
## 3   BodyHumidity   2.898050
## 4    AmbientTemp   7.066412
## 5 AmbientHumdity  53.909861
## 6     Medication   1.208230
## 7     FoodIntake  95.787927

We see that variables like body temperatures, AmbientTemp, AmbientHumdity and FoodIntake have higher vif values. lets try vifcor and vifstep functions from usdm package or R to exclude highly collinear varibales.

Using vifcor with threshold equal to 0.4

vifcor(subset(dairy_data, select = (indep_vars)), th=0.4)
## 2 variables from the 7 input variables have collinearity problem: 
##  
## BodyTemp FoodIntake 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( AmbientTemp ~ BodyHumidity ):  0.02762022 
## max correlation ( AmbientHumdity ~ AmbientTemp ):  0.3638971 
## 
## ---------- VIFs of the remained variables -------- 
##        Variables      VIF
## 1         Weight 1.073443
## 2   BodyHumidity 1.138330
## 3    AmbientTemp 1.192287
## 4 AmbientHumdity 1.322952
## 5     Medication 1.169576

Using vifstep with threshold equal to 4

vifstep(subset(dairy_data, select = (indep_vars)), th=4)
## 2 variables from the 7 input variables have collinearity problem: 
##  
## BodyTemp FoodIntake 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( AmbientTemp ~ BodyHumidity ):  0.02762022 
## max correlation ( AmbientHumdity ~ AmbientTemp ):  0.3638971 
## 
## ---------- VIFs of the remained variables -------- 
##        Variables      VIF
## 1         Weight 1.073443
## 2   BodyHumidity 1.138330
## 3    AmbientTemp 1.192287
## 4 AmbientHumdity 1.322952
## 5     Medication 1.169576

We see that BodyTemp and FoodIntake are two varibles that can be excluded to get better results. Lets calculate the multi linear regression now.

## Remove BodyTemp and FoodIntake from independent varibles
indep_vars2 = indep_vars[indep_vars != "BodyTemp"]
indep_vars2 = indep_vars2[indep_vars2 != "FoodIntake"]

## get the formula for linear modelling
f = as.formula(paste('WaterConsumption ~', paste(indep_vars2, collapse='+')))

## perform multiple linear regression
dairy_data.lm = lm(f, data = dairy_data)

## show summary
summary(dairy_data.lm)
## 
## Call:
## lm(formula = f, data = dairy_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4912 -0.5730  0.1407  0.5216  1.9122 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.222002   2.950953   0.414 0.683976    
## Weight          0.011109   0.001547   7.180 1.54e-06 ***
## BodyHumidity   -0.033961   0.061941  -0.548 0.590627    
## AmbientTemp     0.111083   0.035627   3.118 0.006259 ** 
## AmbientHumdity -0.213626   0.071434  -2.991 0.008218 ** 
## Medication      2.890074   0.679122   4.256 0.000534 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.142 on 17 degrees of freedom
## Multiple R-squared:  0.8333, Adjusted R-squared:  0.7843 
## F-statistic: 16.99 on 5 and 17 DF,  p-value: 4.377e-06