MLR: Cement data set

Data preparation

cementdata <- read.csv("C:/Users/welcome/Downloads/BEST-Hald Cement data.csv", header = T) # Load data

attach(cementdata) # Attach dataset

# Extract x and y from data
x <- model.matrix(Y ~., cementdata) # Design matrix for predictors (xi)

y <- as.matrix(cementdata[,1]) # Response variable (y)

Test of hypothesis

Null hypothesis: B1 = B2 = B3 = B4 = 0.
ALternate hypothesis: Atleast one Bi != 0.

mlreg1 <- lm(Y ~., data = cementdata) # Fitting the model

modelcheck <- anova(mlreg1) # X1 and X2 explain variablity in Y and therefore reject Ho

print(modelcheck)

## Analysis of Variance Table
## 
## Response: Y
##           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## X1         1 1450.08 1450.08 242.3679 2.888e-07 ***
## X2         1 1207.78 1207.78 201.8705 5.863e-07 ***
## X3         1    9.79    9.79   1.6370    0.2366    
## X4         1    0.25    0.25   0.0413    0.8441    
## Residuals  8   47.86    5.98                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(mlreg1) # Adjusted R-square: 0.9736 and Residual standard error: 2.446

## 
## Call:
## lm(formula = Y ~ ., data = cementdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1750 -1.6709  0.2508  1.3783  3.9254 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  62.4054    70.0710   0.891   0.3991  
## X1            1.5511     0.7448   2.083   0.0708 .
## X2            0.5102     0.7238   0.705   0.5009  
## X3            0.1019     0.7547   0.135   0.8959  
## X4           -0.1441     0.7091  -0.203   0.8441  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.446 on 8 degrees of freedom
## Multiple R-squared:  0.9824, Adjusted R-squared:  0.9736 
## F-statistic: 111.5 on 4 and 8 DF,  p-value: 4.756e-07

Remove Insignificant variables ( X3 and X4) from the model and refit the model

mlreg2 <- lm(Y ~ X1 + X2) # refit model

modelcheck2 <- anova(mlreg2)  # Reaffirms that X1 and X2 

print(modelcheck2)

## Analysis of Variance Table
## 
## Response: Y
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## X1         1 1450.1 1450.08  250.43 2.088e-08 ***
## X2         1 1207.8 1207.78  208.58 5.029e-08 ***
## Residuals 10   57.9    5.79                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(mlreg2) # Adjusted R-Square: 0.9744 and Residual standard error: 2.406

## 
## Call:
## lm(formula = Y ~ X1 + X2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.893 -1.574 -1.302  1.363  4.048 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.57735    2.28617   23.00 5.46e-10 ***
## X1           1.46831    0.12130   12.11 2.69e-07 ***
## X2           0.66225    0.04585   14.44 5.03e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.406 on 10 degrees of freedom
## Multiple R-squared:  0.9787, Adjusted R-squared:  0.9744 
## F-statistic: 229.5 on 2 and 10 DF,  p-value: 4.407e-09

We select the refit model, as it explains variablity in Y with minimum number of variables with high adjusted R-square.

Confidence interval for co-efficients

CI_coeff <- confint(mlreg2)

print(CI_coeff)

##                  2.5 %     97.5 %
## (Intercept) 47.4834350 57.6712627
## X1           1.1980304  1.7385810
## X2           0.5600798  0.7644212

Confidence interval for Mean response predicition

CI_meanresponse <- predict(mlreg2, data.frame(X1 = X1, X2 = X2), interval = "confidence", level = 0.95)

print(CI_meanresponse)

##          fit       lwr       upr
## 1   80.07400  77.38679  82.76122
## 2   73.25092  70.50710  75.99473
## 3  105.81474 103.96593 107.66355
## 4   89.25848  86.61956  91.89740
## 5   97.29251  95.74212  98.84291
## 6  105.15249 103.33331 106.97167
## 7  104.00205 100.77704 107.22706
## 8   74.57542  71.94224  77.20860
## 9   91.27549  89.00610  93.54487
## 10 114.53754 110.56117 118.51391
## 11  80.53567  78.23565  82.83570
## 12 112.43724 110.05956 114.81493
## 13 112.29344 109.81199 114.77489

Confidence interval for individual response predicition

CI_indresponse <- predict(mlreg2, data.frame(X1 = X1, X2 = X2), interval = "pred", level = 0.95)

print(CI_indresponse)

##          fit       lwr       upr
## 1   80.07400  74.07664  86.07137
## 2   73.25092  67.22798  79.27386
## 3  105.81474 100.14329 111.48619
## 4   89.25848  83.28260  95.23436
## 5   97.29251  91.71121 102.87382
## 6  105.15249  99.49063 110.81435
## 7  104.00205  97.74522 110.25888
## 8   74.57542  68.60207  80.54877
## 9   91.27549  85.45334  97.09763
## 10 114.53754 107.86230 121.21278
## 11  80.53567  74.70152  86.36983
## 12 112.43724 106.57204 118.30245
## 13 112.29344 106.38540 118.20147

MLR: Cement data set

karthik

June 11, 2017