# reading the rda file
Concrete.df <- read.csv("Concrete_Data.csv")
attach(Concrete.df)
dim(Concrete.df)
## [1] 1030   12

Column names of the data frame

# column names of the data frame
colnames(Concrete.df)
##  [1] "ID"               "Cement"           "BlastFurnace"     "FlyAsh"          
##  [5] "Water"            "Superplasticizer" "CoarseAggregate"  "FineAggregate"   
##  [9] "Age"              "Strength"         "new_input.1"      "new_input.2"

Summary Statistics of the data frame

summary(Concrete.df)
##        ID             Cement       BlastFurnace       FlyAsh      
##  Min.   :   1.0   Min.   :102.0   Min.   :  0.0   Min.   :  0.00  
##  1st Qu.: 258.2   1st Qu.:192.4   1st Qu.:  0.0   1st Qu.:  0.00  
##  Median : 515.5   Median :272.9   Median : 22.0   Median :  0.00  
##  Mean   : 515.5   Mean   :281.2   Mean   : 73.9   Mean   : 54.19  
##  3rd Qu.: 772.8   3rd Qu.:350.0   3rd Qu.:142.9   3rd Qu.:118.30  
##  Max.   :1030.0   Max.   :540.0   Max.   :359.4   Max.   :200.10  
##      Water       Superplasticizer CoarseAggregate  FineAggregate  
##  Min.   :121.8   Min.   : 0.000   Min.   : 801.0   Min.   :594.0  
##  1st Qu.:164.9   1st Qu.: 0.000   1st Qu.: 932.0   1st Qu.:731.0  
##  Median :185.0   Median : 6.400   Median : 968.0   Median :779.5  
##  Mean   :181.6   Mean   : 6.205   Mean   : 972.9   Mean   :773.6  
##  3rd Qu.:192.0   3rd Qu.:10.200   3rd Qu.:1029.4   3rd Qu.:824.0  
##  Max.   :247.0   Max.   :32.200   Max.   :1145.0   Max.   :992.6  
##       Age            Strength      new_input.1    new_input.2    
##  Min.   :  1.00   Min.   : 2.33   Min.   :2074   Min.   : 33.60  
##  1st Qu.:  7.00   1st Qu.:23.71   1st Qu.:3457   1st Qu.: 60.71  
##  Median : 28.00   Median :34.45   Median :4081   Median : 84.87  
##  Mean   : 45.66   Mean   :35.82   Mean   :4013   Mean   : 87.35  
##  3rd Qu.: 56.00   3rd Qu.:46.13   3rd Qu.:4419   3rd Qu.:108.00  
##  Max.   :365.00   Max.   :82.60   Max.   :6679   Max.   :165.00

Q1 - Fit a multiple linear regression to the above data considering all the input variables.The information of the output variable is given in the information file Multiple Linear Regression Model

Model1 <- lm(Strength ~ Cement + BlastFurnace
                    + FlyAsh
                    + Water
                    + Superplasticizer  + CoarseAggregate
                    + FineAggregate + Age,
                    data = Concrete.df)
summary(Model1)
## 
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + FlyAsh + Water + 
##     Superplasticizer + CoarseAggregate + FineAggregate + Age, 
##     data = Concrete.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.654  -6.302   0.703   6.569  34.450 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -23.331214  26.585504  -0.878 0.380372    
## Cement             0.119804   0.008489  14.113  < 2e-16 ***
## BlastFurnace       0.103866   0.010136  10.247  < 2e-16 ***
## FlyAsh             0.087934   0.012583   6.988 5.02e-12 ***
## Water             -0.149918   0.040177  -3.731 0.000201 ***
## Superplasticizer   0.292225   0.093424   3.128 0.001810 ** 
## CoarseAggregate    0.018086   0.009392   1.926 0.054425 .  
## FineAggregate      0.020190   0.010702   1.887 0.059491 .  
## Age                0.114222   0.005427  21.046  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.4 on 1021 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.6125 
## F-statistic: 204.3 on 8 and 1021 DF,  p-value: < 2.2e-16

Q2 - Explain the results obtained from the statistical software along with the diagnostic results/plots.I expect your explanation to be detailed along with interpretations Assumptions and Diagnostics Linearity of the Data

# Linearity of the data
plot(Model1, 1)

Ideally, the residual plot will show no fitted pattern. That is, the red line should be approximately horizontal at zero. The presence of a pattern may indicate a problem with some aspect of the linear model.

In our example, there is no pattern in the residual plot. This suggests that we can assume linear relationship between the predictors and the outcome variables. Normality of Residuals QQ Plot of Residuals The QQ plot of residuals can be used to visually check the normality assumption. The normal probability plot of residuals should approximately follow a straight line.

In our example, all the points fall approximately along this reference line, so we can assume normality.

# Normality of residuals
plot(Model1, 2)

Perform a Shapiro-Wilk Normality Test

library(MASS)
# distribution of studentized residuals
sresid <- studres(Model1) 
shapiro.test(sresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  sresid
## W = 0.99514, p-value = 0.002233

The p-value is less than 0.05, which is significant, So we can say that data is deviated from the normality assumption. Testing the Homoscedasticity Assumption Scale-location plot This assumption can be checked by examining the Scale-location plot, also known as the spread-location plot.

plot(Model1, 3)

This plot shows if residuals are spread equally along the ranges of predictors. it is good if you see a horizontal line with equally spread points. In our example, this is the case. we can assume Homogeneity of variance

ncvTest() For Homoscedasticity

library(car)
## Loading required package: carData

Loading required package: carData

# non-constant error variance test
ncvTest(Model1)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 119.2972, Df = 1, p = < 2.22e-16

p < .05, suggesting that our data is not homoscedastic. Breusch-Pagan Test For Homoscedasticity

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(Model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  Model1
## BP = 137.2, df = 8, p-value < 2.2e-16

p < .05, suggesting that our data is not homoscedastic. Goldfeld–Quandt test Test For Homoscedasticity

library(lmtest)
gqtest(Model1)
## 
##  Goldfeld-Quandt test
## 
## data:  Model1
## GQ = 0.48746, df1 = 506, df2 = 506, p-value = 1
## alternative hypothesis: variance increases from segment 1 to 2

p > .05, suggesting that our data is homoscedastic.

Testing the Independence (Autocorrelation) Assumption

# durbin watson test
durbinWatsonTest(Model1)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3559936      1.281638       0
##  Alternative hypothesis: rho != 0

p < 0.05, so the errors are autocorrelated. We have violated the independence assumption. Testing the Multicollinearity Assumption Correlation Matrix

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
# creating subset of continuous independent variables from the dataframe
expVar <- Concrete.df[c("Cement", "BlastFurnace", "FlyAsh","Water" ,"Superplasticizer",
                       "CoarseAggregate","FineAggregate","Age")]
# correlation matrix
rcorr(as.matrix(expVar))
##                  Cement BlastFurnace FlyAsh Water Superplasticizer
## Cement             1.00        -0.28  -0.40 -0.08             0.09
## BlastFurnace      -0.28         1.00  -0.32  0.11             0.04
## FlyAsh            -0.40        -0.32   1.00 -0.26             0.38
## Water             -0.08         0.11  -0.26  1.00            -0.66
## Superplasticizer   0.09         0.04   0.38 -0.66             1.00
## CoarseAggregate   -0.11        -0.28  -0.01 -0.18            -0.27
## FineAggregate     -0.22        -0.28   0.08 -0.45             0.22
## Age                0.08        -0.04  -0.15  0.28            -0.19
##                  CoarseAggregate FineAggregate   Age
## Cement                     -0.11         -0.22  0.08
## BlastFurnace               -0.28         -0.28 -0.04
## FlyAsh                     -0.01          0.08 -0.15
## Water                      -0.18         -0.45  0.28
## Superplasticizer           -0.27          0.22 -0.19
## CoarseAggregate             1.00         -0.18  0.00
## FineAggregate              -0.18          1.00 -0.16
## Age                         0.00         -0.16  1.00
## 
## n= 1030 
## 
## 
## P
##                  Cement BlastFurnace FlyAsh Water  Superplasticizer
## Cement                  0.0000       0.0000 0.0088 0.0030          
## BlastFurnace     0.0000              0.0000 0.0006 0.1652          
## FlyAsh           0.0000 0.0000              0.0000 0.0000          
## Water            0.0088 0.0006       0.0000        0.0000          
## Superplasticizer 0.0030 0.1652       0.0000 0.0000                 
## CoarseAggregate  0.0004 0.0000       0.7495 0.0000 0.0000          
## FineAggregate    0.0000 0.0000       0.0111 0.0000 0.0000          
## Age              0.0085 0.1559       0.0000 0.0000 0.0000          
##                  CoarseAggregate FineAggregate Age   
## Cement           0.0004          0.0000        0.0085
## BlastFurnace     0.0000          0.0000        0.1559
## FlyAsh           0.7495          0.0111        0.0000
## Water            0.0000          0.0000        0.0000
## Superplasticizer 0.0000          0.0000        0.0000
## CoarseAggregate                  0.0000        0.9230
## FineAggregate    0.0000                        0.0000
## Age              0.9230          0.0000

Variance Inflation Factor for Multicollinearity

# detecting multicollinearity
library(car)
vif(Model1)
##           Cement     BlastFurnace           FlyAsh            Water 
##         7.488944         7.276963         6.170634         7.003957 
## Superplasticizer  CoarseAggregate    FineAggregate              Age 
##         2.963776         5.074617         7.005081         1.118367

Q3 -Based on your diagnostics plots’ interpretations, do you want to recommend any changes inthe model? If so do the changes along with the reasons and fit the model again. If no changerequired then support your arguments Variable Importance

library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
varImp(Model1)
##                    Overall
## Cement           14.112863
## BlastFurnace     10.247419
## FlyAsh            6.988168
## Water             3.731446
## Superplasticizer  3.127937
## CoarseAggregate   1.925656
## FineAggregate     1.886652
## Age              21.046426

Taking the Most Important variable in the Model (Also Reducing Multicollinearity)

Model2 <- lm(Strength ~ Cement
                    + BlastFurnace
                    + Age,
                    data = Concrete.df)
summary(Model2)
## 
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + Age, data = Concrete.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.830  -9.095  -0.614   8.418  40.630 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.921151   1.340214    2.18   0.0295 *  
## Cement       0.088941   0.003980   22.34   <2e-16 ***
## BlastFurnace 0.058296   0.004810   12.12   <2e-16 ***
## Age          0.078438   0.006337   12.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.79 on 1026 degrees of freedom
## Multiple R-squared:  0.4151, Adjusted R-squared:  0.4134 
## F-statistic: 242.7 on 3 and 1026 DF,  p-value: < 2.2e-16

Q4 -In the same email, you will get another data as “add columns”. Merge this new data with the previous data. Now fit a multiple linear regression on the merged data with all the input variables. Explain your results. Comment on the newly added variables and whether you want to keep them in the model or not. Justify your answer in either case Fitting Model With Extra Variables

Concrete.df <- read.csv("Concrete_Data.csv")
attach(Concrete.df)
## The following objects are masked from Concrete.df (pos = 14):
## 
##     Age, BlastFurnace, Cement, CoarseAggregate, FineAggregate, FlyAsh,
##     ID, new_input.1, new_input.2, Strength, Superplasticizer, Water
dim(Concrete.df)
## [1] 1030   12
Model3 <- lm(Strength ~ Cement
                    + BlastFurnace
                    + FlyAsh
                    + Water
                    + Superplasticizer 
                    + CoarseAggregate
                    + FineAggregate
                    + Age
                    +new_input.1
                   + new_input.2,
                    data = Concrete.df)
summary(Model3)
## 
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + FlyAsh + Water + 
##     Superplasticizer + CoarseAggregate + FineAggregate + Age + 
##     new_input.1 + new_input.2, data = Concrete.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.668  -6.309   0.693   6.609  34.527 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.131e+01  2.339e+02  -0.048  0.96146    
## Cement            1.463e+00  2.296e+01   0.064  0.94922    
## BlastFurnace      1.036e-01  1.033e-02  10.027  < 2e-16 ***
## FlyAsh            8.758e-02  1.287e-02   6.807  1.7e-11 ***
## Water            -1.266e-01  1.998e-01  -0.634  0.52655    
## Superplasticizer  2.940e-01  9.501e-02   3.095  0.00202 ** 
## CoarseAggregate   1.806e-02  9.445e-03   1.912  0.05612 .  
## FineAggregate     1.983e-02  1.104e-02   1.796  0.07273 .  
## Age               1.144e-01  5.629e-03  20.325  < 2e-16 ***
## new_input.1      -6.618e-04  5.508e-03  -0.120  0.90439    
## new_input.2      -4.474e+00  7.653e+01  -0.058  0.95339    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.41 on 1019 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.6118 
## F-statistic: 163.1 on 10 and 1019 DF,  p-value: < 2.2e-16

Step AIC on Model 3

library(MASS)
StepAICModel <- stepAIC(Model3)
## Start:  AIC=4836.89
## Strength ~ Cement + BlastFurnace + FlyAsh + Water + Superplasticizer + 
##     CoarseAggregate + FineAggregate + Age + new_input.1 + new_input.2
## 
##                    Df Sum of Sq    RSS    AIC
## - new_input.2       1         0 110412 4834.9
## - Cement            1         0 110412 4834.9
## - new_input.1       1         2 110413 4834.9
## - Water             1        43 110455 4835.3
## <none>                          110411 4836.9
## - FineAggregate     1       350 110761 4838.1
## - CoarseAggregate   1       396 110807 4838.6
## - Superplasticizer  1      1038 111449 4844.5
## - FlyAsh            1      5021 115432 4880.7
## - BlastFurnace      1     10895 121306 4931.8
## - Age               1     44759 155170 5185.4
## 
## Step:  AIC=4834.9
## Strength ~ Cement + BlastFurnace + FlyAsh + Water + Superplasticizer + 
##     CoarseAggregate + FineAggregate + Age + new_input.1
## 
##                    Df Sum of Sq    RSS    AIC
## - new_input.1       1         2 110413 4832.9
## - Water             1        43 110455 4833.3
## <none>                          110412 4834.9
## - FineAggregate     1       354 110766 4836.2
## - CoarseAggregate   1       402 110814 4836.6
## - Superplasticizer  1      1040 111452 4842.6
## - FlyAsh            1      5070 115481 4879.1
## - BlastFurnace      1     10910 121322 4930.0
## - Cement            1     17396 127807 4983.6
## - Age               1     44760 155171 5183.4
## 
## Step:  AIC=4832.91
## Strength ~ Cement + BlastFurnace + FlyAsh + Water + Superplasticizer + 
##     CoarseAggregate + FineAggregate + Age
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          110413 4832.9
## - FineAggregate     1       385 110798 4834.5
## - CoarseAggregate   1       401 110814 4834.6
## - Superplasticizer  1      1058 111471 4840.7
## - Water             1      1506 111919 4844.9
## - FlyAsh            1      5281 115694 4879.0
## - BlastFurnace      1     11356 121769 4931.7
## - Cement            1     21539 131952 5014.5
## - Age               1     47902 158315 5202.1
summary(StepAICModel)
## 
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + FlyAsh + Water + 
##     Superplasticizer + CoarseAggregate + FineAggregate + Age, 
##     data = Concrete.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.654  -6.302   0.703   6.569  34.450 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -23.331214  26.585504  -0.878 0.380372    
## Cement             0.119804   0.008489  14.113  < 2e-16 ***
## BlastFurnace       0.103866   0.010136  10.247  < 2e-16 ***
## FlyAsh             0.087934   0.012583   6.988 5.02e-12 ***
## Water             -0.149918   0.040177  -3.731 0.000201 ***
## Superplasticizer   0.292225   0.093424   3.128 0.001810 ** 
## CoarseAggregate    0.018086   0.009392   1.926 0.054425 .  
## FineAggregate      0.020190   0.010702   1.887 0.059491 .  
## Age                0.114222   0.005427  21.046  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.4 on 1021 degrees of freedom
## Multiple R-squared:  0.6155, Adjusted R-squared:  0.6125 
## F-statistic: 204.3 on 8 and 1021 DF,  p-value: < 2.2e-16

Q5 -You have decided that you will only allow three input variables in the model to make it simple. Choose the most appropriate three input variables and justify your answer along with results Variable Importance

library(caret)
varImp(StepAICModel)
##                    Overall
## Cement           14.112863
## BlastFurnace     10.247419
## FlyAsh            6.988168
## Water             3.731446
## Superplasticizer  3.127937
## CoarseAggregate   1.925656
## FineAggregate     1.886652
## Age              21.046426

Taking the Most Important Three variables in the Model

Model4 <- lm(Strength ~ Cement
                    + BlastFurnace
                    + Age,
                    data = Concrete.df)
summary(Model4)
## 
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + Age, data = Concrete.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.830  -9.095  -0.614   8.418  40.630 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.921151   1.340214    2.18   0.0295 *  
## Cement       0.088941   0.003980   22.34   <2e-16 ***
## BlastFurnace 0.058296   0.004810   12.12   <2e-16 ***
## Age          0.078438   0.006337   12.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.79 on 1026 degrees of freedom
## Multiple R-squared:  0.4151, Adjusted R-squared:  0.4134 
## F-statistic: 242.7 on 3 and 1026 DF,  p-value: < 2.2e-16