# reading the rda file
Concrete.df <- read.csv("Concrete_Data.csv")
attach(Concrete.df)
dim(Concrete.df)## [1] 1030 12
## [1] "ID" "Cement" "BlastFurnace" "FlyAsh"
## [5] "Water" "SuperPlasticizer" "CoarseAggregate" "FineAggregate"
## [9] "Age" "Strength" "new_input.1" "new_input.2"
## ID Cement BlastFurnace FlyAsh
## Min. : 1.0 Min. :102.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 258.2 1st Qu.:192.4 1st Qu.: 0.0 1st Qu.: 0.00
## Median : 515.5 Median :272.9 Median : 22.0 Median : 0.00
## Mean : 515.5 Mean :281.2 Mean : 73.9 Mean : 54.19
## 3rd Qu.: 772.8 3rd Qu.:350.0 3rd Qu.:142.9 3rd Qu.:118.30
## Max. :1030.0 Max. :540.0 Max. :359.4 Max. :200.10
## Water SuperPlasticizer CoarseAggregate FineAggregate
## Min. :121.8 Min. : 0.000 Min. : 801.0 Min. :594.0
## 1st Qu.:164.9 1st Qu.: 0.000 1st Qu.: 932.0 1st Qu.:731.0
## Median :185.0 Median : 6.400 Median : 968.0 Median :779.5
## Mean :181.6 Mean : 6.205 Mean : 972.9 Mean :773.6
## 3rd Qu.:192.0 3rd Qu.:10.200 3rd Qu.:1029.4 3rd Qu.:824.0
## Max. :247.0 Max. :32.200 Max. :1145.0 Max. :992.6
## Age Strength new_input.1 new_input.2
## Min. : 1.00 Min. : 2.33 Min. :2074 Min. : 33.60
## 1st Qu.: 7.00 1st Qu.:23.71 1st Qu.:3457 1st Qu.: 60.71
## Median : 28.00 Median :34.45 Median :4081 Median : 84.87
## Mean : 45.66 Mean :35.82 Mean :4013 Mean : 87.35
## 3rd Qu.: 56.00 3rd Qu.:46.13 3rd Qu.:4419 3rd Qu.:108.00
## Max. :365.00 Max. :82.60 Max. :6679 Max. :165.00
Model1 <- lm(Strength ~ Cement
+ BlastFurnace
+ FlyAsh
+ Water
+ SuperPlasticizer
+ CoarseAggregate
+ FineAggregate
+ Age,
data = Concrete.df)
summary(Model1)##
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + FlyAsh + Water +
## SuperPlasticizer + CoarseAggregate + FineAggregate + Age,
## data = Concrete.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.654 -6.302 0.703 6.569 34.450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -23.331214 26.585504 -0.878 0.380372
## Cement 0.119804 0.008489 14.113 < 2e-16 ***
## BlastFurnace 0.103866 0.010136 10.247 < 2e-16 ***
## FlyAsh 0.087934 0.012583 6.988 5.02e-12 ***
## Water -0.149918 0.040177 -3.731 0.000201 ***
## SuperPlasticizer 0.292225 0.093424 3.128 0.001810 **
## CoarseAggregate 0.018086 0.009392 1.926 0.054425 .
## FineAggregate 0.020190 0.010702 1.887 0.059491 .
## Age 0.114222 0.005427 21.046 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.4 on 1021 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.6125
## F-statistic: 204.3 on 8 and 1021 DF, p-value: < 2.2e-16
Ideally, the residual plot will show no fitted pattern. That is, the red line should be approximately horizontal at zero. The presence of a pattern may indicate a problem with some aspect of the linear model.
In our example, there is no pattern in the residual plot. This suggests that we can assume linear relationship between the predictors and the outcome variables.
The QQ plot of residuals can be used to visually check the normality assumption. The normal probability plot of residuals should approximately follow a straight line.
In our example, all the points fall approximately along this reference line, so we can assume normality.
library(MASS)
# distribution of studentized residuals
sresid <- studres(Model1)
shapiro.test(sresid)##
## Shapiro-Wilk normality test
##
## data: sresid
## W = 0.99514, p-value = 0.002233
The p-value is less than 0.05, which is significant, So we can say that data is deviated from the normality assumption.
This assumption can be checked by examining the Scale-location plot, also known as the spread-location plot.
This plot shows if residuals are spread equally along the ranges of predictors. it is good if you see a horizontal line with equally spread points. In our example, this is the case. we can assume Homogeneity of variance
## Loading required package: carData
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 119.2972, Df = 1, p = < 2.22e-16
p < .05, suggesting that our data is not homoscedastic.
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## studentized Breusch-Pagan test
##
## data: Model1
## BP = 137.2, df = 8, p-value < 2.2e-16
p < .05, suggesting that our data is not homoscedastic.
##
## Goldfeld-Quandt test
##
## data: Model1
## GQ = 0.48746, df1 = 506, df2 = 506, p-value = 1
## alternative hypothesis: variance increases from segment 1 to 2
p > .05, suggesting that our data is homoscedastic.
## lag Autocorrelation D-W Statistic p-value
## 1 0.3559936 1.281638 0
## Alternative hypothesis: rho != 0
p < 0.05, so the errors are autocorrelated. We have violated the independence assumption.
library(Hmisc)
# creating subset of continuous independent variables from the dataframe
expVar <- Concrete.df[c("Cement", "BlastFurnace", "FlyAsh","Water" ,"SuperPlasticizer",
"CoarseAggregate","FineAggregate","Age")]
# correlation matrix
rcorr(as.matrix(expVar))## Cement BlastFurnace FlyAsh Water SuperPlasticizer
## Cement 1.00 -0.28 -0.40 -0.08 0.09
## BlastFurnace -0.28 1.00 -0.32 0.11 0.04
## FlyAsh -0.40 -0.32 1.00 -0.26 0.38
## Water -0.08 0.11 -0.26 1.00 -0.66
## SuperPlasticizer 0.09 0.04 0.38 -0.66 1.00
## CoarseAggregate -0.11 -0.28 -0.01 -0.18 -0.27
## FineAggregate -0.22 -0.28 0.08 -0.45 0.22
## Age 0.08 -0.04 -0.15 0.28 -0.19
## CoarseAggregate FineAggregate Age
## Cement -0.11 -0.22 0.08
## BlastFurnace -0.28 -0.28 -0.04
## FlyAsh -0.01 0.08 -0.15
## Water -0.18 -0.45 0.28
## SuperPlasticizer -0.27 0.22 -0.19
## CoarseAggregate 1.00 -0.18 0.00
## FineAggregate -0.18 1.00 -0.16
## Age 0.00 -0.16 1.00
##
## n= 1030
##
##
## P
## Cement BlastFurnace FlyAsh Water SuperPlasticizer
## Cement 0.0000 0.0000 0.0088 0.0030
## BlastFurnace 0.0000 0.0000 0.0006 0.1652
## FlyAsh 0.0000 0.0000 0.0000 0.0000
## Water 0.0088 0.0006 0.0000 0.0000
## SuperPlasticizer 0.0030 0.1652 0.0000 0.0000
## CoarseAggregate 0.0004 0.0000 0.7495 0.0000 0.0000
## FineAggregate 0.0000 0.0000 0.0111 0.0000 0.0000
## Age 0.0085 0.1559 0.0000 0.0000 0.0000
## CoarseAggregate FineAggregate Age
## Cement 0.0004 0.0000 0.0085
## BlastFurnace 0.0000 0.0000 0.1559
## FlyAsh 0.7495 0.0111 0.0000
## Water 0.0000 0.0000 0.0000
## SuperPlasticizer 0.0000 0.0000 0.0000
## CoarseAggregate 0.0000 0.9230
## FineAggregate 0.0000 0.0000
## Age 0.9230 0.0000
## Cement BlastFurnace FlyAsh Water
## 7.488944 7.276963 6.170634 7.003957
## SuperPlasticizer CoarseAggregate FineAggregate Age
## 2.963776 5.074617 7.005081 1.118367
## Overall
## Cement 14.112863
## BlastFurnace 10.247419
## FlyAsh 6.988168
## Water 3.731446
## SuperPlasticizer 3.127937
## CoarseAggregate 1.925656
## FineAggregate 1.886652
## Age 21.046426
##
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + Age, data = Concrete.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.830 -9.095 -0.614 8.418 40.630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.921151 1.340214 2.18 0.0295 *
## Cement 0.088941 0.003980 22.34 <2e-16 ***
## BlastFurnace 0.058296 0.004810 12.12 <2e-16 ***
## Age 0.078438 0.006337 12.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.79 on 1026 degrees of freedom
## Multiple R-squared: 0.4151, Adjusted R-squared: 0.4134
## F-statistic: 242.7 on 3 and 1026 DF, p-value: < 2.2e-16
Model3 <- lm(Strength ~ Cement
+ BlastFurnace
+ FlyAsh
+ Water
+ SuperPlasticizer
+ CoarseAggregate
+ FineAggregate
+ Age
+ new_input.1
+ new_input.2,
data = Concrete.df)
summary(Model3)##
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + FlyAsh + Water +
## SuperPlasticizer + CoarseAggregate + FineAggregate + Age +
## new_input.1 + new_input.2, data = Concrete.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.668 -6.309 0.693 6.609 34.527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.131e+01 2.339e+02 -0.048 0.96146
## Cement 1.463e+00 2.296e+01 0.064 0.94922
## BlastFurnace 1.036e-01 1.033e-02 10.027 < 2e-16 ***
## FlyAsh 8.758e-02 1.287e-02 6.807 1.7e-11 ***
## Water -1.266e-01 1.998e-01 -0.634 0.52655
## SuperPlasticizer 2.940e-01 9.501e-02 3.095 0.00202 **
## CoarseAggregate 1.806e-02 9.445e-03 1.912 0.05612 .
## FineAggregate 1.983e-02 1.104e-02 1.796 0.07273 .
## Age 1.144e-01 5.629e-03 20.325 < 2e-16 ***
## new_input.1 -6.618e-04 5.508e-03 -0.120 0.90439
## new_input.2 -4.474e+00 7.653e+01 -0.058 0.95339
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.41 on 1019 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.6118
## F-statistic: 163.1 on 10 and 1019 DF, p-value: < 2.2e-16
## Start: AIC=4836.89
## Strength ~ Cement + BlastFurnace + FlyAsh + Water + SuperPlasticizer +
## CoarseAggregate + FineAggregate + Age + new_input.1 + new_input.2
##
## Df Sum of Sq RSS AIC
## - new_input.2 1 0 110412 4834.9
## - Cement 1 0 110412 4834.9
## - new_input.1 1 2 110413 4834.9
## - Water 1 43 110455 4835.3
## <none> 110411 4836.9
## - FineAggregate 1 350 110761 4838.1
## - CoarseAggregate 1 396 110807 4838.6
## - SuperPlasticizer 1 1038 111449 4844.5
## - FlyAsh 1 5021 115432 4880.7
## - BlastFurnace 1 10895 121306 4931.8
## - Age 1 44759 155170 5185.4
##
## Step: AIC=4834.9
## Strength ~ Cement + BlastFurnace + FlyAsh + Water + SuperPlasticizer +
## CoarseAggregate + FineAggregate + Age + new_input.1
##
## Df Sum of Sq RSS AIC
## - new_input.1 1 2 110413 4832.9
## - Water 1 43 110455 4833.3
## <none> 110412 4834.9
## - FineAggregate 1 354 110766 4836.2
## - CoarseAggregate 1 402 110814 4836.6
## - SuperPlasticizer 1 1040 111452 4842.6
## - FlyAsh 1 5070 115481 4879.1
## - BlastFurnace 1 10910 121322 4930.0
## - Cement 1 17396 127807 4983.6
## - Age 1 44760 155171 5183.4
##
## Step: AIC=4832.91
## Strength ~ Cement + BlastFurnace + FlyAsh + Water + SuperPlasticizer +
## CoarseAggregate + FineAggregate + Age
##
## Df Sum of Sq RSS AIC
## <none> 110413 4832.9
## - FineAggregate 1 385 110798 4834.5
## - CoarseAggregate 1 401 110814 4834.6
## - SuperPlasticizer 1 1058 111471 4840.7
## - Water 1 1506 111919 4844.9
## - FlyAsh 1 5281 115694 4879.0
## - BlastFurnace 1 11356 121769 4931.7
## - Cement 1 21539 131952 5014.5
## - Age 1 47902 158315 5202.1
##
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + FlyAsh + Water +
## SuperPlasticizer + CoarseAggregate + FineAggregate + Age,
## data = Concrete.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.654 -6.302 0.703 6.569 34.450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -23.331214 26.585504 -0.878 0.380372
## Cement 0.119804 0.008489 14.113 < 2e-16 ***
## BlastFurnace 0.103866 0.010136 10.247 < 2e-16 ***
## FlyAsh 0.087934 0.012583 6.988 5.02e-12 ***
## Water -0.149918 0.040177 -3.731 0.000201 ***
## SuperPlasticizer 0.292225 0.093424 3.128 0.001810 **
## CoarseAggregate 0.018086 0.009392 1.926 0.054425 .
## FineAggregate 0.020190 0.010702 1.887 0.059491 .
## Age 0.114222 0.005427 21.046 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.4 on 1021 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.6125
## F-statistic: 204.3 on 8 and 1021 DF, p-value: < 2.2e-16
## Overall
## Cement 14.112863
## BlastFurnace 10.247419
## FlyAsh 6.988168
## Water 3.731446
## SuperPlasticizer 3.127937
## CoarseAggregate 1.925656
## FineAggregate 1.886652
## Age 21.046426
##
## Call:
## lm(formula = Strength ~ Cement + BlastFurnace + Age, data = Concrete.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.830 -9.095 -0.614 8.418 40.630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.921151 1.340214 2.18 0.0295 *
## Cement 0.088941 0.003980 22.34 <2e-16 ***
## BlastFurnace 0.058296 0.004810 12.12 <2e-16 ***
## Age 0.078438 0.006337 12.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.79 on 1026 degrees of freedom
## Multiple R-squared: 0.4151, Adjusted R-squared: 0.4134
## F-statistic: 242.7 on 3 and 1026 DF, p-value: < 2.2e-16