#import data
library(readr)
data <- read_csv("D:/UMP/Sem 5/Statistical Modelling/Lab Report/Lab Report 3.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Column1 = col_double(),
## x1 = col_double(),
## x2 = col_double(),
## x3 = col_double(),
## x4 = col_double(),
## y = col_double()
## )
data
## # A tibble: 62 x 6
## Column1 x1 x2 x3 x4 y
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2.14 10 0.34 1 28.9
## 2 2 4.14 10 0.34 1 31
## 3 3 8.15 10 0.34 1 26.4
## 4 4 2.14 10 0.34 0.246 27.2
## 5 5 4.14 10 0.34 0.379 26.1
## 6 6 8.15 10 0.34 0.474 23.2
## 7 7 2.14 10 0.34 0.141 19.7
## 8 8 4.14 10 0.34 0.234 22.1
## 9 9 8.15 10 0.34 0.311 22.8
## 10 10 2.14 10 0.34 0.076 29.2
## # ... with 52 more rows
k2_model <- lm(formula = y ~ x1 + x2 +x3+ x4, data = data)
summary(k2_model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9958 -3.3092 -0.2419 3.3924 10.5668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.89453 4.32508 1.363 0.17828
## x1 -0.47790 0.34002 -1.406 0.16530
## x2 0.18271 0.01718 10.633 3.78e-15 ***
## x3 35.40284 11.09960 3.190 0.00232 **
## x4 5.84391 2.90978 2.008 0.04935 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.014 on 57 degrees of freedom
## Multiple R-squared: 0.6914, Adjusted R-squared: 0.6697
## F-statistic: 31.92 on 4 and 57 DF, p-value: 5.818e-14
Model Test
residuals <- resid(k2_model)
residuals
## 1 2 3 4 5 6 7
## 4.3201673 7.3759699 4.6923543 7.0264782 6.1050403 4.5662529 0.1400891
## 8 9 10 11 12 13 14
## 2.9524078 5.1188108 10.0199435 5.0484870 6.6609879 2.8426639 0.9679773
## 15 16 17 18 19 20 21
## 1.1817851 -2.7930904 -4.6606260 -3.3981066 -5.4113129 -2.2905364 -0.1267099
## 22 23 24 25 26 27 28
## 10.5667676 9.4225703 7.7389546 -2.3392739 -3.2905995 -3.8880229 -3.7211633
## 29 30 31 32 33 34 35
## -1.6282993 -4.1426010 -3.1192626 -3.3153791 -3.2829295 0.6747558 -9.4707259
## 36 37 38 39 40 41 42
## -2.3623054 -4.9707632 0.1885003 0.2430186 1.0514391 -0.3570187 -0.4522636
## 43 44 45 46 47 48 49
## -4.0276183 -0.7841343 -1.7925921 -0.1239430 -6.6941073 -5.0564672 -7.2999885
## 50 51 52 53 54 55 56
## 1.6539042 -0.9915775 -2.7831570 -4.7916148 0.3534173 -7.4095815 -9.9957737
## 57 58 59 60 61 62
## 1.4235069 -3.2440288 0.9184907 3.5390410 9.2260609 3.9957312
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
ggqqplot(residuals)

H0: The residuals are normally distributed
H1: The residuals are not normally distributed
P-Value = 0.2689
Since P-Value = 0.2689, alpha= 0.05, do not reject H0.
at alpha = 0.05, the residuals are normally distributed
#selecting the best transformation
library(MASS)
library(faraway)
## Warning: package 'faraway' was built under R version 4.0.5
k2_model<-lm(y ~ x1 + x2 +x3+ x4, data = data)
summary(k2_model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9958 -3.3092 -0.2419 3.3924 10.5668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.89453 4.32508 1.363 0.17828
## x1 -0.47790 0.34002 -1.406 0.16530
## x2 0.18271 0.01718 10.633 3.78e-15 ***
## x3 35.40284 11.09960 3.190 0.00232 **
## x4 5.84391 2.90978 2.008 0.04935 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.014 on 57 degrees of freedom
## Multiple R-squared: 0.6914, Adjusted R-squared: 0.6697
## F-statistic: 31.92 on 4 and 57 DF, p-value: 5.818e-14
new anova test model
plot(fitted(k2_model), resid(k2_model), col = "dodgerblue",pch = 20, cex = 1.5, xlab = "Fitted", ylab = "Residuals")
abline(h = 0, lty = 2, col = "darkorange", lwd = 2)

The plot is approximately normal because the data is scattered balance above and under the line
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(k2_model)
##
## studentized Breusch-Pagan test
##
## data: k2_model
## BP = 2.8042, df = 4, p-value = 0.5911
studentized Breusch-Pagan test
P-Value = 0.5911
H0= The error variance are all equal(homoscedasticity)
H1= The error variance are not equal(heteroscedasticity)
Since P-Value > alpha=0.05, we do not reject H0
at alpha = 0.05, the error variance are not equal(homoscedasticity)
shapiro.test(resid(k2_model))
##
## Shapiro-Wilk normality test
##
## data: resid(k2_model)
## W = 0.97616, p-value = 0.2689
H0= The residuals are normal distributed
H1= The residuals are not normally distributed
P-Value = 0.2689
Since P-Value = 0.2689, alpha= 0.05, do not reject H0.
at alpha = 0.05, the residuals are normally distributed
boxcox(k2_model, lambda = seq(-0.20, 1.5, by = 0.05), plotit = TRUE)

k2_model_cox1<-lm((((y ^ 0.3) - 1) / 0.3) ~ x1 + x2 + x3 + x4, data = data)
summary(k2_model_cox1)
##
## Call:
## lm(formula = (((y^0.3) - 1)/0.3) ~ x1 + x2 + x3 + x4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.53200 -0.41523 0.01346 0.38533 1.16778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.027431 0.499284 6.064 1.13e-07 ***
## x1 -0.044401 0.039252 -1.131 0.26271
## x2 0.017664 0.001984 8.905 2.21e-12 ***
## x3 4.449880 1.281331 3.473 0.00099 ***
## x4 0.651315 0.335903 1.939 0.05746 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5788 on 57 degrees of freedom
## Multiple R-squared: 0.6248, Adjusted R-squared: 0.5985
## F-statistic: 23.73 on 4 and 57 DF, p-value: 1.381e-11
New model test by using box-cox method
plot(fitted(k2_model_cox1), resid(k2_model_cox1), col = "dodgerblue",
pch = 20, cex = 1.5, xlab = "Fitted", ylab = "Residuals")
abline(h = 0, lty = 2, col = "darkorange", lwd = 2)

The plot is approximately normal because the data is balanced above and under the line.
library(lmtest)
bptest(k2_model_cox1)
##
## studentized Breusch-Pagan test
##
## data: k2_model_cox1
## BP = 3.9224, df = 4, p-value = 0.4166
studentized Breusch-Pagan test
P-Value = 0.4166
H0= The error variance are all equal(homoscedasticity)
H1= The error variance are not equal(heteroscedasticity)
Since P-Value > alpha=0.05, we do not reject H0
at alpha = 0.05, the error variance are all equal(homoscedasticity)
shapiro.test(resid(k2_model_cox1))
##
## Shapiro-Wilk normality test
##
## data: resid(k2_model_cox1)
## W = 0.98251, p-value = 0.5209
H0= The residuals are normal distributed
H1= The residuals are not normally distributed
P-Value = 0.5209
Since P-Value = 0.5209, alpha= 0.05, do not reject H0.
at alpha = 0.05, the residuals are normally distributed
k2_fit_log = lm(log(y) ~ x1 + x2 +x3+ x4, data = data)
summary(k2_fit_log)
##
## Call:
## lm(formula = log(y) ~ x1 + x2 + x3 + x4, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71107 -0.16435 0.00625 0.15922 0.46617
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2284886 0.2033430 10.959 1.18e-15 ***
## x1 -0.0161750 0.0159860 -1.012 0.315897
## x2 0.0065702 0.0008079 8.133 4.14e-11 ***
## x3 1.8501829 0.5218463 3.545 0.000791 ***
## x4 0.2547779 0.1368029 1.862 0.067708 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2357 on 57 degrees of freedom
## Multiple R-squared: 0.5894, Adjusted R-squared: 0.5606
## F-statistic: 20.45 on 4 and 57 DF, p-value: 1.712e-10
The plot is approximately normal because the data is scattered near the line
In conclusion, the log model is the best to test this dataset because the data that scattered in the normal probability plot is mostly near the line and grouped in close distance.