#import data
library(readr)
data <- read_csv("D:/UMP/Sem 5/Statistical Modelling/Lab Report/Lab Report 3.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Column1 = col_double(),
##   x1 = col_double(),
##   x2 = col_double(),
##   x3 = col_double(),
##   x4 = col_double(),
##   y = col_double()
## )
data
## # A tibble: 62 x 6
##    Column1    x1    x2    x3    x4     y
##      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1       1  2.14    10  0.34 1      28.9
##  2       2  4.14    10  0.34 1      31  
##  3       3  8.15    10  0.34 1      26.4
##  4       4  2.14    10  0.34 0.246  27.2
##  5       5  4.14    10  0.34 0.379  26.1
##  6       6  8.15    10  0.34 0.474  23.2
##  7       7  2.14    10  0.34 0.141  19.7
##  8       8  4.14    10  0.34 0.234  22.1
##  9       9  8.15    10  0.34 0.311  22.8
## 10      10  2.14    10  0.34 0.076  29.2
## # ... with 52 more rows
k2_model <- lm(formula = y ~ x1 + x2 +x3+ x4, data = data)
summary(k2_model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9958 -3.3092 -0.2419  3.3924 10.5668 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.89453    4.32508   1.363  0.17828    
## x1          -0.47790    0.34002  -1.406  0.16530    
## x2           0.18271    0.01718  10.633 3.78e-15 ***
## x3          35.40284   11.09960   3.190  0.00232 ** 
## x4           5.84391    2.90978   2.008  0.04935 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.014 on 57 degrees of freedom
## Multiple R-squared:  0.6914, Adjusted R-squared:  0.6697 
## F-statistic: 31.92 on 4 and 57 DF,  p-value: 5.818e-14

Model Test

residuals <- resid(k2_model)
residuals
##          1          2          3          4          5          6          7 
##  4.3201673  7.3759699  4.6923543  7.0264782  6.1050403  4.5662529  0.1400891 
##          8          9         10         11         12         13         14 
##  2.9524078  5.1188108 10.0199435  5.0484870  6.6609879  2.8426639  0.9679773 
##         15         16         17         18         19         20         21 
##  1.1817851 -2.7930904 -4.6606260 -3.3981066 -5.4113129 -2.2905364 -0.1267099 
##         22         23         24         25         26         27         28 
## 10.5667676  9.4225703  7.7389546 -2.3392739 -3.2905995 -3.8880229 -3.7211633 
##         29         30         31         32         33         34         35 
## -1.6282993 -4.1426010 -3.1192626 -3.3153791 -3.2829295  0.6747558 -9.4707259 
##         36         37         38         39         40         41         42 
## -2.3623054 -4.9707632  0.1885003  0.2430186  1.0514391 -0.3570187 -0.4522636 
##         43         44         45         46         47         48         49 
## -4.0276183 -0.7841343 -1.7925921 -0.1239430 -6.6941073 -5.0564672 -7.2999885 
##         50         51         52         53         54         55         56 
##  1.6539042 -0.9915775 -2.7831570 -4.7916148  0.3534173 -7.4095815 -9.9957737 
##         57         58         59         60         61         62 
##  1.4235069 -3.2440288  0.9184907  3.5390410  9.2260609  3.9957312
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
ggqqplot(residuals)

Based on the normal probability plot of the residuals, the plot is not normal. This is because the #### data is scattered far from the linear line makes it the outliers of the plots.

plot(k2_model)

shapiro.test(residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals
## W = 0.97616, p-value = 0.2689

H0: The residuals are normally distributed

H1: The residuals are not normally distributed

P-Value = 0.2689

Since P-Value = 0.2689, alpha= 0.05, do not reject H0.

at alpha = 0.05, the residuals are normally distributed

#selecting the best transformation
library(MASS)
library(faraway)
## Warning: package 'faraway' was built under R version 4.0.5
k2_model<-lm(y ~ x1 + x2 +x3+ x4, data = data)
summary(k2_model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9958 -3.3092 -0.2419  3.3924 10.5668 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.89453    4.32508   1.363  0.17828    
## x1          -0.47790    0.34002  -1.406  0.16530    
## x2           0.18271    0.01718  10.633 3.78e-15 ***
## x3          35.40284   11.09960   3.190  0.00232 ** 
## x4           5.84391    2.90978   2.008  0.04935 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.014 on 57 degrees of freedom
## Multiple R-squared:  0.6914, Adjusted R-squared:  0.6697 
## F-statistic: 31.92 on 4 and 57 DF,  p-value: 5.818e-14

new anova test model

plot(fitted(k2_model), resid(k2_model), col = "dodgerblue",pch = 20, cex = 1.5, xlab = "Fitted", ylab = "Residuals")
abline(h = 0, lty = 2, col = "darkorange", lwd = 2)

The plot is approximately normal because the data is scattered balance above and under the line

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(k2_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  k2_model
## BP = 2.8042, df = 4, p-value = 0.5911

studentized Breusch-Pagan test

P-Value = 0.5911

H0= The error variance are all equal(homoscedasticity)

H1= The error variance are not equal(heteroscedasticity)

Since P-Value > alpha=0.05, we do not reject H0

at alpha = 0.05, the error variance are not equal(homoscedasticity)

shapiro.test(resid(k2_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(k2_model)
## W = 0.97616, p-value = 0.2689

H0= The residuals are normal distributed

H1= The residuals are not normally distributed

P-Value = 0.2689

Since P-Value = 0.2689, alpha= 0.05, do not reject H0.

at alpha = 0.05, the residuals are normally distributed

boxcox(k2_model, lambda = seq(-0.20, 1.5, by = 0.05), plotit = TRUE)

k2_model_cox1<-lm((((y ^ 0.3) - 1) / 0.3) ~ x1 + x2 + x3 + x4, data = data)
summary(k2_model_cox1)
## 
## Call:
## lm(formula = (((y^0.3) - 1)/0.3) ~ x1 + x2 + x3 + x4, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53200 -0.41523  0.01346  0.38533  1.16778 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.027431   0.499284   6.064 1.13e-07 ***
## x1          -0.044401   0.039252  -1.131  0.26271    
## x2           0.017664   0.001984   8.905 2.21e-12 ***
## x3           4.449880   1.281331   3.473  0.00099 ***
## x4           0.651315   0.335903   1.939  0.05746 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5788 on 57 degrees of freedom
## Multiple R-squared:  0.6248, Adjusted R-squared:  0.5985 
## F-statistic: 23.73 on 4 and 57 DF,  p-value: 1.381e-11

New model test by using box-cox method

plot(fitted(k2_model_cox1), resid(k2_model_cox1), col = "dodgerblue",
     pch = 20, cex = 1.5, xlab = "Fitted", ylab = "Residuals")
abline(h = 0, lty = 2, col = "darkorange", lwd = 2)

The plot is approximately normal because the data is balanced above and under the line.

library(lmtest)

bptest(k2_model_cox1)
## 
##  studentized Breusch-Pagan test
## 
## data:  k2_model_cox1
## BP = 3.9224, df = 4, p-value = 0.4166

studentized Breusch-Pagan test

P-Value = 0.4166

H0= The error variance are all equal(homoscedasticity)

H1= The error variance are not equal(heteroscedasticity)

Since P-Value > alpha=0.05, we do not reject H0

at alpha = 0.05, the error variance are all equal(homoscedasticity)

shapiro.test(resid(k2_model_cox1))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(k2_model_cox1)
## W = 0.98251, p-value = 0.5209

H0= The residuals are normal distributed

H1= The residuals are not normally distributed

P-Value = 0.5209

Since P-Value = 0.5209, alpha= 0.05, do not reject H0.

at alpha = 0.05, the residuals are normally distributed

k2_fit_log = lm(log(y) ~ x1 + x2 +x3+ x4, data = data)
summary(k2_fit_log)
## 
## Call:
## lm(formula = log(y) ~ x1 + x2 + x3 + x4, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71107 -0.16435  0.00625  0.15922  0.46617 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.2284886  0.2033430  10.959 1.18e-15 ***
## x1          -0.0161750  0.0159860  -1.012 0.315897    
## x2           0.0065702  0.0008079   8.133 4.14e-11 ***
## x3           1.8501829  0.5218463   3.545 0.000791 ***
## x4           0.2547779  0.1368029   1.862 0.067708 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2357 on 57 degrees of freedom
## Multiple R-squared:  0.5894, Adjusted R-squared:  0.5606 
## F-statistic: 20.45 on 4 and 57 DF,  p-value: 1.712e-10

New model using log scale

residuals <- resid(k2_fit_log)
residuals
##            1            2            3            4            5            6 
##  0.220425369  0.322920975  0.227159523  0.351903248  0.309086131  0.231960942 
##            7            8            9           10           11           12 
##  0.056056585  0.179671214  0.256097990  0.466167219  0.271327659  0.322940953 
##           13           14           15           16           17           18 
##  0.173142536  0.082870277  0.069992674 -0.103272849 -0.225415943 -0.187071316 
##           19           20           21           22           23           24 
## -0.289311742 -0.097705068 -0.006549519  0.425467015  0.393672890  0.337987528 
##           25           26           27           28           29           30 
## -0.095323234 -0.159594449 -0.206676426 -0.208280282 -0.046163437 -0.194025192 
##           31           32           33           34           35           36 
## -0.147616426 -0.165523145 -0.172312014  0.087077118 -0.711066336 -0.109082167 
##           37           38           39           40           41           42 
## -0.315155602  0.052137010  0.052892566  0.089044481  0.019103220  0.029545528 
##           43           44           45           46           47           48 
## -0.160848832  0.007871281 -0.047629296  0.053799334 -0.317752325 -0.222150317 
##           49           50           51           52           53           54 
## -0.386401978  0.004626644 -0.080166922 -0.145343259 -0.228356601 -0.001547481 
##           55           56           57           58           59           60 
## -0.213538452 -0.296938396  0.043140854 -0.084043421  0.025711344  0.117437228 
##           61           62 
##  0.228256264  0.115368826
library(ggpubr)
ggqqplot(residuals)

plot(k2_fit_log)

shapiro.test(residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals
## W = 0.97819, p-value = 0.3357

The plot is approximately normal because the data is scattered near the line

In conclusion, the log model is the best to test this dataset because the data that scattered in the normal probability plot is mostly near the line and grouped in close distance.