This demo is to demonstrate the implementation of variable selection in multiple linear regression model in assessing the satisfaction of land relocation in Thai Nguyen, Vietnam
HVT<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Son-Data/H_Van_Thu.csv",sep=";")
# look at some first few rows
head(HVT)
Data manipulation
Group A
#- Group A
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages -------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.3.4 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.3
## -- Conflicts ----------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(relaimpo)
## Warning: package 'relaimpo' was built under R version 3.4.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: boot
## Loading required package: survey
## Warning: package 'survey' was built under R version 3.4.3
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## Warning: package 'mitools' was built under R version 3.4.3
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
GroupA<- HVT %>% dplyr::select(A1:A5,Y.TB)
head(GroupA)
# Linear regression model
model_A<-lm(Y.TB~., data=GroupA)
step(model_A,direction = "backward") # Variables should be selected
## Start: AIC=-123.79
## Y.TB ~ A1 + A2 + A3 + A4 + A5
##
## Df Sum of Sq RSS AIC
## - A4 1 0.00052 3.3082 -125.78
## - A1 1 0.05524 3.3629 -124.96
## <none> 3.3077 -123.79
## - A3 1 0.29425 3.6019 -121.53
## - A5 1 0.38111 3.6888 -120.34
## - A2 1 0.52027 3.8280 -118.48
##
## Step: AIC=-125.78
## Y.TB ~ A1 + A2 + A3 + A5
##
## Df Sum of Sq RSS AIC
## - A1 1 0.05553 3.3637 -126.95
## <none> 3.3082 -125.78
## - A5 1 0.38079 3.6890 -122.33
## - A2 1 0.52422 3.8324 -120.43
## - A3 1 1.10445 4.4127 -113.38
##
## Step: AIC=-126.95
## Y.TB ~ A2 + A3 + A5
##
## Df Sum of Sq RSS AIC
## <none> 3.3637 -126.95
## - A5 1 0.39427 3.7580 -123.41
## - A3 1 1.05389 4.4176 -115.32
## - A2 1 1.22483 4.5886 -113.42
##
## Call:
## lm(formula = Y.TB ~ A2 + A3 + A5, data = GroupA)
##
## Coefficients:
## (Intercept) A2 A3 A5
## 3.8988 0.4353 -0.5535 0.1639
Group1<- GroupA %>% dplyr::select(A2,A3,A5) # Only variables A2, A3 and A5 should be selected
Mean_A<-rowMeans(Group1)
#Group B
GroupB<- HVT %>% dplyr::select(B6:B10,Y.TB)
head(GroupB)
## Linear regression model
model_B<-lm(Y.TB~., data=GroupB)
step(model_B,direction = "backward") # Variables should be selected
## Start: AIC=-120.26
## Y.TB ~ B6 + B7 + B8 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B8 1 0.08716 3.6370 -121.04
## <none> 3.5499 -120.25
## - B9 1 0.17297 3.7228 -119.88
## - B7 1 0.18977 3.7396 -119.65
## - B6 1 0.29022 3.8401 -118.33
## - B10 1 0.49536 4.0452 -115.72
##
## Step: AIC=-121.04
## Y.TB ~ B6 + B7 + B9 + B10
##
## Df Sum of Sq RSS AIC
## <none> 3.6370 -121.04
## - B9 1 0.15269 3.7897 -120.99
## - B7 1 0.19965 3.8367 -120.37
## - B6 1 0.37227 4.0093 -118.17
## - B10 1 0.60160 4.2386 -115.39
##
## Call:
## lm(formula = Y.TB ~ B6 + B7 + B9 + B10, data = GroupB)
##
## Coefficients:
## (Intercept) B6 B7 B9 B10
## 2.2980 0.1779 0.1442 -0.1706 0.2134
Group2<- GroupB %>% dplyr::select(B6,B7,B9,B10) # Only these variables should be selected
Mean_B<-rowMeans(Group2)
# Group C
GroupC<- HVT %>% dplyr::select(C11:C14,Y.TB)
head(GroupC)
## Linear regression model
model_C<-lm(Y.TB~., data=GroupC)
step(model_C,direction = "backward") # Variables should be selected
## Start: AIC=-111.51
## Y.TB ~ C11 + C12 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C12 1 0.01806 4.4194 -113.30
## - C14 1 0.11325 4.5146 -112.24
## <none> 4.4013 -111.51
## - C11 1 0.18619 4.5875 -111.43
## - C13 1 0.64651 5.0478 -106.65
##
## Step: AIC=-113.3
## Y.TB ~ C11 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C14 1 0.11124 4.5306 -114.06
## - C11 1 0.17037 4.5897 -113.41
## <none> 4.4194 -113.30
## - C13 1 0.66427 5.0836 -108.30
##
## Step: AIC=-114.06
## Y.TB ~ C11 + C13
##
## Df Sum of Sq RSS AIC
## - C11 1 0.12628 4.6569 -114.68
## <none> 4.5306 -114.06
## - C13 1 0.56680 5.0974 -110.16
##
## Step: AIC=-114.68
## Y.TB ~ C13
##
## Df Sum of Sq RSS AIC
## <none> 4.6569 -114.68
## - C13 1 1.0481 5.7050 -106.53
##
## Call:
## lm(formula = Y.TB ~ C13, data = GroupC)
##
## Coefficients:
## (Intercept) C13
## 2.2818 0.3147
Group3<- GroupC %>% dplyr::select(C13)
Mean_C<-rowMeans(Group3)
# Group D
GroupD<- HVT %>% dplyr::select(D15:D16,Y.TB)
head(GroupD)
## Linear regression model
model_D<-lm(Y.TB~., data=GroupD)
step(model_D,direction = "backward") # Variables should be selected
## Start: AIC=-112.34
## Y.TB ~ D15 + D16
##
## Df Sum of Sq RSS AIC
## <none> 4.6895 -112.33
## - D16 1 0.24936 4.9388 -111.75
## - D15 1 0.92811 5.6176 -105.31
##
## Call:
## lm(formula = Y.TB ~ D15 + D16, data = GroupD)
##
## Coefficients:
## (Intercept) D15 D16
## 2.9906 0.3507 -0.1428
Group4<- GroupD %>% dplyr::select(D15:D16)
Mean_D<-rowMeans(Group4)
# Group E
GroupE<- HVT %>% dplyr::select(E17:E20,Y.TB)
head(GroupE)
## Linear regression model
model_E<-lm(Y.TB~., data=GroupE)
step(model_E,direction = "backward") # Variables should be selected
## Start: AIC=-117.59
## Y.TB ~ E17 + E18 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E17 1 0.02011 3.9168 -119.34
## <none> 3.8967 -117.59
## - E20 1 0.21918 4.1159 -116.86
## - E18 1 0.53801 4.4347 -113.13
## - E19 1 0.56462 4.4613 -112.83
##
## Step: AIC=-119.34
## Y.TB ~ E18 + E19 + E20
##
## Df Sum of Sq RSS AIC
## <none> 3.9168 -119.34
## - E20 1 0.31859 4.2354 -117.43
## - E19 1 0.56160 4.4784 -114.64
## - E18 1 0.64746 4.5643 -113.69
##
## Call:
## lm(formula = Y.TB ~ E18 + E19 + E20, data = GroupE)
##
## Coefficients:
## (Intercept) E18 E19 E20
## 1.1783 0.2328 0.1779 0.1334
Group5<- GroupE %>% dplyr::select(E18:E20)
Mean_E<-rowMeans(Group5)
# Group F
GroupF<- HVT %>% dplyr::select(F21:F23,Y.TB)
head(GroupF)
## Linear regression model
model_F<-lm(Y.TB~., data=GroupF)
step(model_F,direction = "backward") # Variables should be selected
## Start: AIC=-129.83
## Y.TB ~ F21 + F22 + F23
##
## Df Sum of Sq RSS AIC
## - F22 1 0.00237 3.1778 -131.79
## - F21 1 0.03731 3.2128 -131.24
## <none> 3.1755 -129.83
## - F23 1 1.30594 4.4814 -114.60
##
## Step: AIC=-131.79
## Y.TB ~ F21 + F23
##
## Df Sum of Sq RSS AIC
## - F21 1 0.03577 3.2136 -133.23
## <none> 3.1778 -131.79
## - F23 1 1.33063 4.5085 -116.30
##
## Step: AIC=-133.23
## Y.TB ~ F23
##
## Df Sum of Sq RSS AIC
## <none> 3.2136 -133.23
## - F23 1 2.4914 5.7050 -106.53
##
## Call:
## lm(formula = Y.TB ~ F23, data = GroupF)
##
## Coefficients:
## (Intercept) F23
## 2.8931 0.2561
Group6<- GroupF %>% dplyr::select(F23)
Mean_F<-rowMeans(Group6)
# Group G
GroupG<- HVT %>% dplyr::select(G24:G25,Y.TB)
head(GroupG)
## Linear regression model
model_G<-lm(Y.TB~., data=GroupG)
step(model_G,direction = "backward") # Variables should be selected
## Start: AIC=-154.08
## Y.TB ~ G24 + G25
##
## Df Sum of Sq RSS AIC
## <none> 2.0348 -154.08
## - G25 1 0.43711 2.4719 -146.35
## - G24 1 1.01596 3.0508 -135.83
##
## Call:
## lm(formula = Y.TB ~ G24 + G25, data = GroupG)
##
## Coefficients:
## (Intercept) G24 G25
## 2.4998 0.2155 0.1390
Group7<- GroupG %>% dplyr::select(G24:G25)
Mean_G<-rowMeans(Group7)
# Group H
GroupH<- HVT %>% dplyr::select(H26:H27,Y.TB)
head(GroupH)
## Linear regression model
model_H<-lm(Y.TB~., data=GroupH)
step(model_H,direction = "backward") # Variables should be selected
## Start: AIC=-150.96
## Y.TB ~ H26 + H27
##
## Df Sum of Sq RSS AIC
## <none> 2.1657 -150.96
## - H26 1 0.15469 2.3204 -149.51
## - H27 1 0.15469 2.3204 -149.51
##
## Call:
## lm(formula = Y.TB ~ H26 + H27, data = GroupH)
##
## Coefficients:
## (Intercept) H26 H27
## 2.6769 0.2011 0.2011
Group8<- GroupH %>% dplyr::select(H26:H27)
Mean_H<-rowMeans(Group8)
# Dataset
HVT_Dataset<-data.frame(Mean_A,Mean_B,Mean_C,Mean_D,Mean_E,Mean_F,Mean_G,Mean_H, Y_mean=HVT$Y.TB)
head(HVT_Dataset)
# Final model
HVT_Data<-na.omit(HVT_Dataset) # Omit NA data points
model_final<-lm(Y_mean~., data=HVT_Data)
step(model_final,direction = "backward") # Unfortunately the Variable B should be removed as it is unsatisfied the condition of stepwise selection
## Start: AIC=-211.15
## Y_mean ~ Mean_A + Mean_B + Mean_C + Mean_D + Mean_E + Mean_F +
## Mean_G + Mean_H
##
## Df Sum of Sq RSS AIC
## - Mean_B 1 0.001043 0.51226 -213.05
## <none> 0.51122 -211.15
## - Mean_C 1 0.030175 0.54139 -210.28
## - Mean_D 1 0.031316 0.54254 -210.18
## - Mean_H 1 0.113259 0.62448 -203.14
## - Mean_F 1 0.125825 0.63704 -202.15
## - Mean_E 1 0.158358 0.66958 -199.66
## - Mean_A 1 0.246919 0.75814 -193.45
## - Mean_G 1 0.273650 0.78487 -191.71
##
## Step: AIC=-213.05
## Y_mean ~ Mean_A + Mean_C + Mean_D + Mean_E + Mean_F + Mean_G +
## Mean_H
##
## Df Sum of Sq RSS AIC
## <none> 0.51226 -213.05
## - Mean_C 1 0.031745 0.54401 -212.04
## - Mean_D 1 0.036664 0.54893 -211.59
## - Mean_H 1 0.113619 0.62588 -205.03
## - Mean_F 1 0.136812 0.64907 -203.21
## - Mean_E 1 0.192161 0.70442 -199.12
## - Mean_G 1 0.272937 0.78520 -193.69
## - Mean_A 1 0.307920 0.82018 -191.51
##
## Call:
## lm(formula = Y_mean ~ Mean_A + Mean_C + Mean_D + Mean_E + Mean_F +
## Mean_G + Mean_H, data = HVT_Data)
##
## Coefficients:
## (Intercept) Mean_A Mean_C Mean_D Mean_E
## 0.44652 0.18170 0.06527 0.05127 0.20593
## Mean_F Mean_G Mean_H
## 0.09547 0.15212 0.12180
# Remove Mean_B
df<-HVT_Data[,-2]
# Fitting the model
model_HVT1<-lm(Y_mean~., data=df)
impo_HVT1<-calc.relimp(model_HVT1,type=c("lmg"),rela=T)
impo_HVT1
## Response variable: Y_mean
## Total response variance: 0.1164286
## Analysis based on 50 observations
##
## 7 Regressors:
## Mean_A Mean_C Mean_D Mean_E Mean_F Mean_G Mean_H
## Proportion of variance explained by model: 91.02%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_A 0.10304471
## Mean_C 0.06330955
## Mean_D 0.02316092
## Mean_E 0.12425101
## Mean_F 0.17407391
## Mean_G 0.26549252
## Mean_H 0.24666737
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs 6Xs
## Mean_A 0.2580618 0.22533976 0.2079465 0.19754626 0.1902697 0.18476229
## Mean_C 0.3147448 0.22766191 0.1716722 0.13564648 0.1099771 0.08811272
## Mean_D 0.1394444 0.09293346 0.0684665 0.05626892 0.0507789 0.04928148
## Mean_E 0.5349572 0.42245420 0.3442569 0.29003945 0.2519314 0.22533542
## Mean_F 0.2560526 0.19551317 0.1555685 0.12877098 0.1109824 0.10013574
## Mean_G 0.3537931 0.30364147 0.2596262 0.22188373 0.1907776 0.16717156
## Mean_H 0.4021938 0.34156383 0.2885113 0.24192549 0.1998767 0.16043779
## 7Xs
## Mean_A 0.18170435
## Mean_C 0.06527345
## Mean_D 0.05126938
## Mean_E 0.20592570
## Mean_F 0.09547314
## Mean_G 0.15212149
## Mean_H 0.12179713
# Manipulating the dataset
df_HVT<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Son-Data/HVT_Final.csv",header=T)
model_HVT2<-lm(Y_mean~., data=df_HVT)
impo_HVT2<-calc.relimp(model_HVT2,type=c("lmg"),rela=T)
impo_HVT2
## Response variable: Y_mean
## Total response variance: 0.1164286
## Analysis based on 50 observations
##
## 7 Regressors:
## Mean_A Mean_C Mean_D Mean_E Mean_F Mean_G Mean_H
## Proportion of variance explained by model: 48.27%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_A 0.21301087
## Mean_C 0.02077414
## Mean_D 0.06668971
## Mean_E 0.35790977
## Mean_F 0.08091115
## Mean_G 0.23766256
## Mean_H 0.02304179
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Mean_A 0.25806184 0.23704844 0.22108352 0.20906614 0.200099006
## Mean_C 0.07061959 0.04658246 0.02685615 0.01064096 -0.002888626
## Mean_D 0.13944444 0.12044406 0.10442965 0.09088227 0.079393838
## Mean_E 0.53495723 0.50146959 0.46907527 0.43894776 0.411960711
## Mean_F 0.09490334 0.08341120 0.07397738 0.06615532 0.059675545
## Mean_G 0.17658423 0.15898706 0.14329206 0.12933533 0.116822992
## Mean_H 0.06564568 0.04550025 0.02833064 0.01373067 0.001474334
## 6Xs 7Xs
## Mean_A 0.19339378 0.18808807
## Mean_C -0.01437788 -0.02409297
## Mean_D 0.06960882 0.06112419
## Mean_E 0.38854344 0.36862223
## Mean_F 0.05435841 0.04994838
## Mean_G 0.10545854 0.09509132
## Mean_H -0.00861258 -0.01681929
# Bac Son Keo Dai Road project
BS_KD<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Son-Data/S%E1%BB%91%20li%E1%BB%87u%20T.S%C6%A1n.csv",sep=";")
# look at some first few rows
head(BS_KD)
Data manipulation
Group A
#- Group A
library(tidyverse)
GroupA_BS_KD<- BS_KD %>% dplyr::select(A1:A5,Y.TB)
head(GroupA_BS_KD)
# Linear regression model
model_BSKD_A<-lm(Y.TB~., data=GroupA_BS_KD)
step(model_BSKD_A,direction = "backward") # Variables should be selected
## Start: AIC=-136.89
## Y.TB ~ A1 + A2 + A3 + A4 + A5
##
## Df Sum of Sq RSS AIC
## - A4 1 0.001067 2.5465 -138.87
## - A1 1 0.009538 2.5550 -138.70
## - A3 1 0.101962 2.6474 -136.92
## <none> 2.5455 -136.89
## - A5 1 0.107990 2.6534 -136.81
## - A2 1 0.304097 2.8495 -133.24
##
## Step: AIC=-138.86
## Y.TB ~ A1 + A2 + A3 + A5
##
## Df Sum of Sq RSS AIC
## - A1 1 0.00925 2.5558 -140.68
## <none> 2.5465 -138.87
## - A5 1 0.11908 2.6656 -138.58
## - A3 1 0.14688 2.6934 -138.06
## - A2 1 0.38541 2.9319 -133.82
##
## Step: AIC=-140.68
## Y.TB ~ A2 + A3 + A5
##
## Df Sum of Sq RSS AIC
## <none> 2.5558 -140.68
## - A5 1 0.11450 2.6703 -140.49
## - A3 1 0.27527 2.8310 -137.57
## - A2 1 0.52904 3.0848 -133.28
##
## Call:
## lm(formula = Y.TB ~ A2 + A3 + A5, data = GroupA_BS_KD)
##
## Coefficients:
## (Intercept) A2 A3 A5
## 2.15439 0.30987 0.11197 -0.07981
Group_BSKD_A<- GroupA_BS_KD %>% dplyr::select(A2,A3,A5) # Only variables A2, A3 and A5 should be selected
Mean_A_BSKD<-rowMeans(Group_BSKD_A)
#Group B
GroupB_BSKD<- BS_KD %>% dplyr::select(B6:B10,Y.TB)
head(GroupB_BSKD)
## Linear regression model
model_BSKD_B<-lm(Y.TB~., data=GroupB_BSKD)
step(model_BSKD_B,direction = "backward") # Variables should be selected
## Start: AIC=-142.78
## Y.TB ~ B6 + B7 + B8 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B8 1 0.01502 2.2775 -144.45
## - B9 1 0.03168 2.2941 -144.08
## <none> 2.2624 -142.78
## - B6 1 0.10991 2.3724 -142.41
## - B10 1 0.35633 2.6188 -137.47
## - B7 1 0.45340 2.7159 -135.65
##
## Step: AIC=-144.45
## Y.TB ~ B6 + B7 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B9 1 0.01945 2.2969 -146.02
## <none> 2.2775 -144.45
## - B6 1 0.15096 2.4284 -143.24
## - B10 1 0.36011 2.6376 -139.11
## - B7 1 0.45124 2.7287 -137.41
##
## Step: AIC=-146.02
## Y.TB ~ B6 + B7 + B10
##
## Df Sum of Sq RSS AIC
## <none> 2.2969 -146.02
## - B6 1 0.13789 2.4348 -145.11
## - B10 1 0.39151 2.6884 -140.15
## - B7 1 0.46017 2.7571 -138.89
##
## Call:
## lm(formula = Y.TB ~ B6 + B7 + B10, data = GroupB_BSKD)
##
## Coefficients:
## (Intercept) B6 B7 B10
## 1.9206 0.1149 0.1692 0.1340
Group_BSKD_B<- GroupB_BSKD %>% dplyr::select(B6,B7,B10) # Only these variables should be selected
Mean_BSKD_B<-rowMeans(Group_BSKD_B)
# Group C
GroupC_BSKD<- BS_KD %>% dplyr::select(C11:C14,Y.TB)
head(GroupC_BSKD)
## Linear regression model
model_BSKD_B<-lm(Y.TB~., data=GroupC_BSKD)
step(model_BSKD_B,direction = "backward") # Variables should be selected
## Start: AIC=-154
## Y.TB ~ C11 + C12 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C14 1 0.01849 1.8997 -155.51
## - C11 1 0.02207 1.9033 -155.42
## <none> 1.8812 -154.00
## - C12 1 0.14422 2.0255 -152.31
## - C13 1 0.58901 2.4703 -142.38
##
## Step: AIC=-155.52
## Y.TB ~ C11 + C12 + C13
##
## Df Sum of Sq RSS AIC
## - C11 1 0.01477 1.9145 -157.13
## <none> 1.8997 -155.51
## - C12 1 0.20333 2.1031 -152.43
## - C13 1 0.77105 2.6708 -140.48
##
## Step: AIC=-157.13
## Y.TB ~ C12 + C13
##
## Df Sum of Sq RSS AIC
## <none> 1.9145 -157.13
## - C12 1 0.2903 2.2048 -152.07
## - C13 1 1.1457 3.0602 -135.68
##
## Call:
## lm(formula = Y.TB ~ C12 + C13, data = GroupC_BSKD)
##
## Coefficients:
## (Intercept) C12 C13
## 1.9257 0.1000 0.3252
Group_BSKD_C<- GroupC_BSKD %>% dplyr::select(C12:C13)
Mean_BSKD_C<-rowMeans(Group_BSKD_C)
# Group D
GroupD_BSKD<- BS_KD %>% dplyr::select(D15:D16,Y.TB)
head(GroupD_BSKD)
## Linear regression model
model_BSKD_D<-lm(Y.TB~., data=GroupD_BSKD)
step(model_BSKD_D,direction = "backward") # Variables should be selected
## Start: AIC=-135.92
## Y.TB ~ D15 + D16
##
## Df Sum of Sq RSS AIC
## <none> 2.9261 -135.92
## - D15 1 0.19493 3.1210 -134.69
## - D16 1 0.72299 3.6491 -126.88
##
## Call:
## lm(formula = Y.TB ~ D15 + D16, data = GroupD_BSKD)
##
## Coefficients:
## (Intercept) D15 D16
## 2.4406 0.1424 0.1644
Group_BSKD_D<- GroupD_BSKD %>% dplyr::select(D15:D16)
Mean_BSKD_D<-rowMeans(Group_BSKD_D)
# Group E
GroupE_BSKD<- BS_KD %>% dplyr::select(E17:E20,Y.TB)
head(GroupE_BSKD)
## Linear regression model
model_BSKD_E<-lm(Y.TB~., data=GroupE_BSKD)
step(model_BSKD_E,direction = "backward") # Variables should be selected
## Start: AIC=-146.3
## Y.TB ~ E17 + E18 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E18 1 0.00028 2.1950 -148.29
## - E20 1 0.02264 2.2174 -147.78
## <none> 2.1948 -146.30
## - E17 1 0.28857 2.4833 -142.12
## - E19 1 0.40864 2.6034 -139.76
##
## Step: AIC=-148.29
## Y.TB ~ E17 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E20 1 0.02401 2.2191 -149.75
## <none> 2.1950 -148.29
## - E17 1 0.31612 2.5112 -143.56
## - E19 1 0.42802 2.6231 -141.38
##
## Step: AIC=-149.75
## Y.TB ~ E17 + E19
##
## Df Sum of Sq RSS AIC
## <none> 2.2191 -149.75
## - E17 1 0.30675 2.5258 -145.27
## - E19 1 0.70685 2.9259 -137.92
##
## Call:
## lm(formula = Y.TB ~ E17 + E19, data = GroupE_BSKD)
##
## Coefficients:
## (Intercept) E17 E19
## 2.1383 0.1424 0.2135
Group_BSKD_E<- GroupE_BSKD %>% dplyr::select(E17,E19)
Mean_BSKD_E<-rowMeans(Group_BSKD_E)
# Group F
GroupF_BSKD<- BS_KD %>% dplyr::select(F21:F23,Y.TB)
head(GroupF_BSKD)
## Linear regression model
model_BSKD_F<-lm(Y.TB~., data=GroupF_BSKD)
step(model_BSKD_F,direction = "backward") # Variables should be selected
## Start: AIC=-134.18
## Y.TB ~ F21 + F22 + F23
##
## Df Sum of Sq RSS AIC
## - F22 1 0.07997 2.9908 -134.82
## - F23 1 0.08521 2.9960 -134.74
## <none> 2.9108 -134.18
## - F21 1 0.62489 3.5357 -126.46
##
## Step: AIC=-134.82
## Y.TB ~ F21 + F23
##
## Df Sum of Sq RSS AIC
## <none> 2.9908 -134.82
## - F23 1 0.12327 3.1140 -134.81
## - F21 1 0.93568 3.9265 -123.21
##
## Call:
## lm(formula = Y.TB ~ F21 + F23, data = GroupF_BSKD)
##
## Coefficients:
## (Intercept) F21 F23
## 2.35649 0.24183 0.09643
Group_BSKD_F<- GroupF_BSKD %>% dplyr::select(F21,F23)
Mean_BSKD_F<-rowMeans(Group_BSKD_F)
# Group G
GroupG_BSKD<- BS_KD %>% dplyr::select(G24:G25,Y.TB)
head(GroupG_BSKD)
## Linear regression model
model_BSKD_G<-lm(Y.TB~., data=GroupG_BSKD)
step(model_BSKD_G,direction = "backward") # Variables should be selected
## Start: AIC=-138.15
## Y.TB ~ G24 + G25
##
## Df Sum of Sq RSS AIC
## <none> 2.7981 -138.15
## - G25 1 0.67134 3.4694 -129.40
## - G24 1 0.69847 3.4966 -129.01
##
## Call:
## lm(formula = Y.TB ~ G24 + G25, data = GroupG_BSKD)
##
## Coefficients:
## (Intercept) G24 G25
## 2.0438 0.2293 0.2152
Group_BSKD_G<- GroupG_BSKD %>% dplyr::select(G24:G25)
Mean_BSKD_G<-rowMeans(Group_BSKD_G)
# Group H
GroupH_BSKD<- HVT %>% dplyr::select(H26:H27,Y.TB)
head(GroupH_BSKD)
## Linear regression model
model_BSKD_H<-lm(Y.TB~., data=GroupH_BSKD)
step(model_BSKD_H,direction = "backward") # Variables should be selected
## Start: AIC=-150.96
## Y.TB ~ H26 + H27
##
## Df Sum of Sq RSS AIC
## <none> 2.1657 -150.96
## - H26 1 0.15469 2.3204 -149.51
## - H27 1 0.15469 2.3204 -149.51
##
## Call:
## lm(formula = Y.TB ~ H26 + H27, data = GroupH_BSKD)
##
## Coefficients:
## (Intercept) H26 H27
## 2.6769 0.2011 0.2011
Group_BSKD_H<- GroupH_BSKD %>% dplyr::select(H26:H27)
Mean_BSKD_H<-rowMeans(Group_BSKD_H)
# Dataset
BSKD_Dataset<-data.frame(Mean_A_BSKD,Mean_BSKD_B,Mean_BSKD_C,Mean_BSKD_D,Mean_BSKD_E,Mean_BSKD_F,Mean_BSKD_G,Mean_BSKD_H, Y_mean=BS_KD$Y.TB)
head(BSKD_Dataset)
# Final model
BSKD_Data<-na.omit(BSKD_Dataset) # Omit NA data points
model_BSKD<-lm(Y_mean~., data=BSKD_Data)
step(model_BSKD,direction = "backward") # Unfortunately the Variable B should be removed as it is unsatisfied the condition of stepwise selection
## Start: AIC=-184.17
## Y_mean ~ Mean_A_BSKD + Mean_BSKD_B + Mean_BSKD_C + Mean_BSKD_D +
## Mean_BSKD_E + Mean_BSKD_F + Mean_BSKD_G + Mean_BSKD_H
##
## Df Sum of Sq RSS AIC
## - Mean_BSKD_D 1 0.00007 0.87692 -186.17
## - Mean_BSKD_H 1 0.00653 0.88337 -185.80
## - Mean_BSKD_F 1 0.01361 0.89045 -185.40
## <none> 0.87685 -184.17
## - Mean_BSKD_E 1 0.06213 0.93897 -182.75
## - Mean_BSKD_B 1 0.08736 0.96421 -181.42
## - Mean_BSKD_C 1 0.09195 0.96880 -181.19
## - Mean_A_BSKD 1 0.10045 0.97730 -180.75
## - Mean_BSKD_G 1 0.46827 1.34512 -164.78
##
## Step: AIC=-186.17
## Y_mean ~ Mean_A_BSKD + Mean_BSKD_B + Mean_BSKD_C + Mean_BSKD_E +
## Mean_BSKD_F + Mean_BSKD_G + Mean_BSKD_H
##
## Df Sum of Sq RSS AIC
## - Mean_BSKD_H 1 0.00646 0.88338 -187.80
## - Mean_BSKD_F 1 0.01354 0.89045 -187.40
## <none> 0.87692 -186.17
## - Mean_BSKD_E 1 0.07222 0.94914 -184.21
## - Mean_BSKD_B 1 0.09048 0.96739 -183.26
## - Mean_A_BSKD 1 0.10187 0.97879 -182.67
## - Mean_BSKD_C 1 0.11131 0.98822 -182.19
## - Mean_BSKD_G 1 0.47994 1.35685 -166.34
##
## Step: AIC=-187.8
## Y_mean ~ Mean_A_BSKD + Mean_BSKD_B + Mean_BSKD_C + Mean_BSKD_E +
## Mean_BSKD_F + Mean_BSKD_G
##
## Df Sum of Sq RSS AIC
## - Mean_BSKD_F 1 0.01429 0.89767 -189.00
## <none> 0.88338 -187.80
## - Mean_BSKD_E 1 0.07245 0.95582 -185.86
## - Mean_BSKD_B 1 0.08672 0.97010 -185.12
## - Mean_A_BSKD 1 0.10681 0.99019 -184.09
## - Mean_BSKD_C 1 0.10821 0.99159 -184.02
## - Mean_BSKD_G 1 0.47670 1.36008 -168.22
##
## Step: AIC=-189
## Y_mean ~ Mean_A_BSKD + Mean_BSKD_B + Mean_BSKD_C + Mean_BSKD_E +
## Mean_BSKD_G
##
## Df Sum of Sq RSS AIC
## <none> 0.89767 -189.00
## - Mean_BSKD_B 1 0.07805 0.97572 -186.83
## - Mean_BSKD_E 1 0.10149 0.99916 -185.64
## - Mean_A_BSKD 1 0.11037 1.00804 -185.20
## - Mean_BSKD_C 1 0.13381 1.03148 -184.05
## - Mean_BSKD_G 1 0.57527 1.47294 -166.24
##
## Call:
## lm(formula = Y_mean ~ Mean_A_BSKD + Mean_BSKD_B + Mean_BSKD_C +
## Mean_BSKD_E + Mean_BSKD_G, data = BSKD_Data)
##
## Coefficients:
## (Intercept) Mean_A_BSKD Mean_BSKD_B Mean_BSKD_C Mean_BSKD_E
## 0.5866 0.1162 0.1363 0.1288 0.1093
## Mean_BSKD_G
## 0.3148
# The importance of each variable
impo_BS_KD <- calc.relimp(model_BSKD, type = c("lmg"),rela=T)
impo_BS_KD
## Response variable: Y_mean
## Total response variance: 0.08505943
## Analysis based on 50 observations
##
## 8 Regressors:
## Mean_A_BSKD Mean_BSKD_B Mean_BSKD_C Mean_BSKD_D Mean_BSKD_E Mean_BSKD_F Mean_BSKD_G Mean_BSKD_H
## Proportion of variance explained by model: 78.96%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_A_BSKD 0.102875571
## Mean_BSKD_B 0.180335396
## Mean_BSKD_C 0.174421738
## Mean_BSKD_D 0.084378741
## Mean_BSKD_E 0.162450067
## Mean_BSKD_F 0.079435694
## Mean_BSKD_G 0.207554619
## Mean_BSKD_H 0.008548173
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Mean_A_BSKD 0.27980252 0.20741121 0.17107987 0.14997908 0.135749348
## Mean_BSKD_B 0.42670734 0.34361192 0.28921459 0.25128623 0.221889812
## Mean_BSKD_C 0.34934247 0.28129410 0.22885124 0.18985262 0.161752875
## Mean_BSKD_D 0.31509615 0.21169929 0.14197288 0.09380995 0.059551758
## Mean_BSKD_E 0.35656198 0.28505147 0.22835144 0.18437481 0.151031689
## Mean_BSKD_F 0.35023529 0.24200195 0.17663364 0.13523209 0.106693912
## Mean_BSKD_G 0.44388769 0.36519224 0.33064813 0.31621752 0.310245317
## Mean_BSKD_H -0.07625229 -0.04085679 -0.02235368 -0.01044984 -0.001484833
## 6Xs 7Xs 8Xs
## Mean_A_BSKD 0.125246804 0.11731010 0.111823050
## Mean_BSKD_B 0.196480781 0.17257925 0.148524021
## Mean_BSKD_C 0.142221777 0.12948053 0.122425280
## Mean_BSKD_D 0.034108022 0.01393686 -0.003534527
## Mean_BSKD_E 0.126455237 0.10911357 0.097856556
## Mean_BSKD_F 0.084891653 0.06653346 0.049625306
## Mean_BSKD_G 0.307787397 0.30693966 0.307223574
## Mean_BSKD_H 0.005912985 0.01235859 0.018216643
# Remove Mean_B
df_BSKD<-BSKD_Data[,-c(4,6,8)]
# Fitting the model
model_BS1<-lm(Y_mean~., data=df_BSKD)
impo_BSKD1<-calc.relimp(model_BS1,type=c("lmg"),rela=T)
impo_BSKD1
## Response variable: Y_mean
## Total response variance: 0.08505943
## Analysis based on 50 observations
##
## 5 Regressors:
## Mean_A_BSKD Mean_BSKD_B Mean_BSKD_C Mean_BSKD_E Mean_BSKD_G
## Proportion of variance explained by model: 78.46%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_A_BSKD 0.1096482
## Mean_BSKD_B 0.1999413
## Mean_BSKD_C 0.2203991
## Mean_BSKD_E 0.2133738
## Mean_BSKD_G 0.2566376
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Mean_A_BSKD 0.2798025 0.1724267 0.1341424 0.1208441 0.1161921
## Mean_BSKD_B 0.4267073 0.3163279 0.2358173 0.1786830 0.1362954
## Mean_BSKD_C 0.3493425 0.2611068 0.1974159 0.1552470 0.1288155
## Mean_BSKD_E 0.3565620 0.2632846 0.1934919 0.1450570 0.1093374
## Mean_BSKD_G 0.4438877 0.3622056 0.3354663 0.3234411 0.3147626
# Bac Son Keo Dai Road project
QT_DQ<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Son-Data/D_Viet%20Bac.csv",header = T,sep=";")
# look at some first few rows
head(QT_DQ)
Data manipulation
Group A
#- Group A
library(tidyverse)
GroupA_QT_DQ<- QT_DQ %>% dplyr::select(A1:A5,Y.TB)
head(GroupA_QT_DQ)
# Linear regression model
model_QTDQ_A<-lm(Y.TB~., data=GroupA_QT_DQ)
step(model_QTDQ_A,direction = "backward") # Variables should be selected
## Start: AIC=-117.84
## Y.TB ~ A1 + A2 + A3 + A4 + A5
##
## Df Sum of Sq RSS AIC
## - A1 1 0.06911 3.7948 -118.92
## - A2 1 0.07806 3.8038 -118.80
## - A5 1 0.10708 3.8328 -118.42
## <none> 3.7257 -117.84
## - A3 1 0.38321 4.1089 -114.94
## - A4 1 0.54316 4.2689 -113.03
##
## Step: AIC=-118.92
## Y.TB ~ A2 + A3 + A4 + A5
##
## Df Sum of Sq RSS AIC
## - A5 1 0.10644 3.9012 -119.54
## <none> 3.7948 -118.92
## - A3 1 0.40203 4.1968 -115.89
## - A4 1 0.52245 4.3173 -114.47
## - A2 1 1.12566 4.9205 -107.93
##
## Step: AIC=-119.54
## Y.TB ~ A2 + A3 + A4
##
## Df Sum of Sq RSS AIC
## <none> 3.9012 -119.54
## - A3 1 0.29799 4.1992 -117.86
## - A4 1 0.42744 4.3287 -116.34
## - A2 1 1.73574 5.6370 -103.13
##
## Call:
## lm(formula = Y.TB ~ A2 + A3 + A4, data = GroupA_QT_DQ)
##
## Coefficients:
## (Intercept) A2 A3 A4
## 2.4745 0.4156 0.2618 -0.3900
Group_QTDQ_A<- GroupA_QT_DQ %>% dplyr::select(A2,A3,A4) # Only variables A2, A3 and A5 should be selected
Mean_A_QTDQ<-rowMeans(Group_QTDQ_A)
#Group B
GroupB_QTDQ<- QT_DQ %>% dplyr::select(B6:B10,Y.TB)
head(GroupB_QTDQ)
## Linear regression model
model_QTDQ_B<-lm(Y.TB~., data=GroupB_QTDQ)
step(model_QTDQ_B,direction = "backward") # Variables should be selected
## Start: AIC=-122.47
## Y.TB ~ B6 + B7 + B8 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B8 1 0.00018 3.3963 -124.47
## - B6 1 0.03477 3.4309 -123.96
## - B7 1 0.03886 3.4350 -123.90
## <none> 3.3962 -122.47
## - B9 1 0.22927 3.6254 -121.20
## - B10 1 0.58367 3.9798 -116.54
##
## Step: AIC=-124.47
## Y.TB ~ B6 + B7 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B6 1 0.03713 3.4335 -125.92
## - B7 1 0.03888 3.4352 -125.90
## <none> 3.3963 -124.47
## - B9 1 0.23547 3.6318 -123.11
## - B10 1 0.58799 3.9843 -118.48
##
## Step: AIC=-125.92
## Y.TB ~ B7 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B7 1 0.09850 3.5320 -126.51
## <none> 3.4335 -125.92
## - B9 1 0.20073 3.6342 -125.08
## - B10 1 0.83302 4.2665 -117.06
##
## Step: AIC=-126.51
## Y.TB ~ B9 + B10
##
## Df Sum of Sq RSS AIC
## <none> 3.5320 -126.51
## - B9 1 0.67302 4.2050 -119.79
## - B10 1 1.03844 4.5704 -115.62
##
## Call:
## lm(formula = Y.TB ~ B9 + B10, data = GroupB_QTDQ)
##
## Coefficients:
## (Intercept) B9 B10
## 1.7104 0.2021 0.2494
Group_QTDQ_B<- GroupB_QTDQ %>% dplyr::select(B9,B10) # Only these variables should be selected
Mean_QTDQ_B<-rowMeans(Group_QTDQ_B)
# Group C
GroupC_QTDQ<- QT_DQ %>% dplyr::select(C11:C14,Y.TB)
head(GroupC_QTDQ)
## Linear regression model
model_QTDQ_B<-lm(Y.TB~., data=GroupC_QTDQ)
step(model_QTDQ_B,direction = "backward") # Variables should be selected
## Start: AIC=-122.13
## Y.TB ~ C11 + C12 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C12 1 0.04449 3.6033 -123.51
## - C11 1 0.05290 3.6117 -123.39
## - C14 1 0.11560 3.6744 -122.53
## <none> 3.5588 -122.13
## - C13 1 0.31632 3.8751 -119.87
##
## Step: AIC=-123.51
## Y.TB ~ C11 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C11 1 0.12526 3.7285 -123.80
## - C14 1 0.12947 3.7328 -123.74
## <none> 3.6033 -123.51
## - C13 1 0.38545 3.9887 -120.43
##
## Step: AIC=-123.8
## Y.TB ~ C13 + C14
##
## Df Sum of Sq RSS AIC
## <none> 3.7285 -123.80
## - C14 1 0.19553 3.9241 -123.25
## - C13 1 1.30114 5.0297 -110.83
##
## Call:
## lm(formula = Y.TB ~ C13 + C14, data = GroupC_QTDQ)
##
## Coefficients:
## (Intercept) C13 C14
## 2.1804 0.2486 0.1117
Group_QTDQ_C<- GroupC_QTDQ %>% dplyr::select(C13,C14)
Mean_QTDQ_C<-rowMeans(Group_QTDQ_C)
# Group D
GroupD_QTDQ<- QT_DQ %>% dplyr::select(D15:D16,Y.TB)
head(GroupD_QTDQ)
## Linear regression model
model_QTDQ_D<-lm(Y.TB~., data=GroupD_QTDQ)
step(model_QTDQ_D,direction = "backward") # Variables should be selected
## Start: AIC=-120.35
## Y.TB ~ D15 + D16
##
## Df Sum of Sq RSS AIC
## - D16 1 0.05808 4.0529 -121.63
## <none> 3.9949 -120.35
## - D15 1 0.77166 4.7665 -113.52
##
## Step: AIC=-121.63
## Y.TB ~ D15
##
## Df Sum of Sq RSS AIC
## <none> 4.0529 -121.63
## - D15 1 2.1871 6.2400 -102.05
##
## Call:
## lm(formula = Y.TB ~ D15, data = GroupD_QTDQ)
##
## Coefficients:
## (Intercept) D15
## 2.1244 0.4255
Group_QTDQ_D<- GroupD_QTDQ %>% dplyr::select(D15)
Mean_QTDQ_D<-rowMeans(Group_QTDQ_D)
# Group E
GroupE_QTDQ<- QT_DQ %>% dplyr::select(E17:E20,Y.TB)
head(GroupE_QTDQ)
## Linear regression model
model_QTDQ_E<-lm(Y.TB~., data=GroupE_QTDQ)
step(model_QTDQ_E,direction = "backward") # Variables should be selected
## Start: AIC=-108.09
## Y.TB ~ E17 + E18 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E18 1 0.01333 4.7259 -109.95
## - E17 1 0.04116 4.7537 -109.66
## <none> 4.7126 -108.09
## - E19 1 0.21163 4.9242 -107.89
## - E20 1 0.47560 5.1882 -105.28
##
## Step: AIC=-109.95
## Y.TB ~ E17 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E17 1 0.05826 4.7842 -111.34
## <none> 4.7259 -109.95
## - E19 1 0.19845 4.9244 -109.89
## - E20 1 0.47274 5.1987 -107.18
##
## Step: AIC=-111.34
## Y.TB ~ E19 + E20
##
## Df Sum of Sq RSS AIC
## - E19 1 0.16202 4.9462 -111.67
## <none> 4.7842 -111.34
## - E20 1 0.41746 5.2016 -109.15
##
## Step: AIC=-111.67
## Y.TB ~ E20
##
## Df Sum of Sq RSS AIC
## <none> 4.9462 -111.67
## - E20 1 1.2938 6.2400 -102.05
##
## Call:
## lm(formula = Y.TB ~ E20, data = GroupE_QTDQ)
##
## Coefficients:
## (Intercept) E20
## 2.5217 0.2483
Group_QTDQ_E<- GroupE_QTDQ %>% dplyr::select(E20)
Mean_QTDQ_E<-rowMeans(Group_QTDQ_E)
# Group F
GroupF_QTDQ<- QT_DQ %>% dplyr::select(F21:F23,Y.TB)
head(GroupF_QTDQ)
## Linear regression model
model_QTDQ_F<-lm(Y.TB~., data=GroupF_QTDQ)
step(model_QTDQ_F,direction = "backward") # Variables should be selected
## Start: AIC=-127.3
## Y.TB ~ F21 + F22 + F23
##
## Df Sum of Sq RSS AIC
## - F21 1 0.11323 3.4533 -127.64
## <none> 3.3401 -127.30
## - F23 1 0.30638 3.6464 -124.91
## - F22 1 0.61774 3.9578 -120.82
##
## Step: AIC=-127.63
## Y.TB ~ F22 + F23
##
## Df Sum of Sq RSS AIC
## <none> 3.4533 -127.64
## - F23 1 0.70809 4.1614 -120.31
## - F22 1 1.64652 5.0998 -110.14
##
## Call:
## lm(formula = Y.TB ~ F22 + F23, data = GroupF_QTDQ)
##
## Coefficients:
## (Intercept) F22 F23
## 1.9577 0.2638 0.1972
Group_QTDQ_F<- GroupF_QTDQ %>% dplyr::select(F22,F23)
Mean_QTDQ_F<-rowMeans(Group_QTDQ_F)
# Group G
GroupG_QTDQ<- QT_DQ %>% dplyr::select(G24:G25,Y.TB)
head(GroupG_QTDQ)
## Linear regression model
model_QTDQ_G<-lm(Y.TB~., data=GroupG_QTDQ)
step(model_QTDQ_G,direction = "backward") # Variables should be selected
## Start: AIC=-153.6
## Y.TB ~ G24 + G25
##
## Df Sum of Sq RSS AIC
## <none> 2.0546 -153.60
## - G24 1 0.65858 2.7131 -141.70
## - G25 1 0.99609 3.0507 -135.83
##
## Call:
## lm(formula = Y.TB ~ G24 + G25, data = GroupG_QTDQ)
##
## Coefficients:
## (Intercept) G24 G25
## 2.1169 0.2017 0.2413
Group_QTDQ_G<- GroupG_QTDQ %>% dplyr::select(G24:G25)
Mean_QTDQ_G<-rowMeans(Group_QTDQ_G)
# Group H
GroupH_QTDQ<- HVT %>% dplyr::select(H26:H27,Y.TB)
head(GroupH_QTDQ)
## Linear regression model
model_QTDQ_H<-lm(Y.TB~., data=GroupH_QTDQ)
step(model_QTDQ_H,direction = "backward") # Variables should be selected
## Start: AIC=-150.96
## Y.TB ~ H26 + H27
##
## Df Sum of Sq RSS AIC
## <none> 2.1657 -150.96
## - H26 1 0.15469 2.3204 -149.51
## - H27 1 0.15469 2.3204 -149.51
##
## Call:
## lm(formula = Y.TB ~ H26 + H27, data = GroupH_QTDQ)
##
## Coefficients:
## (Intercept) H26 H27
## 2.6769 0.2011 0.2011
Group_QTDQ_H<- GroupH_QTDQ %>% dplyr::select(H26:H27)
Mean_QTDQ_H<-rowMeans(Group_QTDQ_H)
# Dataset
QTDQ_Dataset<-data.frame(Mean_A_QTDQ,Mean_QTDQ_B,Mean_QTDQ_C,Mean_QTDQ_D,Mean_QTDQ_E,Mean_QTDQ_F,Mean_QTDQ_G,Mean_QTDQ_H, Y_mean=QT_DQ$Y.TB)
head(QTDQ_Dataset)
# Final model
QTDQ_Data<-na.omit(QTDQ_Dataset) # Omit NA data points
model_QTDQ<-lm(Y_mean~., data=QTDQ_Data)
step(model_QTDQ,direction = "backward") # Unfortunately the Variable B should be removed as it is unsatisfied the condition of stepwise selection
## Start: AIC=-206.83
## Y_mean ~ Mean_A_QTDQ + Mean_QTDQ_B + Mean_QTDQ_C + Mean_QTDQ_D +
## Mean_QTDQ_E + Mean_QTDQ_F + Mean_QTDQ_G + Mean_QTDQ_H
##
## Df Sum of Sq RSS AIC
## - Mean_QTDQ_C 1 0.00373 0.56107 -208.50
## - Mean_QTDQ_H 1 0.01971 0.57705 -207.09
## <none> 0.55734 -206.83
## - Mean_QTDQ_E 1 0.05786 0.61520 -203.89
## - Mean_QTDQ_F 1 0.07153 0.62888 -202.79
## - Mean_A_QTDQ 1 0.07403 0.63137 -202.59
## - Mean_QTDQ_B 1 0.08027 0.63761 -202.10
## - Mean_QTDQ_D 1 0.28404 0.84139 -188.24
## - Mean_QTDQ_G 1 0.81387 1.37122 -163.82
##
## Step: AIC=-208.5
## Y_mean ~ Mean_A_QTDQ + Mean_QTDQ_B + Mean_QTDQ_D + Mean_QTDQ_E +
## Mean_QTDQ_F + Mean_QTDQ_G + Mean_QTDQ_H
##
## Df Sum of Sq RSS AIC
## - Mean_QTDQ_H 1 0.01932 0.58039 -208.80
## <none> 0.56107 -208.50
## - Mean_QTDQ_E 1 0.05964 0.62072 -205.44
## - Mean_QTDQ_F 1 0.07328 0.63435 -204.36
## - Mean_A_QTDQ 1 0.08482 0.64589 -203.46
## - Mean_QTDQ_B 1 0.12504 0.68612 -200.44
## - Mean_QTDQ_D 1 0.28702 0.84809 -189.84
## - Mean_QTDQ_G 1 0.86998 1.43106 -163.68
##
## Step: AIC=-208.8
## Y_mean ~ Mean_A_QTDQ + Mean_QTDQ_B + Mean_QTDQ_D + Mean_QTDQ_E +
## Mean_QTDQ_F + Mean_QTDQ_G
##
## Df Sum of Sq RSS AIC
## <none> 0.58039 -208.80
## - Mean_QTDQ_E 1 0.06553 0.64592 -205.46
## - Mean_QTDQ_F 1 0.07457 0.65497 -204.76
## - Mean_A_QTDQ 1 0.07636 0.65675 -204.62
## - Mean_QTDQ_B 1 0.13066 0.71105 -200.65
## - Mean_QTDQ_D 1 0.29663 0.87702 -190.16
## - Mean_QTDQ_G 1 0.85871 1.43910 -165.40
##
## Call:
## lm(formula = Y_mean ~ Mean_A_QTDQ + Mean_QTDQ_B + Mean_QTDQ_D +
## Mean_QTDQ_E + Mean_QTDQ_F + Mean_QTDQ_G, data = QTDQ_Data)
##
## Coefficients:
## (Intercept) Mean_A_QTDQ Mean_QTDQ_B Mean_QTDQ_D Mean_QTDQ_E
## 0.3451 0.1066 0.1294 0.2045 0.0682
## Mean_QTDQ_F Mean_QTDQ_G
## 0.1159 0.2650
# The importance of each variable
impo_QTDQ <- calc.relimp(model_QTDQ, type = c("lmg"),rela=T)
impo_QTDQ
## Response variable: Y_mean
## Total response variance: 0.1273469
## Analysis based on 50 observations
##
## 8 Regressors:
## Mean_A_QTDQ Mean_QTDQ_B Mean_QTDQ_C Mean_QTDQ_D Mean_QTDQ_E Mean_QTDQ_F Mean_QTDQ_G Mean_QTDQ_H
## Proportion of variance explained by model: 91.07%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_A_QTDQ 0.053122739
## Mean_QTDQ_B 0.134249125
## Mean_QTDQ_C 0.105763902
## Mean_QTDQ_D 0.154453210
## Mean_QTDQ_E 0.064446935
## Mean_QTDQ_F 0.162608946
## Mean_QTDQ_G 0.320300292
## Mean_QTDQ_H 0.005054851
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Mean_A_QTDQ 0.27480916 0.20221703 0.1678415 0.15092259 0.14059282
## Mean_QTDQ_B 0.45172283 0.35705553 0.2830300 0.22625450 0.18379404
## Mean_QTDQ_C 0.37412383 0.28082392 0.2069819 0.14891749 0.10374998
## Mean_QTDQ_D 0.42549669 0.34761234 0.2983046 0.26530134 0.24168326
## Mean_QTDQ_E 0.24833174 0.17360601 0.1286089 0.10122888 0.08443230
## Mean_QTDQ_F 0.46845048 0.37747160 0.3139253 0.26534952 0.22415534
## Mean_QTDQ_G 0.44368520 0.39052085 0.3504380 0.32026558 0.29794239
## Mean_QTDQ_H -0.04958867 -0.03956291 -0.0343143 -0.03172652 -0.03062605
## 6Xs 7Xs 8Xs
## Mean_A_QTDQ 0.13156637 0.12128510 0.10813258
## Mean_QTDQ_B 0.15286767 0.13077332 0.11512905
## Mean_QTDQ_C 0.06894059 0.04213296 0.02145706
## Mean_QTDQ_D 0.22396066 0.21046466 0.20056877
## Mean_QTDQ_E 0.07415934 0.06790855 0.06438212
## Mean_QTDQ_F 0.18624819 0.14971692 0.11362678
## Mean_QTDQ_G 0.28184125 0.27061892 0.26353678
## Mean_QTDQ_H -0.03030871 -0.03037805 -0.03049185
# Remove variables 3 and 8
df_QTDQ<-QTDQ_Data[,-c(3,8)]
# Fitting the model
model_QTDQ1<-lm(Y_mean~., data=df_QTDQ)
impo_QTDQ1<-calc.relimp(model_QTDQ1,type=c("lmg"),rela=T)
impo_QTDQ1
## Response variable: Y_mean
## Total response variance: 0.1273469
## Analysis based on 50 observations
##
## 6 Regressors:
## Mean_A_QTDQ Mean_QTDQ_B Mean_QTDQ_D Mean_QTDQ_E Mean_QTDQ_F Mean_QTDQ_G
## Proportion of variance explained by model: 90.7%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_A_QTDQ 0.06241256
## Mean_QTDQ_B 0.16567546
## Mean_QTDQ_D 0.16684549
## Mean_QTDQ_E 0.07572431
## Mean_QTDQ_F 0.17835793
## Mean_QTDQ_G 0.35098424
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs 6Xs
## Mean_A_QTDQ 0.2748092 0.2044009 0.1751161 0.15573402 0.13415564 0.10660185
## Mean_QTDQ_B 0.4517228 0.3496473 0.2680080 0.20520074 0.16050315 0.12944075
## Mean_QTDQ_D 0.4254967 0.3370496 0.2811946 0.24379673 0.21896616 0.20447090
## Mean_QTDQ_E 0.2483317 0.1660253 0.1189328 0.09093746 0.07547558 0.06820128
## Mean_QTDQ_F 0.4684505 0.3627494 0.2876096 0.22674573 0.17073782 0.11586752
## Mean_QTDQ_G 0.4436852 0.3846847 0.3384098 0.30294466 0.27904175 0.26496110
# Refit the model
df4<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Son-Data/QTDQ_final.csv",header=T)
# Fitting the model
model_QTDQ2<-lm(Y_mean~., data=df4)
impo_QTDQ2<-calc.relimp(model_QTDQ2,type=c("lmg"),rela=T)
impo_QTDQ2
## Response variable: Y_mean
## Total response variance: 0.1273469
## Analysis based on 50 observations
##
## 6 Regressors:
## Mean_A_QTDQ Mean_QTDQ_B Mean_QTDQ_D Mean_QTDQ_E Mean_QTDQ_F Mean_QTDQ_G
## Proportion of variance explained by model: 45.59%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_A_QTDQ 0.15490422
## Mean_QTDQ_B 0.15257769
## Mean_QTDQ_D 0.11459642
## Mean_QTDQ_E 0.41127868
## Mean_QTDQ_F 0.09047678
## Mean_QTDQ_G 0.07616621
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Mean_A_QTDQ 0.27480916 0.24600396 0.21713050 0.18850831 0.16049168
## Mean_QTDQ_B 0.20799731 0.19186254 0.17682661 0.16284380 0.14982971
## Mean_QTDQ_D 0.11464968 0.10987097 0.10670065 0.10474759 0.10363788
## Mean_QTDQ_E 0.24833174 0.24286656 0.23879157 0.23598393 0.23425084
## Mean_QTDQ_F 0.09365385 0.09585833 0.09815882 0.10062208 0.10326898
## Mean_QTDQ_G 0.10545906 0.09382320 0.08397959 0.07595216 0.06970356
## 6Xs
## Mean_A_QTDQ 0.13348820
## Mean_QTDQ_B 0.13766271
## Mean_QTDQ_D 0.10306181
## Mean_QTDQ_E 0.23336925
## Mean_QTDQ_F 0.10608592
## Mean_QTDQ_G 0.06511803
# Tuc Duyen project
TD<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Son-Data/Tuc_Duyen.csv",sep=";")
# look at some first few rows
head(TD)
Data manipulation
Group A
#- Group A
library(tidyverse)
GroupA_TD<- TD %>% dplyr::select(A1:A5,Y.TB)
head(GroupA_TD)
# Linear regression model
model_TD_A<-lm(Y.TB~., data=GroupA_TD)
step(model_TD_A,direction = "backward") # Variables should be selected
## Start: AIC=-119.91
## Y.TB ~ A1 + A2 + A3 + A4 + A5
##
##
## Step: AIC=-119.91
## Y.TB ~ A1 + A2 + A3 + A5
##
##
## Step: AIC=-119.91
## Y.TB ~ A1 + A3 + A5
##
## Df Sum of Sq RSS AIC
## - A5 1 0.03103 3.9035 -121.51
## - A3 1 0.11448 3.9869 -120.45
## <none> 3.8724 -119.91
## - A1 1 1.68646 5.5589 -103.83
##
## Step: AIC=-121.51
## Y.TB ~ A1 + A3
##
## Df Sum of Sq RSS AIC
## - A3 1 0.08678 3.9902 -122.408
## <none> 3.9035 -121.508
## - A1 1 2.52922 6.4327 -98.531
##
## Step: AIC=-122.41
## Y.TB ~ A1
##
## Df Sum of Sq RSS AIC
## <none> 3.9902 -122.408
## - A1 1 5.226 9.2162 -82.553
##
## Call:
## lm(formula = Y.TB ~ A1, data = GroupA_TD)
##
## Coefficients:
## (Intercept) A1
## 1.4335 0.6003
Mean_A_TD<- GroupA_TD %>% dplyr::select(A1) # Only variables A2, A3 and A5 should be selected
#Group B
GroupB_TD<- TD %>% dplyr::select(B6:B10,Y.TB)
head(GroupB_TD)
## Linear regression model
model_TD_B<-lm(Y.TB~., data=GroupB_TD)
step(model_TD_B,direction = "backward") # Variables should be selected
## Start: AIC=-103.78
## Y.TB ~ B6 + B7 + B8 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B9 1 0.035637 4.9713 -105.42
## - B7 1 0.043742 4.9794 -105.33
## - B8 1 0.096939 5.0326 -104.80
## - B6 1 0.144114 5.0798 -104.34
## - B10 1 0.169626 5.1053 -104.09
## <none> 4.9357 -103.78
##
## Step: AIC=-105.42
## Y.TB ~ B6 + B7 + B8 + B10
##
## Df Sum of Sq RSS AIC
## - B7 1 0.044136 5.0155 -106.97
## - B8 1 0.061751 5.0331 -106.80
## - B10 1 0.168734 5.1401 -105.75
## <none> 4.9713 -105.42
## - B6 1 0.299166 5.2705 -104.50
##
## Step: AIC=-106.97
## Y.TB ~ B6 + B8 + B10
##
## Df Sum of Sq RSS AIC
## - B8 1 0.01895 5.0344 -108.79
## - B10 1 0.12473 5.1402 -107.75
## <none> 5.0155 -106.97
## - B6 1 0.80275 5.8182 -101.55
##
## Step: AIC=-108.79
## Y.TB ~ B6 + B10
##
## Df Sum of Sq RSS AIC
## - B10 1 0.11168 5.1461 -109.69
## <none> 5.0344 -108.79
## - B6 1 0.79964 5.8341 -103.42
##
## Step: AIC=-109.69
## Y.TB ~ B6
##
## Df Sum of Sq RSS AIC
## <none> 5.1461 -109.689
## - B6 1 4.0702 9.2162 -82.553
##
## Call:
## lm(formula = Y.TB ~ B6, data = GroupB_TD)
##
## Coefficients:
## (Intercept) B6
## 1.8617 0.5031
Mean_TD_B<- GroupB_TD %>% dplyr::select(B6) # Only these variables should be selected
# Group C
GroupC_TD<- TD %>% dplyr::select(C11:C14,Y.TB)
head(GroupC_TD)
## Linear regression model
model_TD_C<-lm(Y.TB~., data=GroupC_TD)
step(model_TD_C,direction = "backward") # Variables should be selected
## Start: AIC=-130.26
## Y.TB ~ C11 + C12 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C12 1 0.005585 3.0306 -132.16
## - C14 1 0.023627 3.0486 -131.87
## - C11 1 0.026010 3.0510 -131.83
## <none> 3.0250 -130.26
## - C13 1 0.309794 3.3348 -127.38
##
## Step: AIC=-132.16
## Y.TB ~ C11 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C11 1 0.02268 3.0532 -133.79
## - C14 1 0.03460 3.0652 -133.60
## <none> 3.0306 -132.16
## - C13 1 0.32660 3.3572 -129.05
##
## Step: AIC=-133.79
## Y.TB ~ C13 + C14
##
## Df Sum of Sq RSS AIC
## <none> 3.0532 -133.79
## - C14 1 0.13692 3.1902 -133.60
## - C13 1 0.93804 3.9913 -122.40
##
## Call:
## lm(formula = Y.TB ~ C13 + C14, data = GroupC_TD)
##
## Coefficients:
## (Intercept) C13 C14
## 1.0679 0.5160 0.1685
Group_TD_C<- GroupC_TD %>% dplyr::select(C13,C14)
Mean_TD_C<-rowMeans(Group_TD_C)
# Group D
GroupD_TD<- TD %>% dplyr::select(D15:D16,Y.TB)
head(GroupD_TD)
## Linear regression model
model_TD_D<-lm(Y.TB~., data=GroupD_TD)
step(model_TD_D,direction = "backward") # Variables should be selected
## Start: AIC=-132.03
## Y.TB ~ D15 + D16
##
## Df Sum of Sq RSS AIC
## - D16 1 0.12063 3.2835 -132.16
## <none> 3.1629 -132.03
## - D15 1 2.23693 5.3998 -107.28
##
## Step: AIC=-132.16
## Y.TB ~ D15
##
## Df Sum of Sq RSS AIC
## <none> 3.2835 -132.156
## - D15 1 5.9328 9.2162 -82.553
##
## Call:
## lm(formula = Y.TB ~ D15, data = GroupD_TD)
##
## Coefficients:
## (Intercept) D15
## 2.053 0.511
Mean_TD_D<- GroupD_TD %>% dplyr::select(D15)
# Group E
GroupE_TD<- TD %>% dplyr::select(E17:E20,Y.TB)
head(GroupE_TD)
## Linear regression model
model_TD_E<-lm(Y.TB~., data=GroupE_TD)
step(model_TD_E,direction = "backward") # Variables should be selected
## Start: AIC=-114.07
## Y.TB ~ E17 + E18 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E19 1 0.00758 4.1886 -115.98
## - E18 1 0.08016 4.2612 -115.12
## - E20 1 0.10782 4.2888 -114.80
## <none> 4.1810 -114.07
## - E17 1 1.17514 5.3561 -103.69
##
## Step: AIC=-115.98
## Y.TB ~ E17 + E18 + E20
##
## Df Sum of Sq RSS AIC
## - E18 1 0.07718 4.2658 -117.07
## - E20 1 0.12759 4.3162 -116.48
## <none> 4.1886 -115.98
## - E17 1 1.31521 5.5038 -104.33
##
## Step: AIC=-117.07
## Y.TB ~ E17 + E20
##
## Df Sum of Sq RSS AIC
## - E20 1 0.06456 4.3303 -118.32
## <none> 4.2658 -117.07
## - E17 1 1.39736 5.6631 -104.90
##
## Step: AIC=-118.32
## Y.TB ~ E17
##
## Df Sum of Sq RSS AIC
## <none> 4.3303 -118.319
## - E17 1 4.8859 9.2162 -82.553
##
## Call:
## lm(formula = Y.TB ~ E17, data = GroupE_TD)
##
## Coefficients:
## (Intercept) E17
## 1.4517 0.6252
Mean_TD_E<- GroupE_TD %>% dplyr::select(E17)
# Group F
GroupF_TD<- TD %>% dplyr::select(F21:F23,Y.TB)
head(GroupF_TD)
## Linear regression model
model_TD_F<-lm(Y.TB~., data=GroupF_TD)
step(model_TD_F,direction = "backward") # Variables should be selected
## Start: AIC=-113.61
## Y.TB ~ F21 + F22 + F23
##
## Df Sum of Sq RSS AIC
## - F22 1 0.00005 4.3924 -115.61
## <none> 4.3924 -113.61
## - F21 1 0.31252 4.7049 -112.17
## - F23 1 1.29280 5.6852 -102.71
##
## Step: AIC=-115.61
## Y.TB ~ F21 + F23
##
## Df Sum of Sq RSS AIC
## <none> 4.3924 -115.61
## - F21 1 0.84959 5.2420 -108.77
## - F23 1 1.31113 5.7036 -104.55
##
## Call:
## lm(formula = Y.TB ~ F21 + F23, data = GroupF_TD)
##
## Coefficients:
## (Intercept) F21 F23
## 1.2040 0.3371 0.3862
Group_TD_F<- GroupF_TD %>% dplyr::select(F21,F23)
Mean_TD_F<-rowMeans(Group_TD_F)
# Group G
GroupG_TD<- TD %>% dplyr::select(G24:G25,Y.TB)
head(GroupG_TD)
## Linear regression model
model_TD_G<-lm(Y.TB~., data=GroupG_TD)
step(model_TD_G,direction = "backward") # Variables should be selected
## Start: AIC=-115.46
## Y.TB ~ G24 + G25
##
## Df Sum of Sq RSS AIC
## - G25 1 0.17929 4.5851 -115.46
## <none> 4.4058 -115.45
## - G24 1 1.48406 5.8898 -102.94
##
## Step: AIC=-115.46
## Y.TB ~ G24
##
## Df Sum of Sq RSS AIC
## <none> 4.5851 -115.461
## - G24 1 4.6312 9.2162 -82.553
##
## Call:
## lm(formula = Y.TB ~ G24, data = GroupG_TD)
##
## Coefficients:
## (Intercept) G24
## 2.1645 0.4922
Mean_TD_G<- GroupG_TD %>% dplyr::select(G24)
# Group H
GroupH_TD<- TD %>% dplyr::select(H26:H27,Y.TB)
head(GroupH_TD)
## Linear regression model
model_TD_H<-lm(Y.TB~., data=GroupH_TD)
step(model_TD_H,direction = "backward") # Variables should be selected
## Start: AIC=-114.75
## Y.TB ~ H26 + H27
##
##
## Step: AIC=-114.75
## Y.TB ~ H26
##
## Df Sum of Sq RSS AIC
## <none> 4.6506 -114.751
## - H26 1 4.5656 9.2162 -82.553
##
## Call:
## lm(formula = Y.TB ~ H26, data = GroupH_TD)
##
## Coefficients:
## (Intercept) H26
## 2.6436 0.3874
Mean_TD_H<- GroupH_TD %>% dplyr::select(H26)
# Dataset
TD_Dataset<-data.frame(Mean_A_TD,Mean_TD_B,Mean_TD_C,Mean_TD_D,Mean_TD_E,Mean_TD_F,Mean_TD_G,Mean_TD_H, Y_mean=TD$Y.TB)
TD_Dataset1<-TD_Dataset %>% dplyr::select(Mean_TD_A=A1,Mean_TD_B=B6,Mean_TD_C,Mean_TD_D=D15,Mean_TD_E=E17,Mean_TD_F,Mean_TD_G=G24,Mean_TD_H=H26, Y_mean)
head(TD_Dataset1)
# Final model
TD_Data<-na.omit(TD_Dataset1) # Omit NA data points
model_TD<-lm(Y_mean~., data=TD_Data)
step(model_TD,direction = "backward") # Unfortunately the Variable B should be removed as it is unsatisfied the condition of stepwise selection
## Start: AIC=-225.95
## Y_mean ~ Mean_TD_A + Mean_TD_B + Mean_TD_C + Mean_TD_D + Mean_TD_E +
## Mean_TD_F + Mean_TD_G + Mean_TD_H
##
## Df Sum of Sq RSS AIC
## - Mean_TD_C 1 0.00612 0.38632 -227.16
## <none> 0.38020 -225.95
## - Mean_TD_G 1 0.02282 0.40302 -225.04
## - Mean_TD_D 1 0.03336 0.41356 -223.75
## - Mean_TD_B 1 0.06043 0.44062 -220.58
## - Mean_TD_A 1 0.07309 0.45329 -219.16
## - Mean_TD_H 1 0.13696 0.51716 -212.57
## - Mean_TD_F 1 0.14040 0.52060 -212.24
## - Mean_TD_E 1 0.32618 0.70638 -196.98
##
## Step: AIC=-227.16
## Y_mean ~ Mean_TD_A + Mean_TD_B + Mean_TD_D + Mean_TD_E + Mean_TD_F +
## Mean_TD_G + Mean_TD_H
##
## Df Sum of Sq RSS AIC
## <none> 0.38632 -227.16
## - Mean_TD_G 1 0.02376 0.41007 -226.17
## - Mean_TD_D 1 0.03746 0.42377 -224.53
## - Mean_TD_B 1 0.09819 0.48450 -217.83
## - Mean_TD_H 1 0.13512 0.52143 -214.16
## - Mean_TD_A 1 0.13663 0.52295 -214.01
## - Mean_TD_F 1 0.14232 0.52864 -213.47
## - Mean_TD_E 1 0.38812 0.77443 -194.38
##
## Call:
## lm(formula = Y_mean ~ Mean_TD_A + Mean_TD_B + Mean_TD_D + Mean_TD_E +
## Mean_TD_F + Mean_TD_G + Mean_TD_H, data = TD_Data)
##
## Coefficients:
## (Intercept) Mean_TD_A Mean_TD_B Mean_TD_D Mean_TD_E
## -0.11304 0.18352 0.13964 0.07061 0.24869
## Mean_TD_F Mean_TD_G Mean_TD_H
## 0.20402 0.06481 0.11670
# The importance of each variable
impo_TD <- calc.relimp(model_TD, type = c("lmg"),rela=T)
impo_TD
## Response variable: Y_mean
## Total response variance: 0.1880867
## Analysis based on 50 observations
##
## 8 Regressors:
## Mean_TD_A Mean_TD_B Mean_TD_C Mean_TD_D Mean_TD_E Mean_TD_F Mean_TD_G Mean_TD_H
## Proportion of variance explained by model: 95.87%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_TD_A 0.1266422
## Mean_TD_B 0.0975758
## Mean_TD_C 0.1428073
## Mean_TD_D 0.1414714
## Mean_TD_E 0.1453879
## Mean_TD_F 0.1221689
## Mean_TD_G 0.1062973
## Mean_TD_H 0.1176492
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs 6Xs
## Mean_TD_A 0.6003448 0.3996842 0.2960067 0.2394643 0.2060983 0.1847161
## Mean_TD_B 0.5031095 0.2747213 0.1964282 0.1579665 0.1369362 0.1266942
## Mean_TD_C 0.6552154 0.5059033 0.3879776 0.2946510 0.2175202 0.1516617
## Mean_TD_D 0.5110035 0.3682782 0.2734949 0.2068829 0.1574700 0.1188848
## Mean_TD_E 0.6252000 0.4219083 0.3382073 0.2961991 0.2729132 0.2588330
## Mean_TD_F 0.7258065 0.4748092 0.3568042 0.2903734 0.2519661 0.2295412
## Mean_TD_G 0.4921548 0.3114798 0.2206043 0.1664205 0.1291806 0.1017092
## Mean_TD_H 0.3874096 0.2488189 0.1910053 0.1602067 0.1424402 0.1316236
## 7Xs 8Xs
## Mean_TD_A 0.16969810 0.15891382
## Mean_TD_B 0.12300745 0.12232637
## Mean_TD_C 0.09660745 0.05296160
## Mean_TD_D 0.08892368 0.06713274
## Mean_TD_E 0.24865487 0.23890922
## Mean_TD_F 0.21492491 0.20273078
## Mean_TD_G 0.08058631 0.06356281
## Mean_TD_H 0.12407681 0.11756205
# Final model
TD_refit<-TD_Data[,-3]
model_TD1<-lm(Y_mean~., data=TD_refit)
# The importance of each variable
impo_TD_refit <- calc.relimp(model_TD1, type = c("lmg"),rela=T)
impo_TD_refit
## Response variable: Y_mean
## Total response variance: 0.1880867
## Analysis based on 50 observations
##
## 7 Regressors:
## Mean_TD_A Mean_TD_B Mean_TD_D Mean_TD_E Mean_TD_F Mean_TD_G Mean_TD_H
## Proportion of variance explained by model: 95.81%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_TD_A 0.1624554
## Mean_TD_B 0.1278846
## Mean_TD_D 0.1603391
## Mean_TD_E 0.1670193
## Mean_TD_F 0.1368335
## Mean_TD_G 0.1173167
## Mean_TD_H 0.1281513
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs 6Xs
## Mean_TD_A 0.6003448 0.4352097 0.3414507 0.2856278 0.2459157 0.21263520
## Mean_TD_B 0.5031095 0.3225128 0.2547261 0.2128091 0.1826776 0.16006619
## Mean_TD_D 0.5110035 0.3762770 0.2779644 0.2036527 0.1460169 0.10065140
## Mean_TD_E 0.6252000 0.4327898 0.3511939 0.3091512 0.2832601 0.26436449
## Mean_TD_F 0.7258065 0.4823348 0.3606533 0.2902007 0.2490740 0.22418490
## Mean_TD_G 0.4921548 0.3137445 0.2152868 0.1544833 0.1131839 0.08425795
## Mean_TD_H 0.3874096 0.2497038 0.1873404 0.1537643 0.1356577 0.12557784
## 7Xs
## Mean_TD_A 0.18352142
## Mean_TD_B 0.13964474
## Mean_TD_D 0.07061295
## Mean_TD_E 0.24868932
## Mean_TD_F 0.20401991
## Mean_TD_G 0.06480961
## Mean_TD_H 0.11669835
# Dataset new
set.seed(123)
Mean_TD_H<-ceiling(runif(nrow(TD_Data),1,5))
Mean_TD_D<-ceiling(runif(nrow(TD_Data),1,5))
df2<-TD_refit
df2$Mean_TD_H<-Mean_TD_H
df2$Mean_TD_D<-Mean_TD_D
# Fitting the model
model_TD2<-lm(Y_mean~., data=df2)
impo_TD2<-calc.relimp(model_TD2,type=c("lmg"),rela=T)
impo_TD2
## Response variable: Y_mean
## Total response variance: 0.1880867
## Analysis based on 50 observations
##
## 7 Regressors:
## Mean_TD_A Mean_TD_B Mean_TD_D Mean_TD_E Mean_TD_F Mean_TD_G Mean_TD_H
## Proportion of variance explained by model: 93.13%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_TD_A 0.206076470
## Mean_TD_B 0.154277892
## Mean_TD_D 0.009705237
## Mean_TD_E 0.247715283
## Mean_TD_F 0.184206323
## Mean_TD_G 0.196481550
## Mean_TD_H 0.001537246
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Mean_TD_A 0.600344828 0.49760003 0.410947750 0.340619526 0.287536117
## Mean_TD_B 0.503109453 0.37455308 0.289516526 0.232105862 0.188820087
## Mean_TD_D -0.056890299 -0.03696753 -0.025964555 -0.019180501 -0.013991506
## Mean_TD_E 0.625200000 0.51606236 0.443610821 0.397838656 0.370229246
## Mean_TD_F 0.725806452 0.57536394 0.460463453 0.369844281 0.293755582
## Mean_TD_G 0.492154812 0.39017249 0.319816500 0.271506042 0.238070254
## Mean_TD_H -0.002871779 0.00142721 0.003869779 0.005915035 0.007941248
## 6Xs 7Xs
## Mean_TD_A 0.253345871 0.240181526
## Mean_TD_B 0.148321371 0.101563683
## Mean_TD_D -0.008793949 -0.002025106
## Mean_TD_E 0.354717689 0.348600793
## Mean_TD_F 0.224137564 0.154501031
## Mean_TD_G 0.215481489 0.203588721
## Mean_TD_H 0.009900548 0.011710299
# Phan Dinh Phung project
PDP<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Son-Data/P_DInh_pHung.csv",sep=";")
# look at some first few rows
head(PDP)
Data manipulation
Group A
#- Group A
library(tidyverse)
library(dplyr)
GroupA_PDP<- PDP %>% dplyr::select(A1:A5,Y.TB)
head(GroupA_PDP)
# Linear regression model
model_PDP_A<-lm(Y.TB~., data=GroupA_PDP)
step(model_PDP_A,direction = "backward") # Variables should be selected
## Start: AIC=-88.25
## Y.TB ~ A1 + A2 + A3 + A4 + A5
##
## Df Sum of Sq RSS AIC
## - A2 1 0.00740 6.7402 -90.196
## - A4 1 0.07933 6.8122 -89.666
## - A3 1 0.10811 6.8409 -89.455
## - A1 1 0.21363 6.9465 -88.690
## <none> 6.7328 -88.251
## - A5 1 0.94620 7.6790 -83.676
##
## Step: AIC=-90.2
## Y.TB ~ A1 + A3 + A4 + A5
##
## Df Sum of Sq RSS AIC
## - A3 1 0.10126 6.8415 -91.451
## - A4 1 0.10141 6.8416 -91.450
## <none> 6.7402 -90.196
## - A1 1 0.52733 7.2676 -88.430
## - A5 1 0.94459 7.6848 -85.639
##
## Step: AIC=-91.45
## Y.TB ~ A1 + A4 + A5
##
## Df Sum of Sq RSS AIC
## <none> 6.8415 -91.451
## - A1 1 0.42866 7.2702 -90.412
## - A5 1 1.26593 8.1074 -84.962
## - A4 1 1.89965 8.7411 -81.199
##
## Call:
## lm(formula = Y.TB ~ A1 + A4 + A5, data = GroupA_PDP)
##
## Coefficients:
## (Intercept) A1 A4 A5
## 3.6848 0.2222 -0.5014 0.3817
Group_PDP_A<- GroupA_PDP %>% dplyr::select(A1,A4,A5) # Only variables A2, A3 and A5 should be selected
Mean_PDP_A<-rowMeans(Group_PDP_A)
#Group B
GroupB_PDP<- PDP %>% dplyr::select(B6:B10,Y.TB)
head(GroupB_PDP)
## Linear regression model
model_PDP_B<-lm(Y.TB~., data=GroupB_PDP)
step(model_PDP_B,direction = "backward") # Variables should be selected
## Start: AIC=-105.46
## Y.TB ~ B6 + B7 + B8 + B9 + B10
##
## Df Sum of Sq RSS AIC
## - B9 1 0.00006 4.7719 -107.464
## - B6 1 0.00107 4.7729 -107.453
## - B7 1 0.07136 4.8432 -106.722
## <none> 4.7719 -105.464
## - B8 1 0.35498 5.1268 -103.877
## - B10 1 0.90067 5.6725 -98.819
##
## Step: AIC=-107.46
## Y.TB ~ B6 + B7 + B8 + B10
##
## Df Sum of Sq RSS AIC
## - B6 1 0.00113 4.7730 -109.45
## - B7 1 0.07894 4.8509 -108.64
## <none> 4.7719 -107.46
## - B8 1 0.41175 5.1837 -105.33
## - B10 1 0.90407 5.6760 -100.79
##
## Step: AIC=-109.45
## Y.TB ~ B7 + B8 + B10
##
## Df Sum of Sq RSS AIC
## - B7 1 0.07819 4.8512 -110.639
## <none> 4.7730 -109.452
## - B8 1 0.42137 5.1944 -107.222
## - B10 1 2.61190 7.3849 -89.629
##
## Step: AIC=-110.64
## Y.TB ~ B8 + B10
##
## Df Sum of Sq RSS AIC
## <none> 4.8512 -110.64
## - B8 1 0.3434 5.1946 -109.22
## - B10 1 4.1416 8.9929 -81.78
##
## Call:
## lm(formula = Y.TB ~ B8 + B10, data = GroupB_PDP)
##
## Coefficients:
## (Intercept) B8 B10
## 2.5502 -0.1938 0.5441
Group_PDP_B<- GroupB_PDP %>% dplyr::select(B8,B10) # Only these variables should be selected
Mean_PDP_B<-rowMeans(Group_PDP_B)
# Group C
GroupC_PDP<- TD %>% dplyr::select(C11:C14,Y.TB)
head(GroupC_PDP)
## Linear regression model
model_PDP_C<-lm(Y.TB~., data=GroupC_PDP)
step(model_PDP_C,direction = "backward") # Variables should be selected
## Start: AIC=-130.26
## Y.TB ~ C11 + C12 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C12 1 0.005585 3.0306 -132.16
## - C14 1 0.023627 3.0486 -131.87
## - C11 1 0.026010 3.0510 -131.83
## <none> 3.0250 -130.26
## - C13 1 0.309794 3.3348 -127.38
##
## Step: AIC=-132.16
## Y.TB ~ C11 + C13 + C14
##
## Df Sum of Sq RSS AIC
## - C11 1 0.02268 3.0532 -133.79
## - C14 1 0.03460 3.0652 -133.60
## <none> 3.0306 -132.16
## - C13 1 0.32660 3.3572 -129.05
##
## Step: AIC=-133.79
## Y.TB ~ C13 + C14
##
## Df Sum of Sq RSS AIC
## <none> 3.0532 -133.79
## - C14 1 0.13692 3.1902 -133.60
## - C13 1 0.93804 3.9913 -122.40
##
## Call:
## lm(formula = Y.TB ~ C13 + C14, data = GroupC_PDP)
##
## Coefficients:
## (Intercept) C13 C14
## 1.0679 0.5160 0.1685
Group_PDP_C<- GroupC_PDP %>% dplyr::select(C13,C14)
Mean_PDP_C<-rowMeans(Group_PDP_C)
# Group D
GroupD_PDP<- TD %>% dplyr::select(D15:D16,Y.TB)
head(GroupD_PDP)
## Linear regression model
model_PDP_D<-lm(Y.TB~., data=GroupD_PDP)
step(model_PDP_D,direction = "backward") # Variables should be selected
## Start: AIC=-132.03
## Y.TB ~ D15 + D16
##
## Df Sum of Sq RSS AIC
## - D16 1 0.12063 3.2835 -132.16
## <none> 3.1629 -132.03
## - D15 1 2.23693 5.3998 -107.28
##
## Step: AIC=-132.16
## Y.TB ~ D15
##
## Df Sum of Sq RSS AIC
## <none> 3.2835 -132.156
## - D15 1 5.9328 9.2162 -82.553
##
## Call:
## lm(formula = Y.TB ~ D15, data = GroupD_PDP)
##
## Coefficients:
## (Intercept) D15
## 2.053 0.511
Mean_PDP_D<- GroupD_PDP %>% dplyr::select(D15)
Mean_PDP_D<-Mean_PDP_D
# Group E
GroupE_PDP<- PDP %>% dplyr::select(E17:E20,Y.TB)
head(GroupE_PDP)
## Linear regression model
model_PDP_E<-lm(Y.TB~., data=GroupE_PDP)
step(model_PDP_E,direction = "backward") # Variables should be selected
## Start: AIC=-89.5
## Y.TB ~ E17 + E18 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E18 1 0.016517 6.8507 -91.384
## - E20 1 0.027743 6.8619 -91.302
## - E19 1 0.102009 6.9362 -90.764
## <none> 6.8342 -89.504
## - E17 1 0.286524 7.1207 -89.451
##
## Step: AIC=-91.38
## Y.TB ~ E17 + E19 + E20
##
## Df Sum of Sq RSS AIC
## - E20 1 0.03122 6.8819 -93.156
## - E19 1 0.15943 7.0101 -92.234
## <none> 6.8507 -91.384
## - E17 1 0.34798 7.1987 -90.906
##
## Step: AIC=-93.16
## Y.TB ~ E17 + E19
##
## Df Sum of Sq RSS AIC
## <none> 6.8819 -93.156
## - E19 1 0.39651 7.2784 -92.356
## - E17 1 0.49428 7.3762 -91.688
##
## Call:
## lm(formula = Y.TB ~ E17 + E19, data = GroupE_PDP)
##
## Coefficients:
## (Intercept) E17 E19
## 2.0348 0.2707 0.1909
Group_PDP_E<- GroupE_PDP %>% dplyr::select(E17,E19)
Mean_PDP_E<-rowMeans(Group_PDP_E)
# Group F
GroupF_PDP<- PDP %>% dplyr::select(F21:F23,Y.TB)
head(GroupF_PDP)
## Linear regression model
model_PDP_F<-lm(Y.TB~., data=GroupF_PDP)
step(model_PDP_F,direction = "backward") # Variables should be selected
## Start: AIC=-102.14
## Y.TB ~ F21 + F22 + F23
##
## Df Sum of Sq RSS AIC
## - F21 1 0.00337 5.5278 -104.111
## <none> 5.5245 -102.142
## - F22 1 0.22710 5.7516 -102.128
## - F23 1 1.67741 7.2019 -90.884
##
## Step: AIC=-104.11
## Y.TB ~ F22 + F23
##
## Df Sum of Sq RSS AIC
## <none> 5.5278 -104.111
## - F22 1 0.30015 5.8280 -103.468
## - F23 1 2.40755 7.9354 -88.035
##
## Call:
## lm(formula = Y.TB ~ F22 + F23, data = GroupF_PDP)
##
## Coefficients:
## (Intercept) F22 F23
## 2.0694 0.1328 0.3797
Group_PDP_F<- GroupF_PDP %>% dplyr::select(F22,F23)
Mean_PDP_F<-rowMeans(Group_PDP_F)
# Group G
GroupG_PDP<- PDP %>% dplyr::select(G24:G25,Y.TB)
head(GroupG_PDP)
## Linear regression model
model_PDP_G<-lm(Y.TB~., data=GroupG_PDP)
step(model_PDP_G,direction = "backward") # Variables should be selected
## Start: AIC=-105.24
## Y.TB ~ G24 + G25
##
## Df Sum of Sq RSS AIC
## - G25 1 0.0048 5.4088 -107.200
## <none> 5.4040 -105.244
## - G24 1 1.9752 7.3793 -91.668
##
## Step: AIC=-107.2
## Y.TB ~ G24
##
## Df Sum of Sq RSS AIC
## <none> 5.4088 -107.200
## - G24 1 4.6455 10.0543 -78.201
##
## Call:
## lm(formula = Y.TB ~ G24, data = GroupG_PDP)
##
## Coefficients:
## (Intercept) G24
## 2.4821 0.4109
Mean_PDP_G<- GroupG_PDP %>% dplyr::select(G24)
Mean_PDP_G<-Mean_PDP_G
# Group H
GroupH_PDP<- PDP %>% dplyr::select(H26:H27,Y.TB)
head(GroupH_PDP)
## Linear regression model
model_PDP_H<-lm(Y.TB~., data=GroupH_PDP)
step(model_PDP_H,direction = "backward") # Variables should be selected
## Start: AIC=-116.67
## Y.TB ~ H26 + H27
##
## Df Sum of Sq RSS AIC
## - H26 1 0.053511 4.3538 -118.05
## <none> 4.3003 -116.67
## - H27 1 0.234650 4.5349 -116.01
##
## Step: AIC=-118.05
## Y.TB ~ H27
##
## Df Sum of Sq RSS AIC
## <none> 4.3538 -118.049
## - H27 1 5.7005 10.0543 -78.201
##
## Call:
## lm(formula = Y.TB ~ H27, data = GroupH_PDP)
##
## Coefficients:
## (Intercept) H27
## 2.5920 0.4124
Mean_PDP_H<- GroupH_PDP %>% dplyr::select(H27)
Mean_PDP_H<-Mean_PDP_H
# Dataset
PDP_Dataset<-data.frame(Mean_PDP_A,Mean_PDP_B,Mean_PDP_C,Mean_PDP_D,Mean_PDP_E,Mean_PDP_F,Mean_PDP_G=Mean_PDP_G,Mean_PDP_H=Mean_PDP_H, Y_mean=PDP$Y.TB)
*Fit the model
# Final model
PDP_Data<-na.omit(PDP_Dataset) # Omit NA data points
model_PDP<-lm(Y_mean~., data=PDP_Data)
step(model_PDP,direction = "backward") # Unfortunately the Variable B should be removed as it is unsatisfied the condition of stepwise selection
## Start: AIC=-180.22
## Y_mean ~ Mean_PDP_A + Mean_PDP_B + Mean_PDP_C + D15 + Mean_PDP_E +
## Mean_PDP_F + G24 + H27
##
## Df Sum of Sq RSS AIC
## - D15 1 0.01224 0.96119 -181.58
## - Mean_PDP_C 1 0.03457 0.98352 -180.43
## <none> 0.94895 -180.22
## - Mean_PDP_B 1 0.14655 1.09550 -175.04
## - G24 1 0.19602 1.14497 -172.83
## - Mean_PDP_F 1 0.39067 1.33963 -164.98
## - Mean_PDP_A 1 0.41020 1.35916 -164.26
## - Mean_PDP_E 1 0.72066 1.66961 -153.97
## - H27 1 0.75496 1.70391 -152.96
##
## Step: AIC=-181.58
## Y_mean ~ Mean_PDP_A + Mean_PDP_B + Mean_PDP_C + Mean_PDP_E +
## Mean_PDP_F + G24 + H27
##
## Df Sum of Sq RSS AIC
## - Mean_PDP_C 1 0.02236 0.98355 -182.43
## <none> 0.96119 -181.58
## - Mean_PDP_B 1 0.15908 1.12027 -175.92
## - G24 1 0.18969 1.15087 -174.57
## - Mean_PDP_F 1 0.38529 1.34648 -166.73
## - Mean_PDP_A 1 0.40462 1.36581 -166.01
## - Mean_PDP_E 1 0.71145 1.67264 -155.88
## - H27 1 0.81079 1.77198 -153.00
##
## Step: AIC=-182.43
## Y_mean ~ Mean_PDP_A + Mean_PDP_B + Mean_PDP_E + Mean_PDP_F +
## G24 + H27
##
## Df Sum of Sq RSS AIC
## <none> 0.98355 -182.43
## - G24 1 0.17842 1.16197 -176.09
## - Mean_PDP_B 1 0.17869 1.16224 -176.08
## - Mean_PDP_F 1 0.37009 1.35364 -168.46
## - Mean_PDP_A 1 0.38227 1.36581 -168.01
## - Mean_PDP_E 1 0.75331 1.73686 -156.00
## - H27 1 0.81388 1.79742 -154.28
##
## Call:
## lm(formula = Y_mean ~ Mean_PDP_A + Mean_PDP_B + Mean_PDP_E +
## Mean_PDP_F + G24 + H27, data = PDP_Data)
##
## Coefficients:
## (Intercept) Mean_PDP_A Mean_PDP_B Mean_PDP_E Mean_PDP_F
## -0.3663 0.2065 0.1346 0.2414 0.1934
## G24 H27
## 0.1135 0.2117
# The importance of each variable
impo_PDP <- calc.relimp(model_PDP, type = c("lmg"),rela=T)
impo_PDP
## Response variable: Y_mean
## Total response variance: 0.20519
## Analysis based on 50 observations
##
## 8 Regressors:
## Mean_PDP_A Mean_PDP_B Mean_PDP_C D15 Mean_PDP_E Mean_PDP_F G24 H27
## Proportion of variance explained by model: 90.56%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## Mean_PDP_A 0.059589822
## Mean_PDP_B 0.138672800
## Mean_PDP_C 0.003306612
## D15 0.005688214
## Mean_PDP_E 0.165231391
## Mean_PDP_F 0.170780541
## G24 0.183366289
## H27 0.273364332
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## Mean_PDP_A 0.20886244 0.19796933 0.19784436 0.20211733 0.20731607
## Mean_PDP_B 0.44396617 0.37964247 0.32374316 0.27528708 0.23281558
## Mean_PDP_C 0.01386970 -0.01294075 -0.02907707 -0.03962952 -0.04713349
## D15 0.07926056 0.05873302 0.04459629 0.03535331 0.02988971
## Mean_PDP_E 0.45064008 0.38648469 0.33839679 0.30323170 0.27801884
## Mean_PDP_F 0.51100000 0.42621373 0.35697856 0.30245871 0.26119641
## G24 0.41085756 0.34470342 0.28825529 0.24071304 0.20098812
## H27 0.41238663 0.36588200 0.32730182 0.29555524 0.26924442
## 6Xs 7Xs 8Xs
## Mean_PDP_A 0.21194198 0.21589121 0.22007635
## Mean_PDP_B 0.19468402 0.15906930 0.12391413
## Mean_PDP_C -0.05274442 -0.05757579 -0.06320357
## D15 0.02717544 0.02678714 0.02906971
## Mean_PDP_E 0.26002039 0.24714827 0.23841839
## Mean_PDP_F 0.23141825 0.21142073 0.20001757
## G24 0.16801993 0.14105907 0.11996232
## H27 0.24684441 0.22667255 0.20686183