set.seed(1001)
x1 = sample(1:4, replace = TRUE, 100)
x2 = sample(1:4, replace = TRUE, 100)
x3 = sample(1:4, replace = TRUE, 100)
y = rbinom(100, x1, .3) + rbinom(100, x2, .9) + rpois(100, x3/5)
linreg = lm(y ~ x1 + x2 + x3)
summary(linreg)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3298 -0.6719 -0.0072 0.5909 4.2488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.12224 0.44576 0.274 0.78449
## x1 0.30690 0.09331 3.289 0.00141 **
## x2 0.88596 0.09978 8.879 3.81e-14 ***
## x3 0.14536 0.09415 1.544 0.12588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.028 on 96 degrees of freedom
## Multiple R-squared: 0.4791, Adjusted R-squared: 0.4629
## F-statistic: 29.44 on 3 and 96 DF, p-value: 1.395e-13
linreg = lm(y ~ x1 + x2 + x3)
summary(linreg)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3298 -0.6719 -0.0072 0.5909 4.2488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.12224 0.44576 0.274 0.78449
## x1 0.30690 0.09331 3.289 0.00141 **
## x2 0.88596 0.09978 8.879 3.81e-14 ***
## x3 0.14536 0.09415 1.544 0.12588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.028 on 96 degrees of freedom
## Multiple R-squared: 0.4791, Adjusted R-squared: 0.4629
## F-statistic: 29.44 on 3 and 96 DF, p-value: 1.395e-13
#Task: FIND A NESTED MODEL IN LINREG THAT IMPROVES F STATISTIC
#From the above, the the x1 and x2 are insignificant variables (p-value < 0.05) but x3 is insignificant. maybe we should remove x3 first
#But first of all, let's see them individually
library(lme4)
## Warning: package 'lme4' was built under R version 3.6.3
## Loading required package: Matrix
library(nlme)
## Warning: package 'nlme' was built under R version 3.6.3
##
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
##
## lmList
library(lmerTest)
## Warning: package 'lmerTest' was built under R version 3.6.3
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
### Model with insignificant parameters ###
linreg1 = lm(y ~ x1)
summary(linreg1)
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5355 -1.1581 -0.1581 0.6532 3.4645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9694 0.3541 8.386 3.77e-13 ***
## x1 0.1887 0.1252 1.507 0.135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.394 on 98 degrees of freedom
## Multiple R-squared: 0.02266, Adjusted R-squared: 0.01269
## F-statistic: 2.273 on 1 and 98 DF, p-value: 0.1349
### Model with significant parameters ###
linreg2 = lm(y ~ x2)
summary(linreg2)
##
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0739 -0.9319 -0.0739 0.8195 3.7840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3581 0.2763 4.914 3.57e-06 ***
## x2 0.8579 0.1038 8.266 6.83e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.082 on 98 degrees of freedom
## Multiple R-squared: 0.4108, Adjusted R-squared: 0.4048
## F-statistic: 68.33 on 1 and 98 DF, p-value: 6.829e-13
### Model with insignificant parameters ###
linreg3 = lm(y ~ x3)
summary(linreg3)
##
## Call:
## lm(formula = y ~ x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7519 -1.1521 -0.1521 0.8479 3.2481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9522 0.3506 8.421 3.18e-13 ***
## x3 0.1999 0.1267 1.578 0.118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.392 on 98 degrees of freedom
## Multiple R-squared: 0.02479, Adjusted R-squared: 0.01484
## F-statistic: 2.491 on 1 and 98 DF, p-value: 0.1177
#From the above again we can see that x2 is significant, x1 and x3 insignificant
#since x3 is insigniicant in both way, let's first delete x3 to see how the model works out
linreg12 = lm(y ~ x1 + x2)
summary(linreg12)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2698 -0.6763 -0.0734 0.6225 4.3194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.48417 0.38185 1.268 0.20785
## x1 0.29745 0.09378 3.172 0.00203 **
## x2 0.89896 0.10013 8.978 2.16e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.035 on 97 degrees of freedom
## Multiple R-squared: 0.4662, Adjusted R-squared: 0.4552
## F-statistic: 42.36 on 2 and 97 DF, p-value: 5.999e-14
#now let's compare out new model with the original
AIC(linreg,linreg12)
## df AIC
## linreg 5 295.2633
## linreg12 4 295.7162
anova(linreg,linreg12)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2 + x3
## Model 2: y ~ x1 + x2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 96 101.49
## 2 97 104.01 -1 -2.5201 2.3839 0.1259
#Okay according to the above, let's go ahead remove X3 as for now
#Let's then see how x1 and x2 plays in our new model:
AIC(linreg12,linreg1)
## df AIC
## linreg12 4 295.7162
## linreg1 3 354.1961
anova(linreg12,linreg1)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2
## Model 2: y ~ x1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 97 104.01
## 2 98 190.42 -1 -86.418 80.596 2.162e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(linreg12,linreg2)
## df AIC
## linreg12 4 295.7162
## linreg2 3 303.5855
anova(linreg12,linreg2)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2
## Model 2: y ~ x2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 97 104.01
## 2 98 114.80 -1 -10.788 10.062 0.002028 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#x2 seems to be more significant, let's create and comparea new model that only contains x2 and our x1, x2 model created just now
AIC(linreg,linreg2)
## df AIC
## linreg 5 295.2633
## linreg2 3 303.5855
anova(linreg,linreg2)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2 + x3
## Model 2: y ~ x2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 96 101.49
## 2 98 114.80 -2 -13.309 6.2945 0.0027 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(linreg,linreg12)
## df AIC
## linreg 5 295.2633
## linreg12 4 295.7162
anova(linreg,linreg12)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2 + x3
## Model 2: y ~ x1 + x2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 96 101.49
## 2 97 104.01 -1 -2.5201 2.3839 0.1259
#Obiviously with only x2 is more efficient, let's go ahead remove x1
#before we made our last decision, let's check the coefficient
cr12=cor(x1,x2,method="pearson")
cr12 = round(cr12, digit=3)
cr12
## [1] -0.129
cr13=cor(x1,x3,method="pearson")
cr13 = round(cr13, digit=3)
cr13
## [1] -0.077
cr23=cor(x2,x3,method="pearson")
cr23 = round(cr23, digit=3)
cr23
## [1] 0.093
#not much of coef was found among our varirables.
#summary:
#Comparison with the original model which containing both significant and insignificant parameters.
#our final model containing only significant parameters x2 was better performed.