nested-Model_Shuting-yang.utf8.md

set.seed(1001)

x1 = sample(1:4, replace = TRUE, 100)
x2 = sample(1:4, replace = TRUE, 100)
x3 = sample(1:4, replace = TRUE, 100)
y = rbinom(100, x1, .3) + rbinom(100, x2, .9) + rpois(100, x3/5)
linreg = lm(y ~ x1 + x2 + x3)
summary(linreg)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3298 -0.6719 -0.0072  0.5909  4.2488 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.12224    0.44576   0.274  0.78449    
## x1           0.30690    0.09331   3.289  0.00141 ** 
## x2           0.88596    0.09978   8.879 3.81e-14 ***
## x3           0.14536    0.09415   1.544  0.12588    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.028 on 96 degrees of freedom
## Multiple R-squared:  0.4791, Adjusted R-squared:  0.4629 
## F-statistic: 29.44 on 3 and 96 DF,  p-value: 1.395e-13

linreg = lm(y ~ x1 + x2 + x3)
summary(linreg)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3298 -0.6719 -0.0072  0.5909  4.2488 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.12224    0.44576   0.274  0.78449    
## x1           0.30690    0.09331   3.289  0.00141 ** 
## x2           0.88596    0.09978   8.879 3.81e-14 ***
## x3           0.14536    0.09415   1.544  0.12588    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.028 on 96 degrees of freedom
## Multiple R-squared:  0.4791, Adjusted R-squared:  0.4629 
## F-statistic: 29.44 on 3 and 96 DF,  p-value: 1.395e-13

#Task: FIND A NESTED MODEL IN LINREG THAT IMPROVES F STATISTIC

#From the above, the the x1 and x2 are insignificant variables (p-value < 0.05) but x3 is insignificant. maybe we should remove x3 first
#But first of all, let's see them individually

library(lme4)

## Warning: package 'lme4' was built under R version 3.6.3

## Loading required package: Matrix

library(nlme)

## Warning: package 'nlme' was built under R version 3.6.3

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:lme4':
## 
##     lmList

library(lmerTest)

## Warning: package 'lmerTest' was built under R version 3.6.3

## 
## Attaching package: 'lmerTest'

## The following object is masked from 'package:lme4':
## 
##     lmer

## The following object is masked from 'package:stats':
## 
##     step

### Model with insignificant parameters ###
linreg1 = lm(y ~ x1)
summary(linreg1)

## 
## Call:
## lm(formula = y ~ x1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5355 -1.1581 -0.1581  0.6532  3.4645 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.9694     0.3541   8.386 3.77e-13 ***
## x1            0.1887     0.1252   1.507    0.135    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.394 on 98 degrees of freedom
## Multiple R-squared:  0.02266,    Adjusted R-squared:  0.01269 
## F-statistic: 2.273 on 1 and 98 DF,  p-value: 0.1349

### Model with significant parameters ###
linreg2 = lm(y ~ x2)
summary(linreg2)

## 
## Call:
## lm(formula = y ~ x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0739 -0.9319 -0.0739  0.8195  3.7840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.3581     0.2763   4.914 3.57e-06 ***
## x2            0.8579     0.1038   8.266 6.83e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.082 on 98 degrees of freedom
## Multiple R-squared:  0.4108, Adjusted R-squared:  0.4048 
## F-statistic: 68.33 on 1 and 98 DF,  p-value: 6.829e-13

### Model with insignificant parameters ###
linreg3 = lm(y ~ x3)
summary(linreg3)

## 
## Call:
## lm(formula = y ~ x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7519 -1.1521 -0.1521  0.8479  3.2481 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.9522     0.3506   8.421 3.18e-13 ***
## x3            0.1999     0.1267   1.578    0.118    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.392 on 98 degrees of freedom
## Multiple R-squared:  0.02479,    Adjusted R-squared:  0.01484 
## F-statistic: 2.491 on 1 and 98 DF,  p-value: 0.1177

#From the above again we can see that x2 is significant, x1 and x3 insignificant
#since x3 is insigniicant in both way, let's first delete x3 to see how the model works out

linreg12 = lm(y ~ x1 + x2)
summary(linreg12)

## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2698 -0.6763 -0.0734  0.6225  4.3194 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.48417    0.38185   1.268  0.20785    
## x1           0.29745    0.09378   3.172  0.00203 ** 
## x2           0.89896    0.10013   8.978 2.16e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.035 on 97 degrees of freedom
## Multiple R-squared:  0.4662, Adjusted R-squared:  0.4552 
## F-statistic: 42.36 on 2 and 97 DF,  p-value: 5.999e-14

#now let's compare out new model with the original
AIC(linreg,linreg12)

##          df      AIC
## linreg    5 295.2633
## linreg12  4 295.7162

anova(linreg,linreg12)

## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2 + x3
## Model 2: y ~ x1 + x2
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     96 101.49                           
## 2     97 104.01 -1   -2.5201 2.3839 0.1259

#Okay according to the above, let's go ahead remove X3 as for now
#Let's then see how x1 and x2 plays in our new model:

AIC(linreg12,linreg1)

##          df      AIC
## linreg12  4 295.7162
## linreg1   3 354.1961

anova(linreg12,linreg1)

## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2
## Model 2: y ~ x1
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     97 104.01                                  
## 2     98 190.42 -1   -86.418 80.596 2.162e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC(linreg12,linreg2)

##          df      AIC
## linreg12  4 295.7162
## linreg2   3 303.5855

anova(linreg12,linreg2)

## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2
## Model 2: y ~ x2
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1     97 104.01                                
## 2     98 114.80 -1   -10.788 10.062 0.002028 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#x2 seems to be more significant, let's create and comparea new model that only contains x2 and our x1, x2 model created just now
AIC(linreg,linreg2)

##         df      AIC
## linreg   5 295.2633
## linreg2  3 303.5855

anova(linreg,linreg2)

## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2 + x3
## Model 2: y ~ x2
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)   
## 1     96 101.49                              
## 2     98 114.80 -2   -13.309 6.2945 0.0027 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC(linreg,linreg12)

##          df      AIC
## linreg    5 295.2633
## linreg12  4 295.7162

anova(linreg,linreg12)

## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2 + x3
## Model 2: y ~ x1 + x2
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     96 101.49                           
## 2     97 104.01 -1   -2.5201 2.3839 0.1259

#Obiviously with only x2 is more efficient, let's go ahead remove x1
#before we made our last decision, let's check the coefficient

cr12=cor(x1,x2,method="pearson")
cr12 = round(cr12, digit=3)
cr12

## [1] -0.129

cr13=cor(x1,x3,method="pearson")
cr13 = round(cr13, digit=3)
cr13

## [1] -0.077

cr23=cor(x2,x3,method="pearson")
cr23 = round(cr23, digit=3)
cr23

## [1] 0.093

#not much of coef was found among our varirables.

#summary:  
#Comparison with the original model which containing both significant and insignificant parameters.
#our final model containing only significant parameters x2 was better performed.

nested-Model_Shuting-yang.R

shutyang

2020-04-20