x <- rnorm(100, 0, 1)
z <- rnorm(100, 0, 1)
y <- x + z + rnorm(100, 0, 0.5)
data1 <- data.frame(x = x, y = y, z = z)
summary(data1)
## x y z
## Min. :-2.5134 Min. :-3.3466 Min. :-3.039759
## 1st Qu.:-0.7929 1st Qu.:-1.0686 1st Qu.:-0.840445
## Median :-0.1576 Median :-0.1236 Median :-0.091666
## Mean :-0.1785 Mean :-0.1455 Mean : 0.009503
## 3rd Qu.: 0.4703 3rd Qu.: 0.7994 3rd Qu.: 0.712670
## Max. : 1.8910 Max. : 3.3894 Max. : 2.814655
model1 <- lm(data1$y ~ data1$x)
model2 <- lm(data1$y ~ data1$x + data1$z)
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: data1$y ~ data1$x
## Model 2: data1$y ~ data1$x + data1$z
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 98 144.177
## 2 97 25.053 1 119.12 461.21 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
x[c(10, 12)] <- NA
z[c(50, 55)] <- NA
data2 <- data.frame(x = x, y = y, z = z)
summary(data2)
## x y z
## Min. :-2.5134 Min. :-3.3466 Min. :-3.03976
## 1st Qu.:-0.7553 1st Qu.:-1.0686 1st Qu.:-0.80670
## Median :-0.1576 Median :-0.1236 Median :-0.08828
## Mean :-0.1736 Mean :-0.1455 Mean : 0.02392
## 3rd Qu.: 0.4731 3rd Qu.: 0.7994 3rd Qu.: 0.75660
## Max. : 1.8910 Max. : 3.3894 Max. : 2.81466
## NA's :2 NA's :2
model1na <- lm(data2$y ~ data2$x)
model2na <- lm(data2$y ~ data2$x + data2$z)
anova(model1na, model2na)
## Error in anova.lmlist(object, ...): models were not all fitted to the same size of dataset
summary(model1na)
##
## Call:
## lm(formula = data2$y ~ data2$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2925 -0.8175 -0.0368 0.6634 3.0458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03774 0.12363 0.305 0.761
## data2$x 0.87637 0.12203 7.182 1.47e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.206 on 96 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3495, Adjusted R-squared: 0.3427
## F-statistic: 51.58 on 1 and 96 DF, p-value: 1.465e-10
summary(model2na)
##
## Call:
## lm(formula = data2$y ~ data2$x + data2$z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2789 -0.3292 0.0869 0.3387 1.3246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.02695 0.05280 0.51 0.611
## data2$x 0.97446 0.05196 18.76 <2e-16 ***
## data2$z 0.96974 0.04674 20.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5089 on 93 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.8853, Adjusted R-squared: 0.8828
## F-statistic: 358.9 on 2 and 93 DF, p-value: < 2.2e-16
Delete all observations with NA’s using liswise deletion (i.e. you are going to delete all observations with at least one NA in any of the variables that you are going to use).
datanona <- na.omit(data2)
model1nona <- lm(datanona$y ~ datanona$x)
model2nona <- lm(datanona$y ~ datanona$x + datanona$z)
anova(model1nona, model2nona)
## Analysis of Variance Table
##
## Model 1: datanona$y ~ datanona$x
## Model 2: datanona$y ~ datanona$x + datanona$z
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 94 135.533
## 2 93 24.083 1 111.45 430.38 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Everything works!
P.S. If you are going to delete NA’s in a big dataset (like ESS) you should select variables first. Otherwise na.omit() will leave you with an empty dataset because it is highly likely that every respondent in a dataset has at least one NA value.