require(Hmisc)
## Loading required package: Hmisc
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## Hmisc library by Frank E Harrell Jr
##
## Type library(help='Hmisc'), ?Overview, or ?Hmisc.Overview')
## to see overall documentation.
##
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:survival':
##
## untangle.specials
##
## The following object is masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
require(lattice)
## Loading required package: lattice
mydata <- spss.get("/Users/Sofia/Desktop/anaemia.sav") #import .sav data
## Loading required package: foreign
## Warning: duplicated levels in factors are deprecated
View(mydata)
summary(mydata) #summarize the data
## Warning: duplicated levels in factors are deprecated
## age region educ ceb
## 15-19:139 Central :200 no education:173 0 :134
## 20-24:263 Eastern :210 primary :495 1 :144
## 25-29:193 Northern :108 secondary :129 2-3:220
## 30-34:112 Western :242 2-3: 0
## 35-39: 65 Not de jure resident: 37 4-5:161
## 40-44: 23 4-5: 0
## 45-49: 2 6+ :138
## wt anaema
## Min. : 37.7 not anaemic:493
## 1st Qu.: 52.0 anaemic :304
## Median : 56.8
## Mean : 57.6
## 3rd Qu.: 62.1
## Max. :152.6
##
str(mydata) ##tells the structure of the data
## 'data.frame': 797 obs. of 6 variables:
## $ age : Factor w/ 7 levels "15-19","20-24",..: 1 2 4 5 5 2 4 2 7 2 ...
## ..- attr(*, "label")= Named chr "Age in 5-year groups"
## .. ..- attr(*, "names")= chr "age"
## $ region: Factor w/ 5 levels "Central","Eastern",..: 3 3 3 3 3 3 3 3 3 3 ...
## ..- attr(*, "label")= Named chr "De jure region of residence"
## .. ..- attr(*, "names")= chr "region"
## $ educ : Factor w/ 3 levels "no education",..: 2 2 2 2 3 2 2 1 2 2 ...
## ..- attr(*, "label")= Named chr "Educational attainment"
## .. ..- attr(*, "names")= chr "educ"
## $ ceb : Factor w/ 7 levels "0","1","2-3",..: 2 3 5 7 5 3 7 2 7 3 ...
## ..- attr(*, "label")= Named chr "Total children ever born"
## .. ..- attr(*, "names")= chr "ceb"
## $ wt :Class 'labelled' atomic [1:797] 42.4 55.7 59.2 71 53.8 56.3 66.4 48.8 37.7 55.1 ...
## .. ..- attr(*, "label")= Named chr "Respondent's weight (kilos)"
## .. .. ..- attr(*, "names")= chr "wt"
## $ anaema: Factor w/ 2 levels "not anaemic",..: 2 1 1 1 1 2 1 2 2 1 ...
## ..- attr(*, "label")= Named chr "Anaemia level"
## .. ..- attr(*, "names")= chr "anaema"
sapply(mydata, class)
## $age
## [1] "labelled" "factor"
##
## $region
## [1] "labelled" "factor"
##
## $educ
## [1] "labelled" "factor"
##
## $ceb
## [1] "labelled" "factor"
##
## $wt
## [1] "labelled"
##
## $anaema
## [1] "labelled" "factor"
summary(is.na(mydata))
## age region educ ceb
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:797 FALSE:797 FALSE:797 FALSE:797
## NA's :0 NA's :0 NA's :0 NA's :0
## wt anaema
## Mode :logical Mode :logical
## FALSE:797 FALSE:797
## NA's :0 NA's :0
weight <- as.numeric(mydata$wt)
xtabs(~anaema + educ, data = mydata)
## educ
## anaema no education primary secondary
## not anaemic 95 302 96
## anaemic 78 193 33
xtabs(~anaema + age, data = mydata)
## age
## anaema 15-19 20-24 25-29 30-34 35-39 40-44 45-49
## not anaemic 83 169 117 64 43 16 1
## anaemic 56 94 76 48 22 7 1
xtabs(~anaema + region, data = mydata)
## region
## anaema Central Eastern Northern Western Not de jure resident
## not anaemic 130 117 63 167 16
## anaemic 70 93 45 75 21
xtabs(~anaema + ceb, data = mydata)
## Warning: duplicated levels in factors are deprecated
## ceb
## anaema 0 1 2-3 2-3 4-5 4-5 6+
## not anaemic 81 93 134 0 102 0 83
## anaemic 53 51 86 0 59 0 55
mydata$wt <- as.numeric(mydata$wt)
mylogit <- glm(anaema ~ as.numeric(wt) + factor(educ) + factor(region) + factor(ceb) +
factor(age), family = "binomial", data = mydata)
## Warning: duplicated levels in factors are deprecated
summary(mylogit)
##
## Call:
## glm(formula = anaema ~ as.numeric(wt) + factor(educ) + factor(region) +
## factor(ceb) + factor(age), family = "binomial", data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.452 -0.993 -0.796 1.274 1.892
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.74649 0.59686 1.25 0.21105
## as.numeric(wt) -0.01266 0.00942 -1.34 0.17899
## factor(educ)primary -0.29834 0.18961 -1.57 0.11562
## factor(educ)secondary -0.98489 0.28527 -3.45 0.00056
## factor(region)Eastern 0.30273 0.20979 1.44 0.14903
## factor(region)Northern 0.07350 0.25802 0.28 0.77574
## factor(region)Western -0.38131 0.21521 -1.77 0.07642
## factor(region)Not de jure resident 0.70770 0.37521 1.89 0.05928
## factor(ceb)1 -0.26528 0.27517 -0.96 0.33502
## factor(ceb)2-3 -0.24200 0.30079 -0.80 0.42108
## factor(ceb)4-5 -0.61067 0.36654 -1.67 0.09571
## factor(ceb)6+ -0.34540 0.43203 -0.80 0.42400
## factor(age)20-24 -0.00945 0.26770 -0.04 0.97183
## factor(age)25-29 0.25480 0.33171 0.77 0.44241
## factor(age)30-34 0.46875 0.40075 1.17 0.24213
## factor(age)35-39 -0.11237 0.47101 -0.24 0.81144
## factor(age)40-44 -0.23305 0.61901 -0.38 0.70656
## factor(age)45-49 0.66134 1.50672 0.44 0.66071
##
## (Intercept)
## as.numeric(wt)
## factor(educ)primary
## factor(educ)secondary ***
## factor(region)Eastern
## factor(region)Northern
## factor(region)Western .
## factor(region)Not de jure resident .
## factor(ceb)1
## factor(ceb)2-3
## factor(ceb)4-5 .
## factor(ceb)6+
## factor(age)20-24
## factor(age)25-29
## factor(age)30-34
## factor(age)35-39
## factor(age)40-44
## factor(age)45-49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1059.6 on 796 degrees of freedom
## Residual deviance: 1021.7 on 779 degrees of freedom
## AIC: 1058
##
## Number of Fisher Scoring iterations: 4
anova(mylogit, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: anaema
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 796 1060
## as.numeric(wt) 1 5.05 795 1055 0.0247 *
## factor(educ) 2 10.46 793 1044 0.0054 **
## factor(region) 4 14.86 789 1029 0.0050 **
## factor(ceb) 4 2.11 785 1027 0.7159
## factor(age) 6 5.45 779 1022 0.4881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1 - pchisq(1029.3, 789)
## [1] 1.497e-08
mylogit1 <- glm(anaema ~ as.numeric(wt) + educ * region, family = "binomial",
data = mydata)
summary(mylogit1)
##
## Call:
## glm(formula = anaema ~ as.numeric(wt) + educ * region, family = "binomial",
## data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.585 -1.012 -0.775 1.264 1.853
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.62129 0.70079 0.89
## as.numeric(wt) -0.01447 0.00946 -1.53
## educprimary -0.19351 0.48676 -0.40
## educsecondary -1.11768 0.55510 -2.01
## regionEastern 0.34459 0.54851 0.63
## regionNorthern -0.11586 0.54794 -0.21
## regionWestern -0.28256 0.51963 -0.54
## regionNot de jure resident 1.08490 0.95080 1.14
## educprimary:regionEastern -0.08290 0.60394 -0.14
## educsecondary:regionEastern -0.22174 0.76860 -0.29
## educprimary:regionNorthern -0.01169 0.64124 -0.02
## educsecondary:regionNorthern 1.91659 0.90852 2.11
## educprimary:regionWestern -0.19298 0.57973 -0.33
## educsecondary:regionWestern 0.53851 0.77171 0.70
## educprimary:regionNot de jure resident -0.56123 1.04552 -0.54
## educsecondary:regionNot de jure resident 0.30260 1.41890 0.21
## Pr(>|z|)
## (Intercept) 0.375
## as.numeric(wt) 0.126
## educprimary 0.691
## educsecondary 0.044 *
## regionEastern 0.530
## regionNorthern 0.833
## regionWestern 0.587
## regionNot de jure resident 0.254
## educprimary:regionEastern 0.891
## educsecondary:regionEastern 0.773
## educprimary:regionNorthern 0.985
## educsecondary:regionNorthern 0.035 *
## educprimary:regionWestern 0.739
## educsecondary:regionWestern 0.485
## educprimary:regionNot de jure resident 0.591
## educsecondary:regionNot de jure resident 0.831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1059.6 on 796 degrees of freedom
## Residual deviance: 1020.4 on 781 degrees of freedom
## AIC: 1052
##
## Number of Fisher Scoring iterations: 4
# 1.496819e-08, highly insignificant, try interaction
anova(mylogit1, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: anaema
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 796 1060
## as.numeric(wt) 1 5.05 795 1055 0.0247 *
## educ 2 10.46 793 1044 0.0054 **
## region 4 14.86 789 1029 0.0050 **
## educ:region 8 8.85 781 1020 0.3553
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mylogit2 <- glm(anaema ~ as.numeric(wt) + educ + region, family = "binomial",
data = mydata)
summary(mylogit2)
##
## Call:
## glm(formula = anaema ~ as.numeric(wt) + educ + region, family = "binomial",
## data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.431 -0.997 -0.823 1.273 1.954
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5302 0.5622 0.94 0.3457
## as.numeric(wt) -0.0130 0.0093 -1.40 0.1623
## educprimary -0.2740 0.1843 -1.49 0.1370
## educsecondary -0.8612 0.2675 -3.22 0.0013 **
## regionEastern 0.2920 0.2075 1.41 0.1594
## regionNorthern 0.0772 0.2551 0.30 0.7621
## regionWestern -0.3246 0.2107 -1.54 0.1234
## regionNot de jure resident 0.7534 0.3682 2.05 0.0407 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1059.6 on 796 degrees of freedom
## Residual deviance: 1029.3 on 789 degrees of freedom
## AIC: 1045
##
## Number of Fisher Scoring iterations: 4
anova(mylogit2, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: anaema
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 796 1060
## as.numeric(wt) 1 5.05 795 1055 0.0247 *
## educ 2 10.46 793 1044 0.0054 **
## region 4 14.86 789 1029 0.0050 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
add1(mylogit, ~.^2, test = "Chisq")
## Warning: duplicated levels in factors are deprecated
## Single term additions
##
## Model:
## anaema ~ as.numeric(wt) + factor(educ) + factor(region) + factor(ceb) +
## factor(age)
## Df Deviance AIC LRT Pr(>Chi)
## <none> 1022 1058
## as.numeric(wt):factor(educ) 2 1019 1059 2.74 0.25
## as.numeric(wt):factor(region) 4 1019 1063 2.38 0.67
## as.numeric(wt):factor(ceb) 4 1018 1062 4.12 0.39
## as.numeric(wt):factor(age) 6 1016 1064 5.63 0.47
## factor(educ):factor(region) 8 1012 1064 9.70 0.29
## factor(educ):factor(ceb) 8 1018 1070 4.14 0.84
## factor(educ):factor(age) 10 1009 1065 12.86 0.23
## factor(region):factor(ceb) 16 1010 1078 11.99 0.74
## factor(region):factor(age) 20 1007 1083 14.94 0.78
## factor(ceb):factor(age) 13 1005 1067 16.31 0.23
drop1(mylogit, test = "Chisq")
## Single term deletions
##
## Model:
## anaema ~ as.numeric(wt) + factor(educ) + factor(region) + factor(ceb) +
## factor(age)
## Df Deviance AIC LRT Pr(>Chi)
## <none> 1022 1058
## as.numeric(wt) 1 1024 1058 1.88 0.1704
## factor(educ) 2 1034 1066 12.64 0.0018 **
## factor(region) 4 1038 1066 16.35 0.0026 **
## factor(ceb) 4 1025 1053 3.56 0.4687
## factor(age) 6 1027 1051 5.45 0.4881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Plots
barchart(mydata$anaema, horizontal = F)
histogram(weight)