Preliminary Analysis of the Data

require(Hmisc)
## Loading required package: Hmisc
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## Hmisc library by Frank E Harrell Jr
## 
## Type library(help='Hmisc'), ?Overview, or ?Hmisc.Overview')
## to see overall documentation.
## 
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:survival':
## 
##     untangle.specials
## 
## The following object is masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
require(lattice)
## Loading required package: lattice
mydata <- spss.get("/Users/Sofia/Desktop/anaemia.sav")  #import .sav data 
## Loading required package: foreign
## Warning: duplicated levels in factors are deprecated
View(mydata)
summary(mydata)  #summarize the data
## Warning: duplicated levels in factors are deprecated
##     age                       region              educ      ceb     
##  15-19:139   Central             :200   no education:173   0  :134  
##  20-24:263   Eastern             :210   primary     :495   1  :144  
##  25-29:193   Northern            :108   secondary   :129   2-3:220  
##  30-34:112   Western             :242                      2-3:  0  
##  35-39: 65   Not de jure resident: 37                      4-5:161  
##  40-44: 23                                                 4-5:  0  
##  45-49:  2                                                 6+ :138  
##        wt                anaema   
##  Min.   : 37.7   not anaemic:493  
##  1st Qu.: 52.0   anaemic    :304  
##  Median : 56.8                    
##  Mean   : 57.6                    
##  3rd Qu.: 62.1                    
##  Max.   :152.6                    
## 
str(mydata)  ##tells the structure of the data
## 'data.frame':    797 obs. of  6 variables:
##  $ age   : Factor w/ 7 levels "15-19","20-24",..: 1 2 4 5 5 2 4 2 7 2 ...
##   ..- attr(*, "label")= Named chr "Age in 5-year groups"
##   .. ..- attr(*, "names")= chr "age"
##  $ region: Factor w/ 5 levels "Central","Eastern",..: 3 3 3 3 3 3 3 3 3 3 ...
##   ..- attr(*, "label")= Named chr "De jure region of residence"
##   .. ..- attr(*, "names")= chr "region"
##  $ educ  : Factor w/ 3 levels "no education",..: 2 2 2 2 3 2 2 1 2 2 ...
##   ..- attr(*, "label")= Named chr "Educational attainment"
##   .. ..- attr(*, "names")= chr "educ"
##  $ ceb   : Factor w/ 7 levels "0","1","2-3",..: 2 3 5 7 5 3 7 2 7 3 ...
##   ..- attr(*, "label")= Named chr "Total children ever born"
##   .. ..- attr(*, "names")= chr "ceb"
##  $ wt    :Class 'labelled'  atomic [1:797] 42.4 55.7 59.2 71 53.8 56.3 66.4 48.8 37.7 55.1 ...
##   .. ..- attr(*, "label")= Named chr "Respondent's weight (kilos)"
##   .. .. ..- attr(*, "names")= chr "wt"
##  $ anaema: Factor w/ 2 levels "not anaemic",..: 2 1 1 1 1 2 1 2 2 1 ...
##   ..- attr(*, "label")= Named chr "Anaemia level"
##   .. ..- attr(*, "names")= chr "anaema"
sapply(mydata, class)
## $age
## [1] "labelled" "factor"  
## 
## $region
## [1] "labelled" "factor"  
## 
## $educ
## [1] "labelled" "factor"  
## 
## $ceb
## [1] "labelled" "factor"  
## 
## $wt
## [1] "labelled"
## 
## $anaema
## [1] "labelled" "factor"
summary(is.na(mydata))
##     age            region           educ            ceb         
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:797       FALSE:797       FALSE:797       FALSE:797      
##  NA's :0         NA's :0         NA's :0         NA's :0        
##      wt            anaema       
##  Mode :logical   Mode :logical  
##  FALSE:797       FALSE:797      
##  NA's :0         NA's :0
weight <- as.numeric(mydata$wt)

xtabs(~anaema + educ, data = mydata)
##              educ
## anaema        no education primary secondary
##   not anaemic           95     302        96
##   anaemic               78     193        33
xtabs(~anaema + age, data = mydata)
##              age
## anaema        15-19 20-24 25-29 30-34 35-39 40-44 45-49
##   not anaemic    83   169   117    64    43    16     1
##   anaemic        56    94    76    48    22     7     1
xtabs(~anaema + region, data = mydata)
##              region
## anaema        Central Eastern Northern Western Not de jure resident
##   not anaemic     130     117       63     167                   16
##   anaemic          70      93       45      75                   21
xtabs(~anaema + ceb, data = mydata)
## Warning: duplicated levels in factors are deprecated
##              ceb
## anaema          0   1 2-3 2-3 4-5 4-5  6+
##   not anaemic  81  93 134   0 102   0  83
##   anaemic      53  51  86   0  59   0  55
mydata$wt <- as.numeric(mydata$wt)
mylogit <- glm(anaema ~ as.numeric(wt) + factor(educ) + factor(region) + factor(ceb) + 
    factor(age), family = "binomial", data = mydata)
## Warning: duplicated levels in factors are deprecated

summary(mylogit)
## 
## Call:
## glm(formula = anaema ~ as.numeric(wt) + factor(educ) + factor(region) + 
##     factor(ceb) + factor(age), family = "binomial", data = mydata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.452  -0.993  -0.796   1.274   1.892  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)                         0.74649    0.59686    1.25  0.21105
## as.numeric(wt)                     -0.01266    0.00942   -1.34  0.17899
## factor(educ)primary                -0.29834    0.18961   -1.57  0.11562
## factor(educ)secondary              -0.98489    0.28527   -3.45  0.00056
## factor(region)Eastern               0.30273    0.20979    1.44  0.14903
## factor(region)Northern              0.07350    0.25802    0.28  0.77574
## factor(region)Western              -0.38131    0.21521   -1.77  0.07642
## factor(region)Not de jure resident  0.70770    0.37521    1.89  0.05928
## factor(ceb)1                       -0.26528    0.27517   -0.96  0.33502
## factor(ceb)2-3                     -0.24200    0.30079   -0.80  0.42108
## factor(ceb)4-5                     -0.61067    0.36654   -1.67  0.09571
## factor(ceb)6+                      -0.34540    0.43203   -0.80  0.42400
## factor(age)20-24                   -0.00945    0.26770   -0.04  0.97183
## factor(age)25-29                    0.25480    0.33171    0.77  0.44241
## factor(age)30-34                    0.46875    0.40075    1.17  0.24213
## factor(age)35-39                   -0.11237    0.47101   -0.24  0.81144
## factor(age)40-44                   -0.23305    0.61901   -0.38  0.70656
## factor(age)45-49                    0.66134    1.50672    0.44  0.66071
##                                       
## (Intercept)                           
## as.numeric(wt)                        
## factor(educ)primary                   
## factor(educ)secondary              ***
## factor(region)Eastern                 
## factor(region)Northern                
## factor(region)Western              .  
## factor(region)Not de jure resident .  
## factor(ceb)1                          
## factor(ceb)2-3                        
## factor(ceb)4-5                     .  
## factor(ceb)6+                         
## factor(age)20-24                      
## factor(age)25-29                      
## factor(age)30-34                      
## factor(age)35-39                      
## factor(age)40-44                      
## factor(age)45-49                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1059.6  on 796  degrees of freedom
## Residual deviance: 1021.7  on 779  degrees of freedom
## AIC: 1058
## 
## Number of Fisher Scoring iterations: 4
anova(mylogit, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: anaema
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                             796       1060            
## as.numeric(wt)  1     5.05       795       1055   0.0247 * 
## factor(educ)    2    10.46       793       1044   0.0054 **
## factor(region)  4    14.86       789       1029   0.0050 **
## factor(ceb)     4     2.11       785       1027   0.7159   
## factor(age)     6     5.45       779       1022   0.4881   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1 - pchisq(1029.3, 789)
## [1] 1.497e-08
mylogit1 <- glm(anaema ~ as.numeric(wt) + educ * region, family = "binomial", 
    data = mydata)

summary(mylogit1)
## 
## Call:
## glm(formula = anaema ~ as.numeric(wt) + educ * region, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.585  -1.012  -0.775   1.264   1.853  
## 
## Coefficients:
##                                          Estimate Std. Error z value
## (Intercept)                               0.62129    0.70079    0.89
## as.numeric(wt)                           -0.01447    0.00946   -1.53
## educprimary                              -0.19351    0.48676   -0.40
## educsecondary                            -1.11768    0.55510   -2.01
## regionEastern                             0.34459    0.54851    0.63
## regionNorthern                           -0.11586    0.54794   -0.21
## regionWestern                            -0.28256    0.51963   -0.54
## regionNot de jure resident                1.08490    0.95080    1.14
## educprimary:regionEastern                -0.08290    0.60394   -0.14
## educsecondary:regionEastern              -0.22174    0.76860   -0.29
## educprimary:regionNorthern               -0.01169    0.64124   -0.02
## educsecondary:regionNorthern              1.91659    0.90852    2.11
## educprimary:regionWestern                -0.19298    0.57973   -0.33
## educsecondary:regionWestern               0.53851    0.77171    0.70
## educprimary:regionNot de jure resident   -0.56123    1.04552   -0.54
## educsecondary:regionNot de jure resident  0.30260    1.41890    0.21
##                                          Pr(>|z|)  
## (Intercept)                                 0.375  
## as.numeric(wt)                              0.126  
## educprimary                                 0.691  
## educsecondary                               0.044 *
## regionEastern                               0.530  
## regionNorthern                              0.833  
## regionWestern                               0.587  
## regionNot de jure resident                  0.254  
## educprimary:regionEastern                   0.891  
## educsecondary:regionEastern                 0.773  
## educprimary:regionNorthern                  0.985  
## educsecondary:regionNorthern                0.035 *
## educprimary:regionWestern                   0.739  
## educsecondary:regionWestern                 0.485  
## educprimary:regionNot de jure resident      0.591  
## educsecondary:regionNot de jure resident    0.831  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1059.6  on 796  degrees of freedom
## Residual deviance: 1020.4  on 781  degrees of freedom
## AIC: 1052
## 
## Number of Fisher Scoring iterations: 4
# 1.496819e-08, highly insignificant, try interaction
anova(mylogit1, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: anaema
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                             796       1060            
## as.numeric(wt)  1     5.05       795       1055   0.0247 * 
## educ            2    10.46       793       1044   0.0054 **
## region          4    14.86       789       1029   0.0050 **
## educ:region     8     8.85       781       1020   0.3553   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mylogit2 <- glm(anaema ~ as.numeric(wt) + educ + region, family = "binomial", 
    data = mydata)

summary(mylogit2)
## 
## Call:
## glm(formula = anaema ~ as.numeric(wt) + educ + region, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.431  -0.997  -0.823   1.273   1.954  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                  0.5302     0.5622    0.94   0.3457   
## as.numeric(wt)              -0.0130     0.0093   -1.40   0.1623   
## educprimary                 -0.2740     0.1843   -1.49   0.1370   
## educsecondary               -0.8612     0.2675   -3.22   0.0013 **
## regionEastern                0.2920     0.2075    1.41   0.1594   
## regionNorthern               0.0772     0.2551    0.30   0.7621   
## regionWestern               -0.3246     0.2107   -1.54   0.1234   
## regionNot de jure resident   0.7534     0.3682    2.05   0.0407 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1059.6  on 796  degrees of freedom
## Residual deviance: 1029.3  on 789  degrees of freedom
## AIC: 1045
## 
## Number of Fisher Scoring iterations: 4

anova(mylogit2, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: anaema
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                             796       1060            
## as.numeric(wt)  1     5.05       795       1055   0.0247 * 
## educ            2    10.46       793       1044   0.0054 **
## region          4    14.86       789       1029   0.0050 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
add1(mylogit, ~.^2, test = "Chisq")
## Warning: duplicated levels in factors are deprecated
## Single term additions
## 
## Model:
## anaema ~ as.numeric(wt) + factor(educ) + factor(region) + factor(ceb) + 
##     factor(age)
##                               Df Deviance  AIC   LRT Pr(>Chi)
## <none>                               1022 1058               
## as.numeric(wt):factor(educ)    2     1019 1059  2.74     0.25
## as.numeric(wt):factor(region)  4     1019 1063  2.38     0.67
## as.numeric(wt):factor(ceb)     4     1018 1062  4.12     0.39
## as.numeric(wt):factor(age)     6     1016 1064  5.63     0.47
## factor(educ):factor(region)    8     1012 1064  9.70     0.29
## factor(educ):factor(ceb)       8     1018 1070  4.14     0.84
## factor(educ):factor(age)      10     1009 1065 12.86     0.23
## factor(region):factor(ceb)    16     1010 1078 11.99     0.74
## factor(region):factor(age)    20     1007 1083 14.94     0.78
## factor(ceb):factor(age)       13     1005 1067 16.31     0.23
drop1(mylogit, test = "Chisq")
## Single term deletions
## 
## Model:
## anaema ~ as.numeric(wt) + factor(educ) + factor(region) + factor(ceb) + 
##     factor(age)
##                Df Deviance  AIC   LRT Pr(>Chi)   
## <none>                1022 1058                  
## as.numeric(wt)  1     1024 1058  1.88   0.1704   
## factor(educ)    2     1034 1066 12.64   0.0018 **
## factor(region)  4     1038 1066 16.35   0.0026 **
## factor(ceb)     4     1025 1053  3.56   0.4687   
## factor(age)     6     1027 1051  5.45   0.4881   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plots

barchart(mydata$anaema, horizontal = F)

plot of chunk unnamed-chunk-2

histogram(weight)

plot of chunk unnamed-chunk-2