options(warn=-1)
I don’t like to attach my data, hence you will see me using “data$variable” instead of just “variable”. In case I use the variable as is, I will explain why. The answers might not be correct or adequate, so use this at your own risk, or merely as isnpiration.
#You need to install the "MASS" package to test for differences in proportions. Also, the "epiR" package is useful when calculating RR. They can be installed by using these codes (without the # in front):
#install.packages("MASS")
#install.packages("epiR")
#install.packages("epitools")
#Load the libraries like this:
library(MASS)
library(epiR)
## Loading required package: survival
## Package epiR 0.9-79 is loaded
## Type help(epi.about) for summary information
##
library(epitools)
##
## Attaching package: 'epitools'
## Det følgende objekt er maskeret fra 'package:survival':
##
## ratetable
#I saved the Excel file as a csv file, and imported it with the read.csv2() funtion. It is important to use the (.) character as the decimal, dec ="." with this dataset.
obes <- read.csv2("Data file in Excel.csv", header = T,dec = ".")
#Use the View() function to observe your dataset
View(obes)
obes$Sex <- factor(obes$Sex)
obes$Sm <- factor(obes$Sm)
Explanatory note: If you want to import datasets the same way I do, you need to set the work directory to the same folder as where your dataset file is placed. I have a EDH folder where I save all my R files and datasets, then I go to Session -> Set Working Directory -> To Source File Location (I am using R Studio). This allows me load datasets directly and create nice documents like this one :D.
Girls <- subset(obes, obes$Sex=="0")
Boys <- subset(obes, obes$Sex=="1")
View(Girls)
#Normal Distribution or not (whole dataset)
hist(obes$Bw)
qqnorm(obes$Bw)
qqline(obes$Bw)
#Girls
hist(Girls$Bw)
qqnorm(Girls$Bw)
qqline(Girls$Bw)
#Summary (mean etc.), SD and 95% confidence intervals
with(Girls,summary(Bw, na.rm=T))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 425 2800 3150 3114 3500 5400 443
with(Girls,sd(Bw, na.rm=T))
## [1] 611.7459
with(Girls, confint(lm(Bw[!is.na(Bw)]~1)))
## 2.5 % 97.5 %
## (Intercept) 3094.948 3132.809
#Boys
hist(Boys$Bw)
qqnorm(Boys$Bw)
qqline(Boys$Bw)
#Summary (mean etc.), SD and 95% confidence intervals
with(Boys,summary(Bw, na.rm=T))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 320 2900 3300 3239 3650 5500 502
with(Boys,sd(Bw, na.rm=T))
## [1] 640.9409
with(Boys, confint(lm(Bw[!is.na(Bw)]~1)))
## 2.5 % 97.5 %
## (Intercept) 3219.076 3258.013
#First I will use the cut function, to break the dataset into a variable with Birthweights of under or over 4000 grams.
obes$Heavy <- cut(obes$Bw, breaks = c(-Inf,4000,Inf), labels= c("<4 kg",">4 kg"), right=FALSE)
#Every time a variable is added to the dataset, the subsets need to be redefined. Otherwise the new variable will only be present in the original dataset.
Girls <- subset(obes, obes$Sex=="0")
Boys <- subset(obes, obes$Sex=="1")
#I am a lazy kuhnt, but you could write "subset(obes, obes$Sex=="0")" and "subset(obes, obes$Sex=="1")" for Girls and Boys respectively, every time you need the subsets, and ignore defining the subsets over and over. I would propably add all variables in the beginning of the program and define the subsets once only, but for pedagogical reasons, I will do it step by step for this exercise.
#Now the proportions can be calculated for both sexes. First I define a new vector with the values from the summary() of the variable I just created (Heavy) with the subsets. Then I pull the values out of the new vector, to calculate the proportions.
#Girls
GirlsHeavy <- with(Girls, summary(Heavy))
GirlsHeavy
## <4 kg >4 kg NA's
## 3775 239 443
GirlsHeavy[2]/(GirlsHeavy[1]+GirlsHeavy[2])*100
## >4 kg
## 5.95416
#Boys
BoysHeavy <- with(Boys, summary(Heavy))
BoysHeavy
## <4 kg >4 kg NA's
## 3740 426 502
BoysHeavy[2]/(BoysHeavy[1]+BoysHeavy[2])*100
## >4 kg
## 10.22564
Explanatory note: When using the with() function, you are “telling” R what dataset you are pulling values out of. Therefore, it is not necessary to use obes$Heavy when using the summary() function. Actually, this would make R think you want the summary of the variable from the original dataset, and not from the subset of Girls or Boys which is what we want.
#Test for normality (not sure if necessary)
hist(Girls$Bf)
qqnorm(Girls$Bf)
qqline(Girls$Bf)
hist(Boys$Bf)
qqnorm(Boys$Bf)
qqline(Boys$Bf)
#The same procedure as for Birthweight
obes$ShortBf <- cut(obes$Bf, breaks = c(-Inf,2.000001,Inf), labels= c("<=2 months",">2 months"), right=FALSE)
Girls <- subset(obes, obes$Sex=="0")
Boys <- subset(obes, obes$Sex=="1")
#Girls
GirlsShortBf <- with(Girls, summary(ShortBf))
GirlsShortBf
## <=2 months >2 months NA's
## 1765 1643 1049
GirlsShortBf[1]/(GirlsShortBf[1]+GirlsShortBf[2])*100
## <=2 months
## 51.78991
#Boys
BoysShortBf <- with(Boys, summary(ShortBf))
BoysShortBf
## <=2 months >2 months NA's
## 1856 1707 1105
BoysShortBf[1]/(BoysShortBf[1]+BoysShortBf[2])*100
## <=2 months
## 52.09093
#Test for normality (not sure if necessary)
hist(Girls$Cf)
qqnorm(Girls$Cf)
qqline(Girls$Cf)
hist(Boys$Cf)
qqnorm(Boys$Cf)
qqline(Boys$Cf)
#The same procedure as for Birthweight
obes$CompFeed <- cut(obes$Cf, breaks = c(-Inf,3.000001,Inf), labels= c("<=3 months",">3 months"), right=FALSE)
Girls <- subset(obes, obes$Sex=="0")
Boys <- subset(obes, obes$Sex=="1")
#Girls
GirlsCompFeed <- with(Girls, summary(CompFeed))
GirlsCompFeed
## <=3 months >3 months NA's
## 1212 1521 1724
GirlsCompFeed[1]/(GirlsCompFeed[1]+GirlsCompFeed[2])*100
## <=3 months
## 44.34687
#Boys
BoysCompFeed <- with(Boys, summary(CompFeed))
BoysCompFeed
## <=3 months >3 months NA's
## 1300 1541 1827
BoysCompFeed[1]/(BoysCompFeed[1]+BoysCompFeed[2])*100
## <=3 months
## 45.75854
#Calculating BMI from Ht and Wt Variables and saving as new variable (BMI) in the obes dataset
obes$BMI <- obes$Wt/((obes$Ht/100)^2)
#Now, the same procedure as for Birthweight pretty much
obes$Overweight <- cut(obes$BMI, breaks = c(-Inf,24.99999,Inf), labels= c("Normal","Overweight"), right=FALSE)
#Redefining subsets Girls and Boys for inclusion of newly made BMI and Overweight variable
Girls <- subset(obes, obes$Sex=="0")
Boys <- subset(obes, obes$Sex=="1")
#Test for normality (dont know if necessary)
hist(Girls$BMI)
qqnorm(Girls$BMI)
qqline(Girls$BMI)
hist(Boys$BMI)
qqnorm(Boys$BMI)
qqline(Boys$BMI)
#Proportion of BMI >25 in adulthood
#Girls
GirlsOverweight <- with(Girls, summary(Overweight))
GirlsOverweight
## Normal Overweight NA's
## 1502 817 2138
GirlsOverweight[2]/(GirlsOverweight[1]+GirlsOverweight[2])*100
## Overweight
## 35.2307
#Boys
BoysOverweight <- with(Boys, summary(Overweight))
BoysOverweight
## Normal Overweight NA's
## 949 1038 2681
BoysOverweight[2]/(BoysOverweight[1]+BoysOverweight[2])*100
## Overweight
## 52.23956
#You can do this with the tools we have learned:
t.test(Boys$Bw, Girls$Bw)
##
## Welch Two Sample t-test
##
## data: Boys$Bw and Girls$Bw
## t = 9.0008, df = 8177.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 97.51564 151.81730
## sample estimates:
## mean of x mean of y
## 3238.545 3113.878
#You could also use the lm() function, but you propably have not learned about this one yet. With this function we can actually make a one-way ANOVA, to test for differences and get an estimate of the difference all at once with the following commands:
model <- lm(Bw~Sex, obes)
summary(model)
##
## Call:
## lm(formula = Bw ~ Sex, data = obes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2918.54 -338.54 61.46 386.12 2286.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3113.878 9.893 314.754 <2e-16 ***
## Sex1 124.666 13.863 8.993 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 626.8 on 8178 degrees of freedom
## (945 observations deleted due to missingness)
## Multiple R-squared: 0.009792, Adjusted R-squared: 0.009671
## F-statistic: 80.87 on 1 and 8178 DF, p-value: < 2.2e-16
#The confidence intervals for the estimates can be extracted from our model by using this command:
confint(model)
## 2.5 % 97.5 %
## (Intercept) 3094.48531 3133.2710
## Sex1 97.49213 151.8408
Explanatory note: “Intercept” in the output is the missing level of our Sex factor, which is Sex0 = females. To read the output, you have to think of the value of all other levels as the difference from the intercept in Bodyweight (Bw). So what we can conlude from this output, is that the boys were 124.7 grams heavier than the girls at birth, on average. The difference is also highly signficant (P < 0.001).
#Break the dataset into a variable with Birthweights of under or over 2.5 kg.
obes$LowBw <- cut(obes$Bw, breaks = c(-Inf,2499.99999,Inf), labels= c("<2.5 kg",">2.5 kg"), right=FALSE)
#Create a 2x2 table by using the variables Sex and LowBw
table <- table(obes$Sex,obes$LowBw)
table
##
## <2.5 kg >2.5 kg
## 0 498 3516
## 1 442 3724
#The difference in proportions are tested with the prop.test() function (i googled it):
prop.test(table, correct=TRUE) #prop 1 = Girls; prop 2 = Boys.
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: table
## X-squared = 6.3142, df = 1, p-value = 0.01198
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.003887511 0.032050077
## sample estimates:
## prop 1 prop 2
## 0.1240658 0.1060970
#The test for differences used, is as far as I can tell, the same as the Chi^2 test:
chisq.test(table)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table
## X-squared = 6.3142, df = 1, p-value = 0.01198
We were not sure wheather to use the “same follow-up”" or the “variable follow-up” formula for this, since the babies were concieved at differet time-points, although the follow-up at 42y is the same. But after hours with blood sweat and tears, we had to abort the “variable follow-up” mission, and continue with the “same follow-up” formula.
It is favorable to order the levels of our factors “Overweight” and “Heavy” in a proper manner. Right now a print of the factor Overweight will present its levels as “Normal” and “Overweight”, in the order O- and O+ (Outcome absent and Outcome present) which is “wrong” for proper 2by2 tables. Levels ordered as E+, E- or O+, O- are preferred. The rest of the factors have properly ordered levels and need no further ordering.
obes$Overweight_new <- factor(obes$Overweight, levels = c("Overweight", "Normal"))
obes$Heavy_new <- factor(obes$Heavy, levels = c(">4 kg", "<4 kg"))
#High Birthweight and Overweight
tab1 <-table(obes$Heavy_new,obes$Overweight_new)
#RR by hand -> tab1[1,1]/(tab1[1,1]+tab1[1,2]))/(tab1[2,1]/(tab1[2,1]+tab1[2,2]))
table(obes$Heavy_new,obes$Overweight_new)
##
## Overweight Normal
## >4 kg 184 177
## <4 kg 1631 2226
#Short Breastfeeding and Overweight
tab2 <- table(obes$ShortBf,obes$Overweight_new)
#RR by hand -> (tab2[1,1]/(tab2[1,1]+tab2[1,2]))/(tab2[2,1]/(tab2[2,1]+tab2[2,2]))
table(obes$ShortBf,obes$Overweight_new)
##
## Overweight Normal
## <=2 months 777 963
## >2 months 781 1103
#Early Complementary Feeding and Overweight
tab3 <- table(obes$CompFeed,obes$Overweight_new)
#RR by hand -> (tab3[1,1]/(tab3[1,1]+tab3[1,2]))/(tab3[2,1]/(tab3[2,1]+tab3[2,2]))
table(obes$CompFeed,obes$Overweight_new)
##
## Overweight Normal
## <=3 months 621 758
## >3 months 686 1005
#High Birthweight and Overweight
epi.2by2(tab1)
## Outcome + Outcome - Total Inc risk *
## Exposed + 184 177 361 51.0
## Exposed - 1631 2226 3857 42.3
## Total 1815 2403 4218 43.0
## Odds
## Exposed + 1.040
## Exposed - 0.733
## Total 0.755
##
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio 1.21 (1.08, 1.34)
## Odds ratio 1.42 (1.14, 1.76)
## Attrib risk * 8.68 (3.30, 14.07)
## Attrib risk in population * 0.74 (-1.42, 2.90)
## Attrib fraction in exposed (%) 17.04 (7.60, 25.50)
## Attrib fraction in population (%) 1.73 (0.64, 2.81)
## -------------------------------------------------------------------
## X2 test statistic: 10.152 p-value: 0.001
## Wald confidence limits
## * Outcomes per 100 population units
#Short Breastfeeding and Overweight
epi.2by2(tab2)
## Outcome + Outcome - Total Inc risk *
## Exposed + 777 963 1740 44.7
## Exposed - 781 1103 1884 41.5
## Total 1558 2066 3624 43.0
## Odds
## Exposed + 0.807
## Exposed - 0.708
## Total 0.754
##
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio 1.08 (1.00, 1.16)
## Odds ratio 1.14 (1.00, 1.30)
## Attrib risk * 3.20 (-0.02, 6.43)
## Attrib risk in population * 1.54 (-1.21, 4.28)
## Attrib fraction in exposed (%) 7.17 (-0.06, 13.87)
## Attrib fraction in population (%) 3.57 (-0.10, 7.11)
## -------------------------------------------------------------------
## X2 test statistic: 3.781 p-value: 0.052
## Wald confidence limits
## * Outcomes per 100 population units
#Early Complementary Feeding and Overweight
epi.2by2(tab3)
## Outcome + Outcome - Total Inc risk *
## Exposed + 621 758 1379 45.0
## Exposed - 686 1005 1691 40.6
## Total 1307 1763 3070 42.6
## Odds
## Exposed + 0.819
## Exposed - 0.683
## Total 0.741
##
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio 1.11 (1.02, 1.20)
## Odds ratio 1.20 (1.04, 1.39)
## Attrib risk * 4.46 (0.95, 7.98)
## Attrib risk in population * 2.01 (-0.92, 4.93)
## Attrib fraction in exposed (%) 9.91 (2.21, 17.01)
## Attrib fraction in population (%) 4.71 (0.92, 8.36)
## -------------------------------------------------------------------
## X2 test statistic: 6.194 p-value: 0.013
## Wald confidence limits
## * Outcomes per 100 population units
#High Birthweight and Overweight
epitab(tab1, method = "riskratio", rev = c("both"))
## $tab
##
## Normal p0 Overweight p1 riskratio lower upper
## <4 kg 2226 0.5771325 1631 0.4228675 1.000000 NA NA
## >4 kg 177 0.4903047 184 0.5096953 1.205331 1.082281 1.342371
##
## p.value
## <4 kg NA
## >4 kg 0.001526998
##
## $measure
## [1] "wald"
##
## $conf.level
## [1] 0.95
##
## $pvalue
## [1] "fisher.exact"
#Short Breastfeeding and Overweight
epitab(tab2, method = "riskratio", rev = c("both"))
## $tab
##
## Normal p0 Overweight p1 riskratio lower
## >2 months 1103 0.5854565 781 0.4145435 1.000000 NA
## <=2 months 963 0.5534483 777 0.4465517 1.077213 0.9994383
##
## upper p.value
## >2 months NA NA
## <=2 months 1.16104 0.05559314
##
## $measure
## [1] "wald"
##
## $conf.level
## [1] 0.95
##
## $pvalue
## [1] "fisher.exact"
#Early Complementary Feeding and Overweight
epitab(tab3, method = "riskratio", rev = c("both"))
## $tab
##
## Normal p0 Overweight p1 riskratio lower
## >3 months 1005 0.5943229 686 0.4056771 1.000000 NA
## <=3 months 758 0.5496737 621 0.4503263 1.110061 1.022641
##
## upper p.value
## >3 months NA NA
## <=3 months 1.204954 0.01394715
##
## $measure
## [1] "wald"
##
## $conf.level
## [1] 0.95
##
## $pvalue
## [1] "fisher.exact"
Confounder <- table(obes$ShortBf,obes$Overweight_new, obes$Sm, dnn = c("Breastfeeding", "Weight in Adulthood", "Maternal Single Status"))
print(Confounder)
## , , Maternal Single Status = 0
##
## Weight in Adulthood
## Breastfeeding Overweight Normal
## <=2 months 316 402
## >2 months 250 283
##
## , , Maternal Single Status = 1
##
## Weight in Adulthood
## Breastfeeding Overweight Normal
## <=2 months 448 546
## >2 months 526 810
epi.2by2(Confounder)
## Outcome + Outcome - Total Inc risk *
## Exposed + 764 948 1712 44.6
## Exposed - 776 1093 1869 41.5
## Total 1540 2041 3581 43.0
## Odds
## Exposed + 0.806
## Exposed - 0.710
## Total 0.755
##
##
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio (crude) 1.07 (1.00, 1.16)
## Inc risk ratio (M-H) 1.06 (0.99, 1.15)
## Inc risk ratio (crude:M-H) 1.01
## Odds ratio (crude) 1.14 (0.99, 1.30)
## Odds ratio (M-H) 1.12 (0.98, 1.28)
## Odds ratio (crude:M-H) 1.02
## Attrib risk (crude) * 3.11 (-0.14, 6.35)
## Attrib risk (M-H) * 2.70 (-0.80, 6.19)
## Attrib risk (crude:M-H) 1.15
## -------------------------------------------------------------------
## Test of homogeneity of IRR: X2 test statistic: 6.527 p-value: 0.011
## Test of homogeneity of OR: X2 test statistic: 6.036 p-value: 0.014
## Wald confidence limits
## M-H: Mantel-Haenszel
## * Outcomes per 100 population units
Single <- subset(obes, obes$Sm == 1, na.rm = TRUE)
NonSingle <- subset(obes, obes$Sm == 0, na.rm = TRUE)
#Single mothers
EM <- table(Single$ShortBf,Single$Overweight_new, dnn = c("Breastfeeding", "Weight in Adulthood"))
print(EM)
## Weight in Adulthood
## Breastfeeding Overweight Normal
## <=2 months 448 546
## >2 months 526 810
#Non-Single mothers
EM_ <- table(NonSingle$ShortBf,NonSingle$Overweight_new, dnn = c("Breastfeeding", "Weight in Adulthood"))
print(EM_)
## Weight in Adulthood
## Breastfeeding Overweight Normal
## <=2 months 316 402
## >2 months 250 283
epi.2by2(EM)
## Outcome + Outcome - Total Inc risk *
## Exposed + 448 546 994 45.1
## Exposed - 526 810 1336 39.4
## Total 974 1356 2330 41.8
## Odds
## Exposed + 0.821
## Exposed - 0.649
## Total 0.718
##
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio 1.14 (1.04, 1.26)
## Odds ratio 1.26 (1.07, 1.49)
## Attrib risk * 5.70 (1.65, 9.75)
## Attrib risk in population * 2.43 (-0.87, 5.73)
## Attrib fraction in exposed (%) 12.65 (3.88, 20.61)
## Attrib fraction in population (%) 5.82 (1.57, 9.88)
## -------------------------------------------------------------------
## X2 test statistic: 7.609 p-value: 0.006
## Wald confidence limits
## * Outcomes per 100 population units
epi.2by2(EM_)
## Outcome + Outcome - Total Inc risk *
## Exposed + 316 402 718 44.0
## Exposed - 250 283 533 46.9
## Total 566 685 1251 45.2
## Odds
## Exposed + 0.786
## Exposed - 0.883
## Total 0.826
##
## Point estimates and 95 % CIs:
## -------------------------------------------------------------------
## Inc risk ratio 0.94 (0.83, 1.06)
## Odds ratio 0.89 (0.71, 1.11)
## Attrib risk * -2.89 (-8.47, 2.69)
## Attrib risk in population * -1.66 (-6.72, 3.39)
## Attrib fraction in exposed (%) -6.57 (-20.44, 5.70)
## Attrib fraction in population (%) -3.67 (-11.00, 3.18)
## -------------------------------------------------------------------
## X2 test statistic: 1.034 p-value: 0.309
## Wald confidence limits
## * Outcomes per 100 population units
TheGood <- subset(obes, obes$Heavy == ">4 kg")
TheBad <- with(TheGood, subset(TheGood, ShortBf == "<=2 months"))
TheStars <- with(TheBad, subset(TheBad, CompFeed == "<=3 months"))
OverweightStars <- with(TheStars, summary(Overweight_new), na.rm=TRUE)
OverweightStars[1]/(OverweightStars[1]+OverweightStars[2])*100
## Overweight
## 46.9697