#question 2 (a) to (g)
IHDP <- read.csv("HW1_IHDP.csv")
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## section a
### HISTOGRAM FOR TREATMENT GROUP
ggplot(subset(IHDP, treat == 1), aes(x = nnhealth)) +
geom_histogram(binwidth = 5,
fill = "skyblue",
color = "black") +
labs(title = "NNHEALTH for Treatment Group",
x = "nnhealth",
y = "Frequency")

### HISTOGRAM FOR CONTROL GROUP
ggplot(subset(IHDP, treat == 0), aes(x = nnhealth)) +
geom_histogram(binwidth = 5,
fill = "skyblue",
color = "black") +
labs(title = "NNHEALTH for Control Group",
x = "nnhealth",
y = "Frequency")

####interpretation:
## section b
### SIDE BY SIDE BOXPLOT
ggplot(IHDP) +
aes(x = treat, y=nnhealth, group=treat) +
stat_boxplot(geom = "errorbar") +
geom_boxplot()

## section c
###SUMMARY FOR TREATMENT GROUP
IHDP <- read.csv("HW1_IHDP.csv")
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.3
treat_IHDP <-IHDP[IHDP$treat == 1,]
Desc(treat_IHDP$nnhealth)
## ──────────────────────────────────────────────────────────────────────────────
## treat_IHDP$nnhealth (integer)
##
## length n NAs unique 0s mean meanCI'
## 377 377 0 75 0 100.66 99.04
## 100.0% 0.0% 0.0% 102.28
##
## .05 .10 .25 median .75 .90 .95
## 72.00 80.60 93.00 103.00 111.00 117.40 121.20
##
## range sd vcoef mad IQR skew kurt
## 121.00 15.97 0.16 13.34 18.00 -1.18 2.70
##
## lowest : 15, 42, 46 (2), 52, 53
## highest: 127 (3), 128, 129, 132, 136
##
## ' 95%-CI (classic)

###SUMMARY FOR CONTROL GROUP
IHDP <- read.csv("HW1_IHDP.csv")
library(DescTools)
control_IHDP <-IHDP[IHDP$treat == 0,]
Desc(control_IHDP$nnhealth)
## ──────────────────────────────────────────────────────────────────────────────
## control_IHDP$nnhealth (integer)
##
## length n NAs unique 0s mean meanCI'
## 608 608 0 83 0 99.58 98.32
## 100.0% 0.0% 0.0% 100.84
##
## .05 .10 .25 median .75 .90 .95
## 72.00 81.00 91.00 101.00 110.00 117.00 121.65
##
## range sd vcoef mad IQR skew kurt
## 120.00 15.82 0.16 13.34 19.00 -0.92 2.43
##
## lowest : 17, 32, 37, 38, 40
## highest: 132 (2), 133 (2), 135, 136 (2), 137 (2)
##
## ' 95%-CI (classic)

##section d
###SUMMARY TABLE FOR TREATMENT AND CONTROL GROUPS
summary_table <- data.frame(
Group = c("Treatment", "Control"),
Mean = c(mean(treat_IHDP$nnhealth), mean(control_IHDP$nnhealth)),
SD = c(sd(treat_IHDP$nnhealth), sd(control_IHDP$nnhealth))
)
print(summary_table)
## Group Mean SD
## 1 Treatment 100.65782 15.97412
## 2 Control 99.58388 15.81825
##section e
treat_summary <- fivenum(treat_IHDP$nnhealth)
control_summary <- fivenum(control_IHDP$nnhealth)
# Create the summary table
five_num_table <- data.frame(
Summary = c("Min", "Q1", "Median", "Q3", "Max"),
Treatment = treat_summary,
Control = control_summary
)
print(five_num_table)
## Summary Treatment Control
## 1 Min 15 17
## 2 Q1 93 91
## 3 Median 103 101
## 4 Q3 111 110
## 5 Max 136 137
##section f
####The distribution of nnhealth across treatment groups is very even, as they show similar means, standard deviations, as well as the minimum, first quartile, median, third quartile, and maximum. This means that both groups have similar populations and have been randomized effectively.
##section g
####The numerical summary under section e (five-number summary) is most appropriate for the nnhealth variable because it better shows the distribution of the neonatal health of the treatment and control group. With the summary in section d, we are only provided the mean and standard deviation, which makes it more difficult to understand the range of the dataset and where most of the dataset lies.
#question 3
IHDP <- read.csv("HW1_IHDP.csv")
IHDP %>%
group_by(treat) %>%
summarise(mean = mean(iqsb_36),
sd = sd(iqsb_36),
n = n())
## # A tibble: 2 × 4
## treat mean sd n
## <int> <dbl> <dbl> <int>
## 1 0 84.1 20.0 608
## 2 1 93.3 19.1 377
#question 4
##section a
IHDP <- read.csv("HW1_IHDP.csv")
library(tidyverse)
library(DescTools)
IHDP <- IHDP %>%
mutate(prenatal = factor(prenatal,
levels = c(0, 1),
labels = c("0. No prenatal",
"1. Prenatal care")))
Freq(IHDP$prenatal)
## level freq perc cumfreq cumperc
## 1 0. No prenatal 45 4.6% 45 4.6%
## 2 1. Prenatal care 940 95.4% 985 100.0%
IHDP <- IHDP %>%
mutate(cig = factor(cig,
levels = c(0, 1),
labels = c("0. No smoke",
"1. Smoked")))
Freq(IHDP$cig)
## level freq perc cumfreq cumperc
## 1 0. No smoke 639 64.9% 639 64.9%
## 2 1. Smoked 346 35.1% 985 100.0%
##section b
PercTable(IHDP$cig,IHDP$prenatal,margins=c(1,2), rfrq="001")
##
## 0. No prenatal 1. Prenatal care Sum
##
## 0. No smoke freq 25 614 639
## p.col 55.6% 65.3% 64.9%
##
## 1. Smoked freq 20 326 346
## p.col 44.4% 34.7% 35.1%
##
## Sum freq 45 940 985
## p.col . . .
##