#question 2 (a) to (g)
IHDP <- read.csv("HW1_IHDP.csv")

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## section a 

### HISTOGRAM FOR TREATMENT GROUP
ggplot(subset(IHDP, treat == 1), aes(x = nnhealth)) +
  geom_histogram(binwidth = 5,  
                 fill = "skyblue", 
                 color = "black") +
  labs(title = "NNHEALTH for Treatment Group",
       x = "nnhealth",
       y = "Frequency")

### HISTOGRAM FOR CONTROL GROUP
ggplot(subset(IHDP, treat == 0), aes(x = nnhealth)) +
  geom_histogram(binwidth = 5,  
                 fill = "skyblue", 
                 color = "black") +
  labs(title = "NNHEALTH for Control Group",
       x = "nnhealth",
       y = "Frequency")

####interpretation: 

## section b
### SIDE BY SIDE BOXPLOT
ggplot(IHDP) +
  aes(x = treat, y=nnhealth, group=treat) +
  stat_boxplot(geom = "errorbar") +
  geom_boxplot()

## section c
###SUMMARY FOR TREATMENT GROUP
IHDP <- read.csv("HW1_IHDP.csv")

library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.3
treat_IHDP <-IHDP[IHDP$treat == 1,]

Desc(treat_IHDP$nnhealth)
## ────────────────────────────────────────────────────────────────────────────── 
## treat_IHDP$nnhealth (integer)
## 
##   length       n    NAs  unique      0s    mean  meanCI'
##      377     377      0      75       0  100.66   99.04
##           100.0%   0.0%            0.0%          102.28
##                                                        
##      .05     .10    .25  median     .75     .90     .95
##    72.00   80.60  93.00  103.00  111.00  117.40  121.20
##                                                        
##    range      sd  vcoef     mad     IQR    skew    kurt
##   121.00   15.97   0.16   13.34   18.00   -1.18    2.70
##                                                        
## lowest : 15, 42, 46 (2), 52, 53
## highest: 127 (3), 128, 129, 132, 136
## 
## ' 95%-CI (classic)

###SUMMARY FOR CONTROL GROUP
IHDP <- read.csv("HW1_IHDP.csv")

library(DescTools)

control_IHDP <-IHDP[IHDP$treat == 0,]

Desc(control_IHDP$nnhealth)
## ────────────────────────────────────────────────────────────────────────────── 
## control_IHDP$nnhealth (integer)
## 
##   length       n    NAs  unique      0s    mean  meanCI'
##      608     608      0      83       0   99.58   98.32
##           100.0%   0.0%            0.0%          100.84
##                                                        
##      .05     .10    .25  median     .75     .90     .95
##    72.00   81.00  91.00  101.00  110.00  117.00  121.65
##                                                        
##    range      sd  vcoef     mad     IQR    skew    kurt
##   120.00   15.82   0.16   13.34   19.00   -0.92    2.43
##                                                        
## lowest : 17, 32, 37, 38, 40
## highest: 132 (2), 133 (2), 135, 136 (2), 137 (2)
## 
## ' 95%-CI (classic)

##section d
###SUMMARY TABLE FOR TREATMENT AND CONTROL GROUPS
summary_table <- data.frame(
  Group = c("Treatment", "Control"),
  Mean = c(mean(treat_IHDP$nnhealth), mean(control_IHDP$nnhealth)),
  SD = c(sd(treat_IHDP$nnhealth), sd(control_IHDP$nnhealth))
)
print(summary_table)
##       Group      Mean       SD
## 1 Treatment 100.65782 15.97412
## 2   Control  99.58388 15.81825
##section e
treat_summary <- fivenum(treat_IHDP$nnhealth)
control_summary <- fivenum(control_IHDP$nnhealth)

# Create the summary table
five_num_table <- data.frame(
  Summary = c("Min", "Q1", "Median", "Q3", "Max"),
  Treatment = treat_summary,
  Control = control_summary
)
print(five_num_table)
##   Summary Treatment Control
## 1     Min        15      17
## 2      Q1        93      91
## 3  Median       103     101
## 4      Q3       111     110
## 5     Max       136     137
##section f
####The distribution of nnhealth across treatment groups is very even, as they show similar means, standard deviations, as well as the minimum, first quartile, median, third quartile, and maximum. This means that both groups have similar populations and have been randomized effectively. 

##section g
####The numerical summary under section e (five-number summary) is most appropriate for the nnhealth variable because it better shows the distribution of the neonatal health of the treatment and control group. With the summary in section d, we are only provided the mean and standard deviation, which makes it more difficult to understand the range of the dataset and where most of the dataset lies.


#question 3 

IHDP <- read.csv("HW1_IHDP.csv")

IHDP %>%
  group_by(treat) %>%
  summarise(mean = mean(iqsb_36),
            sd = sd(iqsb_36),
            n = n())
## # A tibble: 2 × 4
##   treat  mean    sd     n
##   <int> <dbl> <dbl> <int>
## 1     0  84.1  20.0   608
## 2     1  93.3  19.1   377
#question 4 
##section a
IHDP <- read.csv("HW1_IHDP.csv")

library(tidyverse)
library(DescTools)

IHDP <- IHDP %>%
  mutate(prenatal = factor(prenatal,
                           levels = c(0, 1),
                           labels = c("0. No prenatal",
                                      "1. Prenatal care")))

Freq(IHDP$prenatal)
##               level  freq   perc  cumfreq  cumperc
## 1    0. No prenatal    45   4.6%       45     4.6%
## 2  1. Prenatal care   940  95.4%      985   100.0%
IHDP <- IHDP %>%
  mutate(cig = factor(cig,
                      levels = c(0, 1),
                      labels = c("0. No smoke",
                                 "1. Smoked")))

Freq(IHDP$cig)
##          level  freq   perc  cumfreq  cumperc
## 1  0. No smoke   639  64.9%      639    64.9%
## 2    1. Smoked   346  35.1%      985   100.0%
##section b
PercTable(IHDP$cig,IHDP$prenatal,margins=c(1,2), rfrq="001")
##                                                              
##                       0. No prenatal   1. Prenatal care   Sum
##                                                              
## 0. No smoke   freq                25                614   639
##               p.col            55.6%              65.3% 64.9%
##                                                              
## 1. Smoked     freq                20                326   346
##               p.col            44.4%              34.7% 35.1%
##                                                              
## Sum           freq                45                940   985
##               p.col                .                  .     .
##