library(tidyverse)
library(ggbeeswarm)
library(broom)
library(mosaic)

Cloud seeding: In an experiment to determine whether seeding clouds with silver iodide increases rainfall, 52 clouds were randomly assigned to be seeded or not. The amount of rain they generated was then measured (in acre-feet). (Data and story taken from https://dasl.datadescription.com/datafile/cloud-seeding)

d <- read_tsv("http://www.stat.umn.edu/~rend0020/data/cloud-seeding.txt")
## Parsed with column specification:
## cols(
##   `Unseeded Clouds` = col_double(),
##   `Seeded Clouds` = col_double()
## )

Caution: this data is not in “tidy” format…

d <- d %>% gather("Trt", "Rain")

is it normally distributed enough??

gf_boxplot(Rain ~ Trt, data=d, outlier.shape=NA) %>%
  gf_point(position=position_beeswarm())

gf_dhistogram(~Rain|Trt, data=d, bins=7) %>% gf_dens() %>% gf_fitdistr()

gf_qq(~Rain|Trt, data=d) %>% gf_qqline()

gf_qq(~Rain, color=~Trt, data=d) %>% gf_qqline()

try on the log scale

will use natural log, could also use log2 or log10

d <- d %>% mutate(logRain=log(Rain))

for boxplot, can do this in the plot or in the data set

gf_boxplot(Rain ~ Trt, data=d, outlier.shape=NA) %>%
  gf_point(position=position_beeswarm()) +
  scale_y_log10()

gf_boxplot(logRain ~ Trt, data=d, outlier.shape=NA) %>%
  gf_point(position=position_beeswarm())

for others, need to be in the data set

gf_dhistogram(~logRain|Trt, data=d, bins=7) %>% gf_dens() %>% gf_fitdistr()

gf_qq(~logRain|Trt, data=d) %>% gf_qqline()

gf_qq(~logRain, color=~Trt, data=d) %>% gf_qqline()

calculating t-test “by hand”

ss <- d %>% group_by(Trt) %>% summarize(m=mean(logRain), s=sd(logRain), n=n()); ss
## # A tibble: 2 x 4
##   Trt                 m     s     n
##   <chr>           <dbl> <dbl> <int>
## 1 Seeded Clouds    5.13  1.60    26
## 2 Unseeded Clouds  3.99  1.64    26
dif <- ss$m[1] - ss$m[2]; dif
## [1] 1.143781
SE <- sqrt(ss$s[1]^2/ss$n[1] + ss$s[2]^2/ss$n[2]); SE
## [1] 0.4495342
z <- dif/SE; z
## [1] 2.544369
df.guess <- min(ss$n) - 1; df.guess
## [1] 25
pval <- pt(z, df.guess, lower.tail=FALSE)*2; pval
## [1] 0.0175028
ME <- qt(1-0.05/2, df.guess)*SE; ME
## [1] 0.9258331
c(dif - ME, dif + ME)
## [1] 0.217948 2.069614

using R

t.test(logRain ~ Trt, data=d)
## 
##  Welch Two Sample t-test
## 
## data:  logRain by Trt
## t = 2.5444, df = 49.966, p-value = 0.01408
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2408498 2.0467125
## sample estimates:
##   mean in group Seeded Clouds mean in group Unseeded Clouds 
##                      5.134187                      3.990406

“tidying” the result and back transforming

out <- t.test(logRain ~ Trt, data=d) %>% tidy()
out %>% select(estimate, conf.low, conf.high, p.value) %>%
  mutate(ratio.est = exp(estimate), ratio.low=exp(conf.low), ratio.high=exp(conf.high))
## # A tibble: 1 x 7
##   estimate conf.low conf.high p.value ratio.est ratio.low ratio.high
##      <dbl>    <dbl>     <dbl>   <dbl>     <dbl>     <dbl>      <dbl>
## 1     1.14    0.241      2.05  0.0141      3.14      1.27       7.74