library(tidyverse)
Source: https://www.kaggle.com/arijit75/survey-data#monthly_deaths.csv
Semmelweis <- read_csv("https://github.com/imsharvanj/Dr.-Semmelweis-and-the-discovery-of-handwashing/raw/master/notebook%20and%20datasets/datasets/yearly_deaths_by_clinic.csv")
Parsed with column specification:
cols(
year = [32mcol_double()[39m,
births = [32mcol_double()[39m,
deaths = [32mcol_double()[39m,
clinic = [31mcol_character()[39m
)
glimpse(Semmelweis)
Observations: 12
Variables: 4
$ year [3m[38;5;246m<dbl>[39m[23m 1841, 1842, 1843, 1844, 1845, 1846, 1841, 1842, 1843, 1844, 1845, 1846
$ births [3m[38;5;246m<dbl>[39m[23m 3036, 3287, 3060, 3157, 3492, 4010, 2442, 2659, 2739, 2956, 3241, 3754
$ deaths [3m[38;5;246m<dbl>[39m[23m 237, 518, 274, 260, 241, 459, 86, 202, 164, 68, 66, 105
$ clinic [3m[38;5;246m<chr>[39m[23m "clinic 1", "clinic 1", "clinic 1", "clinic 1", "clinic 1", "clinic 1", "cl…
summary(Semmelweis)
year births deaths clinic
Min. :1841 Min. :2442 Min. : 66.0 Length:12
1st Qu.:1842 1st Qu.:2902 1st Qu.:100.2 Class :character
Median :1844 Median :3108 Median :219.5 Mode :character
Mean :1844 Mean :3153 Mean :223.3
3rd Qu.:1845 3rd Qu.:3338 3rd Qu.:263.5
Max. :1846 Max. :4010 Max. :518.0
La primera columna es año, así que la cambio a chr o fct
Semmelweis$year <- as.character(Semmelweis$year)
Veo por año y por clínica
table(Semmelweis$year, Semmelweis$clinic)
clinic 1 clinic 2
1841 1 1
1842 1 1
1843 1 1
1844 1 1
1845 1 1
1846 1 1
Ok, no falta ninguno
Grafico
Semmelweis %>%
ggplot(aes(x = year, y = births, group = clinic)) +
geom_line()
Semmelweis %>%
ggplot(aes(x = year, y = deaths, group = clinic)) +
geom_line()
Voy a crear una tasa de muertes por 100 nacimientos
Semmelweis <- Semmelweis %>%
mutate(deathsPerBirth = deaths/births * 100)
y la grafico
Semmelweis %>%
ggplot(aes(x = year, y = deathsPerBirth, color = clinic, group = clinic)) +
geom_line()
Porqué una clínica tenía más muertes que la otra? Semmelweis dedujo correctamente que las parteras, al lavarse las manos, disminuian la tasa de mortalidad. Con esta información, hizo un experimento en la clínica 1
clinic1 <- read_csv("https://github.com/imsharvanj/Dr.-Semmelweis-and-the-discovery-of-handwashing/raw/master/notebook%20and%20datasets/datasets/monthly_deaths.csv")
Parsed with column specification:
cols(
date = [34mcol_date(format = "")[39m,
births = [32mcol_double()[39m,
deaths = [32mcol_double()[39m
)
head(clinic1)
clinic1 <- clinic1 %>%
mutate(deathsPerBirth = deaths/births * 100)
Lo veo
clinic1 %>%
ggplot(aes(x = date, y = deathsPerBirth)) +
geom_line()
clinic1 %>%
ggplot(aes(x = date, y = deathsPerBirth)) +
geom_line() +
geom_vline(xintercept = as.numeric(clinic1$date[75]), linetype=4, colour="blue")
geom_line([.$date >= as.Date("1847-06-01"),],
Error: inesperado '[' in " geom_line(["
Otra manera
t.test(deathsPerBirth ~ handwashing_started, data = clinic1)
Welch Two Sample t-test
data: deathsPerBirth by handwashing_started
t = 9.6101, df = 92.435, p-value = 1.445e-15
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
6.660662 10.130659
sample estimates:
mean in group FALSE mean in group TRUE
10.504998 2.109338
t.test(deathsPerBirth ~ handwashing_started, data = clinic1)
Welch Two Sample t-test
data: deathsPerBirth by handwashing_started
t = 9.6101, df = 92.435, p-value = 1.445e-15
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
6.660662 10.130659
sample estimates:
mean in group FALSE mean in group TRUE
10.504998 2.109338
names(t.test(deathsPerBirth ~ handwashing_started, data = clinic1))
[1] "statistic" "parameter" "p.value" "conf.int" "estimate"
[6] "null.value" "alternative" "method" "data.name"
clinicComparison
Welch Two Sample t-test
data: deathsPerBirth by handwashing_started
t = 9.6101, df = 92.435, p-value = 1.445e-15
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
6.660662 10.130659
sample estimates:
mean in group FALSE mean in group TRUE
10.504998 2.109338
names(clinicComparison)
[1] "statistic" "parameter" "p.value" "conf.int" "estimate"
[6] "null.value" "alternative" "method" "data.name"
clinicComparison$conf.int
[1] 6.660662 10.130659
attr(,"conf.level")
[1] 0.95