# install.packages("devtools")
# devtools::install_github("ankitrohatgi/digitizeR")
# library('digitizeR')
library("tidyverse")
[30m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --[39m
[30m[32mv[30m [34mggplot2[30m 3.0.0 [32mv[30m [34mpurrr [30m 0.2.5
[32mv[30m [34mtibble [30m 1.4.2 [32mv[30m [34mdplyr [30m 0.7.6
[32mv[30m [34mtidyr [30m 0.8.1 [32mv[30m [34mstringr[30m 1.3.0
[32mv[30m [34mreadr [30m 1.1.1 [32mv[30m [34mforcats[30m 0.3.0[39m
[30m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
Figura 2
# app <- wpd.launch()
df <- read_csv("DisparitiesCaries.csv")
Parsed with column specification:
cols(
MCD = col_double(),
`Caries prevalence` = col_double(),
Name = col_character()
)
df
[38;5;246m# A tibble: 13 x 3[39m
MCD `Caries prevalence` Name
[3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<chr>[39m[23m
[38;5;250m 1[39m 0.050[4m2[24m 28.4 La Pintana
[38;5;250m 2[39m 0.137 53.9 Lampa
[38;5;250m 3[39m 0.210 26.7 Recoleta
[38;5;250m 4[39m 0.305 40.7 Independencia
[38;5;250m 5[39m 0.386 42.5 San Jose
[38;5;250m 6[39m 0.413 46.2 Isla de Maipo
[38;5;250m 7[39m 0.437 24.6 Maipu
[38;5;250m 8[39m 0.482 42.7 Melipilla
[38;5;250m 9[39m 0.533 35.3 Peñalolen
[38;5;250m10[39m 0.604 40.4 Cerrillos
[38;5;250m11[39m 0.770 39.4 Santiago
[38;5;250m12[39m 0.851 29.4 Providencia
[38;5;250m13[39m 0.952 17.0 Vitacura – Las Condes
glimpse(df)
Observations: 13
Variables: 3
$ MCD <dbl> 0.05022321, 0.13743133, 0.21005779, 0.30511758, 0.38561261, 0.41320146, 0.43709088, 0.482359...
$ `Caries prevalence` <dbl> 28.41328, 53.94834, 26.71587, 40.66421, 42.50923, 46.19926, 24.57565, 42.73063, 35.27675, 40...
$ Name <chr> "La Pintana", "Lampa", "Recoleta", "Independencia", "San Jose", "Isla de Maipo", "Maipu", "M...
summary(df)
MCD Caries prevalence Name
Min. :0.05022 Min. :17.05 Length:13
1st Qu.:0.30512 1st Qu.:28.41 Class :character
Median :0.43709 Median :39.41 Mode :character
Mean :0.47169 Mean :35.94
3rd Qu.:0.60450 3rd Qu.:42.51
Max. :0.95217 Max. :53.95
df %>%
ggplot(aes(x = MCD, y = `Caries prevalence`)) +
geom_point()
df %>%
ggplot(aes(x = MCD, y = `Caries prevalence`, label = Name)) + # agrego label
geom_point() +
theme_minimal() +
geom_text(vjust = 1.5, nudge_x = 0.007, size = 2.7) #agrego el texto de label
geom_smooth(method=‘lm’,formula=y~x)
df %>%
ggplot(aes(x = MCD, y = `Caries prevalence`, label = Name)) +
geom_point() +
theme_minimal() +
geom_text(vjust = 1.5, nudge_x = 0.007, size = 2.7) +
geom_smooth() # agrego area de regresion
Voy a probar con GAM y (y ~ poly(x, 2))
df %>%
filter(Name == "Vitacura – Las Condes") %>%
ggplot(aes(x = MCD, y = `Caries prevalence`, label = Name)) +
geom_point() +
theme_minimal() +
geom_text(vjust = 1.5, nudge_x = 0.007, size = 2.7)
# geom_smooth() # agrego area de regresion
df %>%
filter(Name != "Vitacura – Las Condes") %>%
ggplot(aes(x = MCD, y = `Caries prevalence`, label = Name)) +
geom_point() +
theme_minimal() +
geom_text(vjust = 1.5, nudge_x = 0.007, size = 2.7) +
geom_smooth() # agrego area de regresion
Aparentemente si
elimino Lampa, Maipu y Vitacura
df %>%
filter(Name != c("Vitacura – Las Condes", "Lampa")) %>%
filter(Name != "Maipu") %>%
ggplot(aes(x = MCD, y = `Caries prevalence`, label = Name)) +
geom_point() +
theme_minimal() +
geom_text(vjust = 1.5, nudge_x = 0.007, size = 2.7) +
geom_smooth() # agrego area de regresion
longer object length is not a multiple of shorter object length
df %>%
ggplot(aes(x = Name, y = `Caries prevalence`)) +
geom_col()
Todos los gráficos de barra tienen un orden
df %>%
ggplot(aes(x = fct_reorder(Name, `Caries prevalence`), y = `Caries prevalence`)) +
geom_col() +
coord_flip() +
labs(y = "Prevalencia de caries", x = "Comuna")
promedio_caries <- mean(df$`Caries prevalence`)
sd_caries <- sd(df$`Caries prevalence`)
De hecho parece que Lampa, Maipu y Vitacura son outliers
library(broom)
dfReg <- df %>%
do(reg = lm(`Caries prevalence` ~ MCD, data=.))
ANOVAreg <- dfReg %>%
rowwise %>%
do(anova(.$reg))
ANOVAreg <- as.data.frame(ANOVAreg)
ANOVAreg
Df Sum Sq Mean Sq F value Pr(>F)
1 1 160.471 160.47102 1.628871 0.2281474
2 11 1083.684 98.51674 NA NA
tidy(dfReg,reg)
term estimate std.error statistic p.value
1 (Intercept) 42.30028 5.692609 7.430736 1.308437e-05
2 MCD -13.48198 10.563565 -1.276272 2.281474e-01
glance(dfReg,reg)
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
1 0.1289799 0.04979626 9.92556 1.628871 0.2281474 2 -47.19682 100.3936 102.0885 1083.684 11
¿Qué pasa si elimino Vitacura?
dfReg_sinVitacura <- df %>%
filter(Name != "Vitacura – Las Condes") %>%
do(reg_sinVitacura = lm(`Caries prevalence` ~ MCD, data=.))
ANOVAreg_sinVitacura <- dfReg_sinVitacura %>%
rowwise %>%
do(anova(.$reg))
ANOVAreg_sinVitacura <- as.data.frame(ANOVAreg_sinVitacura)
ANOVAreg_sinVitacura
Df Sum Sq Mean Sq F value Pr(>F)
1 1 6.761167 6.761167 0.07947736 0.7837551
2 10 850.703555 85.070356 NA NA
tidy(dfReg_sinVitacura,reg_sinVitacura)
term estimate std.error statistic p.value
1 (Intercept) 38.926372 5.669149 6.8663516 4.370185e-05
2 MCD -3.268847 11.595056 -0.2819173 7.837551e-01
glance(dfReg_sinVitacura,reg_sinVitacura)
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
1 0.007885067 -0.09132643 9.223359 0.07947736 0.7837551 2 -42.5942 91.18841 92.64313 850.7036 10