dados_raw = read_csv(
here::here("data/participation-per-country.csv"),
col_types = cols(
.default = col_double(),
site = col_character(),
country = col_character(),
geo = col_character(),
four_regions = col_character(),
eight_regions = col_character(),
six_regions = col_character(),
`World bank income group 2017` = col_character()
)
) %>% janitor::clean_names()
glimpse(dados_raw)
Rows: 157
Columns: 21
$ site <chr> "StackOverflow", "StackOverflow", "StackOverflow", "StackOverflow", "StackOverflow", "StackOverflow", "St…
$ country <chr> "Argentina", "Australia", "Austria", "Bangladesh", "Belgium", "Brazil", "Bulgaria", "Canada", "Chile", "C…
$ pdi <dbl> 49, 36, 11, 80, 65, 69, 70, 39, 63, 80, 67, 35, 73, 57, 18, 78, 70, 66, 40, 70, 33, 68, 35, 80, 60, 95, 6…
$ idv <dbl> 46, 90, 55, 20, 75, 38, 30, 80, 23, 20, 13, 15, 33, 58, 74, 8, 25, 19, 60, 20, 63, 71, 67, 15, 35, 6, 25,…
$ mas <dbl> 56, 61, 79, 55, 54, 49, 40, 52, 28, 66, 64, 21, 40, 57, 16, 63, 45, 40, 30, 65, 26, 43, 66, 40, 57, 37, 5…
$ uai <dbl> 86, 51, 70, 60, 94, 76, 85, 48, 86, 30, 80, 86, 80, 74, 23, 67, 80, 94, 60, 55, 59, 86, 65, 65, 112, 101,…
$ usuarios <dbl> 2798, 12313, 2518, 2558, 4275, 10717, 1463, 17591, 1075, 13401, 1224, 431, 1069, 2701, 4029, 278, 1965, 1…
$ responderam_prop <dbl> 0.5357398, 0.6133355, 0.6310564, 0.3928851, 0.6081871, 0.4826911, 0.5680109, 0.6015008, 0.4837209, 0.3557…
$ perguntaram_prop <dbl> 0.5210865, 0.5897832, 0.5933280, 0.4757623, 0.6088889, 0.4658020, 0.5140123, 0.5987721, 0.4846512, 0.3631…
$ editaram_prop <dbl> 0.09256612, 0.14699911, 0.14932486, 0.08053167, 0.14666667, 0.08351218, 0.11483254, 0.13472799, 0.0762790…
$ comentaram_prop <dbl> 0.25339528, 0.33395598, 0.35027800, 0.15989054, 0.32023392, 0.20388168, 0.28639781, 0.32840657, 0.2102325…
$ gni <dbl> NA, 59570, 48160, 840, 44990, 11630, 6870, 50970, 14280, 5680, 6990, 8740, 13290, 18130, 59770, 5200, 300…
$ internet <dbl> 51.0, 79.5, 79.8, 5.0, 78.0, 45.0, 51.0, 83.0, 52.3, 38.3, 40.4, 42.1, 59.6, 73.0, 90.0, 31.4, 95.0, 18.9…
$ epi <dbl> 59.02, NA, 63.21, NA, 61.21, 49.96, NA, NA, 48.75, 50.15, 48.54, 48.53, NA, 57.42, 69.30, 51.05, 42.13, 4…
$ geo <chr> "arg", "aus", "aut", "bgd", "bel", "bra", "bgr", "can", "chl", "chn", "col", "cri", "hrv", "cze", "dnk", …
$ four_regions <chr> "americas", "asia", "europe", "asia", "europe", "americas", "europe", "americas", "americas", "asia", "am…
$ eight_regions <chr> "america_south", "east_asia_pacific", "europe_west", "asia_west", "europe_west", "america_south", "europe…
$ six_regions <chr> "america", "east_asia_pacific", "europe_central_asia", "south_asia", "europe_central_asia", "america", "e…
$ latitude <dbl> -34.00000, -25.00000, 47.33333, 24.00000, 50.75000, -10.00000, 42.66667, 60.10867, -33.45694, 35.00000, 4…
$ longitude <dbl> -64.00000, 135.00000, 13.33333, 90.00000, 4.50000, -55.00000, 25.25000, -113.64258, -70.64827, 105.00000,…
$ world_bank_income_group_2017 <chr> "Upper middle income", "High income", "High income", "Lower middle income", "High income", "Upper middle …
Começando com uma simples vizualização das duas variaves que serao feitas a analise: epi e responderam_prop
dados_raw %>%
group_by(site) %>%
count(site)
dados1 = dados_raw %>%
filter(site == "StackOverflow", !is.na(gni)) %>%
mutate(gni = log10(gni), responderam_prop = 100 * responderam_prop)
dados1 %>%
ggplot(aes(x = gni, y =responderam_prop))+
geom_point()
mod1 <- lm(responderam_prop ~ gni, # poverty = b0 + b1*hs_grad
data = dados1)
# sintaxe base R, que não usaremos
# summary(mod)
# sintaxe broom / tidymodels:
tidy(mod1)
glance(mod1)
função retornada foi responderam_prop = 4.21897 + 11.33284 * gni
mod1 %>%
augment(dados1)
mod1 %>%
augment(dados1) %>%
ggplot(mapping = aes(x = gni)) +
geom_point(aes(y = responderam_prop), alpha = 1, size = .6) +
geom_line(aes(y = .fitted), colour = "red")
mod1 %>%
tidy(conf.int = T, conf.level = .95) %>%
select(-p.value)
mod1 %>%
augment(dados1) %>%
ggplot(aes(gni, .resid)) +
geom_point(alpha = .4, size = .5) +
geom_hline(yintercept = 0, colour = "blue")
Regressão linear simples foi utilizada para analisar a associação entre gni (produto interno bruto per cápita) e responderam_prop (% das pessoas que responderam alguma vez a uma pergunta no stack overflow ou super root) em todo o mundo. Um modelo no formato responderam_prop = 0.0421897 + 0.1133284 * gni explica 45% da variancia da variavel de resposta(R2 = 0.4576176). O aumento em 1 eou no pais, produz um acrescimo de 0.11 pontos nas pessoas que responderam (IC (95% [0.08475037 , 0.1419064])). Portanto podemos observar que a influencia consideravel, pois uma pequena mudança causa uma direfença plausivel na respostas das pessoas.