Registered S3 method overwritten by 'tune':
method from
required_pkgs.model_spec parsnip
── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.3 ──
✓ broom 0.7.8 ✓ rsample 0.1.0
✓ dials 0.0.10 ✓ tune 0.1.6
✓ infer 1.0.0 ✓ workflows 0.2.3
✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
✓ parsnip 0.1.7 ✓ yardstick 0.0.8
✓ recipes 0.1.16
── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
x scales::discard() masks purrr::discard()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x dplyr::lag() masks stats::lag()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
dados_raw = read_csv(
here::here("data/participation-per-country.csv"),
col_types = cols(
.default = col_double(),
site = col_character(),
country = col_character(),
geo = col_character(),
four_regions = col_character(),
eight_regions = col_character(),
six_regions = col_character(),
`World bank income group 2017` = col_character()
)
)%>% janitor::clean_names()
glimpse(dados_raw)
Rows: 157
Columns: 21
$ site <chr> "StackOverflow", "StackOverflow", "StackOverflow", "StackOverflow", "StackOverflow", "StackOverflow", "St…
$ country <chr> "Argentina", "Australia", "Austria", "Bangladesh", "Belgium", "Brazil", "Bulgaria", "Canada", "Chile", "C…
$ pdi <dbl> 49, 36, 11, 80, 65, 69, 70, 39, 63, 80, 67, 35, 73, 57, 18, 78, 70, 66, 40, 70, 33, 68, 35, 80, 60, 95, 6…
$ idv <dbl> 46, 90, 55, 20, 75, 38, 30, 80, 23, 20, 13, 15, 33, 58, 74, 8, 25, 19, 60, 20, 63, 71, 67, 15, 35, 6, 25,…
$ mas <dbl> 56, 61, 79, 55, 54, 49, 40, 52, 28, 66, 64, 21, 40, 57, 16, 63, 45, 40, 30, 65, 26, 43, 66, 40, 57, 37, 5…
$ uai <dbl> 86, 51, 70, 60, 94, 76, 85, 48, 86, 30, 80, 86, 80, 74, 23, 67, 80, 94, 60, 55, 59, 86, 65, 65, 112, 101,…
$ usuarios <dbl> 2798, 12313, 2518, 2558, 4275, 10717, 1463, 17591, 1075, 13401, 1224, 431, 1069, 2701, 4029, 278, 1965, 1…
$ responderam_prop <dbl> 0.5357398, 0.6133355, 0.6310564, 0.3928851, 0.6081871, 0.4826911, 0.5680109, 0.6015008, 0.4837209, 0.3557…
$ perguntaram_prop <dbl> 0.5210865, 0.5897832, 0.5933280, 0.4757623, 0.6088889, 0.4658020, 0.5140123, 0.5987721, 0.4846512, 0.3631…
$ editaram_prop <dbl> 0.09256612, 0.14699911, 0.14932486, 0.08053167, 0.14666667, 0.08351218, 0.11483254, 0.13472799, 0.0762790…
$ comentaram_prop <dbl> 0.25339528, 0.33395598, 0.35027800, 0.15989054, 0.32023392, 0.20388168, 0.28639781, 0.32840657, 0.2102325…
$ gni <dbl> NA, 59570, 48160, 840, 44990, 11630, 6870, 50970, 14280, 5680, 6990, 8740, 13290, 18130, 59770, 5200, 300…
$ internet <dbl> 51.0, 79.5, 79.8, 5.0, 78.0, 45.0, 51.0, 83.0, 52.3, 38.3, 40.4, 42.1, 59.6, 73.0, 90.0, 31.4, 95.0, 18.9…
$ epi <dbl> 59.02, NA, 63.21, NA, 61.21, 49.96, NA, NA, 48.75, 50.15, 48.54, 48.53, NA, 57.42, 69.30, 51.05, 42.13, 4…
$ geo <chr> "arg", "aus", "aut", "bgd", "bel", "bra", "bgr", "can", "chl", "chn", "col", "cri", "hrv", "cze", "dnk", …
$ four_regions <chr> "americas", "asia", "europe", "asia", "europe", "americas", "europe", "americas", "americas", "asia", "am…
$ eight_regions <chr> "america_south", "east_asia_pacific", "europe_west", "asia_west", "europe_west", "america_south", "europe…
$ six_regions <chr> "america", "east_asia_pacific", "europe_central_asia", "south_asia", "europe_central_asia", "america", "e…
$ latitude <dbl> -34.00000, -25.00000, 47.33333, 24.00000, 50.75000, -10.00000, 42.66667, 60.10867, -33.45694, 35.00000, 4…
$ longitude <dbl> -64.00000, 135.00000, 13.33333, 90.00000, 4.50000, -55.00000, 25.25000, -113.64258, -70.64827, 105.00000,…
$ world_bank_income_group_2017 <chr> "Upper middle income", "High income", "High income", "Lower middle income", "High income", "Upper middle …
Começando com uma simples vizualização das duas variaves que serao feitas a analise: epi e responderam_prop
dados_raw %>%
group_by(site) %>%
count(site)
dados1 = dados_raw %>%
filter(!is.na(epi)) %>%
mutate(responderam_prop = responderam_prop *100)
dados1 %>%
ggplot(aes(x = epi, y =responderam_prop))+
geom_point()
mod1 <- lm(responderam_prop ~ epi, # poverty = b0 + b1*hs_grad
data = dados1)
# sintaxe base R, que não usaremos
# summary(mod)
# sintaxe broom / tidymodels:
tidy(mod1)
glance(mod1)
função retornada foi responderam_prop = 0.09801 + 0.00512 * epi
mod1 %>%
augment(dados1)
mod1 %>%
augment(dados1) %>%
ggplot(mapping = aes(x = epi)) +
geom_point(aes(y = responderam_prop), alpha = 0.8, size = .5) +
geom_line(aes(y = .fitted), colour = "red")
mod1 %>%
tidy(conf.int = T, conf.level = .95) %>%
select(-p.value)
mod1 %>%
augment(dados1) %>%
ggplot(aes(epi, .resid)) +
geom_point(alpha = .4, size = .5) +
geom_hline(yintercept = 0, colour = "blue")
Regressão linear simples foi utilizada para analisar a associação entre epi (fluência de inglês na população do país) e responderam_prop(% das pessoas que responderam alguma vez a uma pergunta no stack overflow ou super root) em todo o mundo. Um modelo no formato responderam_prop = 0.098016215 + 0.005123573 * epi explica 7% da variancia da variavel de resposta(R2 = 0.07171708). O aumento em 1 eou no pais, produz um acrescimo de 0.005 % de responderam, (IC (95% [0.001539176, 0.00870797])). Portanto podemos observar que a influencia é pouca , entre as duas variaveis