library(openintro)
library(tidyverse)
library(tidymodels)
theme_set(theme_bw())
dados_raw = read_csv(
    here::here("data/participation-per-country.csv"),
    col_types = cols(
        .default = col_double(),
        site = col_character(),
        country = col_character(),
        PDI = col_double(),
        EPI = col_double(),
        responderam_prop = col_double()
    )
)
## Warning: 765 parsing failures.
## row                          col expected              actual                                                                             file
##   1 geo                          a double arg                 'C:/Users/Hugo/CDD/vis-cultura-stackoverflow/data/participation-per-country.csv'
##   1 four_regions                 a double americas            'C:/Users/Hugo/CDD/vis-cultura-stackoverflow/data/participation-per-country.csv'
##   1 eight_regions                a double america_south       'C:/Users/Hugo/CDD/vis-cultura-stackoverflow/data/participation-per-country.csv'
##   1 six_regions                  a double america             'C:/Users/Hugo/CDD/vis-cultura-stackoverflow/data/participation-per-country.csv'
##   1 World bank income group 2017 a double Upper middle income 'C:/Users/Hugo/CDD/vis-cultura-stackoverflow/data/participation-per-country.csv'
## ... ............................ ........ ................... ................................................................................
## See problems(...) for more details.
glimpse(dados_raw)
## Rows: 157
## Columns: 21
## $ site                           <chr> "StackOverflow", "StackOverflow", "Stac~
## $ country                        <chr> "Argentina", "Australia", "Austria", "B~
## $ PDI                            <dbl> 49, 36, 11, 80, 65, 69, 70, 39, 63, 80,~
## $ IDV                            <dbl> 46, 90, 55, 20, 75, 38, 30, 80, 23, 20,~
## $ MAS                            <dbl> 56, 61, 79, 55, 54, 49, 40, 52, 28, 66,~
## $ UAI                            <dbl> 86, 51, 70, 60, 94, 76, 85, 48, 86, 30,~
## $ usuarios                       <dbl> 2798, 12313, 2518, 2558, 4275, 10717, 1~
## $ responderam_prop               <dbl> 0.5357398, 0.6133355, 0.6310564, 0.3928~
## $ perguntaram_prop               <dbl> 0.5210865, 0.5897832, 0.5933280, 0.4757~
## $ editaram_prop                  <dbl> 0.09256612, 0.14699911, 0.14932486, 0.0~
## $ comentaram_prop                <dbl> 0.25339528, 0.33395598, 0.35027800, 0.1~
## $ GNI                            <dbl> NA, 59570, 48160, 840, 44990, 11630, 68~
## $ Internet                       <dbl> 51.0, 79.5, 79.8, 5.0, 78.0, 45.0, 51.0~
## $ EPI                            <dbl> 59.02, NA, 63.21, NA, 61.21, 49.96, NA,~
## $ geo                            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ four_regions                   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ eight_regions                  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ six_regions                    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ Latitude                       <dbl> -34.00000, -25.00000, 47.33333, 24.0000~
## $ Longitude                      <dbl> -64.00000, 135.00000, 13.33333, 90.0000~
## $ `World bank income group 2017` <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
dados_stack = dados_raw %>% filter(site == "StackOverflow") %>% filter(!is.na(EPI))
dados_super = dados_raw %>% filter(site == "SuperUser") %>% filter(!is.na(EPI))

Análise sobre a relação entre o EPI e a taxa de pessoas que responderam alguma pergunta no stackoverflow e no superuser

Dados do Super User

dados_super %>% 
  ggplot(aes(x = EPI)) + 
  geom_histogram(binwidth = 3 , bins = 30)

ggplot(dados_super, aes(x = EPI, y = responderam_prop)) + 
  geom_point(alpha = 0.4, size = 1)

mod <- lm(responderam_prop ~ EPI, data = dados_super)

tidy(mod)
## # A tibble: 2 x 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)  0.0985   0.0472        2.09 0.0417 
## 2 EPI          0.00272  0.000871      3.12 0.00298
glance(mod)
## # A tibble: 1 x 12
##   r.squared adj.r.squared  sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl>  <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.160         0.144 0.0486      9.73 0.00298     1   86.1 -166. -160.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
mod %>% 
  augment(dados_super) %>% 
  ggplot(mapping = aes(x = EPI)) + 
  geom_point(aes(y = responderam_prop), alpha = 0.4, size = 1) + 
  geom_line(aes(y = .fitted), colour = "red")  + 
  NULL

mod %>% 
    tidy(conf.int = T, conf.level = .95) %>% 
    select(-p.value)
## # A tibble: 2 x 6
##   term        estimate std.error statistic conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)  0.0985   0.0472        2.09 0.00383    0.193  
## 2 EPI          0.00272  0.000871      3.12 0.000968   0.00447

Dados do Stack Overflow

dados_stack %>% 
  ggplot(aes(x = EPI)) + 
  geom_histogram(binwidth = 3 , bins = 30)

ggplot(dados_stack, aes(x = EPI, y = responderam_prop)) + 
  geom_point(alpha = 0.4, size = 1)

mod1 <- lm(responderam_prop ~ EPI, data = dados_stack)

tidy(mod1)
## # A tibble: 2 x 5
##   term        estimate std.error statistic      p.value
##   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept)  0.0975    0.0625       1.56 0.125       
## 2 EPI          0.00753   0.00115      6.52 0.0000000310
glance(mod1)
## # A tibble: 1 x 12
##   r.squared adj.r.squared  sigma statistic      p.value    df logLik   AIC   BIC
##       <dbl>         <dbl>  <dbl>     <dbl>        <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.455         0.444 0.0645      42.5 0.0000000310     1   71.1 -136. -130.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
mod1 %>% 
  augment(dados_stack) 
## # A tibble: 53 x 27
##    site          country         PDI   IDV   MAS   UAI usuarios responderam_prop
##    <chr>         <chr>         <dbl> <dbl> <dbl> <dbl>    <dbl>            <dbl>
##  1 StackOverflow Argentina        49    46    56    86     2798            0.536
##  2 StackOverflow Austria          11    55    79    70     2518            0.631
##  3 StackOverflow Belgium          65    75    54    94     4275            0.608
##  4 StackOverflow Brazil           69    38    49    76    10717            0.483
##  5 StackOverflow Chile            63    23    28    86     1075            0.484
##  6 StackOverflow China            80    20    66    30    13401            0.356
##  7 StackOverflow Colombia         67    13    64    80     1224            0.478
##  8 StackOverflow Costa Rica       35    15    21    86      431            0.487
##  9 StackOverflow Czech Republ~    57    58    57    74     2701            0.575
## 10 StackOverflow Denmark          18    74    16    23     4029            0.557
## # ... with 43 more rows, and 19 more variables: perguntaram_prop <dbl>,
## #   editaram_prop <dbl>, comentaram_prop <dbl>, GNI <dbl>, Internet <dbl>,
## #   EPI <dbl>, geo <dbl>, four_regions <dbl>, eight_regions <dbl>,
## #   six_regions <dbl>, Latitude <dbl>, Longitude <dbl>,
## #   World bank income group 2017 <dbl>, .fitted <dbl>, .resid <dbl>,
## #   .hat <dbl>, .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
mod1 <- lm(responderam_prop ~ EPI, data = dados_stack)

tidy(mod1)
## # A tibble: 2 x 5
##   term        estimate std.error statistic      p.value
##   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept)  0.0975    0.0625       1.56 0.125       
## 2 EPI          0.00753   0.00115      6.52 0.0000000310
glance(mod1)
## # A tibble: 1 x 12
##   r.squared adj.r.squared  sigma statistic      p.value    df logLik   AIC   BIC
##       <dbl>         <dbl>  <dbl>     <dbl>        <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.455         0.444 0.0645      42.5 0.0000000310     1   71.1 -136. -130.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
mod1 %>% 
  augment(dados_stack) %>% 
  ggplot(mapping = aes(x = EPI)) + 
  geom_point(aes(y = responderam_prop), alpha = 0.4, size = 1) + 
  geom_line(aes(y = .fitted), colour = "red")  + 
  NULL

mod1 %>% 
    tidy(conf.int = T, conf.level = .95) %>% 
    select(-p.value)
## # A tibble: 2 x 6
##   term        estimate std.error statistic conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)  0.0975    0.0625       1.56 -0.0280    0.223  
## 2 EPI          0.00753   0.00115      6.52  0.00521   0.00985

Análise sobre a relação entre o GNI e a taxa de pessoas que responderam alguma pergunta no stackoverflow

dados_stack_GNI = dados_raw %>% filter(site == "SuperUser") %>% filter(!is.na(GNI))

dados_stack_GNI %>% 
  ggplot(aes(x = GNI)) + 
  geom_histogram(binwidth = 5000 , bins = 30)

ggplot(dados_stack_GNI, aes(x = GNI, y = responderam_prop)) + 
  geom_point(alpha = 0.4, size = 1)

mod2 <- lm(responderam_prop ~ GNI, data = dados_stack_GNI)

tidy(mod2)
## # A tibble: 2 x 5
##   term          estimate   std.error statistic  p.value
##   <chr>            <dbl>       <dbl>     <dbl>    <dbl>
## 1 (Intercept) 0.215      0.00746         28.8  1.23e-41
## 2 GNI         0.00000158 0.000000241      6.54 7.19e- 9
glance(mod2)
## # A tibble: 1 x 12
##   r.squared adj.r.squared  sigma statistic      p.value    df logLik   AIC   BIC
##       <dbl>         <dbl>  <dbl>     <dbl>        <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.370         0.361 0.0453      42.8      7.19e-9     1   127. -247. -240.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
mod2 %>% 
  augment(dados_stack_GNI) %>% 
  ggplot(mapping = aes(x = GNI)) + 
  geom_point(aes(y = responderam_prop), alpha = 0.4, size = 1) + 
  geom_line(aes(y = .fitted), colour = "red")  + 
  NULL

mod2 %>% 
    tidy(conf.int = T, conf.level = .95) %>% 
    select(-p.value)
## # A tibble: 2 x 6
##   term          estimate   std.error statistic   conf.low  conf.high
##   <chr>            <dbl>       <dbl>     <dbl>      <dbl>      <dbl>
## 1 (Intercept) 0.215      0.00746         28.8  0.200      0.230     
## 2 GNI         0.00000158 0.000000241      6.54 0.00000110 0.00000206