Take the WGI data set we discussed in class and do the tasks below.
wgi <- read.csv("https://raw.githubusercontent.com/allatambov/cluster-analysis/master/clust1/wgi_fh.csv", dec = ",")
wgi <- na.omit(wgi)
wgi$fh_status <- cut(wgi$fh, breaks = c(0, 2.5, 5.5, 7), 
    labels = c("not free", "partly free", "free"))
Using dplyr:
library(dplyr)
wgi %>% group_by(fh_status) %>% tally
## # A tibble: 3 x 2
##   fh_status       n
##   <fct>       <int>
## 1 not free       85
## 2 partly free    77
## 3 free           33
Or:
wgi %>% group_by(fh_status) %>% summarise(n = n())
## # A tibble: 3 x 2
##   fh_status       n
##   <fct>       <int>
## 1 not free       85
## 2 partly free    77
## 3 free           33
wgi %>% group_by(fh_status) %>% summarise(ave_cc = mean(cc))
## # A tibble: 3 x 2
##   fh_status   ave_cc
##   <fct>        <dbl>
## 1 not free     0.649
## 2 partly free -0.488
## 3 free        -0.977
wgi %>% arrange(ps) %>% tail(10) # most stable
##       X           country cnt_code year    va   ps    ge    rq   rl   cc
## 186  36            Canada      CAN 2016  1.38 1.24  1.80  1.74 1.84 1.98
## 187  31 Brunei Darussalam      BRN 2016 -0.95 1.26  1.07  0.59 0.62 0.66
## 188  37       Switzerland      CHE 2016  1.46 1.32  2.03  1.91 1.94 2.05
## 189  93           Iceland      ISL 2016  1.34 1.33  1.41  1.28 1.51 1.99
## 190   2           Andorra      ADO 2016  1.20 1.40  1.86  0.87 1.56 1.23
## 191 194            Tuvalu      TUV 2016  1.09 1.40 -0.93 -0.59 0.46 0.03
## 192 116        Luxembourg      LUX 2016  1.44 1.41  1.69  1.72 1.71 2.08
## 193 112     Liechtenstein      LIE 2016  1.19 1.46  1.70  1.37 1.68 2.05
## 194 148       New Zealand      NZL 2016  1.44 1.49  1.86  2.04 1.93 2.30
## 195 170         Singapore      SGP 2016 -0.28 1.53  2.21  2.18 1.83 2.07
##      fh   fh_status
## 186 1.0    not free
## 187 5.5 partly free
## 188 1.0    not free
## 189 1.0    not free
## 190 1.0    not free
## 191 1.0    not free
## 192 1.0    not free
## 193 1.0    not free
## 194 1.0    not free
## 195 4.0 partly free
wgi %>% arrange(ps) %>% head(10) # least stable
##      X              country cnt_code year    va    ps    ge    rq    rl
## 1  183 Syrian Arab Republic      SYR 2016 -1.96 -2.91 -1.82 -1.67 -2.01
## 2  209          Yemen, Rep.      YEM 2016 -1.65 -2.79 -1.82 -1.48 -1.60
## 3    3          Afghanistan      AFG 2016 -1.09 -2.75 -1.22 -1.33 -1.62
## 4  150             Pakistan      PAK 2016 -0.69 -2.47 -0.64 -0.64 -0.83
## 5   33          South Sudan      SSD 2016 -1.67 -2.42 -2.26 -1.86 -1.69
## 6  168                Sudan      SDN 2016 -1.80 -2.38 -1.41 -1.49 -1.26
## 7  175              Somalia      SOM 2016 -1.83 -2.33 -2.18 -2.27 -2.37
## 8   92                 Iraq      IRQ 2016 -1.01 -2.28 -1.26 -1.13 -1.70
## 9  110                Libya      LBY 2016 -1.37 -2.21 -1.89 -2.27 -1.87
## 10 212     Congo, Dem. Rep.      ZAR 2016 -1.39 -2.20 -1.51 -1.32 -1.61
##       cc  fh   fh_status
## 1  -1.57 7.0        free
## 2  -1.67 6.5        free
## 3  -1.56 6.0        free
## 4  -0.86 4.5 partly free
## 5  -1.58 6.5        free
## 6  -1.61 7.0        free
## 7  -1.69 7.0        free
## 8  -1.40 5.5 partly free
## 9  -1.57 6.0        free
## 10 -1.33 6.0        free
plot(wgi$rq, wgi$rl, 
     main = "Scatterplot: Regulatory Quality and Rule of Law",
     xlab = "Regulatory Quality", 
     ylab = "Rule of Law",
     pch = 16)
cols <- c("red", "blue", "green")
colors <- cols[wgi$fh_status]
plot(wgi$rq, wgi$rl, 
     main = "Scatterplot: Regulatory Quality and Rule of Law",
     xlab = "Regulatory Quality", 
     ylab = "Rule of Law",
     col = colors,
     pch = 16)
plot() function and look at the results.# type names of the variables instead of x and y 
# cex is the text size (by default it is 1)
text(x, y, labels=wgi$cnt_code, cex= 0.7)
plot(wgi$rq, wgi$rl, 
     main = "Scatterplot: Regulatory Quality and Rule of Law",
     xlab = "Regulatory Quality", 
     ylab = "Rule of Law",
     pch = '') # no points
text(wgi$rq, wgi$rl, labels=wgi$cnt_code, cex= 0.7)
cor.test(wgi$rq, wgi$rl) # Pearson's coef
## 
##  Pearson's product-moment correlation
## 
## data:  wgi$rq and wgi$rl
## t = 33.567, df = 193, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9003745 0.9421772
## sample estimates:
##       cor 
## 0.9239895
P-value < 0.05, so at the 5% significance level we reject the null hypothesis about the absence of association between these two indicators. Regulatory Quality and Rule of Law are significantly associated, and this association is positive.
vol <- read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/carData/Cowles.csv")
sex and volunteer. Can you decide using this table whether the participation in volunteering depends on the people’s gender?table(vol$sex, vol$volunteer) # cannot decide
##         
##           no yes
##   female 431 349
##   male   393 248
sex and volunteer are associated using a chi-squared test. Make conclusions.chisq.test(vol$sex, vol$volunteer)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  vol$sex and vol$volunteer
## X-squared = 5.0478, df = 1, p-value = 0.02466
P-value < 0.05, so at the 5% significance level we reject the null hypothesis about the independence of these two variables. People’s participation in volunteering depends on the people’s gender (females are more prone to volunteer).
tab <- table(vol$sex, vol$volunteer)
library(vcd)
mosaicplot(tab, main = "Volunteering")