Take the WGI data set we discussed in class and do the tasks below.
wgi <- read.csv("https://raw.githubusercontent.com/allatambov/cluster-analysis/master/clust1/wgi_fh.csv", dec = ",")
wgi <- na.omit(wgi)
wgi$fh_status <- cut(wgi$fh, breaks = c(0, 2.5, 5.5, 7),
labels = c("not free", "partly free", "free"))
Using dplyr
:
library(dplyr)
wgi %>% group_by(fh_status) %>% tally
## # A tibble: 3 x 2
## fh_status n
## <fct> <int>
## 1 not free 85
## 2 partly free 77
## 3 free 33
Or:
wgi %>% group_by(fh_status) %>% summarise(n = n())
## # A tibble: 3 x 2
## fh_status n
## <fct> <int>
## 1 not free 85
## 2 partly free 77
## 3 free 33
wgi %>% group_by(fh_status) %>% summarise(ave_cc = mean(cc))
## # A tibble: 3 x 2
## fh_status ave_cc
## <fct> <dbl>
## 1 not free 0.649
## 2 partly free -0.488
## 3 free -0.977
wgi %>% arrange(ps) %>% tail(10) # most stable
## X country cnt_code year va ps ge rq rl cc
## 186 36 Canada CAN 2016 1.38 1.24 1.80 1.74 1.84 1.98
## 187 31 Brunei Darussalam BRN 2016 -0.95 1.26 1.07 0.59 0.62 0.66
## 188 37 Switzerland CHE 2016 1.46 1.32 2.03 1.91 1.94 2.05
## 189 93 Iceland ISL 2016 1.34 1.33 1.41 1.28 1.51 1.99
## 190 2 Andorra ADO 2016 1.20 1.40 1.86 0.87 1.56 1.23
## 191 194 Tuvalu TUV 2016 1.09 1.40 -0.93 -0.59 0.46 0.03
## 192 116 Luxembourg LUX 2016 1.44 1.41 1.69 1.72 1.71 2.08
## 193 112 Liechtenstein LIE 2016 1.19 1.46 1.70 1.37 1.68 2.05
## 194 148 New Zealand NZL 2016 1.44 1.49 1.86 2.04 1.93 2.30
## 195 170 Singapore SGP 2016 -0.28 1.53 2.21 2.18 1.83 2.07
## fh fh_status
## 186 1.0 not free
## 187 5.5 partly free
## 188 1.0 not free
## 189 1.0 not free
## 190 1.0 not free
## 191 1.0 not free
## 192 1.0 not free
## 193 1.0 not free
## 194 1.0 not free
## 195 4.0 partly free
wgi %>% arrange(ps) %>% head(10) # least stable
## X country cnt_code year va ps ge rq rl
## 1 183 Syrian Arab Republic SYR 2016 -1.96 -2.91 -1.82 -1.67 -2.01
## 2 209 Yemen, Rep. YEM 2016 -1.65 -2.79 -1.82 -1.48 -1.60
## 3 3 Afghanistan AFG 2016 -1.09 -2.75 -1.22 -1.33 -1.62
## 4 150 Pakistan PAK 2016 -0.69 -2.47 -0.64 -0.64 -0.83
## 5 33 South Sudan SSD 2016 -1.67 -2.42 -2.26 -1.86 -1.69
## 6 168 Sudan SDN 2016 -1.80 -2.38 -1.41 -1.49 -1.26
## 7 175 Somalia SOM 2016 -1.83 -2.33 -2.18 -2.27 -2.37
## 8 92 Iraq IRQ 2016 -1.01 -2.28 -1.26 -1.13 -1.70
## 9 110 Libya LBY 2016 -1.37 -2.21 -1.89 -2.27 -1.87
## 10 212 Congo, Dem. Rep. ZAR 2016 -1.39 -2.20 -1.51 -1.32 -1.61
## cc fh fh_status
## 1 -1.57 7.0 free
## 2 -1.67 6.5 free
## 3 -1.56 6.0 free
## 4 -0.86 4.5 partly free
## 5 -1.58 6.5 free
## 6 -1.61 7.0 free
## 7 -1.69 7.0 free
## 8 -1.40 5.5 partly free
## 9 -1.57 6.0 free
## 10 -1.33 6.0 free
plot(wgi$rq, wgi$rl,
main = "Scatterplot: Regulatory Quality and Rule of Law",
xlab = "Regulatory Quality",
ylab = "Rule of Law",
pch = 16)
cols <- c("red", "blue", "green")
colors <- cols[wgi$fh_status]
plot(wgi$rq, wgi$rl,
main = "Scatterplot: Regulatory Quality and Rule of Law",
xlab = "Regulatory Quality",
ylab = "Rule of Law",
col = colors,
pch = 16)
plot()
function and look at the results.# type names of the variables instead of x and y
# cex is the text size (by default it is 1)
text(x, y, labels=wgi$cnt_code, cex= 0.7)
plot(wgi$rq, wgi$rl,
main = "Scatterplot: Regulatory Quality and Rule of Law",
xlab = "Regulatory Quality",
ylab = "Rule of Law",
pch = '') # no points
text(wgi$rq, wgi$rl, labels=wgi$cnt_code, cex= 0.7)
cor.test(wgi$rq, wgi$rl) # Pearson's coef
##
## Pearson's product-moment correlation
##
## data: wgi$rq and wgi$rl
## t = 33.567, df = 193, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9003745 0.9421772
## sample estimates:
## cor
## 0.9239895
P-value < 0.05, so at the 5% significance level we reject the null hypothesis about the absence of association between these two indicators. Regulatory Quality and Rule of Law are significantly associated, and this association is positive.
vol <- read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/carData/Cowles.csv")
sex
and volunteer
. Can you decide using this table whether the participation in volunteering depends on the people’s gender?table(vol$sex, vol$volunteer) # cannot decide
##
## no yes
## female 431 349
## male 393 248
sex
and volunteer
are associated using a chi-squared test. Make conclusions.chisq.test(vol$sex, vol$volunteer)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: vol$sex and vol$volunteer
## X-squared = 5.0478, df = 1, p-value = 0.02466
P-value < 0.05, so at the 5% significance level we reject the null hypothesis about the independence of these two variables. People’s participation in volunteering depends on the people’s gender (females are more prone to volunteer).
tab <- table(vol$sex, vol$volunteer)
library(vcd)
mosaicplot(tab, main = "Volunteering")