library(tidyverse)
library(GGally)
library(magrittr)
library(moderndive)
library(DT)
options(digits=3, scipen=999)
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
https://www.iea.nl/data-tools/repository/cived#collapse-189
civics <- read_csv("G:/My Drive/homework/ShaiLynn M/civics_pol_1999_g12.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## studentid = col_double(),
## sex = col_double(),
## age = col_double(),
## knowledge = col_double(),
## skills = col_double(),
## economy = col_double()
## )
civics %>% glimpse()
## Rows: 4,041
## Columns: 6
## $ studentid <dbl> 10101, 10102, 10103, 10104, 10105, 10106, 10107, 10108, 1010~
## $ sex <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, ~
## $ age <dbl> 18, 17, 17, 17, 17, 17, 17, 17, 19, 17, 17, 17, 17, 17, 17, ~
## $ knowledge <dbl> 131, 123, 123, 123, 123, 123, 118, 118, 123, 123, 115, 123, ~
## $ skills <dbl> 126, 108, 126, 108, 108, 126, 91, 91, 91, 102, 96, 108, 108,~
## $ economy <dbl> 104, 116, 116, 116, 109, 109, 88, 95, 88, 79, 84, 99, 99, 10~
civics %>%
mutate(across(.cols = c(sex:age), as_factor)) %>%
summary()
## studentid sex age knowledge skills
## Min. : 10101 1:2111 16 : 153 Min. : 89 Min. : 45
## 1st Qu.: 420413 2:1930 17 :3518 1st Qu.:112 1st Qu.: 96
## Median : 830110 18 : 311 Median :118 Median :108
## Mean :1032742 19 : 46 Mean :119 Mean :109
## 3rd Qu.:1210112 20 : 4 3rd Qu.:123 3rd Qu.:126
## Max. :5500133 NA's: 9 Max. :131 Max. :143
## economy
## Min. : 42.0
## 1st Qu.: 88.0
## Median : 95.0
## Mean : 97.1
## 3rd Qu.:109.0
## Max. :142.0
civics %>%
mutate(across(.cols = c(sex:age), as_factor)) %>%
ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing missing values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
civics %>%
select(knowledge:economy) %>%
pivot_longer(cols = everything(), names_to = "Variable") %>%
group_by(Variable) %>%
summarise(across(.cols = everything(),
list(Mean = mean, Median = median, Mode = getmode),
.names = "{.fn}"))
## # A tibble: 3 x 4
## Variable Mean Median Mode
## <chr> <dbl> <dbl> <dbl>
## 1 economy 97.1 95 95
## 2 knowledge 119. 118 123
## 3 skills 109. 108 126
civics %>%
select(knowledge:economy) %>%
pivot_longer(cols = everything(), names_to = "Variable") %>%
group_by(Variable) %>%
summarise(Range = max(value) - min(value),
SS = sum((value - mean(value))^2),
Sample_STD = sqrt(SS/(NROW(value)-1)),
Sample_Var = Sample_STD^2)
## # A tibble: 3 x 5
## Variable Range SS Sample_STD Sample_Var
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 economy 100 978427. 15.6 242.
## 2 knowledge 42 298039. 8.59 73.8
## 3 skills 98 1458347. 19.0 361.
t.test(civics$knowledge[civics$sex==1], civics$knowledge[civics$sex==2])
##
## Welch Two Sample t-test
##
## data: civics$knowledge[civics$sex == 1] and civics$knowledge[civics$sex == 2]
## t = -7, df = 3977, p-value = 0.00000000000009
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.54 -1.49
## sample estimates:
## mean of x mean of y
## 118 120
http://www.sthda.com/english/wiki/one-way-anova-test-in-r#compute-one-way-anova-test
civics %<>%
mutate(Age_Lvl = if_else(age == 16, 16,
if_else(age == 17, 17, 18)) %>%
factor(levels = c("16", "17", "18")),
.after = age)
with(civics, table(age, Age_Lvl, deparse.level = 2))
## Age_Lvl
## age 16 17 18
## 16 153 0 0
## 17 0 3518 0
## 18 0 0 311
## 19 0 0 46
## 20 0 0 4
civics %>%
aov(knowledge ~ Age_Lvl, data = .) %>%
summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## Age_Lvl 2 7099 3549 49.3 <0.0000000000000002 ***
## Residuals 4029 290282 72
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 9 observations deleted due to missingness
pairwise.t.test(civics$knowledge,
civics$Age_Lvl,
p.adjust.method = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: civics$knowledge and civics$Age_Lvl
##
## 16 17
## 17 1 -
## 18 0.000000001 < 0.0000000000000002
##
## P value adjustment method: bonferroni
https://www.statisticshowto.com/tukey-test-honest-significant-difference/
civics %>%
aov(knowledge ~ Age_Lvl, data = .) %>%
TukeyHSD()
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = knowledge ~ Age_Lvl, data = .)
##
## $Age_Lvl
## diff lwr upr p adj
## 17-16 -0.524 -2.17 1.12 0.735
## 18-16 -5.137 -7.06 -3.22 0.000
## 18-17 -4.612 -5.71 -3.51 0.000
civics %>%
lm(skills ~ knowledge, data = .) %>%
get_regression_table() %>%
datatable()