Preparation

library(tidyverse)
library(GGally)
library(magrittr)
library(moderndive)
library(DT)

options(digits=3, scipen=999)

# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

https://www.iea.nl/data-tools/repository/cived#collapse-189

civics <- read_csv("G:/My Drive/homework/ShaiLynn M/civics_pol_1999_g12.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   studentid = col_double(),
##   sex = col_double(),
##   age = col_double(),
##   knowledge = col_double(),
##   skills = col_double(),
##   economy = col_double()
## )
civics %>% glimpse() 
## Rows: 4,041
## Columns: 6
## $ studentid <dbl> 10101, 10102, 10103, 10104, 10105, 10106, 10107, 10108, 1010~
## $ sex       <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, ~
## $ age       <dbl> 18, 17, 17, 17, 17, 17, 17, 17, 19, 17, 17, 17, 17, 17, 17, ~
## $ knowledge <dbl> 131, 123, 123, 123, 123, 123, 118, 118, 123, 123, 115, 123, ~
## $ skills    <dbl> 126, 108, 126, 108, 108, 126, 91, 91, 91, 102, 96, 108, 108,~
## $ economy   <dbl> 104, 116, 116, 116, 109, 109, 88, 95, 88, 79, 84, 99, 99, 10~
civics %>%
  mutate(across(.cols = c(sex:age), as_factor)) %>%
  summary()
##    studentid       sex        age         knowledge       skills   
##  Min.   :  10101   1:2111   16  : 153   Min.   : 89   Min.   : 45  
##  1st Qu.: 420413   2:1930   17  :3518   1st Qu.:112   1st Qu.: 96  
##  Median : 830110            18  : 311   Median :118   Median :108  
##  Mean   :1032742            19  :  46   Mean   :119   Mean   :109  
##  3rd Qu.:1210112            20  :   4   3rd Qu.:123   3rd Qu.:126  
##  Max.   :5500133            NA's:   9   Max.   :131   Max.   :143  
##     economy     
##  Min.   : 42.0  
##  1st Qu.: 88.0  
##  Median : 95.0  
##  Mean   : 97.1  
##  3rd Qu.:109.0  
##  Max.   :142.0
civics %>%
  mutate(across(.cols = c(sex:age), as_factor)) %>%
  ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing missing values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (stat_boxplot).

## Warning: Removed 9 rows containing missing values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

WEEKLY ASSIGNMENT 8

Question 1

civics %>%
  select(knowledge:economy) %>%
  pivot_longer(cols = everything(), names_to = "Variable") %>%
  group_by(Variable) %>%
  summarise(across(.cols = everything(),
                   list(Mean = mean, Median = median, Mode = getmode),
                   .names = "{.fn}"))
## # A tibble: 3 x 4
##   Variable   Mean Median  Mode
##   <chr>     <dbl>  <dbl> <dbl>
## 1 economy    97.1     95    95
## 2 knowledge 119.     118   123
## 3 skills    109.     108   126

Question 2

civics %>%
  select(knowledge:economy) %>%
  pivot_longer(cols = everything(), names_to = "Variable") %>%
  group_by(Variable) %>%
  summarise(Range = max(value) - min(value),
            SS = sum((value - mean(value))^2),
            Sample_STD = sqrt(SS/(NROW(value)-1)),
            Sample_Var = Sample_STD^2)
## # A tibble: 3 x 5
##   Variable  Range       SS Sample_STD Sample_Var
##   <chr>     <dbl>    <dbl>      <dbl>      <dbl>
## 1 economy     100  978427.      15.6       242. 
## 2 knowledge    42  298039.       8.59       73.8
## 3 skills       98 1458347.      19.0       361.

WEEKLY ASSIGNMENT 9

Question 1

t.test(civics$knowledge[civics$sex==1], civics$knowledge[civics$sex==2])
## 
##  Welch Two Sample t-test
## 
## data:  civics$knowledge[civics$sex == 1] and civics$knowledge[civics$sex == 2]
## t = -7, df = 3977, p-value = 0.00000000000009
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.54 -1.49
## sample estimates:
## mean of x mean of y 
##       118       120

Question 2

Single Factor ANOVA

http://www.sthda.com/english/wiki/one-way-anova-test-in-r#compute-one-way-anova-test

civics %<>%
  mutate(Age_Lvl = if_else(age == 16, 16, 
                   if_else(age == 17, 17, 18)) %>%
           factor(levels = c("16", "17", "18")),
         .after = age) 

with(civics, table(age, Age_Lvl, deparse.level = 2))
##     Age_Lvl
## age    16   17   18
##   16  153    0    0
##   17    0 3518    0
##   18    0    0  311
##   19    0    0   46
##   20    0    0    4
civics %>%
  aov(knowledge ~ Age_Lvl, data = .) %>%
  summary()
##               Df Sum Sq Mean Sq F value              Pr(>F)    
## Age_Lvl        2   7099    3549    49.3 <0.0000000000000002 ***
## Residuals   4029 290282      72                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 9 observations deleted due to missingness

Extra Credit

Bonferroni

https://www.statisticshowto.com/probability-and-statistics/statistics-definitions/post-hoc/#PHbonferroni

pairwise.t.test(civics$knowledge,
                civics$Age_Lvl,
                p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  civics$knowledge and civics$Age_Lvl 
## 
##    16          17                  
## 17 1           -                   
## 18 0.000000001 < 0.0000000000000002
## 
## P value adjustment method: bonferroni

Tukey

https://www.statisticshowto.com/tukey-test-honest-significant-difference/

civics %>% 
  aov(knowledge ~ Age_Lvl, data = .) %>%
  TukeyHSD()
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = knowledge ~ Age_Lvl, data = .)
## 
## $Age_Lvl
##         diff   lwr   upr p adj
## 17-16 -0.524 -2.17  1.12 0.735
## 18-16 -5.137 -7.06 -3.22 0.000
## 18-17 -4.612 -5.71 -3.51 0.000

Question 3

civics %>%
  lm(skills ~ knowledge, data = .) %>%
  get_regression_table() %>%
  datatable()