library(tidyverse)
library(magrittr)
library(GGally)
library(psych)

# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

Module 1

Part (1)

Levels are rank of experience

# Import Data
baseline <- read_csv("G:/My Drive/homework/Lovely J/Baseline+survey.csv")

# Clean up Answer5
bad <- 
  baseline %>%
  select(Answer5) %>%
  distinct() %>%
  filter(Answer5 != "1" &
         Answer5 != "2" &
         Answer5 != "3" &
         Answer5 != "4" &
         Answer5 != "5") %>%
  pull()

good <-
  c("1", "2", NA, "3", "1", "1", "1", "2", "3", "1", "2", "4", "3", "3", "1", 
    "3", "1", "1", "1", "1", "1", "1", "1", "1", "1", "3", "1")

for (i in 1:length(good)){
  baseline$Answer5[baseline$Answer5 == bad[i]] <- good[i]
}

# Drop questions, change answer data types & names
baseline %<>% 
  select(-X1, -starts_with("Question")) %>% 
  mutate(Major = Answer1,
         Age_Yrs = Answer2 %>% as.numeric(),
         Work_Yrs = Answer3 %>% as.numeric(),
         Stats_Yrs = Answer4 %>% as.numeric(),
         R_Lvl = Answer5 %>% as.integer(.),
         Cen_Tend_Lvl = Answer6 %>% as.integer(.),
         Var_Lvl = Answer7 %>% as.integer(.),
         Bivar_Lvl = Answer8 %>% as.integer(.),
         Prob_Lvl = Answer9 %>% as.integer(.),
         .keep = "unused")


# Replace NAs
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

age_mode <- baseline$Age_Yrs %>% getmode(.)
baseline %<>% mutate(Age_Yrs = replace_na(Age_Yrs, age_mode))
rm(age_mode)

baseline %<>% mutate(Work_Yrs = round(Work_Yrs))
work_mode <- baseline$Work_Yrs %>% getmode(.) 
baseline %<>% mutate(Work_Yrs = replace_na(Work_Yrs, work_mode))
rm(work_mode)

baseline %<>% mutate(Stats_Yrs = round(Stats_Yrs))
stats_mode <- baseline$Stats_Yrs %>% getmode(.)
baseline %<>% mutate(Stats_Yrs = replace_na(Stats_Yrs, stats_mode))
rm(stats_mode)

r_mode <- baseline$R_Lvl %>% getmode(.)
baseline %<>% mutate(R_Lvl = replace_na(R_Lvl, r_mode))
rm(r_mode)

cen_mode <- baseline$Cen_Tend_Lvl %>% getmode(.)
baseline %<>% mutate(Cen_Tend_Lvl = replace_na(Cen_Tend_Lvl, cen_mode))
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl < 1] <- 1
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl > 5] <- cen_mode
rm(cen_mode)

var_mode <- baseline$Var_Lvl %>% getmode(.)
baseline %<>% mutate(Var_Lvl = replace_na(Var_Lvl, var_mode))
baseline$Var_Lvl[baseline$Var_Lvl < 1] = 1
baseline$Var_Lvl[baseline$Var_Lvl > 5] = var_mode
rm(var_mode)

bivar_mode <- baseline$Bivar_Lvl %>% getmode(.)
baseline %<>% mutate(Bivar_Lvl = replace_na(Bivar_Lvl, bivar_mode))
baseline$Bivar_Lvl[baseline$Bivar_Lvl < 1] = 1
baseline$Bivar_Lvl[baseline$Bivar_Lvl > 5] = bivar_mode
rm(bivar_mode)

prob_mode <- baseline$Prob_Lvl %>% getmode(.)
baseline %<>% mutate(Prob_Lvl = replace_na(Prob_Lvl, prob_mode))
baseline$Prob_Lvl[baseline$Prob_Lvl < 1] = 1
baseline$Prob_Lvl[baseline$Prob_Lvl > 5] = prob_mode
rm(prob_mode)

rm(getmode)
rm(bad)
rm(good)

# Summaries
baseline %>% glimpse()
## Rows: 163
## Columns: 12
## $ Term         <chr> "2019 Fall", "2019 Fall", "2019 Fall", "2019 Fall", "2019~
## $ Class        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ ID           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17~
## $ Major        <chr> "Computer/IT", "Engineering", "Business", "Mathematics", ~
## $ Age_Yrs      <dbl> 24, 24, 23, 23, 22, 24, 22, 24, 26, 24, 22, 22, 22, 25, 2~
## $ Work_Yrs     <dbl> 2, 1, 0, 0, 0, 2, 0, 1, 4, 0, 0, 0, 0, 2, 1, 1, 4, 2, 0, ~
## $ Stats_Yrs    <dbl> 2, 1, 0, 0, 0, 2, 0, 1, 4, 0, 0, 0, 0, 2, 1, 1, 4, 2, 0, ~
## $ R_Lvl        <int> 2, 1, 1, 2, 2, 3, 2, 4, 1, 1, 1, 1, 1, 1, 3, 1, 2, 3, 1, ~
## $ Cen_Tend_Lvl <dbl> 3, 1, 5, 3, 3, 5, 4, 4, 3, 5, 4, 2, 4, 5, 5, 5, 5, 2, 3, ~
## $ Var_Lvl      <dbl> 3, 1, 4, 3, 2, 4, 4, 4, 2, 4, 4, 2, 3, 4, 5, 5, 2, 2, 2, ~
## $ Bivar_Lvl    <dbl> 1, 1, 3, 3, 1, 2, 2, 3, 2, 3, 4, 1, 2, 1, 4, 5, 1, 1, 2, ~
## $ Prob_Lvl     <dbl> 2, 3, 3, 3, 2, 3, 3, 4, 2, 3, 3, 2, 2, 3, 5, 5, 3, 1, 2, ~
baseline %>% mutate(across(.cols = everything(), as.factor)) %>% summary()
##           Term    Class        ID              Major       Age_Yrs   Work_Yrs
##  2019 Fall  :82   1:26   1      :  6   Engineering:27   24     :46   0:61    
##  2020 Spring:25   2:26   2      :  6   Computer/IT:26   25     :39   1:38    
##  2020 Winter:56   3:30   3      :  6   Finance    :24   26     :33   2:36    
##                   4:31   4      :  6   Other      :22   23     :20   3:15    
##                   5:25   5      :  6   Business   :18   22     :13   4: 8    
##                   6:25   6      :  6   Economics  :11   28     : 5   5: 4    
##                          (Other):127   (Other)    :35   (Other): 7   7: 1    
##  Stats_Yrs R_Lvl   Cen_Tend_Lvl Var_Lvl Bivar_Lvl Prob_Lvl
##  0:58      1:107   1:15         1:14    1:32      1:15    
##  1:43      2: 24   2: 7         2:22    2:38      2:33    
##  2:37      3: 24   3:32         3:33    3:66      3:73    
##  3:13      4:  6   4:45         4:61    4:20      4:33    
##  4: 7      5:  2   5:64         5:33    5: 7      5: 9    
##  5: 5                                                     
## 

Part (2)

table(baseline$Age_Yrs, baseline$Bivar_Lvl, deparse.level = 2) %>% addmargins()
##                 baseline$Bivar_Lvl
## baseline$Age_Yrs   1   2   3   4   5 Sum
##              21    0   0   2   1   0   3
##              22    5   4   2   2   0  13
##              23    2   1  10   3   4  20
##              24    7  12  23   2   2  46
##              25   10  11  11   6   1  39
##              26    6   6  15   6   0  33
##              27    2   1   1   0   0   4
##              28    0   3   2   0   0   5
##              Sum  32  38  66  20   7 163

Module 2

Part (1)

baseline %>%
  mutate(across(.cols = c(Term:Major, ends_with("Lvl")), as_factor)) %>%
  summary()
##           Term    Class        ID              Major       Age_Yrs    
##  2019 Fall  :82   1:26   1      :  6   Engineering:27   Min.   :21.0  
##  2020 Winter:56   2:26   2      :  6   Computer/IT:26   1st Qu.:24.0  
##  2020 Spring:25   3:30   3      :  6   Finance    :24   Median :24.0  
##                   4:31   4      :  6   Other      :22   Mean   :24.5  
##                   5:25   5      :  6   Business   :18   3rd Qu.:26.0  
##                   6:25   6      :  6   Economics  :11   Max.   :28.0  
##                          (Other):127   (Other)    :35                 
##     Work_Yrs       Stats_Yrs     R_Lvl   Cen_Tend_Lvl Var_Lvl Bivar_Lvl
##  Min.   :0.000   Min.   :0.000   1:107   1:15         1:14    1:32     
##  1st Qu.:0.000   1st Qu.:0.000   2: 24   2: 7         2:22    2:38     
##  Median :1.000   Median :1.000   3: 24   3:32         3:33    3:66     
##  Mean   :1.313   Mean   :1.282   4:  6   4:45         4:61    4:20     
##  3rd Qu.:2.000   3rd Qu.:2.000   5:  2   5:64         5:33    5: 7     
##  Max.   :7.000   Max.   :5.000                                         
##                                                                        
##  Prob_Lvl
##  1:15    
##  2:33    
##  3:73    
##  4:33    
##  5: 9    
##          
## 
baseline %>%
  select(Age_Yrs:Prob_Lvl) %>%
  describe(., na.rm = TRUE, skew = FALSE, fast = TRUE)
##              vars   n  mean   sd min max range   se
## Age_Yrs         1 163 24.50 1.46  21  28     7 0.11
## Work_Yrs        2 163  1.31 1.39   0   7     7 0.11
## Stats_Yrs       3 163  1.28 1.31   0   5     5 0.10
## R_Lvl           4 163  1.60 0.95   1   5     4 0.07
## Cen_Tend_Lvl    5 163  3.83 1.25   1   5     4 0.10
## Var_Lvl         6 163  3.47 1.20   1   5     4 0.09
## Bivar_Lvl       7 163  2.58 1.07   1   5     4 0.08
## Prob_Lvl        8 163  2.93 1.00   1   5     4 0.08
# Full Sample
baseline %>%
  select(Age_Yrs:Prob_Lvl) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
  group_by(Variable) %>%
  summarize(Mean = mean(Value, na.rm = TRUE), 
            SD = sd(Value, na.rm = TRUE),
            Min = min(Value, na.rm = TRUE),
            Max = max(Value, na.rm = TRUE),
            N = NROW(Value) # ,NAs = sum(is.na(Value)
            )
## # A tibble: 8 x 6
##   Variable      Mean    SD   Min   Max     N
##   <chr>        <dbl> <dbl> <dbl> <dbl> <int>
## 1 Age_Yrs      24.5  1.46     21    28   163
## 2 Bivar_Lvl     2.58 1.07      1     5   163
## 3 Cen_Tend_Lvl  3.83 1.25      1     5   163
## 4 Prob_Lvl      2.93 0.997     1     5   163
## 5 R_Lvl         1.60 0.953     1     5   163
## 6 Stats_Yrs     1.28 1.31      0     5   163
## 7 Var_Lvl       3.47 1.20      1     5   163
## 8 Work_Yrs      1.31 1.39      0     7   163
# Means grouped by Age
baseline %>%
  group_by(Age_Yrs) %>%
  select(Work_Yrs:Prob_Lvl) %>%
  summarize(across(.cols = everything(), mean))
## # A tibble: 8 x 8
##   Age_Yrs Work_Yrs Stats_Yrs R_Lvl Cen_Tend_Lvl Var_Lvl Bivar_Lvl Prob_Lvl
##     <dbl>    <dbl>     <dbl> <dbl>        <dbl>   <dbl>     <dbl>    <dbl>
## 1      21    0         1.33   2.33         4       4         3.33     4   
## 2      22    0.231     0.385  1.31         3.62    3.08      2.08     2.77
## 3      23    0.75      1.65   2.4          3.65    3.4       3.3      3.4 
## 4      24    0.761     0.804  1.41         4.09    3.76      2.57     2.80
## 5      25    1.77      1.54   1.79         3.72    3.41      2.41     2.79
## 6      26    1.76      1.58   1.30         3.91    3.42      2.64     2.91
## 7      27    4.75      3.75   1.5          3.75    3.25      1.75     3   
## 8      28    3         0.6    1            3.2     2.8       2.4      3
# Means grouped by Bivariate Level
baseline %>%
  group_by(Bivar_Lvl) %>%
  select(Age_Yrs:Prob_Lvl) %>%
  summarize(across(.cols = everything(), mean))
## # A tibble: 5 x 8
##   Bivar_Lvl Age_Yrs Work_Yrs Stats_Yrs R_Lvl Cen_Tend_Lvl Var_Lvl Prob_Lvl
##       <dbl>   <dbl>    <dbl>     <dbl> <dbl>        <dbl>   <dbl>    <dbl>
## 1         1    24.5     1.56      1.5   1.28         2.5     2.06     1.78
## 2         2    24.8     1.37      1.37  1.32         3.79    3.21     2.47
## 3         3    24.5     1.12      1.08  1.56         4.27    3.95     3.29
## 4         4    24.4     1.3       0.9   2.05         4.6     4.5      3.9 
## 5         5    23.6     1.71      2.86  3.71         3.86    3.86     4.43
# Means of Age and Bivariate grouped by Term & Class
baseline %>%
  group_by(Term, Class) %>%
  summarize(across(.cols = c(Age_Yrs, Bivar_Lvl), mean))
## # A tibble: 6 x 4
## # Groups:   Term [3]
##   Term        Class Age_Yrs Bivar_Lvl
##   <chr>       <dbl>   <dbl>     <dbl>
## 1 2019 Fall       1    23.9      2.42
## 2 2019 Fall       2    25.1      2.35
## 3 2019 Fall       3    25.3      2.6 
## 4 2020 Spring     6    24.2      2.88
## 5 2020 Winter     4    24.1      2.42
## 6 2020 Winter     5    24.2      2.88

Part (2)

baseline %>%
  ggplot(aes(Age_Yrs, Bivar_Lvl, color = factor(Class), shape = Term)) +
  geom_jitter(height = 0.2, width = 0.2) +
  geom_vline(xintercept = mean(baseline$Age_Yrs), col = "red") +
  geom_hline(yintercept = mean(baseline$Bivar_Lvl), col = "blue") +
  geom_smooth(method = "lm",
              se = FALSE,
              col = "black",
              inherit.aes = FALSE,
              aes(Age_Yrs, Bivar_Lvl)) +
  ggtitle("Red/Blue are Means, Black is OLS")
## `geom_smooth()` using formula 'y ~ x'

baseline %>%
  select(Age_Yrs, Bivar_Lvl) %>%
  mutate(Bivar_Lvl = factor(Bivar_Lvl)) %>%
  ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

baseline %>%
  ggplot(aes(factor(Age_Yrs), Bivar_Lvl, color = Bivar_Lvl)) +
  geom_boxplot(varwidth = TRUE) +
  geom_jitter(width = 0.2) 

baseline %>%
  ggplot(aes(Age_Yrs, Bivar_Lvl, color = factor(Bivar_Lvl))) +
  geom_boxplot(varwidth = TRUE) +
  geom_jitter(width = 0.2)