library(tidyverse)
library(magrittr)
library(GGally)
library(psych)
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Levels are rank of experience
# Import Data
baseline <- read_csv("G:/My Drive/homework/Lovely J/Baseline+survey.csv")
# Clean up Answer5
bad <-
baseline %>%
select(Answer5) %>%
distinct() %>%
filter(Answer5 != "1" &
Answer5 != "2" &
Answer5 != "3" &
Answer5 != "4" &
Answer5 != "5") %>%
pull()
good <-
c("1", "2", NA, "3", "1", "1", "1", "2", "3", "1", "2", "4", "3", "3", "1",
"3", "1", "1", "1", "1", "1", "1", "1", "1", "1", "3", "1")
for (i in 1:length(good)){
baseline$Answer5[baseline$Answer5 == bad[i]] <- good[i]
}
# Drop questions, change answer data types & names
baseline %<>%
select(-X1, -starts_with("Question")) %>%
mutate(Major = Answer1,
Age_Yrs = Answer2 %>% as.numeric(),
Work_Yrs = Answer3 %>% as.numeric(),
Stats_Yrs = Answer4 %>% as.numeric(),
R_Lvl = Answer5 %>% as.integer(.),
Cen_Tend_Lvl = Answer6 %>% as.integer(.),
Var_Lvl = Answer7 %>% as.integer(.),
Bivar_Lvl = Answer8 %>% as.integer(.),
Prob_Lvl = Answer9 %>% as.integer(.),
.keep = "unused")
# Replace NAs
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
age_mode <- baseline$Age_Yrs %>% getmode(.)
baseline %<>% mutate(Age_Yrs = replace_na(Age_Yrs, age_mode))
rm(age_mode)
baseline %<>% mutate(Work_Yrs = round(Work_Yrs))
work_mode <- baseline$Work_Yrs %>% getmode(.)
baseline %<>% mutate(Work_Yrs = replace_na(Work_Yrs, work_mode))
rm(work_mode)
baseline %<>% mutate(Stats_Yrs = round(Stats_Yrs))
stats_mode <- baseline$Stats_Yrs %>% getmode(.)
baseline %<>% mutate(Stats_Yrs = replace_na(Stats_Yrs, stats_mode))
rm(stats_mode)
r_mode <- baseline$R_Lvl %>% getmode(.)
baseline %<>% mutate(R_Lvl = replace_na(R_Lvl, r_mode))
rm(r_mode)
cen_mode <- baseline$Cen_Tend_Lvl %>% getmode(.)
baseline %<>% mutate(Cen_Tend_Lvl = replace_na(Cen_Tend_Lvl, cen_mode))
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl < 1] <- 1
baseline$Cen_Tend_Lvl[baseline$Cen_Tend_Lvl > 5] <- cen_mode
rm(cen_mode)
var_mode <- baseline$Var_Lvl %>% getmode(.)
baseline %<>% mutate(Var_Lvl = replace_na(Var_Lvl, var_mode))
baseline$Var_Lvl[baseline$Var_Lvl < 1] = 1
baseline$Var_Lvl[baseline$Var_Lvl > 5] = var_mode
rm(var_mode)
bivar_mode <- baseline$Bivar_Lvl %>% getmode(.)
baseline %<>% mutate(Bivar_Lvl = replace_na(Bivar_Lvl, bivar_mode))
baseline$Bivar_Lvl[baseline$Bivar_Lvl < 1] = 1
baseline$Bivar_Lvl[baseline$Bivar_Lvl > 5] = bivar_mode
rm(bivar_mode)
prob_mode <- baseline$Prob_Lvl %>% getmode(.)
baseline %<>% mutate(Prob_Lvl = replace_na(Prob_Lvl, prob_mode))
baseline$Prob_Lvl[baseline$Prob_Lvl < 1] = 1
baseline$Prob_Lvl[baseline$Prob_Lvl > 5] = prob_mode
rm(prob_mode)
rm(getmode)
rm(bad)
rm(good)
# Summaries
baseline %>% glimpse()
## Rows: 163
## Columns: 12
## $ Term <chr> "2019 Fall", "2019 Fall", "2019 Fall", "2019 Fall", "2019~
## $ Class <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17~
## $ Major <chr> "Computer/IT", "Engineering", "Business", "Mathematics", ~
## $ Age_Yrs <dbl> 24, 24, 23, 23, 22, 24, 22, 24, 26, 24, 22, 22, 22, 25, 2~
## $ Work_Yrs <dbl> 2, 1, 0, 0, 0, 2, 0, 1, 4, 0, 0, 0, 0, 2, 1, 1, 4, 2, 0, ~
## $ Stats_Yrs <dbl> 2, 1, 0, 0, 0, 2, 0, 1, 4, 0, 0, 0, 0, 2, 1, 1, 4, 2, 0, ~
## $ R_Lvl <int> 2, 1, 1, 2, 2, 3, 2, 4, 1, 1, 1, 1, 1, 1, 3, 1, 2, 3, 1, ~
## $ Cen_Tend_Lvl <dbl> 3, 1, 5, 3, 3, 5, 4, 4, 3, 5, 4, 2, 4, 5, 5, 5, 5, 2, 3, ~
## $ Var_Lvl <dbl> 3, 1, 4, 3, 2, 4, 4, 4, 2, 4, 4, 2, 3, 4, 5, 5, 2, 2, 2, ~
## $ Bivar_Lvl <dbl> 1, 1, 3, 3, 1, 2, 2, 3, 2, 3, 4, 1, 2, 1, 4, 5, 1, 1, 2, ~
## $ Prob_Lvl <dbl> 2, 3, 3, 3, 2, 3, 3, 4, 2, 3, 3, 2, 2, 3, 5, 5, 3, 1, 2, ~
baseline %>% mutate(across(.cols = everything(), as.factor)) %>% summary()
## Term Class ID Major Age_Yrs Work_Yrs
## 2019 Fall :82 1:26 1 : 6 Engineering:27 24 :46 0:61
## 2020 Spring:25 2:26 2 : 6 Computer/IT:26 25 :39 1:38
## 2020 Winter:56 3:30 3 : 6 Finance :24 26 :33 2:36
## 4:31 4 : 6 Other :22 23 :20 3:15
## 5:25 5 : 6 Business :18 22 :13 4: 8
## 6:25 6 : 6 Economics :11 28 : 5 5: 4
## (Other):127 (Other) :35 (Other): 7 7: 1
## Stats_Yrs R_Lvl Cen_Tend_Lvl Var_Lvl Bivar_Lvl Prob_Lvl
## 0:58 1:107 1:15 1:14 1:32 1:15
## 1:43 2: 24 2: 7 2:22 2:38 2:33
## 2:37 3: 24 3:32 3:33 3:66 3:73
## 3:13 4: 6 4:45 4:61 4:20 4:33
## 4: 7 5: 2 5:64 5:33 5: 7 5: 9
## 5: 5
##
table(baseline$Age_Yrs, baseline$Bivar_Lvl, deparse.level = 2) %>% addmargins()
## baseline$Bivar_Lvl
## baseline$Age_Yrs 1 2 3 4 5 Sum
## 21 0 0 2 1 0 3
## 22 5 4 2 2 0 13
## 23 2 1 10 3 4 20
## 24 7 12 23 2 2 46
## 25 10 11 11 6 1 39
## 26 6 6 15 6 0 33
## 27 2 1 1 0 0 4
## 28 0 3 2 0 0 5
## Sum 32 38 66 20 7 163
baseline %>%
mutate(across(.cols = c(Term:Major, ends_with("Lvl")), as_factor)) %>%
summary()
## Term Class ID Major Age_Yrs
## 2019 Fall :82 1:26 1 : 6 Engineering:27 Min. :21.0
## 2020 Winter:56 2:26 2 : 6 Computer/IT:26 1st Qu.:24.0
## 2020 Spring:25 3:30 3 : 6 Finance :24 Median :24.0
## 4:31 4 : 6 Other :22 Mean :24.5
## 5:25 5 : 6 Business :18 3rd Qu.:26.0
## 6:25 6 : 6 Economics :11 Max. :28.0
## (Other):127 (Other) :35
## Work_Yrs Stats_Yrs R_Lvl Cen_Tend_Lvl Var_Lvl Bivar_Lvl
## Min. :0.000 Min. :0.000 1:107 1:15 1:14 1:32
## 1st Qu.:0.000 1st Qu.:0.000 2: 24 2: 7 2:22 2:38
## Median :1.000 Median :1.000 3: 24 3:32 3:33 3:66
## Mean :1.313 Mean :1.282 4: 6 4:45 4:61 4:20
## 3rd Qu.:2.000 3rd Qu.:2.000 5: 2 5:64 5:33 5: 7
## Max. :7.000 Max. :5.000
##
## Prob_Lvl
## 1:15
## 2:33
## 3:73
## 4:33
## 5: 9
##
##
baseline %>%
select(Age_Yrs:Prob_Lvl) %>%
describe(., na.rm = TRUE, skew = FALSE, fast = TRUE)
## vars n mean sd min max range se
## Age_Yrs 1 163 24.50 1.46 21 28 7 0.11
## Work_Yrs 2 163 1.31 1.39 0 7 7 0.11
## Stats_Yrs 3 163 1.28 1.31 0 5 5 0.10
## R_Lvl 4 163 1.60 0.95 1 5 4 0.07
## Cen_Tend_Lvl 5 163 3.83 1.25 1 5 4 0.10
## Var_Lvl 6 163 3.47 1.20 1 5 4 0.09
## Bivar_Lvl 7 163 2.58 1.07 1 5 4 0.08
## Prob_Lvl 8 163 2.93 1.00 1 5 4 0.08
# Full Sample
baseline %>%
select(Age_Yrs:Prob_Lvl) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
group_by(Variable) %>%
summarize(Mean = mean(Value, na.rm = TRUE),
SD = sd(Value, na.rm = TRUE),
Min = min(Value, na.rm = TRUE),
Max = max(Value, na.rm = TRUE),
N = NROW(Value) # ,NAs = sum(is.na(Value)
)
## # A tibble: 8 x 6
## Variable Mean SD Min Max N
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Age_Yrs 24.5 1.46 21 28 163
## 2 Bivar_Lvl 2.58 1.07 1 5 163
## 3 Cen_Tend_Lvl 3.83 1.25 1 5 163
## 4 Prob_Lvl 2.93 0.997 1 5 163
## 5 R_Lvl 1.60 0.953 1 5 163
## 6 Stats_Yrs 1.28 1.31 0 5 163
## 7 Var_Lvl 3.47 1.20 1 5 163
## 8 Work_Yrs 1.31 1.39 0 7 163
# Means grouped by Age
baseline %>%
group_by(Age_Yrs) %>%
select(Work_Yrs:Prob_Lvl) %>%
summarize(across(.cols = everything(), mean))
## # A tibble: 8 x 8
## Age_Yrs Work_Yrs Stats_Yrs R_Lvl Cen_Tend_Lvl Var_Lvl Bivar_Lvl Prob_Lvl
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 0 1.33 2.33 4 4 3.33 4
## 2 22 0.231 0.385 1.31 3.62 3.08 2.08 2.77
## 3 23 0.75 1.65 2.4 3.65 3.4 3.3 3.4
## 4 24 0.761 0.804 1.41 4.09 3.76 2.57 2.80
## 5 25 1.77 1.54 1.79 3.72 3.41 2.41 2.79
## 6 26 1.76 1.58 1.30 3.91 3.42 2.64 2.91
## 7 27 4.75 3.75 1.5 3.75 3.25 1.75 3
## 8 28 3 0.6 1 3.2 2.8 2.4 3
# Means grouped by Bivariate Level
baseline %>%
group_by(Bivar_Lvl) %>%
select(Age_Yrs:Prob_Lvl) %>%
summarize(across(.cols = everything(), mean))
## # A tibble: 5 x 8
## Bivar_Lvl Age_Yrs Work_Yrs Stats_Yrs R_Lvl Cen_Tend_Lvl Var_Lvl Prob_Lvl
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 24.5 1.56 1.5 1.28 2.5 2.06 1.78
## 2 2 24.8 1.37 1.37 1.32 3.79 3.21 2.47
## 3 3 24.5 1.12 1.08 1.56 4.27 3.95 3.29
## 4 4 24.4 1.3 0.9 2.05 4.6 4.5 3.9
## 5 5 23.6 1.71 2.86 3.71 3.86 3.86 4.43
# Means of Age and Bivariate grouped by Term & Class
baseline %>%
group_by(Term, Class) %>%
summarize(across(.cols = c(Age_Yrs, Bivar_Lvl), mean))
## # A tibble: 6 x 4
## # Groups: Term [3]
## Term Class Age_Yrs Bivar_Lvl
## <chr> <dbl> <dbl> <dbl>
## 1 2019 Fall 1 23.9 2.42
## 2 2019 Fall 2 25.1 2.35
## 3 2019 Fall 3 25.3 2.6
## 4 2020 Spring 6 24.2 2.88
## 5 2020 Winter 4 24.1 2.42
## 6 2020 Winter 5 24.2 2.88
baseline %>%
ggplot(aes(Age_Yrs, Bivar_Lvl, color = factor(Class), shape = Term)) +
geom_jitter(height = 0.2, width = 0.2) +
geom_vline(xintercept = mean(baseline$Age_Yrs), col = "red") +
geom_hline(yintercept = mean(baseline$Bivar_Lvl), col = "blue") +
geom_smooth(method = "lm",
se = FALSE,
col = "black",
inherit.aes = FALSE,
aes(Age_Yrs, Bivar_Lvl)) +
ggtitle("Red/Blue are Means, Black is OLS")
## `geom_smooth()` using formula 'y ~ x'
baseline %>%
select(Age_Yrs, Bivar_Lvl) %>%
mutate(Bivar_Lvl = factor(Bivar_Lvl)) %>%
ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
baseline %>%
ggplot(aes(factor(Age_Yrs), Bivar_Lvl, color = Bivar_Lvl)) +
geom_boxplot(varwidth = TRUE) +
geom_jitter(width = 0.2)
baseline %>%
ggplot(aes(Age_Yrs, Bivar_Lvl, color = factor(Bivar_Lvl))) +
geom_boxplot(varwidth = TRUE) +
geom_jitter(width = 0.2)