Statistics on the midterm results
Score data in baxplot - by each problem set
#
# midterm <- readxl::read_excel("Inha/5_Lectures/2023/Advanced biostatistics/2023F/Scoreboard/BTE3207_2023F_midterm_results.xlsx") %>%
# tidyr::gather(., problemset, score, `1`:`12`, factor_key=TRUE)
#readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_midterm_score.xlsx")
midterm <- readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_midterm_score.xlsx") %>%
select(-name,-`87_score`, -`100_score`, -`1st_to_100`) %>%
subset(., .$student_id != 0)
full_score <- readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_midterm_score.xlsx") %>%
select(-name,-`87_score`, -`100_score`, -`1st_to_100`) %>%
subset(., .$student_id == 0) %>%
select(-student_id, ) %>%
t %>%
data.frame(full_score = .) %>%
rownames_to_column("problemset")
students_list <- readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_students_list.xlsx") %>%
rename(student_id = "학번",
department = "학부(과)",
major = "전공",
year = "학년") %>%
select(c("student_id", "department", "major", "year")) %>%
mutate(department = case_when(department == "생명공학과" ~ "BE",
department == "식품영양학과" ~ "FS",
department == "화학과" ~ "CHEM",
department == "통계학과" ~ "STAT",
department == "해양과학과" ~ "OS",
.default = NA))
midterm_merged <- merge(midterm, students_list, by = "student_id", all = T) %>%
tidyr::gather(., problemset, score, `p1`:`p13`, factor_key=TRUE) %>%
merge(., full_score, by = "problemset") %>%
mutate(score_perc = score/full_score*100, .before = "full_score") %>%
tidyr::gather(., score_type, score, `score`:`score_perc`, factor_key=TRUE)
midterm_total <- midterm_merged %>%
mutate(student_year = substr(student_id, 3,4)) %>%
group_by(student_id, score_type) %>%
summarise(student_year = student_year[1],
year = year[1],
department = department[1],
major = major[1],
total = sum(score)) %>%
mutate(total = case_when(score_type == "score_perc" ~ total/1300 * 100,
.default = total))
cat("\n Summary of raw score")
##
## Summary of raw score
midterm_total %>%
subset(., .$score_type == "score") %>%
.$total %>%
summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.40 40.25 48.45 48.57 58.77 67.70
cat("\n Summary of percent score (over 100)")
##
## Summary of percent score (over 100)
midterm_total %>%
subset(., .$score_type != "score") %>%
.$total %>%
summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.91 47.81 57.34 56.42 67.53 79.77
midterm_total %>%
#subset(., .$score_type == "score") %>%
ggplot(aes(x=total, fill = "white")) +
geom_histogram() +
theme_classic() +
theme(legend.position = "none") +
ylab("Number of students") +
xlab("Raw score | percent scroe") +
facet_wrap(~score_type, ncol = 1) +
MetBrewer::scale_fill_met_d("Lakota")
Boxplot of each problemset
midterm_merged %>%
ggplot(aes(x = problemset, y = score, fill = problemset)) +
geom_boxplot() +
scale_fill_brewer(name = "Problem set",
type = "qual",
palette = 3) +
theme_classic(base_family = "serif",
base_size = 13) +
xlab("Problem set") +
ylab("Score") +
facet_wrap(~score_type, scales = "free")
Score data in histogram - by each problem set
midterm_merged %>%
subset(., .$score_type == "score") %>%
ggplot(aes(x = score, fill = problemset)) +
geom_histogram() +
MetBrewer::scale_fill_met_d(name = "Signac") +
facet_wrap(~problemset, scales = "free") +
#guide_legend(title = "Problem set") +
theme_classic(base_family = "serif",
base_size = 13) +
ylab("Number of students") +
xlab("Range of raw score")
midterm_merged %>%
subset(., .$score_type == "score_perc") %>%
ggplot(aes(x = score, fill = problemset)) +
geom_histogram() +
MetBrewer::scale_fill_met_d(name = "Signac") +
facet_wrap(~problemset, scales = "free") +
#guide_legend(title = "Problem set") +
theme_classic(base_family = "serif",
base_size = 13) +
ylab("Number of students") +
xlab("Range of % score")
Score total - summary statistics
midterm_merged %>%
subset(., .$score_type == "score") %>%
select(-c(full_score, student_id, department, major, year)) %>%
reactable(groupBy = c("problemset"),
columns = list(
score_type = colDef(aggregate = "frequency"),
score = colDef(aggregate = "mean", format = colFormat(digits = 2))
)) %>%
reactablefmtr::add_title("Interactive table of summary statistics raw score by each problemset")
Interactive table of summary statistics raw score by each problemset
midterm_merged %>%
subset(., .$score_type != "score") %>%
select(-c(full_score, student_id, department, major, year)) %>%
reactable(groupBy = c("problemset"),
columns = list(
score_type = colDef(aggregate = "frequency"),
score = colDef(aggregate = "mean", format = colFormat(digits = 2))
)) %>%
reactablefmtr::add_title("Interactive table of summary statistics of % score by each problemset")
Interactive table of summary statistics of % score by each problemset
Score total - boxplot and histogram
midterm_total %>%
ggplot(aes(x = department, y = total, fill = department)) +
geom_boxplot() +
MetBrewer::scale_fill_met_d(name = "Signac") +
theme_classic(base_family = "serif",
base_size = 13) +
xlab("By department") +
theme(axis.text.x = element_blank(),
legend.position = "none") +
facet_wrap(~score_type, scales = "free")+
ylab("Total score")
midterm_total %>%
ggplot(aes(x = student_year, y = total, fill = student_year)) +
geom_boxplot() +
MetBrewer::scale_fill_met_d(name = "Signac") +
theme_classic(base_family = "serif",
base_size = 13) +
theme(axis.text.x = element_blank(),
legend.position = "none") +
guides(fill = guide_legend()) +
xlab("By school admission year") +
facet_wrap(~score_type, scales = "free")+
ylab("Total score")
For fun, a statistical test
Is there a different between junior and senior?
Total score - boxplot by year
midterm_total %>%
subset(., .$score_type == "score") %>%
mutate(year = as.factor(year)) %>%
ggplot(aes(y = total, x = year, fill = year)) +
geom_boxplot() +
#facet_wrap(~year, ncol = 1) +
theme_classic(base_family = "serif",
base_size = 13) +
ylab("Score") +
xlab("Year of study at Inha")
Total score - histogram by year
midterm_total %>%
#subset(., .$score_type == "score") %>%
mutate(year = as.factor(year)) %>%
ggplot(aes(x = total, fill = year)) +
geom_histogram(aes(fill = year), position = position_identity(), alpha = 0.5) +
scale_fill_brewer(name = "Year of study at Inha",
type = "qual",
palette = 6) +
#facet_wrap(~year, ncol = 1) +
theme_classic(base_family = "serif",
base_size = 13) +
theme(legend.position = "top") +
facet_wrap(~score_type, ncol = 1, scales = "free") +
xlab("Score") +
ylab("Number of students")
t-tests
t-test (unpaired, unequal variance)
H0 (null hypothesis): true difference in means is 0, i.e., there is no difference between junior and senior Ha (alternative hypothesis): true difference in means is not equal to 0.
raw_score_data <- midterm_total %>%
subset(., .$score_type == "score") %>%
mutate(year = as.factor(year))
t.test(subset(raw_score_data, raw_score_data$year == "4") %>% .$total,
subset(raw_score_data, raw_score_data$year == "3") %>% .$total,
paired = F, var.equal = F)
##
## Welch Two Sample t-test
##
## data: subset(raw_score_data, raw_score_data$year == "4") %>% .$total and subset(raw_score_data, raw_score_data$year == "3") %>% .$total
## t = 0.54554, df = 23.339, p-value = 0.5906
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.181455 12.331455
## sample estimates:
## mean of x mean of y
## 49.675 47.100
perc_score_data <- midterm_total %>%
subset(., .$score_type != "score") %>%
mutate(year = as.factor(year))
t.test(subset(perc_score_data, perc_score_data$year == "4") %>% .$total,
subset(perc_score_data, perc_score_data$year == "3") %>% .$total,
paired = F, var.equal = F)
##
## Welch Two Sample t-test
##
## data: subset(perc_score_data, perc_score_data$year == "4") %>% .$total and subset(perc_score_data, perc_score_data$year == "3") %>% .$total
## t = 0.62849, df = 22.452, p-value = 0.536
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.898507 14.778948
## sample estimates:
## mean of x mean of y
## 57.89566 54.45544
p-value is 0.48. Therefore, we accect the null hypothesis (there is no significant difference between mean of junior and senior).
Alternatively,
raw_score_data %>%
lm(total ~ year, data = .) %>%
summary
##
## Call:
## lm(formula = total ~ year, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.275 -7.819 0.500 9.531 18.025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.675 3.073 16.167 4.41e-15 ***
## year3 -2.575 4.694 -0.549 0.588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.29 on 26 degrees of freedom
## Multiple R-squared: 0.01144, Adjusted R-squared: -0.02658
## F-statistic: 0.301 on 1 and 26 DF, p-value: 0.5879
perc_score_data %>%
lm(total ~ year, data = .) %>%
summary
##
## Call:
## lm(formula = total ~ year, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.985 -8.946 -0.246 11.328 21.876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.896 3.531 16.397 3.15e-15 ***
## year3 -3.440 5.394 -0.638 0.529
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.12 on 26 degrees of freedom
## Multiple R-squared: 0.01541, Adjusted R-squared: -0.02246
## F-statistic: 0.4068 on 1 and 26 DF, p-value: 0.5292
LM on each problemset
midterm_merged %>%
subset(., .$score_type=="score_perc") %>%
lm(data = ., score ~ problemset) %>%
summary %>%
.$coefficient %>%
round(3) %>%
format(digits = 3) %>%
data.frame(check.names = F) %>%
mutate(` ` = case_when(`Pr(>|t|)` %>%as.numeric < 0.001 ~ "***",
`Pr(>|t|)` %>%as.numeric < 0.01 ~ "**",
`Pr(>|t|)` %>%as.numeric < 0.05 ~ "*",
.default = NA)) %>%
reactable(sortable = T, showPageSizeOptions = T)
LM on raw score
With categorical variables (department)
raw_score_data %>%
lm(total ~ department, data = .) %>%
summary %>%
.$coefficient %>%
data.frame(check.names = F) %>%
round(3) %>%
format(digits = 3) %>%
mutate(row_temp = c("(Intercept)", "Dept. 2", "Dept. 3", "Dept. 4", "Dept. 5")) %>%
remove_rownames() %>%
column_to_rownames("row_temp") %>%
reactable
LM on percent score
perc_score_data %>%
lm(total ~ department, data = .) %>%
summary %>%
.$coefficient %>%
data.frame(check.names = F) %>%
round(3) %>%
format(digits = 3) %>%
mutate(row_temp = c("(Intercept)", "Dept. 2", "Dept. 3", "Dept. 4", "Dept. 5")) %>%
remove_rownames() %>%
column_to_rownames("row_temp") %>%
reactable
We are going to learn more about the linear model, at the later half of the semester! # Bibliography
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.