COD_week9_2_MGK_BTE3207

Minsik Kim

2024-10-31

Statistics on the midterm results

Score data in baxplot - by each problem set

# 
# midterm <- readxl::read_excel("Inha/5_Lectures/2023/Advanced biostatistics/2023F/Scoreboard/BTE3207_2023F_midterm_results.xlsx") %>%
#         tidyr::gather(., problemset, score, `1`:`12`, factor_key=TRUE)

#readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_midterm_score.xlsx")

midterm <- readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_midterm_score.xlsx") %>%
        select(-name,-`87_score`, -`100_score`, -`1st_to_100`) %>% 
        subset(., .$student_id != 0)
        
full_score <- readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_midterm_score.xlsx") %>%
        select(-name,-`87_score`, -`100_score`, -`1st_to_100`) %>% 
        subset(., .$student_id == 0) %>%
        select(-student_id, ) %>%
        t %>%
        data.frame(full_score = .) %>%
        rownames_to_column("problemset")

students_list <- readxl::read_excel("Inha/5_Lectures/2024/Advanced biostatistics/2024F/BTE3207_2024F_students_list.xlsx") %>%
        rename(student_id = "학번",
               department = "학부(과)",
               major = "전공",
               year = "학년") %>% 
        select(c("student_id", "department", "major", "year")) %>%
        mutate(department = case_when(department == "생명공학과" ~ "BE",
                                      department == "식품영양학과" ~ "FS",
                                      department == "화학과" ~ "CHEM",
                                      department == "통계학과" ~ "STAT",
                                      department == "해양과학과" ~ "OS",
                                      .default = NA))



midterm_merged <- merge(midterm, students_list, by = "student_id", all = T) %>%
        tidyr::gather(., problemset, score, `p1`:`p13`, factor_key=TRUE) %>%
        merge(., full_score, by = "problemset") %>%
        mutate(score_perc = score/full_score*100, .before = "full_score") %>%
        tidyr::gather(., score_type, score, `score`:`score_perc`, factor_key=TRUE)


midterm_total <- midterm_merged %>%
        mutate(student_year = substr(student_id, 3,4)) %>% 
        group_by(student_id, score_type) %>%
        summarise(student_year = student_year[1],
                  year = year[1],
                  department = department[1],
                  major = major[1],
                  total = sum(score)) %>%
        mutate(total = case_when(score_type == "score_perc" ~ total/1300 * 100,
                                 .default = total))


cat("\n Summary of raw score")

## 
##  Summary of raw score

midterm_total %>% 
        subset(., .$score_type == "score") %>%
        .$total %>%
        summary

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.40   40.25   48.45   48.57   58.77   67.70

cat("\n Summary of percent score (over 100)")

## 
##  Summary of percent score (over 100)

midterm_total %>% 
        subset(., .$score_type != "score") %>%
        .$total %>%
        summary

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.91   47.81   57.34   56.42   67.53   79.77

midterm_total %>% 
        #subset(., .$score_type == "score") %>%
        ggplot(aes(x=total, fill = "white")) +
        geom_histogram() +
        theme_classic() +
        theme(legend.position = "none") +
        ylab("Number of students") +
        xlab("Raw score | percent scroe") +
        facet_wrap(~score_type, ncol = 1) +
        MetBrewer::scale_fill_met_d("Lakota")

Boxplot of each problemset

midterm_merged %>% 
        ggplot(aes(x = problemset, y = score, fill = problemset)) +
                       geom_boxplot() +
        scale_fill_brewer(name = "Problem set",
                          type = "qual", 
                          palette = 3) +
        theme_classic(base_family = "serif",
                      base_size = 13) +
        xlab("Problem set") +
        ylab("Score") +
        facet_wrap(~score_type, scales = "free")

Score data in histogram - by each problem set

midterm_merged %>% 
        subset(., .$score_type == "score") %>%
        ggplot(aes(x = score, fill = problemset)) +
        geom_histogram() +
        MetBrewer::scale_fill_met_d(name = "Signac") +
        facet_wrap(~problemset, scales = "free") +
        #guide_legend(title = "Problem set") +
        theme_classic(base_family = "serif",
                      base_size = 13) +
        ylab("Number of students") +
        xlab("Range of raw score")

midterm_merged %>% 
        subset(., .$score_type == "score_perc") %>%
        ggplot(aes(x = score, fill = problemset)) +
        geom_histogram() +
        MetBrewer::scale_fill_met_d(name = "Signac") +
        facet_wrap(~problemset, scales = "free") +
        #guide_legend(title = "Problem set") +
        theme_classic(base_family = "serif",
                      base_size = 13) +
        ylab("Number of students") +
        xlab("Range of % score")

Score total - summary statistics

midterm_merged %>% 
        subset(., .$score_type == "score") %>% 
        select(-c(full_score, student_id, department, major, year)) %>%
        reactable(groupBy = c("problemset"),
                  columns = list(
                          score_type = colDef(aggregate = "frequency"),
                          score = colDef(aggregate = "mean", format = colFormat(digits = 2))
                  )) %>%
        reactablefmtr::add_title("Interactive table of summary statistics raw score by each problemset")

Interactive table of summary statistics raw score by each problemset

midterm_merged %>% 
        subset(., .$score_type != "score") %>% 
        select(-c(full_score, student_id, department, major, year)) %>%
        reactable(groupBy = c("problemset"),
                  columns = list(
                          score_type = colDef(aggregate = "frequency"),
                          score = colDef(aggregate = "mean", format = colFormat(digits = 2))
                  )) %>%
        reactablefmtr::add_title("Interactive table of summary statistics of % score by each problemset")

Interactive table of summary statistics of % score by each problemset

Score total - boxplot and histogram

midterm_total %>%
        ggplot(aes(x = department, y = total, fill = department)) +
        geom_boxplot() +
        MetBrewer::scale_fill_met_d(name = "Signac") +
        theme_classic(base_family = "serif",
                      base_size = 13) +
        xlab("By department") +
        theme(axis.text.x = element_blank(),
              legend.position = "none") +
        facet_wrap(~score_type, scales = "free")+
        ylab("Total score")

midterm_total %>%
        ggplot(aes(x = student_year, y = total, fill = student_year)) +
        geom_boxplot() +
        MetBrewer::scale_fill_met_d(name = "Signac") +
        theme_classic(base_family = "serif",
                      base_size = 13) +
        theme(axis.text.x = element_blank(),
              legend.position = "none") +
        guides(fill = guide_legend()) +
        xlab("By school admission year") +
        facet_wrap(~score_type, scales = "free")+
        ylab("Total score")

For fun, a statistical test

Is there a different between junior and senior?

Total score - boxplot by year

midterm_total %>%
        subset(., .$score_type == "score") %>% 
        mutate(year = as.factor(year)) %>% 
        ggplot(aes(y = total, x = year, fill = year)) +
        geom_boxplot() +
        #facet_wrap(~year, ncol = 1) +
        theme_classic(base_family = "serif",
                      base_size = 13) +
        ylab("Score") +
        xlab("Year of study at Inha")

Total score - histogram by year

midterm_total %>%
        #subset(., .$score_type == "score") %>% 
        mutate(year = as.factor(year)) %>%
        ggplot(aes(x = total, fill = year)) +
        geom_histogram(aes(fill = year), position = position_identity(), alpha = 0.5) +
        scale_fill_brewer(name = "Year of study at Inha",
                          type = "qual", 
                          palette = 6) +
        #facet_wrap(~year, ncol = 1) +
        theme_classic(base_family = "serif",
                      base_size = 13) +
        theme(legend.position = "top") + 
        facet_wrap(~score_type, ncol = 1, scales = "free") + 
        xlab("Score") +
        ylab("Number of students")

t-tests

t-test (unpaired, unequal variance)

H₀ (null hypothesis): true difference in means is 0, i.e., there is no difference between junior and senior H_a (alternative hypothesis): true difference in means is not equal to 0.

raw_score_data <- midterm_total %>%
        subset(., .$score_type == "score") %>% 
        mutate(year = as.factor(year))

t.test(subset(raw_score_data, raw_score_data$year == "4") %>% .$total,
       subset(raw_score_data, raw_score_data$year == "3") %>% .$total,
       paired = F, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  subset(raw_score_data, raw_score_data$year == "4") %>% .$total and subset(raw_score_data, raw_score_data$year == "3") %>% .$total
## t = 0.54554, df = 23.339, p-value = 0.5906
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.181455 12.331455
## sample estimates:
## mean of x mean of y 
##    49.675    47.100

perc_score_data <- midterm_total %>%
        subset(., .$score_type != "score") %>% 
        mutate(year = as.factor(year))

t.test(subset(perc_score_data, perc_score_data$year == "4") %>% .$total,
       subset(perc_score_data, perc_score_data$year == "3") %>% .$total,
       paired = F, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  subset(perc_score_data, perc_score_data$year == "4") %>% .$total and subset(perc_score_data, perc_score_data$year == "3") %>% .$total
## t = 0.62849, df = 22.452, p-value = 0.536
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.898507 14.778948
## sample estimates:
## mean of x mean of y 
##  57.89566  54.45544

p-value is 0.48. Therefore, we accect the null hypothesis (there is no significant difference between mean of junior and senior).

Alternatively,

raw_score_data %>%
        lm(total ~ year, data = .) %>% 
        summary

## 
## Call:
## lm(formula = total ~ year, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -30.275  -7.819   0.500   9.531  18.025 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   49.675      3.073  16.167 4.41e-15 ***
## year3         -2.575      4.694  -0.549    0.588    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.29 on 26 degrees of freedom
## Multiple R-squared:  0.01144,    Adjusted R-squared:  -0.02658 
## F-statistic: 0.301 on 1 and 26 DF,  p-value: 0.5879

perc_score_data %>%
        lm(total ~ year, data = .) %>% 
        summary

## 
## Call:
## lm(formula = total ~ year, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.985  -8.946  -0.246  11.328  21.876 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   57.896      3.531  16.397 3.15e-15 ***
## year3         -3.440      5.394  -0.638    0.529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.12 on 26 degrees of freedom
## Multiple R-squared:  0.01541,    Adjusted R-squared:  -0.02246 
## F-statistic: 0.4068 on 1 and 26 DF,  p-value: 0.5292

LM on each problemset

midterm_merged %>%
        subset(., .$score_type=="score_perc") %>%
        lm(data = ., score ~ problemset) %>%
        summary %>%
        .$coefficient %>%
        round(3) %>%
        format(digits = 3) %>%
        data.frame(check.names = F) %>%
        mutate(` ` = case_when(`Pr(>|t|)` %>%as.numeric < 0.001 ~ "***",
                               `Pr(>|t|)` %>%as.numeric < 0.01 ~ "**",
                               `Pr(>|t|)` %>%as.numeric < 0.05 ~ "*",
                               .default = NA)) %>%
        reactable(sortable = T, showPageSizeOptions = T)

LM on raw score

With categorical variables (department)

raw_score_data %>%
        lm(total ~ department, data = .) %>% 
        summary %>%
        .$coefficient %>%
        data.frame(check.names = F) %>%
        round(3) %>% 
        format(digits = 3) %>%
        mutate(row_temp = c("(Intercept)", "Dept. 2", "Dept. 3", "Dept. 4", "Dept. 5")) %>%
        remove_rownames() %>%
        column_to_rownames("row_temp") %>%
        reactable

LM on percent score

perc_score_data %>%
        lm(total ~ department, data = .) %>% 
        summary %>%
        .$coefficient %>%
        data.frame(check.names = F) %>%
        round(3) %>% 
        format(digits = 3) %>%
        mutate(row_temp = c("(Intercept)", "Dept. 2", "Dept. 3", "Dept. 4", "Dept. 5")) %>%
        remove_rownames() %>%
        column_to_rownames("row_temp") %>%
        reactable

We are going to learn more about the linear model, at the later half of the semester! # Bibliography

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.