#Here, I have made up grades for each student in order to run the script in the absensence of having only the analytics data. 

load("analytics_raw.rds")


Last <- analytics_raw %>%
  select(c(`User ID`, `Sortable Name`)) %>%
  separate(`Sortable Name`, into = c("Last", NA)) %>%
  unique()

load("X241_spring19.rds")

X241_spring19_x <- X241_spring19 %>% 
  rename("T1" = `Exam 1`, "T2" = `Exam 2`, "T3" = `Exam 3`, "T4" = `Exam 4`) %>%
  select(c(Name, T1, T2, T3, T4, Final)) %>%
  mutate(Final = as.numeric(Final), 
         T1 = as.numeric(T1),
         T2 = as.numeric(T2),
         T3 = as.numeric(T3),
         T4 = as.numeric(T4)) %>%
  mutate(Final = Final/2) %>%
  drop_na() %>%
  separate(Name, into = c("Last", NA)) %>%
  left_join(Last, by = "Last") %>%
  select(-Last) %>%
  gather(T1, T2, T3, T4, Final, key = "Exam", value = "Grade" ) %>%
  mutate(Exam = Exam %>% fct_relevel("T1", "T2", "T3", "T4", "Final", "avg"))
#Here, I have made up grades for each student in order to run the script in the absensence of having only the analytics data. 


analytics_full <- analytics_raw %>% 
  left_join(X241_spring19_x, by = "User ID")

Visualizing raw grades

ggplot(data = X241_spring19_x) + 
  geom_boxplot(mapping = aes(x = Exam, y = Grade, fill = Exam)) + 
  geom_jitter(mapping = aes(x = Exam, y = Grade, color = Exam), alpha = .3) +
  theme(panel.background = element_blank(), 
        axis.line = element_line(),
        legend.position="none") + 
  coord_cartesian(ylim = c(0:100)) +
  labs(y = "Grade (%)")

It would be cool to then later ID kids as “video watchers” and “non-video watchers” and then split the fill = video_vatching_status se that T1, T2, etc. grades can be seen for those who do and don’t watch videos

Also make an average grade bar

Visualizing Resource Use

Raw Count of Resource Use by Type

test_test <- analytics_raw %>% 
  select(`User ID`, `Sortable Name`, `Title`, `Views`, `Last Access`, `First Access`) %>%
  mutate(Category = 
           if_else(str_detect(Title, "T5 PRACTICE") | 
                     str_detect(Title, "Test 2") | 
                     str_detect(Title, "T4.pdf") | 
                     str_detect(Title, "Test 1") | 
                     str_detect(Title, "Test 3"), "Test/Key", 
                   if_else(str_detect(Title, "#") |
                             str_detect(Title, "^Q") |
                             str_detect(Title, "T.Q"), "Videos", "Other"
                   )
           ),
         `Last Access` = ymd_hms(`Last Access`), 
         `First Access` = ymd_hms(`First Access`), 
         Ex_num = if_else(`Last Access` < ymd_hms("2019-01-31 09:15:00"), "1", if_else(
           `Last Access` < ymd_hms("2019-02-28 09:15:00"), "2", if_else(
             `Last Access` < ymd_hms("2019-03-28 09:15:00"), "3", if_else(
               `Last Access` < ymd_hms("2019-04-18 09:15:00"), "4", "Final"
               )
             )
           )
         )) %>% 
  arrange(`Sortable Name`)

 # for some reason, this is the better way to annotate ggplots with geom_text. Doing so othrwise makes the labels print twice and appear blurry.

####### T1 vid, Test/answer, other----

df_T1_x_vid <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 1 & Category == "Videos")$`Sortable Name`),
  T1_vid = tabulate(with(filter(test_test, Ex_num == 1 & Category == "Videos"), interaction(`Sortable Name`)))
)

df_T1_x_TK <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 1 & Category == "Test/Key")$`Sortable Name`),
  T1_TK = tabulate(with(filter(test_test, Ex_num == 1 & Category == "Test/Key"), interaction(`Sortable Name`)))
)

df_T1_x_other <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 1 & Category == "Other")$`Sortable Name`),
  T1_other = tabulate(with(filter(test_test, Ex_num == 1 & Category == "Other"), interaction(`Sortable Name`)))
)

####### T2 vid, Test/answer, other----

df_T2_x_vid <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 2 & Category == "Videos")$`Sortable Name`),
  T2_vid = tabulate(with(filter(test_test, Ex_num == 2 & Category == "Videos"), interaction(`Sortable Name`)))
)

df_T2_x_TK <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 2 & Category == "Test/Key")$`Sortable Name`),
  T2_TK = tabulate(with(filter(test_test, Ex_num == 2 & Category == "Test/Key"), interaction(`Sortable Name`)))
)

df_T2_x_other <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 2 & Category == "Other")$`Sortable Name`),
  T2_other = tabulate(with(filter(test_test, Ex_num == 2 & Category == "Other"), interaction(`Sortable Name`)))
)

####### T3 vid, Test/answer, other----

df_T3_x_vid <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 3 & Category == "Videos")$`Sortable Name`),
  T3_vid = tabulate(with(filter(test_test, Ex_num == 3 & Category == "Videos"), interaction(`Sortable Name`)))
)

df_T3_x_TK <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 3 & Category == "Test/Key")$`Sortable Name`),
  T3_TK = tabulate(with(filter(test_test, Ex_num == 3 & Category == "Test/Key"), interaction(`Sortable Name`)))
)

df_T3_x_other <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 3 & Category == "Other")$`Sortable Name`),
  T3_other = tabulate(with(filter(test_test, Ex_num == 3 & Category == "Other"), interaction(`Sortable Name`)))
)

####### T4 vid, Test/answer, other----

df_T4_x_vid <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 4 & Category == "Videos")$`Sortable Name`),
  T4_vid = tabulate(with(filter(test_test, Ex_num == 4 & Category == "Videos"), interaction(`Sortable Name`)))
)

df_T4_x_TK <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 4 & Category == "Test/Key")$`Sortable Name`),
  T4_TK = tabulate(with(filter(test_test, Ex_num == 4 & Category == "Test/Key"), interaction(`Sortable Name`)))
)

df_T4_x_other <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == 4 & Category == "Other")$`Sortable Name`),
  T4_other = tabulate(with(filter(test_test, Ex_num == 4 & Category == "Other"), interaction(`Sortable Name`)))
)

####### Final vid, Test/answer, other----

df_final_x_vid <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == "Final" & Category == "Videos")$`Sortable Name`),
  final_vid = tabulate(with(filter(test_test, Ex_num == "Final" & Category == "Videos"), interaction(`Sortable Name`)))
)

df_final_x_TK <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == "Final" & Category == "Test/Key")$`Sortable Name`),
  final_TK = tabulate(with(filter(test_test, Ex_num == "Final" & Category == "Test/Key"), interaction(`Sortable Name`)))
)

df_final_x_other <- tibble(
  `Sortable Name` = unique(filter(test_test, Ex_num == "Final" & Category == "Other")$`Sortable Name`),
  final_other = tabulate(with(filter(test_test, Ex_num == "Final" & Category == "Other"), interaction(`Sortable Name`)))
)

######----

usage <-  test_test %>% select(`Sortable Name`) %>% 
  unique() %>% 
  left_join(df_T1_x_vid, by = "Sortable Name") %>% 
  left_join(df_T1_x_TK, by = "Sortable Name") %>% 
  left_join(df_T1_x_other, by = "Sortable Name") %>%
  
  left_join(df_T2_x_vid, by = "Sortable Name") %>% 
  left_join(df_T2_x_other, by = "Sortable Name") %>%
  left_join(df_T2_x_TK, by = "Sortable Name") %>% 
  
  left_join(df_T3_x_vid, by = "Sortable Name") %>% 
  left_join(df_T3_x_other, by = "Sortable Name") %>%
  left_join(df_T3_x_TK, by = "Sortable Name") %>% 
  
  left_join(df_T4_x_vid, by = "Sortable Name") %>% 
  left_join(df_T4_x_other, by = "Sortable Name") %>%
  left_join(df_T4_x_TK, by = "Sortable Name") %>% 
  
  left_join(df_final_x_vid, by = "Sortable Name") %>% 
  left_join(df_final_x_other, by = "Sortable Name") %>%
  left_join(df_final_x_TK, by = "Sortable Name") %>% 
  
  gather(T1_vid, T1_TK, T1_other,
         T2_vid, T2_TK, T2_other,
         T3_vid, T3_TK, T3_other,
         T4_vid, T4_TK, T4_other,
         final_vid, final_TK, final_other,
         key = view_type, value = count) %>% arrange(`Sortable Name`) %>% separate(view_type, into = c("test", "type")) %>%
  mutate(count = replace_na(count, 0)) %>%
  mutate(test = test %>% fct_relevel("T1", "T2", "T3", "T4", "final")) %>%
  mutate(type = type %>% fct_relevel("vid", "TK", "other"))

####----

ggplot(data = usage) + 
   geom_boxplot(mapping = aes(x = test, y = count, fill = type)) + 
   theme_classic() + 
   labs(y = "Views", x = "View Type (by date viewed)")

################################################

Average views vs. time

Similar to boxplots but more sensitive to outliers. However, I feel like it does a better job of visualising trends in use than do the box plots due to a large number of people accumulating 0 clicks

Use vs. time

Here, I want to explore the relationship between how far out you use the videos and your score.

Also plotly would be really nice here. Need to adjust the label output though, right now it is displaying it in raw seconds

graph_2_labs <- tibble(
  x = c(ymd_hms("2019-02-01 09:15:00") + minutes(60*24*6),
        ymd_hms("2019-03-01 09:15:00") + minutes(60*24*6),
        ymd_hms("2019-03-29 09:15:00") + minutes(60*24*6),
        ymd_hms("2019-04-19 09:15:00") + minutes(60*24*6),
        ymd_hms("2019-05-3 09:15:00") + minutes(60*24*4)),
  y = 250,
  label = c("Exam 1", 
            "Exam 2", 
            "Exam 3", 
            "Exam 4", 
            "Final")
  
)

freq_dt <- test_test %>% filter(Category != "Other") %>% select(`Sortable Name`, `Last Access`) %>% unique()

ggplot(freq_dt) + 
  geom_freqpoly(aes(x = `Last Access`), binwidth = 86400, color = "red", size = .8) + # binwidth = on day in seconds
  theme_classic() + 
  geom_vline(xintercept = as.numeric(c(ymd_hms("2019-02-01 09:15:00",
                                    "2019-03-01 09:15:00",
                                    "2019-03-29 09:15:00",
                                    "2019-04-19 09:15:00",
                                    "2019-05-3 09:15:00")))) + #These are the dates of one day after each exam
  geom_text(data = graph_2_labs, aes(x = x, y = y, label = label)) +
  labs(y = "Views", x = "Date")

Correlations

get ready

corr_names <- analytics_raw %>% select(`User ID`, `Sortable Name`) %>% unique()

tot_x_vid <- tibble(
  `Sortable Name` = unique(filter(test_test, Category == "Videos")$`Sortable Name`),
  TOTAL_vid = tabulate(with(filter(test_test, Category == "Videos"), interaction(`Sortable Name`)))
)

tot_x_TK <- tibble(
  `Sortable Name` = unique(filter(test_test, Category == "Test/Key")$`Sortable Name`),
  TOTAL_TK = tabulate(with(filter(test_test, Category == "Test/Key"), interaction(`Sortable Name`)))
)

corr <- X241_spring19 %>% 
  rename("T1" = `Exam 1`, "T2" = `Exam 2`, "T3" = `Exam 3`, "T4" = `Exam 4`) %>%
  select(c(Name, T1, T2, T3, T4, Final, Percent)) %>%
  mutate(Final = as.numeric(Final), 
         T1 = as.numeric(T1),
         T2 = as.numeric(T2),
         T3 = as.numeric(T3),
         T4 = as.numeric(T4),
         Percent = as.numeric(Percent)) %>%
  mutate(Final = Final/2) %>%
  drop_na() %>%
  separate(Name, into = c("Last", NA)) %>%
  left_join(Last, by = "Last") %>%
  left_join(corr_names, by = "User ID") %>%
  select(-c(Last, `User ID`)) %>%
  
  left_join(df_T1_x_vid, by = "Sortable Name") %>% 
  left_join(df_T1_x_TK, by = "Sortable Name") %>% 
  
  left_join(df_T2_x_vid, by = "Sortable Name") %>% 
  left_join(df_T2_x_TK, by = "Sortable Name") %>% 
  
  left_join(df_T3_x_vid, by = "Sortable Name") %>% 
  left_join(df_T3_x_TK, by = "Sortable Name") %>% 
  
  left_join(df_T4_x_vid, by = "Sortable Name") %>% 
  left_join(df_T4_x_TK, by = "Sortable Name") %>% 
  
  left_join(df_final_x_vid, by = "Sortable Name") %>% 
  left_join(df_final_x_TK, by = "Sortable Name") %>% 
  
  left_join(tot_x_vid, by = "Sortable Name") %>% 
  left_join(tot_x_TK, by = "Sortable Name") %>% 
  
  na_replace(0) %>%
  mutate(T1_tot_resource = T1_vid + T1_TK,
         T2_tot_resource = T2_vid + T2_TK,
         T3_tot_resource = T3_vid + T3_TK,
         T4_tot_resource = T4_vid + T4_TK,
         final_tot_resource = final_vid + final_TK,
         TOTAL = TOTAL_vid + TOTAL_TK)


  


### Making lm, extracting values, and assessing for transformation

PLOTS <- function(x, y, dataset) {
  lm <- lm(y ~ x)
  lm


  p_val <- round(summary(lm)$coefficients[2,4], 3)
  r_sq <- round(summary(lm)$r.squared, 3)
  ad_r_sq <- summary(lm)$adj.r.squared
  intercept <- round(summary(lm)$coefficients[1, 1], 3)
  slope <- round(summary(lm)$coefficients[2, 1], 3)
  label_1 <- paste('y = ', slope,' * x + ', intercept, '; R^2 = ', r_sq, '; p = ', p_val, sep='')

  #boxcox(lm)

### Plotting graph and residuals 

  graph_3_labs <- tibble(
    x_1 = (min(x) + (max(x)/2 - min(x)/2)),
    y_1 = 100,
    label_2 = paste('y = ', slope,' * x + ', intercept, '\n', 'R^2 = ', r_sq, '; p = ', p_val, sep='')
  )

  p2 <- ggplot(dataset, aes(x = x, y = y)) +
    geom_jitter() +
    stat_smooth(method = "lm") +
    theme_classic() +
    geom_text(data = graph_3_labs, aes(x = x_1, y = y_1, label = label_2), col = "red") +
    labs(x = "Clicks", y = "Grade") + 
    coord_cartesian(xlim = c(0 : max(x)), ylim = 0 : 100)

  res_plot_labs <- tibble(
    x_2 = (min(x) + (max(x)/2 - min(x)/2)),
    y_2 = 50,
    label_3 = paste("Shapiro-Wilk p: ", round(shapiro.test(summary(lm)$residuals)$p.val, 3), sep = "")
  )

  p3 <- ggplot(dataset, aes(x = x, y = summary(lm)$residuals)) +
    geom_jitter() +
    geom_hline(yintercept = 0) +
    theme_classic() +
    geom_text(data = res_plot_labs, aes(x = x_2, y = y_2, label = label_3), col = "red") +
    labs(x = "Clicks", y = "Residuals") + 
    coord_cartesian(xlim = c(0 : max(x)))

  grid.arrange(p2, p3, nrow = 1)
}

All vs. Overall Grade

PLOTS(corr$TOTAL, corr$Percent, corr)

Video views vs. Overall Grade

PLOTS(corr$TOTAL_vid, corr$Percent, corr)

Test/Key views vs. Overall Grade

PLOTS(corr$TOTAL_TK, corr$Percent, corr)

All T1 views vs. T1 grade

PLOTS(corr$T1_tot_resource, corr$T1, corr)

T1 video views vs. T1 grade

PLOTS(corr$T1_vid, corr$T1, corr)

T1 Test/Key views vs. T1 grade

PLOTS(corr$T1_TK, corr$T1, corr)

All T2 views vs. T2 grade

PLOTS(corr$T2_tot_resource, corr$T2, corr)

T2 video views vs. T2 grade

PLOTS(corr$T2_vid, corr$T2, corr)

T2 Test/Key views vs. T2 grade

PLOTS(corr$T2_TK, corr$T2, corr)

All T3 views vs. T3 grade

PLOTS(corr$T3_tot_resource, corr$T3, corr)

T3 video views vs. T3 grade

PLOTS(corr$T3_vid, corr$T3, corr)

T3 Test/Key views vs. T3 grade

PLOTS(corr$T3_TK, corr$T3, corr)

All T4 views vs. T4 grade

PLOTS(corr$T4_tot_resource, corr$T4, corr)

T4 video views vs. T4 grade

PLOTS(corr$T4_vid, corr$T4, corr)

T4 Test/Key views vs. T4 grade

PLOTS(corr$T4_TK, corr$T4, corr)

All Final views vs. T4 grade

PLOTS(corr$final_tot_resource, corr$Final, corr)

Final video views vs. Final Exam grade

PLOTS(corr$final_vid, corr$Final, corr)

Final Test/Key views vs. Final Exam grade

PLOTS(corr$final_TK, corr$Final, corr)

Extra

This is a plot of Total views vs. course grade for only the IQR

In other words, I chopped off the top scorers and the very bottom of the class to look at the relationship. As you can se, p-value shoots up and the trend disappears

av_bear_corr <- corr %>% filter(Percent > (mean(Percent) - sd(Percent)) & 
                                  Percent < (mean(Percent) + sd(Percent)))

  lm_2 <- lm(Percent ~ TOTAL, data = av_bear_corr)
  
  p_val_2 <- round(summary(lm_2)$coefficients[2,4], 3)
  r_sq_2 <- round(summary(lm_2)$r.squared, 3)
  ad_r_sq_2 <- summary(lm_2)$adj.r.squared
  intercept_2 <- round(summary(lm_2)$coefficients[1, 1], 3)
  slope_2 <- round(summary(lm_2)$coefficients[2, 1], 3)
  
    graph_avg_labs <- tibble(
    x_1 = 30,
    y_1 = 100,
    label_2 = paste('y = ', slope_2,' * x + ', intercept_2, '; R^2 = ', r_sq_2, '; p = ', p_val_2, sep='')
  )

ggplot(av_bear_corr, aes(x = TOTAL, y = Percent)) +
    geom_jitter() +
    stat_smooth(method = "lm") +
    theme_classic() +
    geom_text(data = graph_avg_labs, aes(x = x_1, y = y_1, label = label_2), col = "red") +
    labs(x = "Clicks", y = "Grade")

Binned Analysis

p_vals_var <-c()

n <- 0

for (i in min(corr$TOTAL) + 20 : max(corr$TOTAL) - 20){
  corr_bin <- corr %>% 
  mutate(viewer_type = ifelse(
    TOTAL > i,
    "High",
    "Low"
  ))

  var_p <- var.test(filter(corr_bin, viewer_type == "High")$Percent,
                    filter(corr_bin, viewer_type == "Low")$Percent)$p.val
         
  ifelse(
    var_p > .05,
    t_p <- t.test(filter(corr_bin, viewer_type == "High")$Percent,
                  filter(corr_bin, viewer_type == "Low")$Percent,
                  var.equal = TRUE)$p.val,
    t_p <- t.test(filter(corr_bin, viewer_type == "High")$Percent,
                  filter(corr_bin, viewer_type == "Low")$Percent,
                  var.equal = FALSE)$p.val
  )
  
  n <- n + 1
  
  p_vals_var <- c(p_vals_var, t_p)
  
}

newtest <- tibble(cutoff = c(min(corr$TOTAL) + 20 : max(corr$TOTAL) - 20), p = p_vals_var)

x <- ggplot(newtest) +
  geom_point(aes(x = cutoff, y = p, col = p)) +
  geom_hline(yintercept = 0.05) +
  theme_classic()

ggplotly(x)

corr_bin <- corr %>% 
  mutate(viewer_type = ifelse(
    TOTAL > 19,
    "High",
    "Low"
  ))

t_bin <- round(t.test(filter(corr_bin, viewer_type == "High")$Percent,
                  filter(corr_bin, viewer_type == "Low")$Percent,
                  var.equal = TRUE)$p.val, 3)

gt <- paste("p: ", t_bin, sep = "")

bin_labs <- tibble(
  x = 1.5,
  y = 90,
  label = paste("p: ", t_bin, sep = "")
)

ggplot(corr_bin) +
  geom_boxplot(aes(x = viewer_type, y = Percent, fill = viewer_type)) +
  geom_jitter(aes(x = viewer_type, y = Percent, fill = viewer_type)) +
  geom_text(data = bin_labs, aes(x = x, y = y, label = label), col = "red") + 
  theme_classic()

var.test(filter(corr_bin, viewer_type == "High")$Percent,
         filter(corr_bin, viewer_type == "Low")$Percent)$p.val

## [1] 0.42679

SMLC Analysis

Shane Chambers

6/22/2019