NST

## MANCALA
## load packages
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(RColorBrewer)
## read in data
ratingData <- read.csv("./data/GameRating.csv")
detailsData <- read.csv("./data/GameDetails.csv")
## 1: How does time of day affect game outcome?
## specify which game IDs correspond to ties
tiesGameID <- c()
for(i in 1:(nrow(detailsData)-1)) {
    if(as.numeric(detailsData$Step.No.[i+1])==0) {
        if (as.numeric(detailsData$No.6[i]) == as.numeric(detailsData$No.13[i])) {
           tiesGameID <- c(tiesGameID, detailsData$GameID[i]) 
        }
    }
}
ratingData$Win[ratingData$Game.ID %in% tiesGameID] <- 2
## divide play time into factors
ratingData$Play_Time <- strptime(ratingData$Play_Time, 
                                 format = c("%Y-%m-%d %I:%M:%S%p"))
hours <- hour(ymd_hms(ratingData$Play_Time))
minutes <- minute(ymd_hms(ratingData$Play_Time))
ratingData$dayTime <- paste(hours, minutes, sep=".")
levels <- cut(as.numeric(ratingData$dayTime), breaks = c(0, 4, 8, 12, 16, 20, 24))
## plot losses, wins, and ties per time level
timeGroups <- split(ratingData$Win, levels)
## RAW barplots
par(mfrow=c(2, 3))
names <- c("Wins", "Losses")
namesT <- c("Wins", "Losses", "Ties")
cols <- brewer.pal(n = 3, name = "RdBu")
barplot(table(timeGroups$`(0,4]`), names.arg = names, 
        main="Game Outcomes 00:00-04:00", col=cols, density=80)
barplot(table(timeGroups$`(4,8]`), names.arg = names, 
        main="Game Outcomes 04:00-08:00", col=cols, density=80)
barplot(table(timeGroups$`(8,12]`), names.arg = namesT, 
        main="Game Outcomes 08:00-12:00", col=cols, density=80)
barplot(table(timeGroups$`(12,16]`), names.arg = namesT, 
        main="Game Outcomes 12:00-16:00", col=cols, density=80)
barplot(table(timeGroups$`(16,20]`), names.arg = namesT, 
        main="Game Outcomes 16:00-20:00", col=cols, density=80)
barplot(table(timeGroups$`(20,24]`), names.arg = namesT, 
        main="Game Outcomes 20:00-24:00", col=cols, density=80, ylim=c(0, 20))

## PERCENTAGE pie charts
par(mfrow=c(2, 3))
pie(table(timeGroups$`(0,4]`)/sum(table(timeGroups$`(0,4]`)), labels = names, 
    main="Game Outcomes 00:00-04:00", col=cols, density=80)
pie(table(timeGroups$`(4,8]`)/sum(table(timeGroups$`(4,8]`)), labels = names, 
    main="Game Outcomes 04:00-08:00", col=cols, density=80)
pie(table(timeGroups$`(8,12]`)/sum(table(timeGroups$`(8,12]`)), labels = namesT, 
    main="Game Outcomes 08:00-12:00", col=cols, density=80)
pie(table(timeGroups$`(12,16]`)/sum(table(timeGroups$`(12,16]`)), labels = namesT, 
    main="Game Outcomes 12:00-16:00", col=cols, density=80)
pie(table(timeGroups$`(16,20]`)/sum(table(timeGroups$`(16,20]`)), labels = namesT, 
    main="Game Outcomes 16:00-20:00", col=cols, density=80)
pie(table(timeGroups$`(20,24]`)/sum(table(timeGroups$`(20,24]`)), labels = namesT, 
    main="Game Outcomes 20:00-24:00", col=cols, density=80)

## 2: Average games played per player
freqs <- as.vector(table(ratingData$SID))
cols2 <- brewer.pal(n = 4, name = "RdBu")
barplot(quantile(freqs)[2:5], col = cols2, density = 80, ylim=c(0,25), 
        main="Rounds of Mancala Played per Student", ylab = "Rounds", 
        xlab = "Percentage of Students")
## ratio of wins vs losses for students who played =< 2 rounds
par(mfrow=c(1,2))

pie(table(ratingData$Win[ratingData$SID %in% which(freqs<=2)]), labels=names, 
    main="Game outcomes for students who \n played two rounds or less", col=cols, 
    density = 80)
## ratio of wins vs losses for students who played > 2 rounds
pie(table(ratingData$Win[ratingData$SID %in% which(freqs>2)]), labels=namesT, 
    main="Game outcomes for students who \n played more than two rounds", col=cols, 
    density = 80)

## 3: Does game duration influence engagement?
## correlate average steps per game and games played
meanSteps <- as.vector(sapply(split(ratingData$Steps, ratingData$SID), mean))
gamesPlayed <- as.vector(sapply(split(ratingData$Steps, ratingData$SID), length))
cols3 <- brewer.pal(n = length(meanSteps), name = "RdBu")

## Warning in brewer.pal(n = length(meanSteps), name = "RdBu"): n too large, allowed maximum for palette RdBu is 11
## Returning the palette you asked for with that many colors

par(mfrow=c(1,1), mar=c(6, 6, 4, 4))
plot(gamesPlayed, meanSteps, main="Relationship between game duration and engagement", 
     xlab = "Rounds played", ylab = "Average steps per student", pch=19, col=cols3)
## draw regression line only taking into account students playing >2 rounds
df03 <- data.frame(gamesPlayed, meanSteps)
df03 <- df03[df03$gamesPlayed > 2,]
abline(lm(df03$meanSteps ~ df03$gamesPlayed))

## 4: Does game outcome influence subsequent engagement?
## probability of rematch after win vs after loss
df <- arrange(ratingData, Play_Time)
wins_rm <- 0
losses_rm <- 0
tie_rm <- 0
for(i in 1:(nrow(df)-1)){
    if(df$Win[i] == 0 & df$SID[i] == df$SID[i+1]) wins_rm = wins_rm+1
    if(df$Win[i] == 1 & df$SID[i] == df$SID[i+1]) losses_rm = losses_rm+1
    if(df$Win[i] == 2 & df$SID[i] == df$SID[i+1]) tie_rm = tie_rm+1
    
}
par(mfrow=c(1,2), mar=c(6, 6, 3, 3))
barplot(c(wins_rm, losses_rm, tie_rm), col = cols, density = 80, ylim=c(0, 40), 
        main="Total rematches per \n game outcome type", ylab="Rematches", 
        names.arg = c("Wins", "Losses", "Ties"), xlab = "Game outcome") ## raw
## percentage wise
wins_per <- wins_rm/sum(df$Win == 0)
losses_per <- losses_rm/sum(df$Win == 1)
tie_per <- tie_rm/sum(df$Win==2)
barplot(c(wins_per, losses_per, tie_per), col = cols, density = 80, ylim = c(0, .40),
        main="Probability of rematch \n per game outcome type", xlab = "Game outcome",
        ylab = "Percent of rematches", names.arg = c("Wins", "Losses", "Ties"))

## 5: Which encourages further engagement: an easy, normal, or hard win? 
easy_rm <- 0
normal_rm <- 0
hard_rm <- 0
for (i in 1:c(nrow(df)-1)) {
    if(df$Levels[i] == "Easy" & df$SID[i] == df$SID[i+1]) easy_rm = easy_rm+1
    if(df$Levels[i] == "Normal" & df$SID[i] == df$SID[i+1]) normal_rm = normal_rm+1
    if(df$Levels[i] == "Hard" & df$SID[i] == df$SID[i+1]) hard_rm = hard_rm+1
}
par(mfrow=c(1,2), mar=c(6, 6, 3, 3))
barplot(c(easy_rm, normal_rm, hard_rm), col=cols, density=80, ylim = c(0, 30), 
        main="Total rematches \n per game level", ylab = "Rematches",
        names.arg=c("Easy", "Normal", "Hard"), xlab = "Game level")
## percentage wise
easy_per <- easy_rm/sum(df$Levels=="Easy")
normal_per <- normal_rm/sum(df$Levels=="Normal")
hard_per <- hard_rm/sum(df$Levels=="Hard")
barplot(c(easy_per, normal_per, hard_per), col=cols, density=80, ylim=c(0,.5), 
        main="Probability of rematch \n per game level", ylab="Percent of rematches",
        names.arg=c("Easy", "Normal", "Hard"), xlab = "Game level")

## MCQs
## read in data
## 1: Accuracy in timed vs untimed MCQs across course levels    
## relationship between time limit and response accuracy
mcqData <- read.csv("./data/mcq.csv")
df2 <- as.data.frame(table(mcqData$correct, mcqData$questionType, mcqData$channel))
cs33 <- c()
cs64 <- c()
ge23 <- c()
for (i in seq(1,(nrow(df2)-1), by=2)){
    accuracy <- df2$Freq[i+1]/(df2$Freq[i+1]+df2$Freq[i])
    if (df2$Var3[i] == "cs3334") cs33 <- c(cs33, accuracy)
    if (df2$Var3[i] == "cs6491") cs64 <- c(cs64, accuracy)
    if (df2$Var3[i] == "ge2340") ge23 <- c(ge23, accuracy)
}
par(mfrow=c(1,3), mar=c(6, 6, 3, 3))
cols6 <- brewer.pal(n = 6, name = "RdBu")
xnames <- c("Untimed", "2", "3", "5", "6", "7")
barplot(cs33, col = cols6, density=80, ylim=c(0,1), 
        main="CS3334",
        ylab="Accuracy", names.arg = xnames, xlab="Time Limit (minutes)")
barplot(cs64, col = cols6, density=80, ylim=c(0,1), 
        main="CS6491",
        ylab="Accuracy", names.arg = xnames, xlab="Time Limit (minutes)")
barplot(ge23, col = cols6, density=80, ylim=c(0,1), 
        main="GE2340",
        ylab="Accuracy", names.arg = xnames, xlab="Time Limit (minutes)")

## 2: How does time of day affect response accuracy?
## divide play time into factors
mcqData$answerAt <- strptime(mcqData$answerAt, 
                                 format = c("%Y-%m-%d %H:%M:%S"))
hoursMCQ <- hour(ymd_hms(mcqData$answerAt))
minutesMCQ <- minute(ymd_hms(mcqData$answerAt))
mcqData$dayTime <- paste(hoursMCQ, minutesMCQ, sep=".")
levelsMCQ <- cut(as.numeric(mcqData$dayTime), breaks = c(0, 4, 8, 12, 16, 20, 24))
df3 <- as.data.frame(table(mcqData$correct, levelsMCQ))
perc <- c()
for(i in seq(1,nrow(df3), by=2)){
    accuracy <- df3$Freq[i+1]/(df3$Freq[i+1]+df3$Freq[i])
    perc <- c(perc, accuracy)
}
par(mfrow=c(1,1), mar=c(6, 6, 3, 3))
barplot(perc, col=cols6, density=80, ylim=c(0,.8),
        main="Time of day effect \n on MCQ response accuracy", 
        ylab="Accuracy", names.arg=unique(df3$levelsMCQ), xlab="Time of day (hours)")

## 3: Do students improve over the length of the course in MCQ accuracy?
## for each of the 6 top responders, find accuracy per quarter

top6 <- as.data.frame(tail(sort(table(mcqData$sID)), 6))
df4 <- mcqData[mcqData$sID %in% top6$Var1,]
resps <- split(df4$correct, df4$sID)
par(mfrow=c(2,3))
ids <- sort(as.numeric(as.vector(top6$Var1)))
for(i in 1:length(resps)){
    reps <- rep(1:4, each=length(resps[[i]])/4)
    while (length(reps) != length(resps[[i]])) reps <- c(reps, 4)
    barplot(tapply(resps[[i]], reps, mean), col = cols2, density=80, ylim = c(0, .8), 
            main=c("Student ID: ", ids[i]), ylab="Accuracy", 
            names.arg = c("1st", "2nd", "3rd", "4th"), xlab="Quater")
}

## performance across all student body
repsAll <- rep(1:4, each=nrow(mcqData)/4)
while (length(repsAll) != nrow(mcqData)) repsAll <- c(repsAll, 4)
barplot(tapply(mcqData$correct, repsAll, mean), col = cols2, density=80, ylim = c(0, .8), 
        main="Performance on MCQs across all courses", ylab="Accuracy", 
        names.arg = c("1st", "2nd", "3rd", "4th"), xlab="Quater")
## performance per course 
par(mfrow=c(1,3))

courses <- split(mcqData$correct, mcqData$channel)
courseNames <- sort(unique(mcqData$channel))
for (i in 1:length(courses)){
    repsC <- rep(1:4, each=length(courses[[i]])/4)
    while (length(repsC) != length(courses[[i]])) repsC <- c(repsC, 4)
    barplot(tapply(courses[[i]], repsC, mean), col = cols2, density=80, ylim = c(0, 1), 
            main=c(toupper(courseNames[i])), ylab="Accuracy", 
            names.arg = c("1st", "2nd", "3rd", "4th"), xlab="Quater")
    
}

## Polls
## 1: Does accuracy on one Fermi question correlate with accuracy on other Fermi questions?
## read data
pollData <- read.csv("./data/NemoBotPollDataset.csv")
fermiqs <- unique(pollData$voteText)[(10:15)[-3]] ## find fermi qs
pollData <- pollData[pollData$voteText %in% fermiqs,]
## subset students who answered all fermi qs
students <- split(pollData$sID, pollData$voteText)
inter <- Reduce(intersect, list(students[[1]], students[[2]], 
                                students[[3]], students[[4]], students[[5]]))
pollData <- pollData[pollData$sID %in% inter,]
## subset students who got each q correct
corr <- c("90", "120 million", "300900 miles", "1 x 10^80", "255,168")
accur <- list()
for (i in 1:length(fermiqs)){
    accur[[i]] <- pollData$sID[pollData$voteText==fermiqs[i] & pollData$answer==corr[i]]
}
## find what percentage got each of the other fermi qs correct
percs <- list(0, 0, 0, 0, 0)
for(i in 1:length(fermiqs)){
    seq <- (1:5)[-i]
    for(j in seq){
        percs[[i]] <- append(percs[[i]], (length(intersect(accur[[i]], accur[[j]]))/length(accur[[i]])))
    }
    percs[[i]] <- percs[[i]][2:5]
}
## make plots for each fermi q
par(mfrow=c(2, 3))
for(i in 1:length(fermiqs)){
    barplot(percs[[i]], col=cols2, density=80, ylim=c(0, .9), 
            main=c("Fremi Question", i), ylab="% of students also correctly answering specified Fermi Question", names.arg = (1:5)[-i])
}

NST

Fabiola

9/15/2020