Preprossessing

classlist2013 = read.csv("classlists_2013W_UBC.csv")
classlist2014 = read.csv("classlists_2014W_UBC.csv")
quiz2013 = read.csv("pre_quiz_2013.csv", na.strings=c(" ", "", "NA", "In Progress"))
quiz2013 <- as.data.frame(lapply(quiz2013,function(x) gsub("Needs Grading","",x)))
quiz2013 <- as.data.frame(lapply(quiz2013,function(x) as.numeric(gsub("[()]","",x))))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
quiz2014 = read.csv("pre_quiz_2014.csv", na.strings=c(" ", "", "NA", "In Progress"))

merged_2013 = merge(x=quiz2013, y=classlist2013[,c(3,11,15)], by.x = "Username", by.y = "Student.Number")
merged_2014 = merge(x=quiz2014, y=classlist2014[,c(3,11,15)], by.x = "Username", by.y = "Student.Number")

merged_2013$Percent.Quiz = rowSums(merged_2013[,2:30], na.rm = TRUE)/145 * 100
merged_2014$Percent.Quiz = rowSums(merged_2014[,2:30], na.rm = TRUE)/146 * 100

Pre-class quiz distribution vs. percent grade distribution

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
library(gridExtra)
quiz2013 = ggplot(aes(Percent.Quiz), data = merged_2013) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) + labs(title = "2013 Pre-class quiz distribution")
percentgrade2013 = ggplot(aes(Percent.Grade), data = merged_2013) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) +labs(title = " 2013 Percent grade distribution")
grid.arrange(quiz2013, percentgrade2013)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

quiz2014 = ggplot(aes(Percent.Quiz), data = merged_2014) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) + labs(title = "2014 Pre-class quiz distribution")
percentgrade2014 = ggplot(aes(Percent.Grade), data = merged_2014) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) + labs(title = "2014 Percent grade distribution")
grid.arrange(quiz2014, percentgrade2014)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Relationship between pre-class quiz perfomance and percent grade

ggplot(aes(Percent.Quiz, Percent.Grade, color = Gender), data = merged_2013) + geom_point() + stat_smooth(method = "lm") + labs(title = "Relationship between pre-class quiz and percent grade in 2013", x = "Percent quiz", y = "Percent grade")

cor(merged_2013$Percent.Quiz, merged_2013$Percent.Grade)
## [1] 0.6993431
linearRegression2013 = lm(Percent.Grade ~ Percent.Quiz + Gender, data = merged_2013)
summary(linearRegression2013)
## 
## Call:
## lm(formula = Percent.Grade ~ Percent.Quiz + Gender, data = merged_2013)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.501  -6.620   1.165   8.091  56.052 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  25.94751    1.24826  20.787  < 2e-16 ***
## Percent.Quiz  0.57844    0.01752  33.010  < 2e-16 ***
## GenderMale    2.84899    0.70031   4.068 5.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.5 on 1145 degrees of freedom
## Multiple R-squared:  0.4964, Adjusted R-squared:  0.4955 
## F-statistic: 564.2 on 2 and 1145 DF,  p-value: < 2.2e-16
ggplot(aes(Percent.Quiz, Percent.Grade, color = Gender), data = merged_2014) + geom_point() + stat_smooth(method = "lm") + labs(title = "Relationship between pre-class quiz and percent grade in 2014", x = "Percent quiz", y = "Percent grade")

cor(merged_2014$Percent.Quiz, merged_2014$Percent.Grade)
## [1] 0.6494114
linearRegression2014 = lm(Percent.Grade ~ Percent.Quiz + Gender, data = merged_2014)
summary(linearRegression2014)
## 
## Call:
## lm(formula = Percent.Grade ~ Percent.Quiz + Gender, data = merged_2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.342  -7.795   1.310   8.964  37.865 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  29.66026    1.38768  21.374  < 2e-16 ***
## Percent.Quiz  0.62028    0.02134  29.067  < 2e-16 ***
## GenderMale    2.16398    0.81070   2.669  0.00771 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.73 on 1143 degrees of freedom
## Multiple R-squared:  0.4253, Adjusted R-squared:  0.4243 
## F-statistic:   423 on 2 and 1143 DF,  p-value: < 2.2e-16

Comparison of percent grade distribution in 2013 and 2014

merged_2013$year = 2013
merged_2014$year = 2014
comparison = data.frame(rbind(merged_2013[,c('Percent.Grade','Percent.Quiz','year')], merged_2014[,c('Percent.Grade','Percent.Quiz','year')]))
comparison$year = factor(comparison$year)
ggplot(aes(x=Percent.Grade), data=comparison) + geom_density(aes(group=year, color=year))

boxplotgrade2013 = ggplot(aes(y = Percent.Grade, x = Gender), data =  merged_2013) + geom_boxplot() + ggtitle("Distribution of\n percent grade in 2013")
boxplotgrade2014 = ggplot(aes(y = Percent.Grade, x = Gender), data =  merged_2014) + geom_boxplot() + ggtitle("Distribution of\n percent grade in 2014")
grid.arrange(boxplotgrade2013, boxplotgrade2014, ncol = 2)

t-test

t.test(merged_2013$Percent.Grade, merged_2014$Percent.Grade)
## 
##  Welch Two Sample t-test
## 
## data:  merged_2013$Percent.Grade and merged_2014$Percent.Grade
## t = -1.7515, df = 2288.8, p-value = 0.08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.5553106  0.1442338
## sample estimates:
## mean of x mean of y 
##  66.27352  67.47906

Comparison of pre-class quiz performace in 2013 and 2014

ggplot(aes(x=Percent.Quiz), data=comparison) + geom_density(aes(group=year, color=year))

boxplot2013 = ggplot(aes(y = Percent.Quiz, x = Gender), data =  merged_2013) + geom_boxplot() + ggtitle("Distribution of\n percent quiz in 2013")
boxplot2014 = ggplot(aes(y = Percent.Quiz, x = Gender), data =  merged_2014) + geom_boxplot() + ggtitle("Distribution of\n percent quiz in 2014")
grid.arrange(boxplot2013, boxplot2014, ncol = 2)

t-test

t.test(merged_2013$Percent.Quiz, merged_2014$Percent.Quiz)
## 
##  Welch Two Sample t-test
## 
## data:  merged_2013$Percent.Quiz and merged_2014$Percent.Quiz
## t = 10.29, df = 2275, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  6.472290 9.519961
## sample estimates:
## mean of x mean of y 
##  67.84007  59.84395