Preprossessing
classlist2013 = read.csv("classlists_2013W_UBC.csv")
classlist2014 = read.csv("classlists_2014W_UBC.csv")
quiz2013 = read.csv("pre_quiz_2013.csv", na.strings=c(" ", "", "NA", "In Progress"))
quiz2013 <- as.data.frame(lapply(quiz2013,function(x) gsub("Needs Grading","",x)))
quiz2013 <- as.data.frame(lapply(quiz2013,function(x) as.numeric(gsub("[()]","",x))))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
quiz2014 = read.csv("pre_quiz_2014.csv", na.strings=c(" ", "", "NA", "In Progress"))
merged_2013 = merge(x=quiz2013, y=classlist2013[,c(3,11,15)], by.x = "Username", by.y = "Student.Number")
merged_2014 = merge(x=quiz2014, y=classlist2014[,c(3,11,15)], by.x = "Username", by.y = "Student.Number")
merged_2013$Percent.Quiz = rowSums(merged_2013[,2:30], na.rm = TRUE)/145 * 100
merged_2014$Percent.Quiz = rowSums(merged_2014[,2:30], na.rm = TRUE)/146 * 100
Pre-class quiz distribution vs. percent grade distribution
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
library(gridExtra)
quiz2013 = ggplot(aes(Percent.Quiz), data = merged_2013) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) + labs(title = "2013 Pre-class quiz distribution")
percentgrade2013 = ggplot(aes(Percent.Grade), data = merged_2013) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) +labs(title = " 2013 Percent grade distribution")
grid.arrange(quiz2013, percentgrade2013)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

quiz2014 = ggplot(aes(Percent.Quiz), data = merged_2014) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) + labs(title = "2014 Pre-class quiz distribution")
percentgrade2014 = ggplot(aes(Percent.Grade), data = merged_2014) + geom_histogram(aes(y=..density..), colour="black", fill="white") + geom_density(fill = 'light blue', alpha=0.2) + labs(title = "2014 Percent grade distribution")
grid.arrange(quiz2014, percentgrade2014)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Relationship between pre-class quiz perfomance and percent grade
ggplot(aes(Percent.Quiz, Percent.Grade, color = Gender), data = merged_2013) + geom_point() + stat_smooth(method = "lm") + labs(title = "Relationship between pre-class quiz and percent grade in 2013", x = "Percent quiz", y = "Percent grade")

cor(merged_2013$Percent.Quiz, merged_2013$Percent.Grade)
## [1] 0.6993431
linearRegression2013 = lm(Percent.Grade ~ Percent.Quiz + Gender, data = merged_2013)
summary(linearRegression2013)
##
## Call:
## lm(formula = Percent.Grade ~ Percent.Quiz + Gender, data = merged_2013)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.501 -6.620 1.165 8.091 56.052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.94751 1.24826 20.787 < 2e-16 ***
## Percent.Quiz 0.57844 0.01752 33.010 < 2e-16 ***
## GenderMale 2.84899 0.70031 4.068 5.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.5 on 1145 degrees of freedom
## Multiple R-squared: 0.4964, Adjusted R-squared: 0.4955
## F-statistic: 564.2 on 2 and 1145 DF, p-value: < 2.2e-16
ggplot(aes(Percent.Quiz, Percent.Grade, color = Gender), data = merged_2014) + geom_point() + stat_smooth(method = "lm") + labs(title = "Relationship between pre-class quiz and percent grade in 2014", x = "Percent quiz", y = "Percent grade")

cor(merged_2014$Percent.Quiz, merged_2014$Percent.Grade)
## [1] 0.6494114
linearRegression2014 = lm(Percent.Grade ~ Percent.Quiz + Gender, data = merged_2014)
summary(linearRegression2014)
##
## Call:
## lm(formula = Percent.Grade ~ Percent.Quiz + Gender, data = merged_2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.342 -7.795 1.310 8.964 37.865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.66026 1.38768 21.374 < 2e-16 ***
## Percent.Quiz 0.62028 0.02134 29.067 < 2e-16 ***
## GenderMale 2.16398 0.81070 2.669 0.00771 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.73 on 1143 degrees of freedom
## Multiple R-squared: 0.4253, Adjusted R-squared: 0.4243
## F-statistic: 423 on 2 and 1143 DF, p-value: < 2.2e-16
Comparison of percent grade distribution in 2013 and 2014
merged_2013$year = 2013
merged_2014$year = 2014
comparison = data.frame(rbind(merged_2013[,c('Percent.Grade','Percent.Quiz','year')], merged_2014[,c('Percent.Grade','Percent.Quiz','year')]))
comparison$year = factor(comparison$year)
ggplot(aes(x=Percent.Grade), data=comparison) + geom_density(aes(group=year, color=year))

boxplotgrade2013 = ggplot(aes(y = Percent.Grade, x = Gender), data = merged_2013) + geom_boxplot() + ggtitle("Distribution of\n percent grade in 2013")
boxplotgrade2014 = ggplot(aes(y = Percent.Grade, x = Gender), data = merged_2014) + geom_boxplot() + ggtitle("Distribution of\n percent grade in 2014")
grid.arrange(boxplotgrade2013, boxplotgrade2014, ncol = 2)

t-test
t.test(merged_2013$Percent.Grade, merged_2014$Percent.Grade)
##
## Welch Two Sample t-test
##
## data: merged_2013$Percent.Grade and merged_2014$Percent.Grade
## t = -1.7515, df = 2288.8, p-value = 0.08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.5553106 0.1442338
## sample estimates:
## mean of x mean of y
## 66.27352 67.47906