Santosh Anand — Jul 5, 2013, 4:09 AM
library(ggplot2)
library(reshape2)
max.scores = c(10,9,10,10,11,20,15,10) # maxmum scores
numTests = length(max.scores) # Total num of Tests
## setwd('Desktop/workNew/courses/Useful Genetics/fun stuff/')
# data contains "CCTP" and some <"> charater. Remove them and put in _formatted
grades=read.csv("AnonGradebookJuly3_formatted.txt", header=T)
head(grades)
Module.1.Graded.Quiz Module.2.Graded.Quiz Module.3.Graded.Quiz
1 9.875 9.000 9.500
2 6.125 0.000 9.333
3 9.875 8.500 8.500
4 8.500 6.266 9.500
5 9.042 7.332 9.833
6 7.625 6.100 9.500
Module.4.Graded.Quiz Module.5.Graded.Quiz Midterm Module.6.Graded.Quiz
1 8.700 11.000 17.67 15
2 8.850 10.780 17.50 15
3 7.933 10.500 16.67 15
4 7.613 9.888 16.67 15
5 6.823 9.335 17.50 15
6 8.557 0.000 16.67 15
Module.7.Graded.Quiz
1 10
2 10
3 10
4 10
5 10
6 10
nrow (grades) # Total rows in original data
[1] 2730
# set easy names of columns
my.names = c(paste0("Quiz.",1:5), "Midterm", paste0("Quiz.",6:7))
names(grades) = my.names
# Replace all the zero values by NA
grades[grades==0] = NA
summary(grades)
Quiz.1 Quiz.2 Quiz.3 Quiz.4
Min. : 0.2 Min. : 0.3 Min. : 0.5 Min. : 0.1
1st Qu.: 4.6 1st Qu.: 4.4 1st Qu.: 5.1 1st Qu.: 5.9
Median : 6.3 Median : 5.7 Median : 6.5 Median : 7.1
Mean : 6.1 Mean : 5.6 Mean : 6.5 Mean : 6.7
3rd Qu.: 7.7 3rd Qu.: 6.7 3rd Qu.: 7.9 3rd Qu.: 7.9
Max. :14.9 Max. :11.7 Max. :16.5 Max. :10.0
NA's :579 NA's :1348 NA's :1586 NA's :1704
Quiz.5 Midterm Quiz.6 Quiz.7
Min. : 0.2 Min. : 0.7 Min. : 0.7 Min. : 0.0
1st Qu.: 6.5 1st Qu.:11.8 1st Qu.: 9.7 1st Qu.: 4.9
Median : 8.2 Median :14.7 Median :11.8 Median : 6.9
Mean : 7.8 Mean :13.9 Mean :11.1 Mean : 6.6
3rd Qu.: 9.5 3rd Qu.:16.7 3rd Qu.:13.6 3rd Qu.: 8.4
Max. :11.0 Max. :20.0 Max. :15.0 Max. :10.0
NA's :1822 NA's :1808 NA's :1945 NA's :2015
# Remove those grades which have <4 appearances
numTestsTaken = rowSums(!is.na(grades))
my.grades = grades[numTestsTaken > 4,]
# num of those people having taken >4 exams
nrow(my.grades)
[1] 876
summary(my.grades)
Quiz.1 Quiz.2 Quiz.3 Quiz.4
Min. : 0.54 Min. :0.33 Min. : 0.92 Min. : 0.09
1st Qu.: 5.66 1st Qu.:5.04 1st Qu.: 5.50 1st Qu.: 6.18
Median : 7.06 Median :6.17 Median : 6.92 Median : 7.24
Mean : 6.86 Mean :6.02 Mean : 6.80 Mean : 6.93
3rd Qu.: 8.17 3rd Qu.:7.17 3rd Qu.: 8.33 3rd Qu.: 8.03
Max. :10.00 Max. :9.00 Max. :10.00 Max. :10.00
NA's :78 NA's :63 NA's :41 NA's :29
Quiz.5 Midterm Quiz.6 Quiz.7
Min. : 0.87 Min. : 0.67 Min. : 0.67 Min. : 0.04
1st Qu.: 6.78 1st Qu.:12.25 1st Qu.: 9.89 1st Qu.: 5.11
Median : 8.34 Median :14.88 Median :11.92 Median : 6.98
Mean : 8.02 Mean :14.21 Mean :11.24 Mean : 6.66
3rd Qu.: 9.61 3rd Qu.:16.67 3rd Qu.:13.70 3rd Qu.: 8.40
Max. :11.00 Max. :20.00 Max. :15.00 Max. :10.00
NA's :49 NA's :50 NA's :144 NA's :202
# add an id-column as the original row nums. This is necessary for "melting" the data correctly later
my.grades$id=as.numeric(rownames(my.grades))+1
head(my.grades)
Quiz.1 Quiz.2 Quiz.3 Quiz.4 Quiz.5 Midterm Quiz.6 Quiz.7 id
1 9.875 9.000 9.500 8.700 11.000 17.67 15 10 2
2 6.125 NA 9.333 8.850 10.780 17.50 15 10 3
3 9.875 8.500 8.500 7.933 10.500 16.67 15 10 4
4 8.500 6.266 9.500 7.613 9.888 16.67 15 10 5
5 9.042 7.332 9.833 6.823 9.335 17.50 15 10 6
6 7.625 6.100 9.500 8.557 NA 16.67 15 10 7
my.grades.m = melt(my.grades, id.vars="id")
names(my.grades.m) = c("id", "TestExam", "score")
# Normalized grades
my.Ngrades = my.grades
for(i in 1:numTests) {
my.Ngrades[,i] = (my.Ngrades[,i] / max.scores[i]) * 10;
}
my.Ngrades.m = melt(my.Ngrades, id.vars="id")
names(my.Ngrades.m) = c("id", "TestExam", "score")
######### Start various Plots ###############
############## Boxplot ######################
p = qplot(TestExam, score, data=my.grades.m, geom="boxplot", notch=TRUE, outlier.colour = "red", outlier.size=6, fill=TestExam, alpha=I(.5), main="Boxplot (DataFreeze on 3July)", xlab="TestExam", ylab="Grades")
p+ geom_point()
Warning: Removed 656 rows containing non-finite values (stat_boxplot).
Warning: Removed 656 rows containing missing values (geom_point).
###############################################
# Density distribution of current Quiz (7)
qplot(score, data=my.grades.m[my.grades.m$TestExam == "Quiz.7",], geom="density", fill=TestExam, alpha=I(.5), main="Distribution of Grades (Quiz 7)", xlab="Grades", ylab="Density")
Warning: Removed 202 rows containing non-finite values (stat_density).
###############################################
# Density of Grades
qplot(score, data=my.grades.m, geom="density", fill=TestExam, alpha=I(.5), main="Distribution of Grades", xlab="Grades", ylab="Density")
Warning: Removed 78 rows containing non-finite values (stat_density).
Warning: Removed 63 rows containing non-finite values (stat_density).
Warning: Removed 41 rows containing non-finite values (stat_density).
Warning: Removed 29 rows containing non-finite values (stat_density).
Warning: Removed 49 rows containing non-finite values (stat_density).
Warning: Removed 50 rows containing non-finite values (stat_density).
Warning: Removed 144 rows containing non-finite values (stat_density).
Warning: Removed 202 rows containing non-finite values (stat_density).
###############################################
# Density of Normalized grades
qplot(score, data=my.Ngrades.m, geom="density", fill=TestExam, alpha=I(.5), main="Distribution of Normalized Grades", xlab="Normalized Grades (max = 10)", ylab="Density")
Warning: Removed 78 rows containing non-finite values (stat_density).
Warning: Removed 63 rows containing non-finite values (stat_density).
Warning: Removed 41 rows containing non-finite values (stat_density).
Warning: Removed 29 rows containing non-finite values (stat_density).
Warning: Removed 49 rows containing non-finite values (stat_density).
Warning: Removed 50 rows containing non-finite values (stat_density).
Warning: Removed 144 rows containing non-finite values (stat_density).
Warning: Removed 202 rows containing non-finite values (stat_density).
###############################################
# Violin Plot
ggplot(my.grades.m, aes(x = score )) + stat_density(aes(ymax = ..density.., ymin = -..density..), fill = "grey50", colour = "grey50", geom = "ribbon", position = "identity") + facet_grid(. ~ TestExam) + coord_flip() + xlab("Grades") + ggtitle("Density Distribution of Grades")
Warning: Removed 78 rows containing non-finite values (stat_density).
Warning: Removed 63 rows containing non-finite values (stat_density).
Warning: Removed 41 rows containing non-finite values (stat_density).
Warning: Removed 29 rows containing non-finite values (stat_density).
Warning: Removed 49 rows containing non-finite values (stat_density).
Warning: Removed 50 rows containing non-finite values (stat_density).
Warning: Removed 144 rows containing non-finite values (stat_density).
Warning: Removed 202 rows containing non-finite values (stat_density).
###############################################
# Violin Plot (Normalized Grades)
ggplot(my.Ngrades.m, aes(x = score )) + stat_density(aes(ymax = ..density.., ymin = -..density..), fill = "grey50", colour = "grey50", geom = "ribbon", position = "identity") + facet_grid(. ~ TestExam) + coord_flip() + xlab("Normalized Grades (max = 10)") + ggtitle("Density Distribution of Normalized Grades")
Warning: Removed 78 rows containing non-finite values (stat_density).
Warning: Removed 63 rows containing non-finite values (stat_density).
Warning: Removed 41 rows containing non-finite values (stat_density).
Warning: Removed 29 rows containing non-finite values (stat_density).
Warning: Removed 49 rows containing non-finite values (stat_density).
Warning: Removed 50 rows containing non-finite values (stat_density).
Warning: Removed 144 rows containing non-finite values (stat_density).
Warning: Removed 202 rows containing non-finite values (stat_density).