Load Data
GSD <- readxl::read_excel("Guatemalan Student Test Data(1).xlsx")
Examine Data
# ----------------------- Mon Oct 01 15:23:56 2018 ------------------------#
# Missing Values
GSD[, lapply(GSD, is.numeric) %>% unlist] %>% summary
## Age Grade Q1 Q2
## Min. : 0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 8.000 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 9.000 Median :2.000 Median :1.0000 Median :1.0000
## Mean : 8.956 Mean :2.477 Mean :0.5313 Mean :0.5415
## 3rd Qu.: 9.000 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :90.000 Max. :3.000 Max. :1.0000 Max. :1.0000
## NA's :164 NA's :10 NA's :69 NA's :70
## Q3 Q4 Q5 Q6
## Min. :0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median : 1.0000 Median :0.0000 Median :0.0000
## Mean :0.4325 Mean : 0.5714 Mean :0.4715 Mean :0.4814
## 3rd Qu.:1.0000 3rd Qu.: 1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :12.0000 Max. :1.0000 Max. :1.0000
## NA's :69 NA's :69 NA's :70 NA's :73
## Q7 Q8 Q9 Q10
## Min. :0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median : 1.0000 Median :0.0000 Median :1.0000
## Mean :0.6876 Mean : 0.6749 Mean :0.2958 Mean :0.5528
## 3rd Qu.:1.0000 3rd Qu.: 1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :11.0000 Max. :1.0000 Max. :1.0000
## NA's :69 NA's :91 NA's :71 NA's :69
## Q11 Q12 Q13 Q14
## Min. :0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median : 1.0000 Median :1.0000 Median :0.0000
## Mean :0.4372 Mean : 0.6171 Mean :0.5406 Mean :0.4635
## 3rd Qu.:1.0000 3rd Qu.: 1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :11.0000 Max. :1.0000 Max. :1.0000
## NA's :69 NA's :69 NA's :70 NA's :72
## Q15 Q16 Q17 Q18
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000
## Mean :0.4069 Mean :0.5245 Mean :0.3528 Mean :0.3578
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :70 NA's :71 NA's :69 NA's :69
## Q19 Q20 Q21 Q22
## Min. :0.0000 Min. :-1.0000 Min. : 0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000
## Median :0.0000 Median : 0.0000 Median : 1.0000 Median :0.0000
## Mean :0.3478 Mean : 0.2841 Mean : 0.5496 Mean :0.3991
## 3rd Qu.:1.0000 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. : 1.0000 Max. :10.0000 Max. :1.0000
## NA's :69 NA's :69 NA's :69 NA's :69
## Q23 Q24
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.3388 Mean :0.3248
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## NA's :71 NA's :72
rNA <- vector(length = nrow(GSD))
for (i in seq_along(1:nrow(GSD))) {
rNA[i] <- sapply(GSD[i, ], is.na) %>% sum
}
GSD <- GSD[{
rNA > 6
} %>% not, ] # Remove the rows where nearly all the data is missing
tags$p("The following rows have large amounts of missing data, is it possible to recover any of these values?")
The following rows have large amounts of missing data, is it possible to recover any of these values?
rNA <- vector(length = nrow(GSD))
for (i in seq_along(1:nrow(GSD))) {
rNA[i] <- sapply(GSD[i, ], is.na) %>% sum
}
GSD[{
rNA > 3
} %>% which, ]
# ----------------------- Mon Oct 01 15:24:08 2018 ------------------------#
# Questions
Qs <- names(GSD) %>% str_detect("Q") # Create a variable for subsetting the Question columns
tags$p("If all other questions use 0 as incorrect and 1 as correct, why do the following questions have values other than 0 or 1?")
If all other questions use 0 as incorrect and 1 as correct, why do the following questions have values other than 0 or 1?
GSD[, Qs] %>% lapply(unique) %>% subset(GSD[, Qs] %>% lapply(unique) %>% lapply(FUN = function(x) !x %in%
c(0, 1, NA)) %>% lapply(any) %>% unlist)
## $Q4
## [1] 0 1 12 11 10
##
## $Q8
## [1] 1 0 NA 11
##
## $Q12
## [1] 1 0 10 11
##
## $Q20
## [1] 1 0 -1
##
## $Q21
## [1] 1 0 10
aQs <- GSD[, Qs] %>% lapply(unique) %>% lapply(FUN = function(x) !x %in% c(0, 1,
NA)) %>% lapply(any) %>% unlist %>% which %>% names # Capture names of questions with aberrant values
tags$em("Note:The following indices can be used to find and correct the aberrant values:")
(aQs_ind <- GSD[, aQs] %>% lapply(FUN = function(x) {
out <- sapply(x, FUN = function(v) !v %in% c(0, 1, NA), simplify = T) %>% unlist %>%
which
}))
## $Q4
## [1] 1620 2157 2417 2746
##
## $Q8
## [1] 1580
##
## $Q12
## [1] 285 2187
##
## $Q20
## [1] 3376
##
## $Q21
## [1] 1984
# ----------------------- Mon Oct 01 15:23:40 2018 ------------------------# Ages
tags$p("The following are reported ages, some of these values are aberrant")
The following are reported ages, some of these values are aberrant
GSD[, "Age", drop = T] %>% unique
## [1] 10 9 8 7 11 NA 12 13 14 15 6 0 18 90
age_ol <- c({
GSD$Age == 0
} %>% which, {
GSD$Age == 90
} %>% which)
tags$p("Should the ages for the entries in the table below be 10,10 and 9? I will be imputing these as such, if we find please let me know.")
Should the ages for the entries in the table below be 10,10 and 9? I will be imputing these as such, if we find please let me know.
GSD[age_ol, "Age"]
GSD[age_ol, "Age"] <- c(10, 10, 9) #Assume most likely typos
# ----------------------- Mon Oct 01 15:23:28 2018 ------------------------# Sex
tags$p("The sex of individuals, notice there is one number.")
The sex of individuals, notice there is one number.
GSD$Sex %>% unique
## [1] "F" "M" NA "1" "m"
GSD$Sex %<>% gsub("m", "M", .) # Change m to M
tags$p("Is the sex of the individual below M or F?")
Is the sex of the individual below M or F?
GSD[{
GSD$Sex == 1
} %>% which, ]
tags$p("I will assume the sex of this individual is M, if info is found suggesting otherwise, please let me know.")
I will assume the sex of this individual is M, if info is found suggesting otherwise, please let me know.
GSD[{
GSD$Sex == 1
} %>% which, "Sex"] <- "M" #Assume Male
chr <- GSD[, lapply(GSD, is.numeric) %>% unlist %>% not] %>% names # Get a vector of all non numeric column names
Data Cleaning Continued 1
# ----------------------- Mon Oct 01 15:22:58 2018 ------------------------#
# School Names
# {GSD$`School Name`=='EORM Caserio Santa Maria El Tablón'} %>% which %>% length
# {GSD$`School Name`=='EORM Caserio Santa Maria El Tablon'} %>% which %>% length
# # Used to determin which was the most used school name for the following
# transformation
GSD$`School Name`[{
GSD$`School Name` == "EORM Caserio Santa Maria El Tablon"
}] <- "EORM Caserio Santa Maria El Tablón" # Combine those two school names since they are the same
tags$p("It appears that all of the schools below are the same school. This indicates the school code is likely a more reliable value for grouping the data by school.")
It appears that all of the schools below are the same school. This indicates the school code is likely a more reliable value for grouping the data by school.
GSD[GSD$`School Name` %>% str_detect("CONCEPCION|Concepción"), ] #It appears that all of the school names below are actually the same school
# ----------------------- Mon Oct 01 15:22:48 2018 ------------------------#
# School Codes
GSD$`School Code` %<>% gsub("(\\d)(\\d{2})(\\d{4})(\\d{2})", "0\\1\\-\\2\\-\\3\\-\\4",
.) # Fix obvious formatting errors
tags$p("I've corrected some of the common formatting errors of the school code, can the remaining values be corrected?")
I've corrected some of the common formatting errors of the school code, can the remaining values be corrected?
GSD[GSD$`School Code` %>% str_detect("\\d{2}\\-\\d{2}\\-\\d{4}\\-\\d{2}") %>% not,
]
tags$p("Are any missing school codes imputable?")
Are any missing school codes imputable?
GSD[GSD$`School Name` %in% {
GSD[GSD$`School Code` %>% is.na(), ][["School Name"]] %>% unique
}, ] %>% arrange(desc(`School Name`)) %>% .[, c(1, 2)] %>% unique
GSD$`School Code`[GSD$`School Code` %>% str_detect("semana")] <- NA
tags$p("It appears there are two school codes which are imputable. ")
It appears there are two school codes which are imputable.
iSchCodes <- GSD[GSD$`School Name` %in% {
GSD[GSD$`School Code` %>% is.na(), ][["School Name"]] %>% unique
}, ] %>% arrange(desc(`School Name`)) %>% .[, c(1, 2)] %>% unique %>% .[!is.na(.$`School Code`),
]
# Impute Codes
GSD$`School Code`[GSD$`School Name` == iSchCodes[1, 1, drop = T]] <- iSchCodes[1,
2, drop = T]
GSD$`School Code`[GSD$`School Name` == iSchCodes[2, 1, drop = T]] <- iSchCodes[2,
2, drop = T]
tags$p("Can any of these remaining school codes be recovered?")
Can any of these remaining school codes be recovered?
GSD[GSD$`School Name` %in% {
GSD[GSD$`School Code` %>% is.na(), ][["School Name"]] %>% unique
}, ] %>% arrange(desc(`School Name`)) %>% .[, c(1, 2)] %>% unique
# ----------------------- Mon Oct 08 08:25:29 2018 ------------------------#
# Teacher names
GSD$Teacher %>% unique
## [1] NA "mgg 2" "hn gonzales castro"
## [4] "jc colo sosof" "paguacal 2 sec A" "paguacal 2 sec B"
## [7] "san jorge 3A" "mgg 3" "brenda Roxana"
## [10] "gregorio cuc" "rosa lopez morales" "paguacal 3 sec A"
## [13] "paguacal 3 sec B" "teresita 3B" "santos tzay chipin"
tags$p("Some teachers are tracking their sections individually by adding data to the teacher column. We can separate the teacher column into teacher & section to make this easier for them to do so if they would like to have that granularity. This also creates consistency in the factors for the teacher column")
Some teachers are tracking their sections individually by adding data to the teacher column. We can separate the teacher column into teacher & section to make this easier for them to do so if they would like to have that granularity. This also creates consistency in the factors for the teacher column
GSD %<>% separate(Teacher, into = c("Teacher", "Section"), sep = "\\s(?=\\d)", remove = T)
GSD$Teacher %<>% as.factor()
Data Cleaning Continued 2
# ----------------------- Mon Oct 08 09:37:07 2018 ------------------------#
# Grades
tags$p("The following columns have aberrant or missing values for grade level. Can these be corrected? These will be imputed to the mode of the Grade column until corrections are supplied.")
The following columns have aberrant or missing values for grade level. Can these be corrected? These will be imputed to the mode of the Grade column until corrections are supplied.
GSD[GSD$Grade == 0 | is.na(GSD$Grade), ]
GSD[GSD$Grade == 0 | is.na(GSD$Grade), "Grade"] <- Mode(GSD$Grade)
# ----------------------- Mon Oct 01 15:22:30 2018 ------------------------# Exam
# Dates
GSD$`Exam date` %>% unique
## [1] "42429" "19/2" "42438" "42416" "16/2" "42431" "42410" "42436"
## [9] "42412" "42433" "26Feb" "26/2" "42417" "17/2" "42419" "42430"
## [17] "1" "42411" "42415" "42424" "42425" "42423" "42422" "42418"
## [25] "42437" "42440" "42439" "42443" "42426" "42432" "42444" "42445"
## [33] "c" "18Feb" "42441" "42442" "42446" "42447" "42448" "42449"
## [41] "42450" "42451" "42452" "42453" "42454" "42455" "42456" "42457"
## [49] "42458" "42459" "42460" "42461" "42462" "42463" "42464" "42465"
## [57] "42466" "42467" "42468" "42469" "42470" "42471" "42472" "42473"
## [65] "42474" "42475" "42476" "42477" "42478" "0" "24/2"
GSD$`Exam date`[GSD$`Exam date` %>% str_detect("\\d{5}")] %<>% as.numeric %>% as.Date(.,
origin = "1899-12-30") #Change dates from Excel format to Lubridate format (even though they will be converted to numeric until column is fixed
GSD$`Exam date`[GSD$`Exam date` %>% str_detect("\\/")] %<>% gsub("(\\d{2})\\/(\\d{1})",
"2016\\-0\\2\\-\\1", .) %>% lubridate::ymd()
tags$p("The following rows have strange values for exam date.")
The following rows have strange values for exam date.
GSD[{
GSD$`Exam date` %>% nchar < 3
}, ]
tags$p("Perhaps the dates can be derived from their contextual appearance in the dataset?")
Perhaps the dates can be derived from their contextual appearance in the dataset?
aDates <- {
GSD$`Exam date` %>% nchar < 3
} %>% which
GSD[sort(c(aDates, aDates - 1, aDates + 1)), ]
tags$p("It looks like it is safe to assume that these dates can be imputed according to the dates that surround them. These dates will be imputed.")
It looks like it is safe to assume that these dates can be imputed according to the dates that surround them. These dates will be imputed.
GSD$`Exam date`[aDates] <- GSD$`Exam date`[aDates + 1]
GSD$`Exam date`[!GSD$`Exam date` %>% str_detect("\\d{5}")] %<>% gsub("(\\d{2})\\w{3}",
"2016\\-02\\-\\1", .) %>% lubridate::ymd()
GSD$`Exam date` %<>% as.numeric %>% lubridate::as_date()
# ----------------------- Mon Oct 08 08:37:22 2018 ------------------------#
# Duplicate Students
tags$p("The following rows could be duplicates:")
The following rows could be duplicates:
identical({
{
GSD %>% duplicated(., margin = 1)
} | {
GSD %>% duplicated(., fromLast = TRUE)
}
} %>% which, GSD %>% {
duplicated(., margin = 1) | duplicated(., fromLast = T)
} %>% which)
## [1] TRUE
GSD[GSD %>% {
duplicated(., margin = 1) | duplicated(., fromLast = T)
} %>% which, ]
tags$p("Their indexes are listed below such that they can be removed if they are indeed duplicates.")
Their indexes are listed below such that they can be removed if they are indeed duplicates.
GSD %>% {
duplicated(., margin = 1) | duplicated(., fromLast = T)
} %>% which
## [1] 317 318 472 473 474 475 478 479 502 503 505 506 508 512
## [15] 515 1017 1018 1021 1023 1026 1642 1644 2415 2416 2798 2800
Analysis
tags$p(paste("Since this is baseline data only, a paired t-test to measure the effect of the intervention is not yet possible. Once longitudinal data is supplied, a paired t-test can be used to measure the effect of the intervention for students who appear in both data sets. These comparisons can be confined to specific factors stratifications as well."))
Since this is baseline data only, a paired t-test to measure the effect of the intervention is not yet possible. Once longitudinal data is supplied, a paired t-test can be used to measure the effect of the intervention for students who appear in both data sets. These comparisons can be confined to specific factors stratifications as well.
# GSD[GSD$`Student code` %in% GSD$`Student code` %>% unique,c('Student code')]
# %>% arrange(`Student code`) %>% duplicated %>% sum
Summary statistics can be made by various groupings.
Impute to Mode
# What is the mode for each question with aberrant or missing values
qModes <- GSD[, aQs] %>% lapply(FUN = function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}) %>% unlist
GSD[, aQs] %<>% names %>% lapply(data = GSD, key = qModes, FUN = function(clm, key,
data) {
print(clm)
out <- sapply(data[, clm], clm = clm, key = key, FUN = function(x, key, clm) {
out <- ifelse({
x == 0 | x == 1
}, x, key[[clm]])
})
})
## [1] "Q4"
## [1] "Q8"
## [1] "Q12"
## [1] "Q20"
## [1] "Q21"
# ----------------------- Mon Oct 01 17:32:49 2018 ------------------------#
# Classes
GSD[, c("School Name", "Teacher", "Section", "School Code", "Sex", "Grade")] %<>%
lapply(as.factor)
Summary Stats
Qs <- Qs %>% which # Change this sorting vector to numeric since the column # is about to change
GSD$Score <- {
GSD %>% select(starts_with("Q")) %>% rowSums(na.rm = T)/{
str_detect(names(GSD), "Q") %>% length
}
}
tags$p("Summary statistics for age and score:")
Summary statistics for age and score:
GSD[, c("Age", "Score")] %>% summary
## Age Score
## Min. : 6.000 Min. :0.0000
## 1st Qu.: 8.000 1st Qu.:0.2353
## Median : 9.000 Median :0.3235
## Mean : 8.937 Mean :0.3281
## 3rd Qu.: 9.000 3rd Qu.:0.4412
## Max. :18.000 Max. :0.7059
## NA's :96
tags$p("Standard deviations for age and score:")
Standard deviations for age and score:
GSD[, c("Age", "Score")] %>% sapply(FUN = function(.) sd(., na.rm = T))
## Age Score
## 1.1476375 0.1552162
tags$p("Sex Ratios")
Sex Ratios
table(GSD$Sex)
##
## F M
## 1594 1708
table(GSD$Sex) %>% prop.table()
##
## F M
## 0.4827377 0.5172623
tags$p("Average scores by school sorted highest to lowest.")
Average scores by school sorted highest to lowest.
GSD %>% group_by(`School Code`) %>% summarize(AvgScore = mean(Score, na.rm = T),
SDScore = sd(Score, na.rm = T), N = n()) %>% arrange(desc(AvgScore))
tags$p("Average scores by age")
Average scores by age
GSD %>% group_by(Age) %>% summarize(AvgScore = mean(Score, na.rm = T), SDScore = sd(Score,
na.rm = T), N = n()) %>% arrange(desc(AvgScore))
# Looks like age 15 is a rough year.
tags$p("Average scores by sex")
Average scores by sex
GSD %>% group_by(Sex) %>% summarize(AvgScore = mean(Score, na.rm = T), SDScore = sd(Score,
na.rm = T), N = n()) %>% arrange(desc(AvgScore))
tags$p("Average score by grade")
Average score by grade
GSD %>% group_by(Grade) %>% summarize(AvgScore = mean(Score, na.rm = T), SDScore = sd(Score,
na.rm = T), N = n()) %>% arrange(desc(AvgScore))
Plots of Variables
# Clean Names
names(GSD) <- names(GSD) %>% make.names()
tags$p("Histograms of relevant categorical variables:")
Histograms of relevant categorical variables:
lapply(c("School.Code", "Teacher", "Age", "Sex", "Grade"), data = GSD, FUN = function(clm,
data) {
print(clm)
data %>% ggplot(data = ., mapping = aes_string(x = clm)) + geom_bar() + xlab(clm) +
ylab("Number") + ggtitle(paste("Histogram of", clm)) + theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45))
})
## [1] "School.Code"
## [1] "Teacher"
## [1] "Age"
## [1] "Sex"
## [1] "Grade"
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
hist(GSD$Score, main = "Histogram of Score", xlab = "Score")
Plots of Scores by Grade
(mu <- plyr::ddply(GSD, "Grade", summarise, grp.mean = mean(Score, na.rm = T)))
ggplot(data = GSD, mapping = aes(x = Score, y = ..count..)) + geom_density(binwidth = 0.001,
mapping = aes(group = Grade, fill = Grade), alpha = 0.2) + geom_vline(data = mu,
aes(xintercept = grp.mean, color = Grade), linetype = "dashed") + # geom_vline(data=GSD, aes(xintercept=mean(GSD$Score[GSD$Grade == 2],na.rm=T),
# color='black'),linetype='dashed')+
ggtitle("Score Distributions by Grade") + theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
tags$p("Grade two scores have a right skew of ", psych::skew(GSD$Score[GSD$Grade ==
2]) %>% round(2), " and a small kurtosis of ", psych::kurtosi(GSD$Score[GSD$Grade ==
2]) %>% round(2), ". Grade three scores have a left skew of ", psych::skew(GSD$Score[GSD$Grade ==
3]) %>% round(2), " and notable kurtosis of ", psych::kurtosi(GSD$Score[GSD$Grade ==
3]) %>% round(2), ". The mean and standard deviation of score is fairly close together for each grade: Grade 2 (mu:",
mean(GSD$Score[GSD$Grade == 2]) %>% round(2), HTML("sd:"), sd(GSD$Score[GSD$Grade ==
2]) %>% round(2), ") Grade 3 (mu:", mean(GSD$Score[GSD$Grade == 3]) %>% round(2),
HTML("sd:"), sd(GSD$Score[GSD$Grade == 3]) %>% round(2), ").")
Grade two scores have a right skew of 0.33 and a small kurtosis of -0.07 . Grade three scores have a left skew of -0.13 and notable kurtosis of -0.43 . The mean and standard deviation of score is fairly close together for each grade: Grade 2 (mu: 0.29 sd: 0.14 ) Grade 3 (mu: 0.37 sd: 0.16 ).
Plots by School Codestags$p("Average scores by school code, sorted highest to lowest:")
Average scores by school code, sorted highest to lowest:
(mu <- plyr::ddply(GSD, "School.Code", summarise, grp.mean = mean(Score, na.rm = T),
grp.sd = sd(Score, na.rm = T)) %>% arrange(desc(grp.mean)))
tags$p("Graphs of score distribution for each school code:")
Graphs of score distribution for each school code:
test <- lapply(mu$School.Code %>% as.character %>% .[!is.na(.)], GSD = GSD, function(sc,
GSD) {
dt <- GSD[GSD$School.Code %>% as.character == sc, ]
ggplot(dt, mapping = aes(x = Score, y = ..count..)) + geom_density(alpha = 0.2) +
geom_vline(data = mu, aes(xintercept = mean(dt[, "Score", drop = T], na.rm = T)),
linetype = "dashed") + ggtitle(paste("Scores of ", sc), subtitle = paste(" n=",
nrow(dt), "mu=", mean(dt[, "Score", drop = T], na.rm = T) %>% round(2), "sd=",
sd(dt[, "Score", drop = T], na.rm = T) %>% round(2))) + scale_x_continuous(breaks = seq(0,
1, 0.2), minor_breaks = seq(0, 1, 0.1), limits = c(0, 1)) + theme(plot.title = element_text(hjust = 0.5,
size = ggplot2::unit(8, "mm")), plot.subtitle = element_text(hjust = 0.5),
legend.position = "none")
})
library(gridExtra)
invisible(lapply(sapply(1:9, function(x) seq(ifelse(x == 1, 1, {
({
x - 1
} * 9) + 1
}), x * 9, 1)) %>% as.data.frame(), plots = test, function(rs, plots) {
try(do.call("grid.arrange", c(plots[rs], ncol = 3)))
}))
Research Questions
tags$p("Is score normally distributed?")
Is score normally distributed?
shapiro.test(GSD$Score)
##
## Shapiro-Wilk normality test
##
## data: GSD$Score
## W = 0.98388, p-value < 0.00000000000000022
tags$p("It's highly unlikely.")
It's highly unlikely.
tags$br()
# tags$p('Is there improvement over time?')
GSD_date <- GSD %>% group_by(School.Code, Exam.date) %>% summarise(Avg = mean(Score,
na.rm = T))
SC_dup <- GSD_date[GSD_date$School.Code %>% duplicated, ] %>% filter(!is.na(School.Code)) %>%
.$School.Code
GSD_dup <- GSD_date[GSD_date$School.Code %in% SC_dup, ]
GSD_dup$Test.Num <- as.factor(rep(c(1:2), nrow(GSD_dup)/2))
# GSD_dup %>% ggplot(data = .,mapping=aes(x = School.Code,y=Avg,fill =
# Test.Num))+ geom_bar(stat='identity',position='dodge') tags$p('It appears that
# in 3 out of 5 cases the scores from the second test were an improvement over
# the previous.')
sch <- GSD_dup$School.Code[1] # For debugging
tags$p("Are the exam scores taken on the sequential dates per school significantly different (statistically speaking)?")
Are the exam scores taken on the sequential dates per school significantly different (statistically speaking)?
lapply(GSD_dup$School.Code[seq(1, nrow(GSD_dup), by = 2)], data = GSD[GSD$School.Code %in%
GSD_dup$School.Code, ], FUN = function(sch, data) {
tests <- data[data$School.Code == sch, ]$Exam.date %>% unique
out <- t.test(data[data$School.Code == sch & data$Exam.date == tests[1], "Score",
drop = T], data[data$School.Code == sch & data$Exam.date == tests[2], "Score",
drop = T])
out$data.name <- paste("T.Test Comparison of School:", sch, "between exams occurring on:",
tests[1], "and", tests[2])
return(out)
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: T.Test Comparison of School: 07-01-0427-43 between exams occurring on: 2016-03-07 and 2016-03-03
## t = -3.7292, df = 232.92, p-value = 0.0002413
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.10424954 -0.03217416
## sample estimates:
## mean of x mean of y
## 0.4041667 0.4723785
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: T.Test Comparison of School: 07-01-0445-43 between exams occurring on: 2016-02-24 and 2016-02-25
## t = -2.6193, df = 69.69, p-value = 0.0108
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.10481260 -0.01419193
## sample estimates:
## mean of x mean of y
## 0.2907240 0.3502262
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: T.Test Comparison of School: 07-01-0450-43 between exams occurring on: 2016-02-23 and 2016-02-22
## t = -1.7618, df = 145.02, p-value = 0.0802
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.086309039 0.004955029
## sample estimates:
## mean of x mean of y
## 0.2322515 0.2729285
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: T.Test Comparison of School: 07-013-2577-43 between exams occurring on: 2016-02-11 and 2016-02-12
## t = -4.6531, df = 19, p-value = 0.0001734
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.22386844 -0.08495509
## sample estimates:
## mean of x mean of y
## 0.2058824 0.3602941
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: T.Test Comparison of School: 07-19-0015-43 between exams occurring on: 2016-03-10 and 2016-02-19
## t = 6.7059, df = 41.147, p-value = 0.00000004223
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1603249 0.2984852
## sample estimates:
## mean of x mean of y
## 0.3478261 0.1184211
tags$p("It appears that the scores are significantly different between class sections in all but the third school. These were likely classes held on different days for different grades, or for different sections.")
It appears that the scores are significantly different between class sections in all but the third school. These were likely classes held on different days for different grades, or for different sections.
tags$p("Is the variance of scores in each grade level equal?")
Is the variance of scores in each grade level equal?
(vtest <- var.test(GSD$Score[GSD$Grade == 2], GSD$Score[GSD$Grade == 3]))
##
## F test to compare two variances
##
## data: GSD$Score[GSD$Grade == 2] and GSD$Score[GSD$Grade == 3]
## F = 0.74305, num df = 1778, denom df = 1610, p-value =
## 0.000000001009
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6753678 0.8173255
## sample estimates:
## ratio of variances
## 0.7430488
tags$p("p<.001 indicates that the variances are not equal.")
p<.001 indicates that the variances are not equal.
tags$p("Are grade levels aggregate scores significantly different given variance is not equal?")
Are grade levels aggregate scores significantly different given variance is not equal?
(tout <- t.test(GSD$Score[GSD$Grade == 2], GSD$Score[GSD$Grade == 3]))
##
## Welch Two Sample t-test
##
## data: GSD$Score[GSD$Grade == 2] and GSD$Score[GSD$Grade == 3]
## t = -16.915, df = 3195.5, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09742693 -0.07718608
## sample estimates:
## mean of x mean of y
## 0.2866118 0.3739183
tags$p("The grade levels are significantly different at the 95% confidence level",
tout %>% apa_t)
The grade levels are significantly different at the 95% confidence level t (3195.51) = -16.91 , CI[ -0.1,-0.08 ], p<.001