Loading and Cleaning Data

Load Data

GSD <- readxl::read_excel("Guatemalan Student Test Data(1).xlsx")

Examine Data

# ----------------------- Mon Oct 01 15:23:56 2018 ------------------------#
# Missing Values

GSD[, lapply(GSD, is.numeric) %>% unlist] %>% summary
##       Age             Grade             Q1               Q2        
##  Min.   : 0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 8.000   1st Qu.:2.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 9.000   Median :2.000   Median :1.0000   Median :1.0000  
##  Mean   : 8.956   Mean   :2.477   Mean   :0.5313   Mean   :0.5415  
##  3rd Qu.: 9.000   3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :90.000   Max.   :3.000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :164      NA's   :10      NA's   :69       NA's   :70      
##        Q3               Q4                Q5               Q6        
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median : 1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.4325   Mean   : 0.5714   Mean   :0.4715   Mean   :0.4814  
##  3rd Qu.:1.0000   3rd Qu.: 1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :12.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :69       NA's   :69        NA's   :70       NA's   :73      
##        Q7               Q8                Q9              Q10        
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median : 1.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.6876   Mean   : 0.6749   Mean   :0.2958   Mean   :0.5528  
##  3rd Qu.:1.0000   3rd Qu.: 1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :11.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :69       NA's   :91        NA's   :71       NA's   :69      
##       Q11              Q12               Q13              Q14        
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median : 1.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.4372   Mean   : 0.6171   Mean   :0.5406   Mean   :0.4635  
##  3rd Qu.:1.0000   3rd Qu.: 1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :11.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :69       NA's   :69        NA's   :70       NA's   :72      
##       Q15              Q16              Q17              Q18        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.4069   Mean   :0.5245   Mean   :0.3528   Mean   :0.3578  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :70       NA's   :71       NA's   :69       NA's   :69      
##       Q19              Q20               Q21               Q22        
##  Min.   :0.0000   Min.   :-1.0000   Min.   : 0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median : 0.0000   Median : 1.0000   Median :0.0000  
##  Mean   :0.3478   Mean   : 0.2841   Mean   : 0.5496   Mean   :0.3991  
##  3rd Qu.:1.0000   3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   : 1.0000   Max.   :10.0000   Max.   :1.0000  
##  NA's   :69       NA's   :69        NA's   :69        NA's   :69      
##       Q23              Q24        
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000  
##  Mean   :0.3388   Mean   :0.3248  
##  3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000  
##  NA's   :71       NA's   :72
rNA <- vector(length = nrow(GSD))
for (i in seq_along(1:nrow(GSD))) {
    rNA[i] <- sapply(GSD[i, ], is.na) %>% sum
}
GSD <- GSD[{
    rNA > 6
} %>% not, ]  # Remove the rows where nearly all the data is missing
tags$p("The following rows have large amounts of missing data, is it possible to recover any of these values?")

The following rows have large amounts of missing data, is it possible to recover any of these values?

rNA <- vector(length = nrow(GSD))
for (i in seq_along(1:nrow(GSD))) {
    rNA[i] <- sapply(GSD[i, ], is.na) %>% sum
}
GSD[{
    rNA > 3
} %>% which, ]
# ----------------------- Mon Oct 01 15:24:08 2018 ------------------------#
# Questions

Qs <- names(GSD) %>% str_detect("Q")  # Create a variable for subsetting the Question columns
tags$p("If all other questions use 0 as incorrect and 1 as correct, why do the following questions have values other than 0 or 1?")

If all other questions use 0 as incorrect and 1 as correct, why do the following questions have values other than 0 or 1?

GSD[, Qs] %>% lapply(unique) %>% subset(GSD[, Qs] %>% lapply(unique) %>% lapply(FUN = function(x) !x %in% 
    c(0, 1, NA)) %>% lapply(any) %>% unlist)
## $Q4
## [1]  0  1 12 11 10
## 
## $Q8
## [1]  1  0 NA 11
## 
## $Q12
## [1]  1  0 10 11
## 
## $Q20
## [1]  1  0 -1
## 
## $Q21
## [1]  1  0 10
aQs <- GSD[, Qs] %>% lapply(unique) %>% lapply(FUN = function(x) !x %in% c(0, 1, 
    NA)) %>% lapply(any) %>% unlist %>% which %>% names  # Capture names of questions with aberrant values
tags$em("Note:The following indices can be used to find and correct the aberrant values:")
Note:The following indices can be used to find and correct the aberrant values:
(aQs_ind <- GSD[, aQs] %>% lapply(FUN = function(x) {
    out <- sapply(x, FUN = function(v) !v %in% c(0, 1, NA), simplify = T) %>% unlist %>% 
        which
}))
## $Q4
## [1] 1620 2157 2417 2746
## 
## $Q8
## [1] 1580
## 
## $Q12
## [1]  285 2187
## 
## $Q20
## [1] 3376
## 
## $Q21
## [1] 1984
# ----------------------- Mon Oct 01 15:23:40 2018 ------------------------# Ages

tags$p("The following are reported ages, some of these values are aberrant")

The following are reported ages, some of these values are aberrant

GSD[, "Age", drop = T] %>% unique
##  [1] 10  9  8  7 11 NA 12 13 14 15  6  0 18 90
age_ol <- c({
    GSD$Age == 0
} %>% which, {
    GSD$Age == 90
} %>% which)
tags$p("Should the ages for the entries in the table below be 10,10 and 9? I will be imputing these as such, if we find  please let me know.")

Should the ages for the entries in the table below be 10,10 and 9? I will be imputing these as such, if we find please let me know.

GSD[age_ol, "Age"]
GSD[age_ol, "Age"] <- c(10, 10, 9)  #Assume most likely typos
# ----------------------- Mon Oct 01 15:23:28 2018 ------------------------# Sex

tags$p("The sex of individuals, notice there is one number.")

The sex of individuals, notice there is one number.

GSD$Sex %>% unique
## [1] "F" "M" NA  "1" "m"
GSD$Sex %<>% gsub("m", "M", .)  # Change m to M
tags$p("Is the sex of the individual below M or F?")

Is the sex of the individual below M or F?

GSD[{
    GSD$Sex == 1
} %>% which, ]
tags$p("I will assume the sex of this individual is M, if info is found suggesting otherwise, please let me know.")

I will assume the sex of this individual is M, if info is found suggesting otherwise, please let me know.

GSD[{
    GSD$Sex == 1
} %>% which, "Sex"] <- "M"  #Assume Male
chr <- GSD[, lapply(GSD, is.numeric) %>% unlist %>% not] %>% names  # Get a vector of all non numeric column names

Data Cleaning Continued 1

# ----------------------- Mon Oct 01 15:22:58 2018 ------------------------#
# School Names

# {GSD$`School Name`=='EORM Caserio Santa Maria El Tablón'} %>% which %>% length
# {GSD$`School Name`=='EORM Caserio Santa Maria El Tablon'} %>% which %>% length
# # Used to determin which was the most used school name for the following
# transformation
GSD$`School Name`[{
    GSD$`School Name` == "EORM Caserio Santa Maria El Tablon"
}] <- "EORM Caserio Santa Maria El Tablón"  # Combine those two school names since they are the same
tags$p("It appears that all of the schools below are the same school. This indicates the school code is likely a more reliable value for grouping the data by school.")

It appears that all of the schools below are the same school. This indicates the school code is likely a more reliable value for grouping the data by school.

GSD[GSD$`School Name` %>% str_detect("CONCEPCION|Concepción"), ]  #It appears that all of the school names below are actually the same school
# ----------------------- Mon Oct 01 15:22:48 2018 ------------------------#
# School Codes

GSD$`School Code` %<>% gsub("(\\d)(\\d{2})(\\d{4})(\\d{2})", "0\\1\\-\\2\\-\\3\\-\\4", 
    .)  # Fix obvious formatting errors
tags$p("I've corrected some of the common formatting errors of the school code, can the remaining values be corrected?")

I've corrected some of the common formatting errors of the school code, can the remaining values be corrected?

GSD[GSD$`School Code` %>% str_detect("\\d{2}\\-\\d{2}\\-\\d{4}\\-\\d{2}") %>% not, 
    ]
tags$p("Are any missing school codes imputable?")

Are any missing school codes imputable?

GSD[GSD$`School Name` %in% {
    GSD[GSD$`School Code` %>% is.na(), ][["School Name"]] %>% unique
}, ] %>% arrange(desc(`School Name`)) %>% .[, c(1, 2)] %>% unique
GSD$`School Code`[GSD$`School Code` %>% str_detect("semana")] <- NA
tags$p("It appears there are two school codes which are imputable. ")

It appears there are two school codes which are imputable.

iSchCodes <- GSD[GSD$`School Name` %in% {
    GSD[GSD$`School Code` %>% is.na(), ][["School Name"]] %>% unique
}, ] %>% arrange(desc(`School Name`)) %>% .[, c(1, 2)] %>% unique %>% .[!is.na(.$`School Code`), 
    ]
# Impute Codes
GSD$`School Code`[GSD$`School Name` == iSchCodes[1, 1, drop = T]] <- iSchCodes[1, 
    2, drop = T]
GSD$`School Code`[GSD$`School Name` == iSchCodes[2, 1, drop = T]] <- iSchCodes[2, 
    2, drop = T]
tags$p("Can any of these remaining school codes be recovered?")

Can any of these remaining school codes be recovered?

GSD[GSD$`School Name` %in% {
    GSD[GSD$`School Code` %>% is.na(), ][["School Name"]] %>% unique
}, ] %>% arrange(desc(`School Name`)) %>% .[, c(1, 2)] %>% unique
# ----------------------- Mon Oct 08 08:25:29 2018 ------------------------#
# Teacher names
GSD$Teacher %>% unique
##  [1] NA                   "mgg 2"              "hn gonzales castro"
##  [4] "jc colo sosof"      "paguacal 2 sec A"   "paguacal 2 sec B"  
##  [7] "san jorge 3A"       "mgg 3"              "brenda Roxana"     
## [10] "gregorio cuc"       "rosa lopez morales" "paguacal 3 sec A"  
## [13] "paguacal 3 sec B"   "teresita 3B"        "santos tzay chipin"
tags$p("Some teachers are tracking their sections individually by adding data to the teacher column. We can separate the teacher column into teacher & section to make this easier for them to do so if they would like to have that granularity. This also creates consistency in the factors for the teacher column")

Some teachers are tracking their sections individually by adding data to the teacher column. We can separate the teacher column into teacher & section to make this easier for them to do so if they would like to have that granularity. This also creates consistency in the factors for the teacher column

GSD %<>% separate(Teacher, into = c("Teacher", "Section"), sep = "\\s(?=\\d)", remove = T)
GSD$Teacher %<>% as.factor()

Data Cleaning Continued 2

# ----------------------- Mon Oct 08 09:37:07 2018 ------------------------#
# Grades
tags$p("The following columns have aberrant or missing values for grade level. Can these be corrected? These will be imputed to the mode of the Grade column until corrections are supplied.")

The following columns have aberrant or missing values for grade level. Can these be corrected? These will be imputed to the mode of the Grade column until corrections are supplied.

GSD[GSD$Grade == 0 | is.na(GSD$Grade), ]
GSD[GSD$Grade == 0 | is.na(GSD$Grade), "Grade"] <- Mode(GSD$Grade)
# ----------------------- Mon Oct 01 15:22:30 2018 ------------------------# Exam
# Dates

GSD$`Exam date` %>% unique
##  [1] "42429" "19/2"  "42438" "42416" "16/2"  "42431" "42410" "42436"
##  [9] "42412" "42433" "26Feb" "26/2"  "42417" "17/2"  "42419" "42430"
## [17] "1"     "42411" "42415" "42424" "42425" "42423" "42422" "42418"
## [25] "42437" "42440" "42439" "42443" "42426" "42432" "42444" "42445"
## [33] "c"     "18Feb" "42441" "42442" "42446" "42447" "42448" "42449"
## [41] "42450" "42451" "42452" "42453" "42454" "42455" "42456" "42457"
## [49] "42458" "42459" "42460" "42461" "42462" "42463" "42464" "42465"
## [57] "42466" "42467" "42468" "42469" "42470" "42471" "42472" "42473"
## [65] "42474" "42475" "42476" "42477" "42478" "0"     "24/2"
GSD$`Exam date`[GSD$`Exam date` %>% str_detect("\\d{5}")] %<>% as.numeric %>% as.Date(., 
    origin = "1899-12-30")  #Change dates from Excel format to Lubridate format (even though they will be converted to numeric until column is fixed
GSD$`Exam date`[GSD$`Exam date` %>% str_detect("\\/")] %<>% gsub("(\\d{2})\\/(\\d{1})", 
    "2016\\-0\\2\\-\\1", .) %>% lubridate::ymd()
tags$p("The following rows have strange values for exam date.")

The following rows have strange values for exam date.

GSD[{
    GSD$`Exam date` %>% nchar < 3
}, ]
tags$p("Perhaps the dates can be derived from their contextual appearance in the dataset?")

Perhaps the dates can be derived from their contextual appearance in the dataset?

aDates <- {
    GSD$`Exam date` %>% nchar < 3
} %>% which
GSD[sort(c(aDates, aDates - 1, aDates + 1)), ]
tags$p("It looks like it is safe to assume that these dates can be imputed according to the dates that surround them. These dates will be imputed.")

It looks like it is safe to assume that these dates can be imputed according to the dates that surround them. These dates will be imputed.

GSD$`Exam date`[aDates] <- GSD$`Exam date`[aDates + 1]
GSD$`Exam date`[!GSD$`Exam date` %>% str_detect("\\d{5}")] %<>% gsub("(\\d{2})\\w{3}", 
    "2016\\-02\\-\\1", .) %>% lubridate::ymd()
GSD$`Exam date` %<>% as.numeric %>% lubridate::as_date()
# ----------------------- Mon Oct 08 08:37:22 2018 ------------------------#
# Duplicate Students
tags$p("The following rows could be duplicates:")

The following rows could be duplicates:

identical({
    {
        GSD %>% duplicated(., margin = 1)
    } | {
        GSD %>% duplicated(., fromLast = TRUE)
    }
} %>% which, GSD %>% {
    duplicated(., margin = 1) | duplicated(., fromLast = T)
} %>% which)
## [1] TRUE
GSD[GSD %>% {
    duplicated(., margin = 1) | duplicated(., fromLast = T)
} %>% which, ]
tags$p("Their indexes are listed below such that they can be removed if they are indeed duplicates.")

Their indexes are listed below such that they can be removed if they are indeed duplicates.

GSD %>% {
    duplicated(., margin = 1) | duplicated(., fromLast = T)
} %>% which
##  [1]  317  318  472  473  474  475  478  479  502  503  505  506  508  512
## [15]  515 1017 1018 1021 1023 1026 1642 1644 2415 2416 2798 2800

Analysis

Analysis

tags$p(paste("Since this is baseline data only, a paired t-test to measure the effect of the intervention is not yet possible. Once longitudinal data is supplied, a paired t-test can be used to measure the effect of the intervention for students who appear in both data sets. These comparisons can be confined to specific factors stratifications as well."))

Since this is baseline data only, a paired t-test to measure the effect of the intervention is not yet possible. Once longitudinal data is supplied, a paired t-test can be used to measure the effect of the intervention for students who appear in both data sets. These comparisons can be confined to specific factors stratifications as well.

# GSD[GSD$`Student code` %in% GSD$`Student code` %>% unique,c('Student code')]
# %>% arrange(`Student code`) %>% duplicated %>% sum

Summary statistics can be made by various groupings.

Impute to Mode

# What is the mode for each question with aberrant or missing values
qModes <- GSD[, aQs] %>% lapply(FUN = function(v) {
    uniqv <- unique(v)
    uniqv[which.max(tabulate(match(v, uniqv)))]
}) %>% unlist
GSD[, aQs] %<>% names %>% lapply(data = GSD, key = qModes, FUN = function(clm, key, 
    data) {
    print(clm)
    out <- sapply(data[, clm], clm = clm, key = key, FUN = function(x, key, clm) {
        out <- ifelse({
            x == 0 | x == 1
        }, x, key[[clm]])
    })
})
## [1] "Q4"
## [1] "Q8"
## [1] "Q12"
## [1] "Q20"
## [1] "Q21"
# ----------------------- Mon Oct 01 17:32:49 2018 ------------------------#
# Classes
GSD[, c("School Name", "Teacher", "Section", "School Code", "Sex", "Grade")] %<>% 
    lapply(as.factor)

Summary Stats

Qs <- Qs %>% which  # Change this sorting vector to numeric since the column # is about to change
GSD$Score <- {
    GSD %>% select(starts_with("Q")) %>% rowSums(na.rm = T)/{
        str_detect(names(GSD), "Q") %>% length
    }
}
tags$p("Summary statistics for age and score:")

Summary statistics for age and score:

GSD[, c("Age", "Score")] %>% summary
##       Age             Score       
##  Min.   : 6.000   Min.   :0.0000  
##  1st Qu.: 8.000   1st Qu.:0.2353  
##  Median : 9.000   Median :0.3235  
##  Mean   : 8.937   Mean   :0.3281  
##  3rd Qu.: 9.000   3rd Qu.:0.4412  
##  Max.   :18.000   Max.   :0.7059  
##  NA's   :96
tags$p("Standard deviations for age and score:")

Standard deviations for age and score:

GSD[, c("Age", "Score")] %>% sapply(FUN = function(.) sd(., na.rm = T))
##       Age     Score 
## 1.1476375 0.1552162
tags$p("Sex Ratios")

Sex Ratios

table(GSD$Sex)
## 
##    F    M 
## 1594 1708
table(GSD$Sex) %>% prop.table()
## 
##         F         M 
## 0.4827377 0.5172623
tags$p("Average scores by school sorted highest to lowest.")

Average scores by school sorted highest to lowest.

GSD %>% group_by(`School Code`) %>% summarize(AvgScore = mean(Score, na.rm = T), 
    SDScore = sd(Score, na.rm = T), N = n()) %>% arrange(desc(AvgScore))
tags$p("Average scores by age")

Average scores by age

GSD %>% group_by(Age) %>% summarize(AvgScore = mean(Score, na.rm = T), SDScore = sd(Score, 
    na.rm = T), N = n()) %>% arrange(desc(AvgScore))
# Looks like age 15 is a rough year.
tags$p("Average scores by sex")

Average scores by sex

GSD %>% group_by(Sex) %>% summarize(AvgScore = mean(Score, na.rm = T), SDScore = sd(Score, 
    na.rm = T), N = n()) %>% arrange(desc(AvgScore))
tags$p("Average score by grade")

Average score by grade

GSD %>% group_by(Grade) %>% summarize(AvgScore = mean(Score, na.rm = T), SDScore = sd(Score, 
    na.rm = T), N = n()) %>% arrange(desc(AvgScore))

Plots

Plots of Variables

# Clean Names
names(GSD) <- names(GSD) %>% make.names()
tags$p("Histograms of relevant categorical variables:")

Histograms of relevant categorical variables:

lapply(c("School.Code", "Teacher", "Age", "Sex", "Grade"), data = GSD, FUN = function(clm, 
    data) {
    print(clm)
    data %>% ggplot(data = ., mapping = aes_string(x = clm)) + geom_bar() + xlab(clm) + 
        ylab("Number") + ggtitle(paste("Histogram of", clm)) + theme(plot.title = element_text(hjust = 0.5), 
        plot.subtitle = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45))
})
## [1] "School.Code"
## [1] "Teacher"
## [1] "Age"
## [1] "Sex"
## [1] "Grade"
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

hist(GSD$Score, main = "Histogram of Score", xlab = "Score")

Plots of Scores by Grade

(mu <- plyr::ddply(GSD, "Grade", summarise, grp.mean = mean(Score, na.rm = T)))
ggplot(data = GSD, mapping = aes(x = Score, y = ..count..)) + geom_density(binwidth = 0.001, 
    mapping = aes(group = Grade, fill = Grade), alpha = 0.2) + geom_vline(data = mu, 
    aes(xintercept = grp.mean, color = Grade), linetype = "dashed") + # geom_vline(data=GSD, aes(xintercept=mean(GSD$Score[GSD$Grade == 2],na.rm=T),
# color='black'),linetype='dashed')+
ggtitle("Score Distributions by Grade") + theme(plot.title = element_text(hjust = 0.5), 
    plot.subtitle = element_text(hjust = 0.5))

tags$p("Grade two scores have a right skew of ", psych::skew(GSD$Score[GSD$Grade == 
    2]) %>% round(2), " and a small kurtosis of ", psych::kurtosi(GSD$Score[GSD$Grade == 
    2]) %>% round(2), ". Grade three scores have a left skew of ", psych::skew(GSD$Score[GSD$Grade == 
    3]) %>% round(2), " and notable kurtosis of ", psych::kurtosi(GSD$Score[GSD$Grade == 
    3]) %>% round(2), ". The mean and standard deviation of score is fairly close together for each grade: Grade 2 (mu:", 
    mean(GSD$Score[GSD$Grade == 2]) %>% round(2), HTML("sd:"), sd(GSD$Score[GSD$Grade == 
        2]) %>% round(2), ") Grade 3 (mu:", mean(GSD$Score[GSD$Grade == 3]) %>% round(2), 
    HTML("sd:"), sd(GSD$Score[GSD$Grade == 3]) %>% round(2), ").")

Grade two scores have a right skew of 0.33 and a small kurtosis of -0.07 . Grade three scores have a left skew of -0.13 and notable kurtosis of -0.43 . The mean and standard deviation of score is fairly close together for each grade: Grade 2 (mu: 0.29 sd: 0.14 ) Grade 3 (mu: 0.37 sd: 0.16 ).

Plots by School Codes

tags$p("Average scores by school code, sorted highest to lowest:")

Average scores by school code, sorted highest to lowest:

(mu <- plyr::ddply(GSD, "School.Code", summarise, grp.mean = mean(Score, na.rm = T), 
    grp.sd = sd(Score, na.rm = T)) %>% arrange(desc(grp.mean)))
tags$p("Graphs of score distribution for each school code:")

Graphs of score distribution for each school code:

test <- lapply(mu$School.Code %>% as.character %>% .[!is.na(.)], GSD = GSD, function(sc, 
    GSD) {
    dt <- GSD[GSD$School.Code %>% as.character == sc, ]
    ggplot(dt, mapping = aes(x = Score, y = ..count..)) + geom_density(alpha = 0.2) + 
        geom_vline(data = mu, aes(xintercept = mean(dt[, "Score", drop = T], na.rm = T)), 
            linetype = "dashed") + ggtitle(paste("Scores of ", sc), subtitle = paste(" n=", 
        nrow(dt), "mu=", mean(dt[, "Score", drop = T], na.rm = T) %>% round(2), "sd=", 
        sd(dt[, "Score", drop = T], na.rm = T) %>% round(2))) + scale_x_continuous(breaks = seq(0, 
        1, 0.2), minor_breaks = seq(0, 1, 0.1), limits = c(0, 1)) + theme(plot.title = element_text(hjust = 0.5, 
        size = ggplot2::unit(8, "mm")), plot.subtitle = element_text(hjust = 0.5), 
        legend.position = "none")
})
library(gridExtra)
invisible(lapply(sapply(1:9, function(x) seq(ifelse(x == 1, 1, {
    ({
        x - 1
    } * 9) + 1
}), x * 9, 1)) %>% as.data.frame(), plots = test, function(rs, plots) {
    try(do.call("grid.arrange", c(plots[rs], ncol = 3)))
}))

Research Questions

Research Questions

tags$p("Is score normally distributed?")

Is score normally distributed?

shapiro.test(GSD$Score)
## 
##  Shapiro-Wilk normality test
## 
## data:  GSD$Score
## W = 0.98388, p-value < 0.00000000000000022
tags$p("It's highly unlikely.")

It's highly unlikely.

tags$br()

# tags$p('Is there improvement over time?')
GSD_date <- GSD %>% group_by(School.Code, Exam.date) %>% summarise(Avg = mean(Score, 
    na.rm = T))
SC_dup <- GSD_date[GSD_date$School.Code %>% duplicated, ] %>% filter(!is.na(School.Code)) %>% 
    .$School.Code
GSD_dup <- GSD_date[GSD_date$School.Code %in% SC_dup, ]
GSD_dup$Test.Num <- as.factor(rep(c(1:2), nrow(GSD_dup)/2))
# GSD_dup %>% ggplot(data = .,mapping=aes(x = School.Code,y=Avg,fill =
# Test.Num))+ geom_bar(stat='identity',position='dodge') tags$p('It appears that
# in 3 out of 5 cases the scores from the second test were an improvement over
# the previous.')
sch <- GSD_dup$School.Code[1]  # For debugging
tags$p("Are the exam scores taken on the sequential dates per school significantly different (statistically speaking)?")

Are the exam scores taken on the sequential dates per school significantly different (statistically speaking)?

lapply(GSD_dup$School.Code[seq(1, nrow(GSD_dup), by = 2)], data = GSD[GSD$School.Code %in% 
    GSD_dup$School.Code, ], FUN = function(sch, data) {
    tests <- data[data$School.Code == sch, ]$Exam.date %>% unique
    out <- t.test(data[data$School.Code == sch & data$Exam.date == tests[1], "Score", 
        drop = T], data[data$School.Code == sch & data$Exam.date == tests[2], "Score", 
        drop = T])
    out$data.name <- paste("T.Test Comparison of School:", sch, "between exams occurring on:", 
        tests[1], "and", tests[2])
    return(out)
})
## [[1]]
## 
##  Welch Two Sample t-test
## 
## data:  T.Test Comparison of School: 07-01-0427-43 between exams occurring on: 2016-03-07 and 2016-03-03
## t = -3.7292, df = 232.92, p-value = 0.0002413
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.10424954 -0.03217416
## sample estimates:
## mean of x mean of y 
## 0.4041667 0.4723785 
## 
## 
## [[2]]
## 
##  Welch Two Sample t-test
## 
## data:  T.Test Comparison of School: 07-01-0445-43 between exams occurring on: 2016-02-24 and 2016-02-25
## t = -2.6193, df = 69.69, p-value = 0.0108
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.10481260 -0.01419193
## sample estimates:
## mean of x mean of y 
## 0.2907240 0.3502262 
## 
## 
## [[3]]
## 
##  Welch Two Sample t-test
## 
## data:  T.Test Comparison of School: 07-01-0450-43 between exams occurring on: 2016-02-23 and 2016-02-22
## t = -1.7618, df = 145.02, p-value = 0.0802
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.086309039  0.004955029
## sample estimates:
## mean of x mean of y 
## 0.2322515 0.2729285 
## 
## 
## [[4]]
## 
##  Welch Two Sample t-test
## 
## data:  T.Test Comparison of School: 07-013-2577-43 between exams occurring on: 2016-02-11 and 2016-02-12
## t = -4.6531, df = 19, p-value = 0.0001734
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.22386844 -0.08495509
## sample estimates:
## mean of x mean of y 
## 0.2058824 0.3602941 
## 
## 
## [[5]]
## 
##  Welch Two Sample t-test
## 
## data:  T.Test Comparison of School: 07-19-0015-43 between exams occurring on: 2016-03-10 and 2016-02-19
## t = 6.7059, df = 41.147, p-value = 0.00000004223
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1603249 0.2984852
## sample estimates:
## mean of x mean of y 
## 0.3478261 0.1184211
tags$p("It appears that the scores are significantly different between class sections in all but the third school. These were likely classes held on different days for different grades, or for different sections.")

It appears that the scores are significantly different between class sections in all but the third school. These were likely classes held on different days for different grades, or for different sections.

tags$p("Is the variance of scores in each grade level equal?")

Is the variance of scores in each grade level equal?

(vtest <- var.test(GSD$Score[GSD$Grade == 2], GSD$Score[GSD$Grade == 3]))
## 
##  F test to compare two variances
## 
## data:  GSD$Score[GSD$Grade == 2] and GSD$Score[GSD$Grade == 3]
## F = 0.74305, num df = 1778, denom df = 1610, p-value =
## 0.000000001009
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6753678 0.8173255
## sample estimates:
## ratio of variances 
##          0.7430488
tags$p("p<.001 indicates that the variances are not equal.")

p<.001 indicates that the variances are not equal.

tags$p("Are grade levels aggregate scores significantly different given variance is not equal?")

Are grade levels aggregate scores significantly different given variance is not equal?

(tout <- t.test(GSD$Score[GSD$Grade == 2], GSD$Score[GSD$Grade == 3]))
## 
##  Welch Two Sample t-test
## 
## data:  GSD$Score[GSD$Grade == 2] and GSD$Score[GSD$Grade == 3]
## t = -16.915, df = 3195.5, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09742693 -0.07718608
## sample estimates:
## mean of x mean of y 
## 0.2866118 0.3739183
tags$p("The grade levels are significantly different at the 95% confidence level", 
    tout %>% apa_t)

The grade levels are significantly different at the 95% confidence level t (3195.51) = -16.91 , CI[ -0.1,-0.08 ], p<.001