SPARK in Staten Island

Evaluated on behalf of Borough Hall and New Dorp High School by Fil Babalievsky and Atishay Sehgal

Introduction

This is a replication file for Babalievsky and Sehgal (2018), an evaluation of Project SPARK.

Preliminaries

First we load prerequisite pack and set our working directory to the path with the data (not a local file--we cannot store the data in a public repo.)

setwd("~/Dropbox/New SI Stuff/SPARK Eval")
knitr::opts_knit$set(root.dir = "~/Dropbox/New SI Stuff/SPARK Eval")
#install.packages("tidyverse")
#install.packages("stargazer")
#install.packages("Rcurl")
#install.packages("dplyr")
#install.packages("readxl")
#install.packages("anonymizer")
library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0.9000     ✔ purrr   0.2.5     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.5     
## ✔ tidyr   0.8.1          ✔ stringr 1.3.1     
## ✔ readr   1.1.1          ✔ forcats 0.3.0

## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(dplyr)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(RCurl)

## Loading required package: bitops

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

library(readxl)
library(anonymizer)

Next we import a function for clustered standard errors.

# import the function from repository
# thanks https://economictheoryblog.com/2016/12/13/clustered-standard-errors-in-r/
url_robust <- "https://raw.githubusercontent.com/IsidoreBeautrelet/economictheoryblog/master/robust_summary.R"
eval(parse(text = getURL(url_robust, ssl.verifypeer = FALSE)),
     envir=.GlobalEnv)

The Data

First we import data for the ninth graders in the Health Sciences SLC in New Dorp High who are enrolled in SPARK. Note that there are 57 students in SPARK and 37 students in New Dorp's 9th grade class not in SPARK. The seventh tab of the Excel file holds data for SPARK students in their first marking periods in English, math, and science. There are 171 observations, 3 classes times 51 students.

hs_spark_1<-read_excel("SPARK 2017 HN copy.xlsx", sheet = 7, col_names = FALSE)
names(hs_spark_1)<-c("StudentID", "Course", "Mark1")

nrow(hs_spark_1)

## [1] 171

#dummy for Spark participation
hs_spark_1$Spark<-1
#dummy for New Dorp High School students
hs_spark_1$HS<-1

Next we import data for ninth graders who are in the Health Sciences SLC in New Dorp High School but not in SPARK. There are 111 observations, 3 classes times 37 students.

hs_nonspark_1<-read_excel("SPARK 2017 HN copy.xlsx", sheet = 8, col_names = FALSE)
names(hs_nonspark_1)<-c("StudentID", "Course", "Mark1")
#dummy for SPARK non-participation
hs_nonspark_1$Spark<-0
#dummy for New Dorp High School Students
hs_nonspark_1$HS<-1
nrow(hs_nonspark_1)

## [1] 111

Here we combine the results from the first marking period with those of the second marking period.

first<-rbind(hs_spark_1, hs_nonspark_1)



all_mp_2<-read_excel("SPARK MP2 Grades copy.xlsx", sheet = 1)
names(all_mp_2)

## [1] "StudentID" "Course"    "Mark"

names(all_mp_2)<-c("StudentID", "Course", "Mark2")
total<-left_join(first, all_mp_2, by=c("StudentID", "Course"))

total$diff<-as.numeric(as.character(total$Mark2))-as.numeric(as.character(total$Mark1))

## Warning: NAs introduced by coercion

total_hs<-total[total$HS==1,]

nrow(total_hs[total_hs$Spark==0,])

## [1] 111

nrow(total_hs[total_hs$Spark==1,])

## [1] 171

Next we look at the data from marking periods 3 and 4 and try to merge them with the data from periods 1 and 2.

three_and_four_again<-read_excel("SPARK 2017 Term 2 MP3 and MP4 copy.xlsx", sheet = 1)
names(three_and_four_again)<-c("StudentID", "Blank1", "Blank2",  "Course", "Blank3", "Blank4", "Blank5", "Blank6", "MP3", "MP4", "Final")

total_hs<-total[total$HS==1,]
total_hs$firsfour<-substr(total_hs$Course, 1, 4)
total_hs$lastthree<-substr(total_hs$Course, 6, 8)
total_hs$Course<-paste(total_hs$firsfour, total_hs$lastthree, sep="2")
nrow(total_hs)

## [1] 282

Final<-left_join(total_hs, three_and_four_again, by=c("StudentID", "Course"))
nrow(Final)

## [1] 282

nrow(Final[Final$Spark==0,])

## [1] 111

nrow(Final[Final$Spark==1,])

## [1] 171

Final$Final <- NULL

Next we run our regressions. All our observations are at the student-subject level. The first three regressions have the following format:

Δ=α+β×SPARK

Here Δ is the difference in grades for a student-subject between marking period 1 and a subsequent marking period, α is a constant, and β is the coefficient on a dummy variable taking value 1 if and only if the student is in SPARK. The next three regressions have the format:

Δ=α+∑_i(β_i×SPARK×Class_i)+∑_i(γ_i×Class_i)

Here, γ is the coefficient on a specific subject--Math, English, or Science. A high value for, say γ_m, the coefficient on math, tells us that student performance declined less in math than in the "default" subject (here chosen as science.) A high value for β_m would therefore imply that students in SPARK saw even smaller decreases in math performance than non-SPARK students.

Final$diff1<-as.numeric(as.character(Final$Mark2))-as.numeric(as.character(Final$Mark1))

## Warning: NAs introduced by coercion

Final$diff2<-as.numeric(as.character(Final$MP3))-as.numeric(as.character(Final$Mark1))

## Warning: NAs introduced by coercion

Final$diff3<-as.numeric(as.character(Final$MP4))-as.numeric(as.character(Final$Mark1))

## Warning: NAs introduced by coercion

summary(Final$diff1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -29.000  -9.000  -5.000  -4.905   0.000  23.000       9

lm1<-lm(diff1~Spark,data=Final)
summary(lm1)

## 
## Call:
## lm(formula = diff1 ~ Spark, data = Final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.0059  -4.0059   0.3846   3.9941  26.9941 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.3846     0.7333  -8.707 3.13e-16 ***
## Spark         2.3905     0.9320   2.565   0.0109 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.478 on 271 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.0237, Adjusted R-squared:  0.0201 
## F-statistic: 6.579 on 1 and 271 DF,  p-value: 0.01086

summary(lm1, cluster=c("StudentID"))

## 
## Call:
## lm(formula = diff1 ~ Spark, data = Final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.0059  -4.0059   0.3846   3.9941  26.9941 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -6.385         NA      NA       NA
## Spark          2.391         NA      NA       NA
## 
## Residual standard error: 7.478 on 271 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.0237, Adjusted R-squared:  0.0201 
## F-statistic:   NaN on 1 and 0 DF,  p-value: NA

lm2<-lm(diff2~Spark,data=Final)
summary(lm2)

## 
## Call:
## lm(formula = diff2 ~ Spark, data = Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.713  -6.165   0.835   7.287  24.835 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.8350     0.9733  -9.077   <2e-16 ***
## Spark         0.5475     1.2479   0.439    0.661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.878 on 261 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.0007368,  Adjusted R-squared:  -0.003092 
## F-statistic: 0.1925 on 1 and 261 DF,  p-value: 0.6612

lm3<-lm(diff3~Spark,data=Final)
summary(lm3)

## 
## Call:
## lm(formula = diff3 ~ Spark, data = Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.413  -4.192   0.029   4.587  24.587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.0291     0.7667  -7.864 9.86e-14 ***
## Spark         1.4416     0.9829   1.467    0.144    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.781 on 261 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.008174,   Adjusted R-squared:  0.004374 
## F-statistic: 2.151 on 1 and 261 DF,  p-value: 0.1437

Final$coursechar<-as.character(Final$Course)
Final$Type<-substr(Final$coursechar, 1, 1)


Final$Spark_math<-0
Final$Spark_math[Final$Spark==1&Final$Type=="M"]<-1

Final$Spark_sci<-0
Final$Spark_sci[Final$Spark==1&Final$Type=="S"]<-1

Final$Spark_eng<-0
Final$Spark_eng[Final$Spark==1&Final$Type=="E"]<-1

Final$math<-0
Final$math[Final$Type=="M"]<-1

Final$sci<-0
Final$sci[Final$Type=="S"]<-1

Final$eng<-0
Final$eng[Final$Type=="E"]<-1


lm1_subj<-lm(diff1~Spark_math+Spark_sci+Spark_eng+math+eng,data=Final)
summary(lm1_subj)

## 
## Call:
## lm(formula = diff1 ~ Spark_math + Spark_sci + Spark_eng + math + 
##     eng, data = Final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.6429  -3.8214   0.1786   3.7059  25.4737 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.0857     1.2577  -6.429  5.9e-10 ***
## Spark_math    3.2322     1.6124   2.005   0.0460 *  
## Spark_sci     2.9071     1.6033   1.813   0.0709 .  
## Spark_eng     0.9857     1.6033   0.615   0.5392    
## math          2.3798     1.7917   1.328   0.1852    
## eng           2.7429     1.7787   1.542   0.1242    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.441 on 267 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.04771,    Adjusted R-squared:  0.02987 
## F-statistic: 2.675 on 5 and 267 DF,  p-value: 0.02224

summary(lm1_subj, cluster=c("StudentID"))

## 
## Call:
## lm(formula = diff1 ~ Spark_math + Spark_sci + Spark_eng + math + 
##     eng, data = Final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.6429  -3.8214   0.1786   3.7059  25.4737 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -8.0857         NA      NA       NA
## Spark_math    3.2322         NA      NA       NA
## Spark_sci     2.9071         NA      NA       NA
## Spark_eng     0.9857         NA      NA       NA
## math          2.3798         NA      NA       NA
## eng           2.7429         NA      NA       NA
## 
## Residual standard error: 7.441 on 267 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.04771,    Adjusted R-squared:  0.02987 
## F-statistic:   NaN on 5 and 0 DF,  p-value: NA

lm2_subj<-lm(diff2~Spark_math+Spark_sci+Spark_eng+math+eng,data=Final)
summary(lm2_subj)

## 
## Call:
## lm(formula = diff2 ~ Spark_math + Spark_sci + Spark_eng + math + 
##     eng, data = Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.396  -5.396   0.333   5.252  23.604 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.3714     1.5492  -2.176 0.030452 *  
## Spark_math    0.2778     2.0251   0.137 0.891008    
## Spark_sci     0.2394     1.9963   0.120 0.904656    
## Spark_eng     1.3962     1.9963   0.699 0.484924    
## math         -8.9619     2.2239  -4.030 7.36e-05 ***
## eng          -7.6286     2.1909  -3.482 0.000585 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.165 on 257 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.153,  Adjusted R-squared:  0.1365 
## F-statistic: 9.282 on 5 and 257 DF,  p-value: 3.854e-08

summary(lm2_subj, cluster=c("StudentID"))

## 
## Call:
## lm(formula = diff2 ~ Spark_math + Spark_sci + Spark_eng + math + 
##     eng, data = Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.396  -5.396   0.333   5.252  23.604 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -3.3714         NA      NA       NA
## Spark_math    0.2778         NA      NA       NA
## Spark_sci     0.2394         NA      NA       NA
## Spark_eng     1.3962         NA      NA       NA
## math         -8.9619         NA      NA       NA
## eng          -7.6286         NA      NA       NA
## 
## Residual standard error: 9.165 on 257 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.153,  Adjusted R-squared:  0.1365 
## F-statistic:   NaN on 5 and 0 DF,  p-value: NA

lm3_subj<-lm(diff3~Spark_math+Spark_sci+Spark_eng+math+eng,data=Final)
summary(lm3_subj)

## 
## Call:
## lm(formula = diff3 ~ Spark_math + Spark_sci + Spark_eng + math + 
##     eng, data = Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.113  -4.560  -0.463   4.368  25.887 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -3.42857    1.30296  -2.631  0.00902 **
## Spark_math   2.85690    1.70323   1.677  0.09469 . 
## Spark_sci    0.08895    1.67894   0.053  0.95779   
## Spark_eng    1.45606    1.67894   0.867  0.38661   
## math        -3.96537    1.87038  -2.120  0.03496 * 
## eng         -3.91429    1.84267  -2.124  0.03460 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.708 on 257 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.04149,    Adjusted R-squared:  0.02284 
## F-statistic: 2.225 on 5 and 257 DF,  p-value: 0.05233

summary(lm3_subj, cluster=c("StudentID"))

## 
## Call:
## lm(formula = diff3 ~ Spark_math + Spark_sci + Spark_eng + math + 
##     eng, data = Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.113  -4.560  -0.463   4.368  25.887 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.42857         NA      NA       NA
## Spark_math   2.85690         NA      NA       NA
## Spark_sci    0.08895         NA      NA       NA
## Spark_eng    1.45606         NA      NA       NA
## math        -3.96537         NA      NA       NA
## eng         -3.91429         NA      NA       NA
## 
## Residual standard error: 7.708 on 257 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.04149,    Adjusted R-squared:  0.02284 
## F-statistic:   NaN on 5 and 0 DF,  p-value: NA

We next run three regressions, again with subject dummies, but this time we compare performance between:

Marking period 2 and 3
Marking period 3 and 4
Marking period 2 and 4

Final$diff_intermediate1<-as.numeric(as.character(Final$MP3))-as.numeric(as.character(Final$Mark2))
lm_intermediate1<-lm(diff_intermediate1~Spark_math+Spark_sci+Spark_eng+math+eng,data=Final)
summary(lm_intermediate1)

## 
## Call:
## lm(formula = diff_intermediate1 ~ Spark_math + Spark_sci + Spark_eng + 
##     math + eng, data = Final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.9259  -4.8056   0.5833   5.8286  22.0741 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.806      1.472   3.264  0.00124 ** 
## Spark_math    -3.783      1.917  -1.973  0.04949 *  
## Spark_sci     -2.880      1.901  -1.515  0.13092    
## Spark_eng     -0.287      1.901  -0.151  0.88006    
## math         -10.634      2.097  -5.072 7.45e-07 ***
## eng          -10.389      2.082  -4.990 1.10e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.833 on 263 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2472, Adjusted R-squared:  0.2329 
## F-statistic: 17.27 on 5 and 263 DF,  p-value: 8.895e-15

Final$diff_intermediate2<-as.numeric(as.character(Final$MP4))-as.numeric(as.character(Final$MP3))
lm_intermediate2<-lm(diff_intermediate2~Spark_math+Spark_sci+Spark_eng+math+eng,data=Final)
summary(lm_intermediate2)

## 
## Call:
## lm(formula = diff_intermediate2 ~ Spark_math + Spark_sci + Spark_eng + 
##     math + eng, data = Final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5185  -3.5556   0.0278   2.4815  20.2037 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.02778    0.74732  -0.037 0.970378    
## Spark_math   2.86138    0.97301   2.941 0.003566 ** 
## Spark_sci   -0.17593    0.96478  -0.182 0.855450    
## Spark_eng    0.37037    0.96478   0.384 0.701369    
## math         4.68492    1.06439   4.402 1.56e-05 ***
## eng          3.58333    1.05686   3.391 0.000805 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.484 on 263 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.2852, Adjusted R-squared:  0.2717 
## F-statistic: 20.99 on 5 and 263 DF,  p-value: < 2.2e-16

Final$diff_ends<-as.numeric(as.character(Final$MP4))-as.numeric(as.character(Final$Mark2))
lm_ends<-lm(diff_ends~Spark_math+Spark_sci+Spark_eng+math+eng,data=Final)
summary(lm_ends)

## 
## Call:
## lm(formula = diff_ends ~ Spark_math + Spark_sci + Spark_eng + 
##     math + eng, data = Final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.9074  -3.7222   0.9444   4.2222  17.0926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.77778    1.16935   4.086 5.84e-05 ***
## Spark_math  -0.92116    1.52251  -0.605 0.545681    
## Spark_sci   -3.05556    1.50963  -2.024 0.043976 *  
## Spark_eng    0.08333    1.50963   0.055 0.956020    
## math        -5.94921    1.66548  -3.572 0.000421 ***
## eng         -6.80556    1.65371  -4.115 5.18e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.016 on 263 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.1101, Adjusted R-squared:  0.09319 
## F-statistic: 6.509 on 5 and 263 DF,  p-value: 1.001e-05

plot(Final$Mark1)

## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

plot(Final$Mark2)

plot(Final$MP3)

plot(Final$MP4)

Next, we note that a small number of kids are in a more advanced math course. The first letter of the course identifier indicates the broad subject, whereas the second indicates the specific course. A second letter "E" indicates algebra 1, whereas a second letter "G" indicates a more advanced geometry course. We add dummies for this higher level math course and for its interaction with SPARK. Nothing important changes.

The number of students in a different science level was tiny, so we do not bother cutting up the sample along that dimension.

All students were at the same level of English.

Final$Type2<-substr(Final$coursechar, 1, 2)


Final$adv_math<-0
Final$adv_math[Final$Type2=="MG"]<-1
Final$Spark_adv_math<-0
Final$Spark_adv_math[Final$Spark==1&Final$Type2=="MG"]<-1

lm3_subj_adv<-lm(diff3~Spark_math+Spark_sci+Spark_eng+math+eng+adv_math+Spark_adv_math,data=Final)
summary(lm3_subj_adv)

## 
## Call:
## lm(formula = diff3 ~ Spark_math + Spark_sci + Spark_eng + math + 
##     eng + adv_math + Spark_adv_math, data = Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.113  -4.524  -0.113   4.343  25.887 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    -3.42857    1.30788  -2.621  0.00928 **
## Spark_math      2.73505    1.78113   1.536  0.12588   
## Spark_sci       0.08895    1.68528   0.053  0.95795   
## Spark_eng       1.45606    1.68528   0.864  0.38841   
## math           -3.91518    1.89248  -2.069  0.03957 * 
## eng            -3.91429    1.84962  -2.116  0.03529 * 
## adv_math       -1.65625    7.85751  -0.211  0.83322   
## Spark_adv_math  2.13995    8.39795   0.255  0.79907   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.738 on 255 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.04175,    Adjusted R-squared:  0.01545 
## F-statistic: 1.587 on 7 and 255 DF,  p-value: 0.1394

Next we look at the kids who left our sample after Marking Period 2. Note that the SPARK students who dropped out of the sample had high MP1 grades, and an MP1 to MP2 drop that was not unusually great. We do not know why they left the sample. The overall number of kids leaving the sample was not great.

dropouts <- Final[which(is.na(Final$MP4)),]
dropouts$anonID<-anonymize(dropouts$StudentID, .n_chars = 5, .algo = "crc32")
myvars<-c("anonID", "Mark1", "Mark2", "MP3", "MP4", "Spark")

dropouts_anon<-dropouts[myvars]
dropouts_anon

## # A tibble: 13 x 6
##    anonID   Mark1 Mark2 MP3   MP4   Spark
##    <chr>    <chr> <chr> <chr> <chr> <dbl>
##  1 b4d27052 84    80    <NA>  <NA>      1
##  2 b4d27052 80    70    <NA>  <NA>      1
##  3 b4d27052 85    83    <NA>  <NA>      1
##  4 281c8b1c 89    80    <NA>  <NA>      1
##  5 281c8b1c 85    87    <NA>  <NA>      1
##  6 281c8b1c 88    85    <NA>  <NA>      1
##  7 8e773d69 99    94    <NA>  <NA>      1
##  8 8e773d69 98    97    <NA>  <NA>      1
##  9 8e773d69 93    88    <NA>  <NA>      1
## 10 2a98d8da 65    65    <NA>  <NA>      0
## 11 95e6d8a7 99    <NA>  <NA>  <NA>      0
## 12 95e6d8a7 99    <NA>  <NA>  <NA>      0
## 13 95e6d8a7 97    <NA>  <NA>  <NA>      0

as.numeric(dropouts$Mark2)-as.numeric(dropouts$Mark1)

##  [1]  -4 -10  -2  -9   2  -3  -5  -1  -5   0  NA  NA  NA

summary(as.numeric(dropouts$Mark2)-as.numeric(dropouts$Mark1))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -10.00   -5.00   -3.50   -3.70   -1.25    2.00       3

summary(as.numeric(Final$Mark2)-as.numeric(Final$Mark1))

## Warning in summary(as.numeric(Final$Mark2) - as.numeric(Final$Mark1)): NAs
## introduced by coercion

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -29.000  -9.000  -5.000  -4.905   0.000  23.000       9

mean(dropouts$diff)

## [1] NA

write.csv(Final, file = "Final_Spark.csv")