output: html_document

language: R

The Data project is completed in R markdown file. The output of the document is html.

The assumptions made and steps included are defined in the project along with the codes.

Subgroup Passing Rates

Installing required libraries

library(plyr)
## Warning: package 'plyr' was built under R version 3.2.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(reshape)
## Warning: package 'reshape' was built under R version 3.2.5
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:plyr':
## 
##     rename, round_any
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:reshape':
## 
##     rename
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.2.5
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:reshape':
## 
##     colsplit, melt, recast

Data Collection

#set working directory
setwd("C:/Users/Gurpreet/Documents/DoE")


#reading the data from three files
data1 <- read.table("schools.txt", header = TRUE, sep="\t")
data2 <-read.table("scores.txt", header= TRUE, sep="\t")
data3 <- read.table("demographics.txt", header = TRUE, sep="\t")


#structure of three datasets
head(data1)
##   StudentID School
## 1    257213      A
## 2    255735      B
## 3    254427      B
## 4    257689      B
## 5    253216      B
## 6    255711      B
head(data2)
##   StudentID subject score
## 1    259392     Ela  36.0
## 2    257193     Ela  67.0
## 3    256465     soc  93.5
## 4    259953    Math  31.0
## 5    253458     SOC  96.5
## 6    255874    MATH  71.0
head(data3)
##   StudentID Special_Ed ELL
## 1    252904          Z   U
## 2    250075          U   O
## 3    253820          5   U
## 4    255913          A   F
## 5    253777          Y   O
## 6    252712          4   I

The column subject in data2 has 9 levels: ela,ELA,Ela, Math, Math, MATH, soc, Soc,SOC. Assuming that these 9 different levels corresponds to three unique levels ELA, MATH, SOC. We will convert these 9 levels into 3 unique levels.

levels(data2$subject)[levels(data2$subject)=="ela"] <- "ELA"
levels(data2$subject)[levels(data2$subject)=="Ela"] <- "ELA"

levels(data2$subject)[levels(data2$subject)=="math"] <- "MATH"
levels(data2$subject)[levels(data2$subject)=="Math"] <- "MATH"

levels(data2$subject)[levels(data2$subject)=="soc"]  <- "SOC"
levels(data2$subject)[levels(data2$subject)=="Soc"]  <- "SOC"


levels(data2$subject)  # 3 unique values for column subject are left
## [1] "ELA"  "MATH" "SOC"

Merge the data sets into a single r dataframe

df <-Reduce(function(x, y) merge(x, y, all=TRUE), list(data1, data2, data3))
df <-na.omit(df)  # omitting the NA values

head(df)
##    StudentID School subject score Special_Ed ELL
## 1     250001      B     SOC  89.0          K   D
## 3     250003      A     ELA  30.5          K   I
## 7     250008      A    MATH  30.0          K   U
## 8     250008      A     SOC  32.5          K   U
## 9     250009      B     SOC  68.5          Y   I
## 10    250009      B     ELA  83.0          Y   I
Mapping the values

One of the requirement of the task is to group ELL and Special_Ed column to 0 and 1. That is the student is ELL if his/her flag is “A”, “E”, “I”, “O”, “U”. We map these values to 1. Values other than these are mapped to 0.

df$ELL = mapvalues(df$ELL, from = c("A", "E", "I", "O", "U"), to = c(1, 1, 1, 1,1))
df$ELL = mapvalues(df$ELL, from = c("B", "D", "F"), to = c(0,0,0))

Similarly special education recieients have a flag “Y” which should be converted to 1 and all other values be converted to 0.

df$Special_Ed = mapvalues(df$Special_Ed, from = c("Y"), to = c(1))
df$Special_Ed = mapvalues(df$Special_Ed, from = c(4, 5, "A", "K", "U", "Z"), to = c(0, 0, 0, 0,0, 0))

head(df)
##    StudentID School subject score Special_Ed ELL
## 1     250001      B     SOC  89.0          0   0
## 3     250003      A     ELA  30.5          0   1
## 7     250008      A    MATH  30.0          0   1
## 8     250008      A     SOC  32.5          0   1
## 9     250009      B     SOC  68.5          1   1
## 10    250009      B     ELA  83.0          1   1

Analysis

In order the subgroup the passing rate we will generate subsets of dataframe df accordingly.

1. Pass rate for School A

school_A <- subset(df, df$School== "A")

pass_A <- subset(school_A, df$score >= 65)

nrow(pass_A)
## [1] 2801
#pass_rate `for` school A 
(nrow(pass_A)/nrow(school_A))*100
## [1] 90.26748

2. Pass rate for Math for School A

sch_A_Math <- subset(pass_A, df$subject=="MATH")
nrow(sch_A_Math)
## [1] 1976
#pass Math school A
(nrow(sch_A_Math)/nrow(pass_A))*100
## [1] 70.54623

3. Pass rate for ELA for School A

sch_A_ELA <- subset(pass_A, df$subject=="ELA")
nrow(sch_A_ELA)
## [1] 1982
#pass ELA school A
(nrow(sch_A_ELA)/nrow(pass_A))*100
## [1] 70.76044

4. Pass rate for ELL students for School A

school_A_ELL <- subset(school_A, school_A$ELL == 1)
head(school_A_ELL)
##    StudentID School subject score Special_Ed ELL
## 3     250003      A     ELA  30.5          0   1
## 7     250008      A    MATH  30.0          0   1
## 8     250008      A     SOC  32.5          0   1
## 15    250014      A     ELA  29.5          1   1
## 23    250020      A     SOC  69.5          0   1
## 24    250020      A     SOC  60.5          0   1
nrow(school_A_ELL)
## [1] 1862
school_A_ELL_pass <- subset(school_A_ELL, school_A_ELL$score >= 65)


#pass rate for ELL students for school A

(nrow(school_A_ELL_pass)/nrow(school_A_ELL))*100
## [1] 46.50913

5. Pass rate for Special Ed reciepients for School A

school_A_SpEd <- subset(school_A, school_A$Special_Ed == 1)
head(school_A_SpEd)
##     StudentID School subject score Special_Ed ELL
## 14     250012      A     ELA  88.5          1   0
## 15     250014      A     ELA  29.5          1   1
## 127    250101      A     SOC  38.0          1   0
## 159    250125      A     ELA  25.5          1   1
## 190    250150      A     ELA  85.0          1   0
## 211    250166      A    MATH  63.0          1   0
nrow(school_A_SpEd)
## [1] 455
school_A_SpEd_pass <- subset(school_A_SpEd, school_A_SpEd$score >= 65)


#pass rate for Special Ed students for school A

(nrow(school_A_SpEd_pass)/nrow(school_A_SpEd))*100
## [1] 42.41758

6. Pass rate for ELL and Special Ed reciepients for School A

school_A_ELL_SpEd <- subset(school_A, school_A$ELL == 1 & school_A$Special_Ed == 1)

pass_schol_A_ELL_SpEd <- subset(school_A_ELL_SpEd, school_A_ELL_SpEd$score >= 65)

(nrow(pass_schol_A_ELL_SpEd)/nrow(school_A_ELL_SpEd))*100
## [1] 42.85714

7. Pass rate for School B

school_B <- subset(df, df$School== "B")
head(school_B)
##    StudentID School subject   score Special_Ed ELL
## 1     250001      B     SOC 89.0000          0   0
## 9     250009      B     SOC 68.5000          1   1
## 10    250009      B     ELA 83.0000          1   1
## 11    250010      B     SOC 71.0000          0   0
## 12    250010      B    MATH 65.7657          0   0
## 17    250016      B    MATH 45.5000          0   0
pass_B <- subset(school_B, df$score >= 65)
#pass_rate `for` school B
(nrow(pass_B)/nrow(school_B))*100
## [1] 93.67893

8. Pass rate for Math for School B

sch_B_Math <- subset(pass_B, df$subject=="MATH")
nrow(sch_B_Math)
## [1] 1976
#pass Math school B
(nrow(sch_B_Math)/nrow(pass_B))*100
## [1] 70.54623

9. Pass rate for ELA for School B

sch_B_ELA <- subset(pass_B, df$subject=="ELA")
nrow(sch_B_ELA)
## [1] 1982
#pass ELA school B
(nrow(sch_B_ELA)/nrow(pass_B))*100
## [1] 70.76044

10. Pass rate for ELL students for School B

school_B_ELL <- subset(school_B, school_B$ELL == 1)
head(school_B_ELL)
##    StudentID School subject score Special_Ed ELL
## 9     250009      B     SOC  68.5          1   1
## 10    250009      B     ELA  83.0          1   1
## 25    250021      B     SOC  74.0          0   1
## 63    250050      B     ELA  25.5          0   1
## 64    250050      B    MATH  99.0          0   1
## 65    250051      B     SOC  53.5          0   1
nrow(school_B_ELL)
## [1] 1828
school_B_ELL_pass <- subset(school_B_ELL, school_B_ELL$score >= 65)


#pass rate for ELL students for school B

(nrow(school_B_ELL_pass)/nrow(school_B_ELL))*100
## [1] 45.51422

11. Pass rate for Special Ed reciepients for School B

school_B_SpEd <- subset(school_B, school_B$Special_Ed == 1)
head(school_B_SpEd)
##     StudentID School subject score Special_Ed ELL
## 9      250009      B     SOC  68.5          1   1
## 10     250009      B     ELA  83.0          1   1
## 70     250056      B    MATH  37.5          1   1
## 84     250067      B     ELA  59.5          1    
## 85     250067      B     ELA  79.0          1    
## 125    250099      B     SOC  74.5          1   0
nrow(school_B_SpEd)
## [1] 450
school_B_SpEd_pass <- subset(school_B_SpEd, school_B_SpEd$score >= 65)


#pass rate for Special Ed students for school B

(nrow(school_B_SpEd_pass)/nrow(school_B_SpEd))*100
## [1] 48

12. Pass rate for ELL and Special Ed reciepients for School B

school_B_ELL_SpEd <- subset(school_B, school_B$ELL == 1 & school_B$Special_Ed == 1)

pass_schol_B_ELL_SpEd <- subset(school_B_ELL_SpEd, school_B_ELL_SpEd$score >= 65)

(nrow(pass_schol_B_ELL_SpEd)/nrow(school_B_ELL_SpEd))*100
## [1] 48.31461

13. Pass rate for ELL students for school A and B

df_ELL <- subset(df, df$ELL == 1)
df_ELL_pass <- subset(df_ELL, df_ELL$score >= 65)

(nrow(df_ELL_pass)/nrow(df_ELL))*100
## [1] 46.01626

14. Pass rate for Special Ed students for school A and B

df_SpEd <- subset(df, df$Special_Ed == 1)
df_SpEd_pass <- subset(df_SpEd, df_SpEd$score >= 65)

(nrow(df_SpEd_pass)/nrow(df_SpEd))*100
## [1] 45.19337

15. Pass rate for ELL andSpecial Ed students for school A and B

df_ELL_SpEd <- subset(df, df$Special_Ed == 1 & df$ELL ==1)
df_ELL_SpEd_pass <- subset(df_ELL_SpEd, df_ELL_SpEd$score >= 65)

(nrow(df_ELL_SpEd_pass)/nrow(df_ELL_SpEd))*100
## [1] 45.55556

16. Pass rate for all students for school A and B

df_pass <- subset(df, df$score >= 65)

(nrow(df_pass)/nrow(df))*100
## [1] 45.97079

Part B

Creating a flat student level dataset

The dataframe df will be broken down into two groups, one with score 65 and more, second with score less than 65. Additon of another column (score_cat)for group one with values 1 and values 0 for the second group. That is student having apsiing score will have score_cat value 1 while 0 for the rest.

sc_more <-subset(df,df$score >= 65)
sc_less <- subset(df, df$score <65)


sc_less <- transform(sc_less, score_cat = 0)
sc_more <- transform(sc_more, score_cat =1)

#merge two dataframes 
df <-Reduce(function(x, y) merge(x, y, all=TRUE), list(sc_more, sc_less))
df <-na.omit(df)
df <- df[c("StudentID","School", "subject", "score","score_cat","Special_Ed", "ELL")]
#removing the duplicates
df <- deduped.data<- unique(df[, 1:7])

convert the long data into swide format to satisfy three conditions

Requirements

B1) any student who is ELL should have ELL = 1, otherwise ELL = 0

B2) any student who is receiving Special Ed should have Special_Ed = 1, otherwise Special_Ed = 0

B3) Any student who has passed a particular exam should have a 1 for that exam. Any student who has failed a particular exam should have a 0 for that exam. If the student did not take an exam, the value should be left empty

#converting data into wide format using dcast by taking subject as timevar (i.e transposing subject column into rows with score_cat values accordingly)
df2<-dcast(df, StudentID+School+score+ELL+Special_Ed~subject, value.var = 'score_cat')
head(df2)
##   StudentID School score ELL Special_Ed ELA MATH SOC
## 1    250001      B  89.0   0          0  NA   NA   1
## 2    250003      A  30.5   1          0   0   NA  NA
## 3    250008      A  30.0   1          0  NA    0  NA
## 4    250008      A  32.5   1          0  NA   NA   0
## 5    250009      B  68.5   1          1  NA   NA   1
## 6    250009      B  83.0   1          1   1   NA  NA
df2<-sapply(df2,as.character)
df2[is.na(df2)]<- " "

df2<-df2[,-c(3)]
head(as.data.frame(df2))
##   StudentID School ELL Special_Ed ELA MATH SOC
## 1    250001      B   0          0            1
## 2    250003      A   1          0   0         
## 3    250008      A   1          0        0    
## 4    250008      A   1          0            0
## 5    250009      B   1          1            1
## 6    250009      B   1          1   1
#convert r dataframe to txt file and save in the folder providing the path
write.table(df2,"C:/Users/Gurpreet/Documents/DoE/mydata.txt", sep="\t")