output: html_documentlanguage: RThe Data project is completed in R markdown file. The output of the document is html.
The assumptions made and steps included are defined in the project along with the codes.
library(plyr)
## Warning: package 'plyr' was built under R version 3.2.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(reshape)
## Warning: package 'reshape' was built under R version 3.2.5
##
## Attaching package: 'reshape'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:plyr':
##
## rename, round_any
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:reshape':
##
## rename
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.2.5
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:reshape':
##
## colsplit, melt, recast
#set working directory
setwd("C:/Users/Gurpreet/Documents/DoE")
#reading the data from three files
data1 <- read.table("schools.txt", header = TRUE, sep="\t")
data2 <-read.table("scores.txt", header= TRUE, sep="\t")
data3 <- read.table("demographics.txt", header = TRUE, sep="\t")
#structure of three datasets
head(data1)
## StudentID School
## 1 257213 A
## 2 255735 B
## 3 254427 B
## 4 257689 B
## 5 253216 B
## 6 255711 B
head(data2)
## StudentID subject score
## 1 259392 Ela 36.0
## 2 257193 Ela 67.0
## 3 256465 soc 93.5
## 4 259953 Math 31.0
## 5 253458 SOC 96.5
## 6 255874 MATH 71.0
head(data3)
## StudentID Special_Ed ELL
## 1 252904 Z U
## 2 250075 U O
## 3 253820 5 U
## 4 255913 A F
## 5 253777 Y O
## 6 252712 4 I
The column subject in data2 has 9 levels: ela,ELA,Ela, Math, Math, MATH, soc, Soc,SOC. Assuming that these 9 different levels corresponds to three unique levels ELA, MATH, SOC. We will convert these 9 levels into 3 unique levels.
levels(data2$subject)[levels(data2$subject)=="ela"] <- "ELA"
levels(data2$subject)[levels(data2$subject)=="Ela"] <- "ELA"
levels(data2$subject)[levels(data2$subject)=="math"] <- "MATH"
levels(data2$subject)[levels(data2$subject)=="Math"] <- "MATH"
levels(data2$subject)[levels(data2$subject)=="soc"] <- "SOC"
levels(data2$subject)[levels(data2$subject)=="Soc"] <- "SOC"
levels(data2$subject) # 3 unique values for column subject are left
## [1] "ELA" "MATH" "SOC"
df <-Reduce(function(x, y) merge(x, y, all=TRUE), list(data1, data2, data3))
df <-na.omit(df) # omitting the NA values
head(df)
## StudentID School subject score Special_Ed ELL
## 1 250001 B SOC 89.0 K D
## 3 250003 A ELA 30.5 K I
## 7 250008 A MATH 30.0 K U
## 8 250008 A SOC 32.5 K U
## 9 250009 B SOC 68.5 Y I
## 10 250009 B ELA 83.0 Y I
One of the requirement of the task is to group ELL and Special_Ed column to 0 and 1. That is the student is ELL if his/her flag is “A”, “E”, “I”, “O”, “U”. We map these values to 1. Values other than these are mapped to 0.
df$ELL = mapvalues(df$ELL, from = c("A", "E", "I", "O", "U"), to = c(1, 1, 1, 1,1))
df$ELL = mapvalues(df$ELL, from = c("B", "D", "F"), to = c(0,0,0))
Similarly special education recieients have a flag “Y” which should be converted to 1 and all other values be converted to 0.
df$Special_Ed = mapvalues(df$Special_Ed, from = c("Y"), to = c(1))
df$Special_Ed = mapvalues(df$Special_Ed, from = c(4, 5, "A", "K", "U", "Z"), to = c(0, 0, 0, 0,0, 0))
head(df)
## StudentID School subject score Special_Ed ELL
## 1 250001 B SOC 89.0 0 0
## 3 250003 A ELA 30.5 0 1
## 7 250008 A MATH 30.0 0 1
## 8 250008 A SOC 32.5 0 1
## 9 250009 B SOC 68.5 1 1
## 10 250009 B ELA 83.0 1 1
In order the subgroup the passing rate we will generate subsets of dataframe df accordingly.
school_A <- subset(df, df$School== "A")
pass_A <- subset(school_A, df$score >= 65)
nrow(pass_A)
## [1] 2801
#pass_rate `for` school A
(nrow(pass_A)/nrow(school_A))*100
## [1] 90.26748
sch_A_Math <- subset(pass_A, df$subject=="MATH")
nrow(sch_A_Math)
## [1] 1976
#pass Math school A
(nrow(sch_A_Math)/nrow(pass_A))*100
## [1] 70.54623
sch_A_ELA <- subset(pass_A, df$subject=="ELA")
nrow(sch_A_ELA)
## [1] 1982
#pass ELA school A
(nrow(sch_A_ELA)/nrow(pass_A))*100
## [1] 70.76044
school_A_ELL <- subset(school_A, school_A$ELL == 1)
head(school_A_ELL)
## StudentID School subject score Special_Ed ELL
## 3 250003 A ELA 30.5 0 1
## 7 250008 A MATH 30.0 0 1
## 8 250008 A SOC 32.5 0 1
## 15 250014 A ELA 29.5 1 1
## 23 250020 A SOC 69.5 0 1
## 24 250020 A SOC 60.5 0 1
nrow(school_A_ELL)
## [1] 1862
school_A_ELL_pass <- subset(school_A_ELL, school_A_ELL$score >= 65)
#pass rate for ELL students for school A
(nrow(school_A_ELL_pass)/nrow(school_A_ELL))*100
## [1] 46.50913
school_A_SpEd <- subset(school_A, school_A$Special_Ed == 1)
head(school_A_SpEd)
## StudentID School subject score Special_Ed ELL
## 14 250012 A ELA 88.5 1 0
## 15 250014 A ELA 29.5 1 1
## 127 250101 A SOC 38.0 1 0
## 159 250125 A ELA 25.5 1 1
## 190 250150 A ELA 85.0 1 0
## 211 250166 A MATH 63.0 1 0
nrow(school_A_SpEd)
## [1] 455
school_A_SpEd_pass <- subset(school_A_SpEd, school_A_SpEd$score >= 65)
#pass rate for Special Ed students for school A
(nrow(school_A_SpEd_pass)/nrow(school_A_SpEd))*100
## [1] 42.41758
school_A_ELL_SpEd <- subset(school_A, school_A$ELL == 1 & school_A$Special_Ed == 1)
pass_schol_A_ELL_SpEd <- subset(school_A_ELL_SpEd, school_A_ELL_SpEd$score >= 65)
(nrow(pass_schol_A_ELL_SpEd)/nrow(school_A_ELL_SpEd))*100
## [1] 42.85714
school_B <- subset(df, df$School== "B")
head(school_B)
## StudentID School subject score Special_Ed ELL
## 1 250001 B SOC 89.0000 0 0
## 9 250009 B SOC 68.5000 1 1
## 10 250009 B ELA 83.0000 1 1
## 11 250010 B SOC 71.0000 0 0
## 12 250010 B MATH 65.7657 0 0
## 17 250016 B MATH 45.5000 0 0
pass_B <- subset(school_B, df$score >= 65)
#pass_rate `for` school B
(nrow(pass_B)/nrow(school_B))*100
## [1] 93.67893
sch_B_Math <- subset(pass_B, df$subject=="MATH")
nrow(sch_B_Math)
## [1] 1976
#pass Math school B
(nrow(sch_B_Math)/nrow(pass_B))*100
## [1] 70.54623
sch_B_ELA <- subset(pass_B, df$subject=="ELA")
nrow(sch_B_ELA)
## [1] 1982
#pass ELA school B
(nrow(sch_B_ELA)/nrow(pass_B))*100
## [1] 70.76044
school_B_ELL <- subset(school_B, school_B$ELL == 1)
head(school_B_ELL)
## StudentID School subject score Special_Ed ELL
## 9 250009 B SOC 68.5 1 1
## 10 250009 B ELA 83.0 1 1
## 25 250021 B SOC 74.0 0 1
## 63 250050 B ELA 25.5 0 1
## 64 250050 B MATH 99.0 0 1
## 65 250051 B SOC 53.5 0 1
nrow(school_B_ELL)
## [1] 1828
school_B_ELL_pass <- subset(school_B_ELL, school_B_ELL$score >= 65)
#pass rate for ELL students for school B
(nrow(school_B_ELL_pass)/nrow(school_B_ELL))*100
## [1] 45.51422
school_B_SpEd <- subset(school_B, school_B$Special_Ed == 1)
head(school_B_SpEd)
## StudentID School subject score Special_Ed ELL
## 9 250009 B SOC 68.5 1 1
## 10 250009 B ELA 83.0 1 1
## 70 250056 B MATH 37.5 1 1
## 84 250067 B ELA 59.5 1
## 85 250067 B ELA 79.0 1
## 125 250099 B SOC 74.5 1 0
nrow(school_B_SpEd)
## [1] 450
school_B_SpEd_pass <- subset(school_B_SpEd, school_B_SpEd$score >= 65)
#pass rate for Special Ed students for school B
(nrow(school_B_SpEd_pass)/nrow(school_B_SpEd))*100
## [1] 48
school_B_ELL_SpEd <- subset(school_B, school_B$ELL == 1 & school_B$Special_Ed == 1)
pass_schol_B_ELL_SpEd <- subset(school_B_ELL_SpEd, school_B_ELL_SpEd$score >= 65)
(nrow(pass_schol_B_ELL_SpEd)/nrow(school_B_ELL_SpEd))*100
## [1] 48.31461
df_ELL <- subset(df, df$ELL == 1)
df_ELL_pass <- subset(df_ELL, df_ELL$score >= 65)
(nrow(df_ELL_pass)/nrow(df_ELL))*100
## [1] 46.01626
df_SpEd <- subset(df, df$Special_Ed == 1)
df_SpEd_pass <- subset(df_SpEd, df_SpEd$score >= 65)
(nrow(df_SpEd_pass)/nrow(df_SpEd))*100
## [1] 45.19337
df_ELL_SpEd <- subset(df, df$Special_Ed == 1 & df$ELL ==1)
df_ELL_SpEd_pass <- subset(df_ELL_SpEd, df_ELL_SpEd$score >= 65)
(nrow(df_ELL_SpEd_pass)/nrow(df_ELL_SpEd))*100
## [1] 45.55556
df_pass <- subset(df, df$score >= 65)
(nrow(df_pass)/nrow(df))*100
## [1] 45.97079
The dataframe df will be broken down into two groups, one with score 65 and more, second with score less than 65. Additon of another column (score_cat)for group one with values 1 and values 0 for the second group. That is student having apsiing score will have score_cat value 1 while 0 for the rest.
sc_more <-subset(df,df$score >= 65)
sc_less <- subset(df, df$score <65)
sc_less <- transform(sc_less, score_cat = 0)
sc_more <- transform(sc_more, score_cat =1)
#merge two dataframes
df <-Reduce(function(x, y) merge(x, y, all=TRUE), list(sc_more, sc_less))
df <-na.omit(df)
df <- df[c("StudentID","School", "subject", "score","score_cat","Special_Ed", "ELL")]
#removing the duplicates
df <- deduped.data<- unique(df[, 1:7])
B1) any student who is ELL should have ELL = 1, otherwise ELL = 0
B2) any student who is receiving Special Ed should have Special_Ed = 1, otherwise Special_Ed = 0
B3) Any student who has passed a particular exam should have a 1 for that exam. Any student who has failed a particular exam should have a 0 for that exam. If the student did not take an exam, the value should be left empty
#converting data into wide format using dcast by taking subject as timevar (i.e transposing subject column into rows with score_cat values accordingly)
df2<-dcast(df, StudentID+School+score+ELL+Special_Ed~subject, value.var = 'score_cat')
head(df2)
## StudentID School score ELL Special_Ed ELA MATH SOC
## 1 250001 B 89.0 0 0 NA NA 1
## 2 250003 A 30.5 1 0 0 NA NA
## 3 250008 A 30.0 1 0 NA 0 NA
## 4 250008 A 32.5 1 0 NA NA 0
## 5 250009 B 68.5 1 1 NA NA 1
## 6 250009 B 83.0 1 1 1 NA NA
df2<-sapply(df2,as.character)
df2[is.na(df2)]<- " "
df2<-df2[,-c(3)]
head(as.data.frame(df2))
## StudentID School ELL Special_Ed ELA MATH SOC
## 1 250001 B 0 0 1
## 2 250003 A 1 0 0
## 3 250008 A 1 0 0
## 4 250008 A 1 0 0
## 5 250009 B 1 1 1
## 6 250009 B 1 1 1
#convert r dataframe to txt file and save in the folder providing the path
write.table(df2,"C:/Users/Gurpreet/Documents/DoE/mydata.txt", sep="\t")