Title Week 6 assignment IS607
#NYC The 3rd-to-8th-grade Math test 2013-2014
#1.Choose and load R dataset
math <- read.csv("/Users/seoungyoonlim/Documents/cuny/IS607/week6/MathResults20132014.csv")
str(math)
## 'data.frame': 240 obs. of 6 variables:
## $ Borough : Factor w/ 5 levels "BRONX","BROOKLYN",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Grade : int 3 3 3 3 3 3 3 3 4 4 ...
## $ Year : int 2013 2013 2013 2013 2014 2014 2014 2014 2013 2013 ...
## $ Category : Factor w/ 4 levels "Asian","Black",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Number.Tested: int 586 4474 9961 707 618 4419 10061 721 624 4326 ...
## $ Mean : int 313 284 285 304 315 286 288 309 316 283 ...
#Since Grade and Year variable are integer
# in csv file, I want to change them to factors first
math$Grade<-as.factor(math$Grade)
math$Year<-as.factor(math$Year)
#2.Generate summary level descriptive statistics
summary(math)
## Borough Grade Year Category Number.Tested
## BRONX :48 3:40 2013:120 Asian :60 Min. : 280
## BROOKLYN :48 4:40 2014:120 Black :60 1st Qu.: 1082
## MANHATTAN :48 5:40 Hispanic:60 Median : 2884
## QUEENS :48 6:40 White :60 Mean : 3389
## STATEN ISLAND:48 7:40 3rd Qu.: 4498
## 8:40 Max. :10061
## Mean
## Min. :275.0
## 1st Qu.:288.0
## Median :300.5
## Mean :305.0
## 3rd Qu.:322.0
## Max. :344.0
#3.Determine the frequency
table(math$Category)
##
## Asian Black Hispanic White
## 60 60 60 60
#4. Determine the frequency two categorical variables
table(math$Category, math$Borough)
##
## BRONX BROOKLYN MANHATTAN QUEENS STATEN ISLAND
## Asian 12 12 12 12 12
## Black 12 12 12 12 12
## Hispanic 12 12 12 12 12
## White 12 12 12 12 12
table(math$Year, math$Borough)
##
## BRONX BROOKLYN MANHATTAN QUEENS STATEN ISLAND
## 2013 24 24 24 24 24
## 2014 24 24 24 24 24
#5.Create a graph for a single numeric variable
hist(math$Mean) #total distribution of math mean of students
# math mean boxplot 2013 and 2014 based on ethnic group
math_2013 <- math[which(math$Year==2013,),]
A13 <-math_2013[which(math$Category=='Asian',),]
B13 <-math_2013[which(math$Category=='Black',),]
W13 <-math_2013[which(math$Category=='White',),]
H13 <-math_2013[which(math$Category=='Hispanic',),]
boxplot(A13$Mean,B13$Mean,W13$Mean,H13$Mean, main='2013 Math test', names=c('Asian','Black','White','Hispanic') )
math_2014 <- math[which(math$Year==2014,),]
A14 <-math_2014[which(math$Category=='Asian',),]
B14 <-math_2014[which(math$Category=='Black',),]
W14 <-math_2014[which(math$Category=='White',),]
H14 <-math_2014[which(math$Category=='Hispanic',),]
boxplot(A14$Mean,B14$Mean,W14$Mean,H14$Mean, main='2014 Math test', names=c('Asian','Black','White','Hispanic') )
# math mean boxplot based on Borough
bx <- math[which(math$Borough=='BRONX',),]
mh <- math[which(math$Borough=='MANHATTAN',),]
qn <- math[which(math$Borough=='QUEENS',),]
bn <- math[which(math$Borough=='BROOKLIN',),]
st <- math[which(math$Borough=='STATEN ISLAND',),]
hist(st$Mean, main='histogram of Staten island students Math result')
#ggplot
library(ggplot2)
qplot(Mean, data=st, binwidth=10, main='histogram of Staten island students Math result')
#6.Create a graph for a two numeric variable
#number of students and test score do not show corelation
plot(math$Number.Tested, math$Mean)
qplot(Number.Tested, Mean, data=math)