library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
ts <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/3-8_ELA_AND_MATH_NYC_SUMMARY_2019.csv", na.strings = "-")
head(ts)
## SCHOOL.YEAR.END.DATE BEDS.CODE NAME
## 1 06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 2 06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 3 06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 4 06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 5 06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 6 06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## SUBJECT STUDENT.SUBGROUP TOTAL.TESTED LEVEL.1.COUNT LEVEL.1.PCT
## 1 Grade 3 ELA All Students 675 54 8%
## 2 Grade 3 Math All Students 686 129 19%
## 3 Grade 4 ELA All Students 678 104 15%
## 4 Grade 4 Math All Students 678 170 25%
## 5 Grade 5 ELA All Students 694 208 30%
## 6 Grade 5 Math All Students 690 217 31%
## LEVEL.2.COUNT LEVEL.2.PCT LEVEL.3.COUNT LEVEL.3.PCT LEVEL.4.COUNT
## 1 207 31% 288 43% 126
## 2 139 20% 176 26% 242
## 3 171 25% 217 32% 186
## 4 149 22% 122 18% 237
## 5 184 27% 142 20% 160
## 6 123 18% 115 17% 235
## LEVEL.4.PCT LEVEL.2.4.PCT LEVEL.3.4.PCT MEAN.SCALE.SCORE
## 1 19% 92% 61% 607
## 2 35% 81% 61% 606
## 3 27% 85% 59% 605
## 4 35% 75% 53% 604
## 5 23% 70% 44% 604
## 6 34% 69% 51% 603
str(ts)
## 'data.frame': 7777 obs. of 17 variables:
## $ SCHOOL.YEAR.END.DATE: Factor w/ 1 level "06/30/2019": 1 1 1 1 1 1 1 1 1 1 ...
## $ BEDS.CODE : num 3.1e+11 3.1e+11 3.1e+11 3.1e+11 3.1e+11 ...
## $ NAME : Factor w/ 1145 levels "30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)",..: 357 357 357 357 357 357 357 357 357 357 ...
## $ SUBJECT : Factor w/ 12 levels "Grade 3 ELA",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ STUDENT.SUBGROUP : Factor w/ 1 level "All Students": 1 1 1 1 1 1 1 1 1 1 ...
## $ TOTAL.TESTED : int 675 686 678 678 694 690 661 662 720 710 ...
## $ LEVEL.1.COUNT : int 54 129 104 170 208 217 166 184 163 196 ...
## $ LEVEL.1.PCT : Factor w/ 97 levels "0%","1%","10%",..: 79 12 8 19 25 26 19 22 17 22 ...
## $ LEVEL.2.COUNT : int 207 139 171 149 184 123 116 113 173 148 ...
## $ LEVEL.2.PCT : Factor w/ 72 levels "0%","1%","10%",..: 26 14 19 16 21 11 11 10 18 15 ...
## $ LEVEL.3.COUNT : int 288 176 217 122 142 115 101 132 163 131 ...
## $ LEVEL.3.PCT : Factor w/ 83 levels "0%","1%","10%",..: 39 20 27 11 14 10 8 14 17 11 ...
## $ LEVEL.4.COUNT : int 126 242 186 237 160 235 278 233 221 235 ...
## $ LEVEL.4.PCT : Factor w/ 98 levels "0%","1%","10%",..: 12 30 21 30 17 29 38 30 26 28 ...
## $ LEVEL.2.4.PCT : Factor w/ 97 levels "10%","100%","11%",..: 90 78 82 71 66 64 71 68 73 68 ...
## $ LEVEL.3.4.PCT : Factor w/ 101 levels "0%","1%","10%",..: 60 60 57 51 41 49 55 53 51 50 ...
## $ MEAN.SCALE.SCORE : int 607 606 605 604 604 603 604 605 608 605 ...
ts <- ts %>% separate(LEVEL.1.PCT, c("LEVEL.1.PCT", NA)) %>%
separate(LEVEL.2.PCT, c("LEVEL.2.PCT", NA)) %>%
separate(LEVEL.3.PCT, c("LEVEL.3.PCT", NA)) %>%
separate(LEVEL.4.PCT, c("LEVEL.4.PCT", NA)) %>%
separate(LEVEL.2.4.PCT, c("LEVEL.2.4.PCT", NA)) %>%
separate(LEVEL.3.4.PCT, c("LEVEL.3.4.PCT", NA))
str(ts)
## 'data.frame': 7777 obs. of 17 variables:
## $ SCHOOL.YEAR.END.DATE: Factor w/ 1 level "06/30/2019": 1 1 1 1 1 1 1 1 1 1 ...
## $ BEDS.CODE : num 3.1e+11 3.1e+11 3.1e+11 3.1e+11 3.1e+11 ...
## $ NAME : Factor w/ 1145 levels "30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)",..: 357 357 357 357 357 357 357 357 357 357 ...
## $ SUBJECT : Factor w/ 12 levels "Grade 3 ELA",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ STUDENT.SUBGROUP : Factor w/ 1 level "All Students": 1 1 1 1 1 1 1 1 1 1 ...
## $ TOTAL.TESTED : int 675 686 678 678 694 690 661 662 720 710 ...
## $ LEVEL.1.COUNT : int 54 129 104 170 208 217 166 184 163 196 ...
## $ LEVEL.1.PCT : chr "8" "19" "15" "25" ...
## $ LEVEL.2.COUNT : int 207 139 171 149 184 123 116 113 173 148 ...
## $ LEVEL.2.PCT : chr "31" "20" "25" "22" ...
## $ LEVEL.3.COUNT : int 288 176 217 122 142 115 101 132 163 131 ...
## $ LEVEL.3.PCT : chr "43" "26" "32" "18" ...
## $ LEVEL.4.COUNT : int 126 242 186 237 160 235 278 233 221 235 ...
## $ LEVEL.4.PCT : chr "19" "35" "27" "35" ...
## $ LEVEL.2.4.PCT : chr "92" "81" "85" "75" ...
## $ LEVEL.3.4.PCT : chr "61" "61" "59" "53" ...
## $ MEAN.SCALE.SCORE : int 607 606 605 604 604 603 604 605 608 605 ...
ts <- ts %>%
select(SCHOOL.YEAR.END.DATE:MEAN.SCALE.SCORE) %>%
mutate_if(is.character,as.numeric)
str(ts)
## 'data.frame': 7777 obs. of 17 variables:
## $ SCHOOL.YEAR.END.DATE: Factor w/ 1 level "06/30/2019": 1 1 1 1 1 1 1 1 1 1 ...
## $ BEDS.CODE : num 3.1e+11 3.1e+11 3.1e+11 3.1e+11 3.1e+11 ...
## $ NAME : Factor w/ 1145 levels "30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)",..: 357 357 357 357 357 357 357 357 357 357 ...
## $ SUBJECT : Factor w/ 12 levels "Grade 3 ELA",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ STUDENT.SUBGROUP : Factor w/ 1 level "All Students": 1 1 1 1 1 1 1 1 1 1 ...
## $ TOTAL.TESTED : int 675 686 678 678 694 690 661 662 720 710 ...
## $ LEVEL.1.COUNT : int 54 129 104 170 208 217 166 184 163 196 ...
## $ LEVEL.1.PCT : num 8 19 15 25 30 31 25 28 23 28 ...
## $ LEVEL.2.COUNT : int 207 139 171 149 184 123 116 113 173 148 ...
## $ LEVEL.2.PCT : num 31 20 25 22 27 18 18 17 24 21 ...
## $ LEVEL.3.COUNT : int 288 176 217 122 142 115 101 132 163 131 ...
## $ LEVEL.3.PCT : num 43 26 32 18 20 17 15 20 23 18 ...
## $ LEVEL.4.COUNT : int 126 242 186 237 160 235 278 233 221 235 ...
## $ LEVEL.4.PCT : num 19 35 27 35 23 34 42 35 31 33 ...
## $ LEVEL.2.4.PCT : num 92 81 85 75 70 69 75 72 77 72 ...
## $ LEVEL.3.4.PCT : num 61 61 59 53 44 51 57 55 53 52 ...
## $ MEAN.SCALE.SCORE : int 607 606 605 604 604 603 604 605 608 605 ...
tbl_df(ts)
## # A tibble: 7,777 x 17
## SCHOOL.YEAR.END~ BEDS.CODE NAME SUBJECT STUDENT.SUBGROUP TOTAL.TESTED
## <fct> <dbl> <fct> <fct> <fct> <int>
## 1 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 675
## 2 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 686
## 3 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 678
## 4 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 678
## 5 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 694
## 6 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 690
## 7 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 661
## 8 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 662
## 9 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 720
## 10 06/30/2019 3.10e11 NEW ~ Grade ~ All Students 710
## # ... with 7,767 more rows, and 11 more variables: LEVEL.1.COUNT <int>,
## # LEVEL.1.PCT <dbl>, LEVEL.2.COUNT <int>, LEVEL.2.PCT <dbl>,
## # LEVEL.3.COUNT <int>, LEVEL.3.PCT <dbl>, LEVEL.4.COUNT <int>,
## # LEVEL.4.PCT <dbl>, LEVEL.2.4.PCT <dbl>, LEVEL.3.4.PCT <dbl>,
## # MEAN.SCALE.SCORE <int>
ts %>% group_by(SUBJECT) %>% summarise(count = n(),
PCT.mean = mean(LEVEL.1.PCT,na.rm = TRUE))
## # A tibble: 12 x 3
## SUBJECT count PCT.mean
## <fct> <int> <dbl>
## 1 Grade 3 ELA 811 15.2
## 2 Grade 3 Math 811 22.8
## 3 Grade 4 ELA 801 20.5
## 4 Grade 4 Math 801 25.2
## 5 Grade 5 ELA 795 35.4
## 6 Grade 5 Math 795 33.6
## 7 Grade 6 ELA 512 33.4
## 8 Grade 6 Math 512 36.6
## 9 Grade 7 ELA 503 30.4
## 10 Grade 7 Math 502 35.1
## 11 Grade 8 ELA 502 15.6
## 12 Grade 8 Math 432 42.0
ts %>% filter(SUBJECT == "Grade 3 Math") %>%
select(SUBJECT, NAME,BEDS.CODE , TOTAL.TESTED)%>%
arrange( SUBJECT, NAME, BEDS.CODE)%>%
head()
## SUBJECT NAME
## 1 Grade 3 Math 30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)
## 2 Grade 3 Math 47 AMERICAN SIGN LANGUAGE AND ENGLISH LOWER SCHOOL (THE)
## 3 Grade 3 Math ACADEMY FOR EXCELLENCE THROUGH THE ARTS (THE)
## 4 Grade 3 Math ACADEMY OF ARTS AND LETTERS
## 5 Grade 3 Math ACADEMY OF TALENTED SCHOLARS (THE)
## 6 Grade 3 Math ACE ACADEMY FOR SCHOLARS AT THE GERALDINE FERRARO CAMPUS
## BEDS.CODE TOTAL.TESTED
## 1 3.430e+11 60
## 2 3.102e+11 9
## 3 3.428e+11 33
## 4 3.313e+11 32
## 5 3.320e+11 55
## 6 3.424e+11 94
#####Histogram for Leve 1
ts %>% ggplot(aes(x=LEVEL.1.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (stat_bin).
#####Histogram for Leve 2
ts %>% ggplot(aes(x=LEVEL.2.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (stat_bin).
#####Histogram for Leve 3
ts %>% ggplot(aes(x=LEVEL.3.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (stat_bin).
#####Histogram for Leve 4
ts %>% ggplot(aes(x=LEVEL.4.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (stat_bin).