Project 2A

TestScore Departement of Education

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Importing dataset

ts <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/3-8_ELA_AND_MATH_NYC_SUMMARY_2019.csv", na.strings = "-")
head(ts)

##   SCHOOL.YEAR.END.DATE BEDS.CODE                                  NAME
## 1           06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 2           06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 3           06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 4           06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 5           06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
## 6           06/30/2019 3.101e+11 NEW YORK CITY GEOGRAPHIC DISTRICT # 1
##        SUBJECT STUDENT.SUBGROUP TOTAL.TESTED LEVEL.1.COUNT LEVEL.1.PCT
## 1  Grade 3 ELA     All Students          675            54          8%
## 2 Grade 3 Math     All Students          686           129         19%
## 3  Grade 4 ELA     All Students          678           104         15%
## 4 Grade 4 Math     All Students          678           170         25%
## 5  Grade 5 ELA     All Students          694           208         30%
## 6 Grade 5 Math     All Students          690           217         31%
##   LEVEL.2.COUNT LEVEL.2.PCT LEVEL.3.COUNT LEVEL.3.PCT LEVEL.4.COUNT
## 1           207         31%           288         43%           126
## 2           139         20%           176         26%           242
## 3           171         25%           217         32%           186
## 4           149         22%           122         18%           237
## 5           184         27%           142         20%           160
## 6           123         18%           115         17%           235
##   LEVEL.4.PCT LEVEL.2.4.PCT LEVEL.3.4.PCT MEAN.SCALE.SCORE
## 1         19%           92%           61%              607
## 2         35%           81%           61%              606
## 3         27%           85%           59%              605
## 4         35%           75%           53%              604
## 5         23%           70%           44%              604
## 6         34%           69%           51%              603

str(ts)

## 'data.frame':    7777 obs. of  17 variables:
##  $ SCHOOL.YEAR.END.DATE: Factor w/ 1 level "06/30/2019": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BEDS.CODE           : num  3.1e+11 3.1e+11 3.1e+11 3.1e+11 3.1e+11 ...
##  $ NAME                : Factor w/ 1145 levels "30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)",..: 357 357 357 357 357 357 357 357 357 357 ...
##  $ SUBJECT             : Factor w/ 12 levels "Grade 3 ELA",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ STUDENT.SUBGROUP    : Factor w/ 1 level "All Students": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TOTAL.TESTED        : int  675 686 678 678 694 690 661 662 720 710 ...
##  $ LEVEL.1.COUNT       : int  54 129 104 170 208 217 166 184 163 196 ...
##  $ LEVEL.1.PCT         : Factor w/ 97 levels "0%","1%","10%",..: 79 12 8 19 25 26 19 22 17 22 ...
##  $ LEVEL.2.COUNT       : int  207 139 171 149 184 123 116 113 173 148 ...
##  $ LEVEL.2.PCT         : Factor w/ 72 levels "0%","1%","10%",..: 26 14 19 16 21 11 11 10 18 15 ...
##  $ LEVEL.3.COUNT       : int  288 176 217 122 142 115 101 132 163 131 ...
##  $ LEVEL.3.PCT         : Factor w/ 83 levels "0%","1%","10%",..: 39 20 27 11 14 10 8 14 17 11 ...
##  $ LEVEL.4.COUNT       : int  126 242 186 237 160 235 278 233 221 235 ...
##  $ LEVEL.4.PCT         : Factor w/ 98 levels "0%","1%","10%",..: 12 30 21 30 17 29 38 30 26 28 ...
##  $ LEVEL.2.4.PCT       : Factor w/ 97 levels "10%","100%","11%",..: 90 78 82 71 66 64 71 68 73 68 ...
##  $ LEVEL.3.4.PCT       : Factor w/ 101 levels "0%","1%","10%",..: 60 60 57 51 41 49 55 53 51 50 ...
##  $ MEAN.SCALE.SCORE    : int  607 606 605 604 604 603 604 605 608 605 ...

Removing the % sign

ts <- ts %>% separate(LEVEL.1.PCT,  c("LEVEL.1.PCT", NA)) %>%
                separate(LEVEL.2.PCT,  c("LEVEL.2.PCT", NA)) %>%
                separate(LEVEL.3.PCT,  c("LEVEL.3.PCT", NA)) %>%
                separate(LEVEL.4.PCT,  c("LEVEL.4.PCT", NA)) %>%
                separate(LEVEL.2.4.PCT,  c("LEVEL.2.4.PCT", NA)) %>%
                separate(LEVEL.3.4.PCT,  c("LEVEL.3.4.PCT", NA))
 str(ts)

## 'data.frame':    7777 obs. of  17 variables:
##  $ SCHOOL.YEAR.END.DATE: Factor w/ 1 level "06/30/2019": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BEDS.CODE           : num  3.1e+11 3.1e+11 3.1e+11 3.1e+11 3.1e+11 ...
##  $ NAME                : Factor w/ 1145 levels "30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)",..: 357 357 357 357 357 357 357 357 357 357 ...
##  $ SUBJECT             : Factor w/ 12 levels "Grade 3 ELA",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ STUDENT.SUBGROUP    : Factor w/ 1 level "All Students": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TOTAL.TESTED        : int  675 686 678 678 694 690 661 662 720 710 ...
##  $ LEVEL.1.COUNT       : int  54 129 104 170 208 217 166 184 163 196 ...
##  $ LEVEL.1.PCT         : chr  "8" "19" "15" "25" ...
##  $ LEVEL.2.COUNT       : int  207 139 171 149 184 123 116 113 173 148 ...
##  $ LEVEL.2.PCT         : chr  "31" "20" "25" "22" ...
##  $ LEVEL.3.COUNT       : int  288 176 217 122 142 115 101 132 163 131 ...
##  $ LEVEL.3.PCT         : chr  "43" "26" "32" "18" ...
##  $ LEVEL.4.COUNT       : int  126 242 186 237 160 235 278 233 221 235 ...
##  $ LEVEL.4.PCT         : chr  "19" "35" "27" "35" ...
##  $ LEVEL.2.4.PCT       : chr  "92" "81" "85" "75" ...
##  $ LEVEL.3.4.PCT       : chr  "61" "61" "59" "53" ...
##  $ MEAN.SCALE.SCORE    : int  607 606 605 604 604 603 604 605 608 605 ...

this removes the column if all your character columns need converted to numeric

ts <- ts %>%
  select(SCHOOL.YEAR.END.DATE:MEAN.SCALE.SCORE) %>%
  mutate_if(is.character,as.numeric)
str(ts)

## 'data.frame':    7777 obs. of  17 variables:
##  $ SCHOOL.YEAR.END.DATE: Factor w/ 1 level "06/30/2019": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BEDS.CODE           : num  3.1e+11 3.1e+11 3.1e+11 3.1e+11 3.1e+11 ...
##  $ NAME                : Factor w/ 1145 levels "30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)",..: 357 357 357 357 357 357 357 357 357 357 ...
##  $ SUBJECT             : Factor w/ 12 levels "Grade 3 ELA",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ STUDENT.SUBGROUP    : Factor w/ 1 level "All Students": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TOTAL.TESTED        : int  675 686 678 678 694 690 661 662 720 710 ...
##  $ LEVEL.1.COUNT       : int  54 129 104 170 208 217 166 184 163 196 ...
##  $ LEVEL.1.PCT         : num  8 19 15 25 30 31 25 28 23 28 ...
##  $ LEVEL.2.COUNT       : int  207 139 171 149 184 123 116 113 173 148 ...
##  $ LEVEL.2.PCT         : num  31 20 25 22 27 18 18 17 24 21 ...
##  $ LEVEL.3.COUNT       : int  288 176 217 122 142 115 101 132 163 131 ...
##  $ LEVEL.3.PCT         : num  43 26 32 18 20 17 15 20 23 18 ...
##  $ LEVEL.4.COUNT       : int  126 242 186 237 160 235 278 233 221 235 ...
##  $ LEVEL.4.PCT         : num  19 35 27 35 23 34 42 35 31 33 ...
##  $ LEVEL.2.4.PCT       : num  92 81 85 75 70 69 75 72 77 72 ...
##  $ LEVEL.3.4.PCT       : num  61 61 59 53 44 51 57 55 53 52 ...
##  $ MEAN.SCALE.SCORE    : int  607 606 605 604 604 603 604 605 608 605 ...

tbl_df(ts)

## # A tibble: 7,777 x 17
##    SCHOOL.YEAR.END~ BEDS.CODE NAME  SUBJECT STUDENT.SUBGROUP TOTAL.TESTED
##    <fct>                <dbl> <fct> <fct>   <fct>                   <int>
##  1 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              675
##  2 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              686
##  3 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              678
##  4 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              678
##  5 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              694
##  6 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              690
##  7 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              661
##  8 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              662
##  9 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              720
## 10 06/30/2019         3.10e11 NEW ~ Grade ~ All Students              710
## # ... with 7,767 more rows, and 11 more variables: LEVEL.1.COUNT <int>,
## #   LEVEL.1.PCT <dbl>, LEVEL.2.COUNT <int>, LEVEL.2.PCT <dbl>,
## #   LEVEL.3.COUNT <int>, LEVEL.3.PCT <dbl>, LEVEL.4.COUNT <int>,
## #   LEVEL.4.PCT <dbl>, LEVEL.2.4.PCT <dbl>, LEVEL.3.4.PCT <dbl>,
## #   MEAN.SCALE.SCORE <int>

Summarizing

ts %>% group_by(SUBJECT) %>% summarise(count = n(),
        PCT.mean =  mean(LEVEL.1.PCT,na.rm = TRUE))

## # A tibble: 12 x 3
##    SUBJECT      count PCT.mean
##    <fct>        <int>    <dbl>
##  1 Grade 3 ELA    811     15.2
##  2 Grade 3 Math   811     22.8
##  3 Grade 4 ELA    801     20.5
##  4 Grade 4 Math   801     25.2
##  5 Grade 5 ELA    795     35.4
##  6 Grade 5 Math   795     33.6
##  7 Grade 6 ELA    512     33.4
##  8 Grade 6 Math   512     36.6
##  9 Grade 7 ELA    503     30.4
## 10 Grade 7 Math   502     35.1
## 11 Grade 8 ELA    502     15.6
## 12 Grade 8 Math   432     42.0

Filter and Select

ts %>% filter(SUBJECT == "Grade 3 Math") %>% 
  select(SUBJECT, NAME,BEDS.CODE , TOTAL.TESTED)%>%
  arrange( SUBJECT, NAME, BEDS.CODE)%>%
  head()

##        SUBJECT                                                     NAME
## 1 Grade 3 Math                30TH AVENUE SCHOOL (THE) (G & T CITYWIDE)
## 2 Grade 3 Math 47 AMERICAN SIGN LANGUAGE AND ENGLISH LOWER SCHOOL (THE)
## 3 Grade 3 Math            ACADEMY FOR EXCELLENCE THROUGH THE ARTS (THE)
## 4 Grade 3 Math                              ACADEMY OF ARTS AND LETTERS
## 5 Grade 3 Math                       ACADEMY OF TALENTED SCHOLARS (THE)
## 6 Grade 3 Math ACE ACADEMY FOR SCHOLARS AT THE GERALDINE FERRARO CAMPUS
##   BEDS.CODE TOTAL.TESTED
## 1 3.430e+11           60
## 2 3.102e+11            9
## 3 3.428e+11           33
## 4 3.313e+11           32
## 5 3.320e+11           55
## 6 3.424e+11           94

#####Histogram for Leve 1

ts %>% ggplot(aes(x=LEVEL.1.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 24 rows containing non-finite values (stat_bin).

#####Histogram for Leve 2

ts %>% ggplot(aes(x=LEVEL.2.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 24 rows containing non-finite values (stat_bin).

#####Histogram for Leve 3

ts %>% ggplot(aes(x=LEVEL.3.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 24 rows containing non-finite values (stat_bin).

#####Histogram for Leve 4

ts %>% ggplot(aes(x=LEVEL.4.PCT))+geom_histogram(aes(fill= SUBJECT), color = "Black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 24 rows containing non-finite values (stat_bin).

Project 2A

Alain T Kuiete

10/6/2019

TestScore Departement of Education

Importing dataset

Removing the % sign

this removes the column if all your character columns need converted to numeric

Summarizing

Filter and Select