Homework 2 Instructions

Here is a small dataset for you to work with.

Each of 5 schools (A, B, C, D and E) is implementing the same math course this semester, with 35 lessons. There are 30 sections total. The semester is about 3/4 of the way through.

For each section, we record the number of students who are:

• very ahead (more than 5 lessons ahead)

• middling (5 lessons ahead to 0 lessons ahead)

• behind (1 to 5 lessons behind)

• more behind (6 to 10 lessons behind)

• very behind (more than 10 lessons behind)

• completed (finished with the course)

What’s the story (or stories) in this data? Find it, and tell it visually and, above all, truthfully.

Load Packages

library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ dplyr   1.0.6
## ✓ tibble  3.1.1     ✓ stringr 1.4.0
## ✓ tidyr   1.1.3     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(dplyr)

Import school data from CSV

schoolInfoDF <- read.csv("data-storyteller.csv")
schoolInfoDF
##    School Section Very.Ahead..5 Middling..0 Behind..1.5 More.Behind..6.10
## 1       A       1             0           5          54                 3
## 2       A       2             0           8          40                10
## 3       A       3             0           9          35                12
## 4       A       4             0          14          44                 5
## 5       A       5             0           9          42                 2
## 6       A       6             0           7          29                 3
## 7       A       7             0          19          22                 5
## 8       A       8             0           3          37                11
## 9       A       9             0           6          29                 8
## 10      A      10             0          13          40                 5
## 11      A      11             0           8          32                 4
## 12      A      12             0           2          16                 2
## 13      A      13             0          10          30                 3
## 14      B       1             0           4          22                 0
## 15      B       2             0           5           7                 2
## 16      B       3             0           6          31                 1
## 17      B       4             0           4           7                 0
## 18      B       5             0           8          14                 4
## 19      B       6             0           8          11                 1
## 20      B       7             0           9          21                 0
## 21      B       8             0          10          23                 2
## 22      B       9             0          10          21                 0
## 23      B      10             0           3           8                 1
## 24      B      11             0           7          19                 2
## 25      B      12             0          10          17                 1
## 26      C       1             0           2          15                 2
## 27      C       2             0           7          20                 1
## 28      C       3             0           2           4                 1
## 29      D       1             0           3           8                 2
## 30      E       1             0          11          56                 7
##    Very.Behind..11 Completed
## 1                9        10
## 2               16         6
## 3               13        11
## 4               12        10
## 5               24         8
## 6               10         9
## 7               14        19
## 8               18         5
## 9               12        10
## 10               5        20
## 11              10        15
## 12               3        14
## 13               8         5
## 14               6         7
## 15               1         3
## 16               1         8
## 17               0         7
## 18               0        14
## 19               2        18
## 20               2        13
## 21               5         6
## 22               3         5
## 23               1        15
## 24               1        10
## 25               0        19
## 26               4        13
## 27               7         1
## 28               1         5
## 29               6         3
## 30              15        27

High level review of data

str(schoolInfoDF)
## 'data.frame':    30 obs. of  8 variables:
##  $ School           : chr  "A" "A" "A" "A" ...
##  $ Section          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Very.Ahead..5    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Middling..0      : int  5 8 9 14 9 7 19 3 6 13 ...
##  $ Behind..1.5      : int  54 40 35 44 42 29 22 37 29 40 ...
##  $ More.Behind..6.10: int  3 10 12 5 2 3 5 11 8 5 ...
##  $ Very.Behind..11  : int  9 16 13 12 24 10 14 18 12 5 ...
##  $ Completed        : int  10 6 11 10 8 9 19 5 10 20 ...
summary(schoolInfoDF)
##     School             Section      Very.Ahead..5  Middling..0   
##  Length:30          Min.   : 1.00   Min.   :0     Min.   : 2.00  
##  Class :character   1st Qu.: 2.25   1st Qu.:0     1st Qu.: 4.25  
##  Mode  :character   Median : 5.50   Median :0     Median : 7.50  
##                     Mean   : 5.90   Mean   :0     Mean   : 7.40  
##                     3rd Qu.: 9.00   3rd Qu.:0     3rd Qu.: 9.75  
##                     Max.   :13.00   Max.   :0     Max.   :19.00  
##   Behind..1.5    More.Behind..6.10 Very.Behind..11    Completed    
##  Min.   : 4.00   Min.   : 0.000    Min.   : 0.000   Min.   : 1.00  
##  1st Qu.:15.25   1st Qu.: 1.000    1st Qu.: 1.250   1st Qu.: 6.00  
##  Median :22.00   Median : 2.000    Median : 5.500   Median :10.00  
##  Mean   :25.13   Mean   : 3.333    Mean   : 6.967   Mean   :10.53  
##  3rd Qu.:34.25   3rd Qu.: 4.750    3rd Qu.:11.500   3rd Qu.:14.00  
##  Max.   :56.00   Max.   :12.000    Max.   :24.000   Max.   :27.00

Change school & section into factors

schoolInfoDF$School <- factor(schoolInfoDF$School)

Update column names

colnames(schoolInfoDF) <- c("School", "Section", "Very Ahead", "Middling", "Behind", "More Behind", "Very Behind"
                    , "Complete")
str(schoolInfoDF)
## 'data.frame':    30 obs. of  8 variables:
##  $ School     : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Section    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Very Ahead : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Middling   : int  5 8 9 14 9 7 19 3 6 13 ...
##  $ Behind     : int  54 40 35 44 42 29 22 37 29 40 ...
##  $ More Behind: int  3 10 12 5 2 3 5 11 8 5 ...
##  $ Very Behind: int  9 16 13 12 24 10 14 18 12 5 ...
##  $ Complete   : int  10 6 11 10 8 9 19 5 10 20 ...

Ensure there are no “NAs”

numNAs <- (which((is.na.data.frame(schoolInfoDF))=='TRUE'))
length(numNAs)
## [1] 0

Ensure there are no duplicate entries

numDup <- which(duplicated(schoolInfoDF))
length(numDup)
## [1] 0

See number of sections by school

table(schoolInfoDF$School)
## 
##  A  B  C  D  E 
## 13 12  3  1  1
table(schoolInfoDF$School)[which.max(table(schoolInfoDF$School))]
##  A 
## 13

View number of students in each section by school

studentsBySchool <- aggregate(cbind(schoolInfoDF$`Very Ahead`, schoolInfoDF$Middling, schoolInfoDF$Behind, schoolInfoDF$`More Behind`,schoolInfoDF$`Very Behind`, schoolInfoDF$Complete), list(schoolInfoDF$School), sum)

Rename Columns

colnames(studentsBySchool) <- c("School","Very Ahead", "Middling", "Behind", "More Behind", "Very Behind"
                                , "Complete" )

Remove “very ahead” because no students fall into this category

studentsBySchool <- studentsBySchool[,c(1,3,4,5,6,7)]

Add total # students attribute

studentsBySchool$Total <- rowSums(studentsBySchool[,-1])
studentsBySchool
##   School Middling Behind More Behind Very Behind Complete Total
## 1      A      113    450          73         154      142   932
## 2      B       84    201          14          22      125   446
## 3      C       11     39           4          12       19    85
## 4      D        3      8           2           6        3    22
## 5      E       11     56           7          15       27   116

Want to use percentages to accurately represent spread of students across levels without skewing with varying class sizes

normalize_Students <- studentsBySchool
normalize_Students$Middling <- round((normalize_Students$Middling/normalize_Students$Total),digits=2)
normalize_Students$Behind <- round((normalize_Students$Behind/normalize_Students$Total),digits=2)
normalize_Students$`More Behind` <- round((normalize_Students$`More Behind`/normalize_Students$Total),digits=2)
normalize_Students$`Very Behind` <- round((normalize_Students$`Very Behind`/normalize_Students$Total),digits=2)
normalize_Students$Complete <- round((normalize_Students$Complete/normalize_Students$Total),digits=2)
normalize_Students
##   School Middling Behind More Behind Very Behind Complete Total
## 1      A     0.12   0.48        0.08        0.17     0.15   932
## 2      B     0.19   0.45        0.03        0.05     0.28   446
## 3      C     0.13   0.46        0.05        0.14     0.22    85
## 4      D     0.14   0.36        0.09        0.27     0.14    22
## 5      E     0.09   0.48        0.06        0.13     0.23   116

Remove total students again

normalize_Students <- normalize_Students[,-7]
studentsBySchool <- studentsBySchool[,-7]

Re-Analyze

normalize_Students
##   School Middling Behind More Behind Very Behind Complete
## 1      A     0.12   0.48        0.08        0.17     0.15
## 2      B     0.19   0.45        0.03        0.05     0.28
## 3      C     0.13   0.46        0.05        0.14     0.22
## 4      D     0.14   0.36        0.09        0.27     0.14
## 5      E     0.09   0.48        0.06        0.13     0.23
str(normalize_Students)
## 'data.frame':    5 obs. of  6 variables:
##  $ School     : Factor w/ 5 levels "A","B","C","D",..: 1 2 3 4 5
##  $ Middling   : num  0.12 0.19 0.13 0.14 0.09
##  $ Behind     : num  0.48 0.45 0.46 0.36 0.48
##  $ More Behind: num  0.08 0.03 0.05 0.09 0.06
##  $ Very Behind: num  0.17 0.05 0.14 0.27 0.13
##  $ Complete   : num  0.15 0.28 0.22 0.14 0.23
summary(normalize_Students)
##  School    Middling         Behind       More Behind     Very Behind   
##  A:1    Min.   :0.090   Min.   :0.360   Min.   :0.030   Min.   :0.050  
##  B:1    1st Qu.:0.120   1st Qu.:0.450   1st Qu.:0.050   1st Qu.:0.130  
##  C:1    Median :0.130   Median :0.460   Median :0.060   Median :0.140  
##  D:1    Mean   :0.134   Mean   :0.446   Mean   :0.062   Mean   :0.152  
##  E:1    3rd Qu.:0.140   3rd Qu.:0.480   3rd Qu.:0.080   3rd Qu.:0.170  
##         Max.   :0.190   Max.   :0.480   Max.   :0.090   Max.   :0.270  
##     Complete    
##  Min.   :0.140  
##  1st Qu.:0.150  
##  Median :0.220  
##  Mean   :0.204  
##  3rd Qu.:0.230  
##  Max.   :0.280

Melt down data to graph

require(tidyr)
df.long <- gather(normalize_Students, variable,value, -School)

GGPLOT all schools on one graph

df.graph <- ggplot(data=df.long, aes(x = School, y=value, fill = variable)) + 
  geom_col(position=position_dodge())
df.graph

Breakout data frames for each school, can remove “School” attribute

schoolA <- data.frame(studentsBySchool[1,-1])
schoolB <- data.frame(studentsBySchool[2,-1])
schoolC <- data.frame(studentsBySchool[3,-1])
schoolD <- data.frame(studentsBySchool[4,-1])
schoolE <- data.frame(studentsBySchool[5,-1])

schoolA
##   Middling Behind More.Behind Very.Behind Complete
## 1      113    450          73         154      142
schoolB
##   Middling Behind More.Behind Very.Behind Complete
## 2       84    201          14          22      125
schoolC
##   Middling Behind More.Behind Very.Behind Complete
## 3       11     39           4          12       19
schoolD
##   Middling Behind More.Behind Very.Behind Complete
## 4        3      8           2           6        3
schoolE
##   Middling Behind More.Behind Very.Behind Complete
## 5       11     56           7          15       27

Plot individual school A info

df.longA <- gather(schoolA, variable, value)
df.graphA <- ggplot(data=df.longA, aes(x = variable, y=value, fill = variable)) + 
  geom_col(position=position_dodge())
df.graphA

Plot individual school B info

df.longB <- gather(schoolB, variable, value)
df.graphB <- ggplot(data=df.longB, aes(x = variable, y=value, fill = variable)) + 
  geom_col(position=position_dodge())
df.graphB

Plot individual school C info

df.longC <- gather(schoolC, variable, value)
df.graphC <- ggplot(data=df.longC, aes(x = variable, y=value, fill = variable)) + 
  geom_col(position=position_dodge())
df.graphC

Plot individual school D info

df.longD <- gather(schoolD, variable, value)
df.graphD <- ggplot(data=df.longD, aes(x = variable, y=value, fill = variable)) + 
  geom_col(position=position_dodge())
df.graphD

Plot individual school E info

df.longE <- gather(schoolD, variable, value)
df.graphE <- ggplot(data=df.longE, aes(x = variable, y=value, fill = variable)) + 
  geom_col(position=position_dodge())
df.graphE