Homework 2 Instructions
Here is a small dataset for you to work with.
Each of 5 schools (A, B, C, D and E) is implementing the same math course this semester, with 35 lessons. There are 30 sections total. The semester is about 3/4 of the way through.
For each section, we record the number of students who are:
• very ahead (more than 5 lessons ahead)
• middling (5 lessons ahead to 0 lessons ahead)
• behind (1 to 5 lessons behind)
• more behind (6 to 10 lessons behind)
• very behind (more than 10 lessons behind)
• completed (finished with the course)
What’s the story (or stories) in this data? Find it, and tell it visually and, above all, truthfully.
Load Packages
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ dplyr 1.0.6
## ✓ tibble 3.1.1 ✓ stringr 1.4.0
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
Import school data from CSV
schoolInfoDF <- read.csv("data-storyteller.csv")
schoolInfoDF
## School Section Very.Ahead..5 Middling..0 Behind..1.5 More.Behind..6.10
## 1 A 1 0 5 54 3
## 2 A 2 0 8 40 10
## 3 A 3 0 9 35 12
## 4 A 4 0 14 44 5
## 5 A 5 0 9 42 2
## 6 A 6 0 7 29 3
## 7 A 7 0 19 22 5
## 8 A 8 0 3 37 11
## 9 A 9 0 6 29 8
## 10 A 10 0 13 40 5
## 11 A 11 0 8 32 4
## 12 A 12 0 2 16 2
## 13 A 13 0 10 30 3
## 14 B 1 0 4 22 0
## 15 B 2 0 5 7 2
## 16 B 3 0 6 31 1
## 17 B 4 0 4 7 0
## 18 B 5 0 8 14 4
## 19 B 6 0 8 11 1
## 20 B 7 0 9 21 0
## 21 B 8 0 10 23 2
## 22 B 9 0 10 21 0
## 23 B 10 0 3 8 1
## 24 B 11 0 7 19 2
## 25 B 12 0 10 17 1
## 26 C 1 0 2 15 2
## 27 C 2 0 7 20 1
## 28 C 3 0 2 4 1
## 29 D 1 0 3 8 2
## 30 E 1 0 11 56 7
## Very.Behind..11 Completed
## 1 9 10
## 2 16 6
## 3 13 11
## 4 12 10
## 5 24 8
## 6 10 9
## 7 14 19
## 8 18 5
## 9 12 10
## 10 5 20
## 11 10 15
## 12 3 14
## 13 8 5
## 14 6 7
## 15 1 3
## 16 1 8
## 17 0 7
## 18 0 14
## 19 2 18
## 20 2 13
## 21 5 6
## 22 3 5
## 23 1 15
## 24 1 10
## 25 0 19
## 26 4 13
## 27 7 1
## 28 1 5
## 29 6 3
## 30 15 27
High level review of data
str(schoolInfoDF)
## 'data.frame': 30 obs. of 8 variables:
## $ School : chr "A" "A" "A" "A" ...
## $ Section : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Very.Ahead..5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Middling..0 : int 5 8 9 14 9 7 19 3 6 13 ...
## $ Behind..1.5 : int 54 40 35 44 42 29 22 37 29 40 ...
## $ More.Behind..6.10: int 3 10 12 5 2 3 5 11 8 5 ...
## $ Very.Behind..11 : int 9 16 13 12 24 10 14 18 12 5 ...
## $ Completed : int 10 6 11 10 8 9 19 5 10 20 ...
summary(schoolInfoDF)
## School Section Very.Ahead..5 Middling..0
## Length:30 Min. : 1.00 Min. :0 Min. : 2.00
## Class :character 1st Qu.: 2.25 1st Qu.:0 1st Qu.: 4.25
## Mode :character Median : 5.50 Median :0 Median : 7.50
## Mean : 5.90 Mean :0 Mean : 7.40
## 3rd Qu.: 9.00 3rd Qu.:0 3rd Qu.: 9.75
## Max. :13.00 Max. :0 Max. :19.00
## Behind..1.5 More.Behind..6.10 Very.Behind..11 Completed
## Min. : 4.00 Min. : 0.000 Min. : 0.000 Min. : 1.00
## 1st Qu.:15.25 1st Qu.: 1.000 1st Qu.: 1.250 1st Qu.: 6.00
## Median :22.00 Median : 2.000 Median : 5.500 Median :10.00
## Mean :25.13 Mean : 3.333 Mean : 6.967 Mean :10.53
## 3rd Qu.:34.25 3rd Qu.: 4.750 3rd Qu.:11.500 3rd Qu.:14.00
## Max. :56.00 Max. :12.000 Max. :24.000 Max. :27.00
Change school & section into factors
schoolInfoDF$School <- factor(schoolInfoDF$School)
Update column names
colnames(schoolInfoDF) <- c("School", "Section", "Very Ahead", "Middling", "Behind", "More Behind", "Very Behind"
, "Complete")
str(schoolInfoDF)
## 'data.frame': 30 obs. of 8 variables:
## $ School : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Section : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Very Ahead : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Middling : int 5 8 9 14 9 7 19 3 6 13 ...
## $ Behind : int 54 40 35 44 42 29 22 37 29 40 ...
## $ More Behind: int 3 10 12 5 2 3 5 11 8 5 ...
## $ Very Behind: int 9 16 13 12 24 10 14 18 12 5 ...
## $ Complete : int 10 6 11 10 8 9 19 5 10 20 ...
Ensure there are no “NAs”
numNAs <- (which((is.na.data.frame(schoolInfoDF))=='TRUE'))
length(numNAs)
## [1] 0
Ensure there are no duplicate entries
numDup <- which(duplicated(schoolInfoDF))
length(numDup)
## [1] 0
See number of sections by school
table(schoolInfoDF$School)
##
## A B C D E
## 13 12 3 1 1
table(schoolInfoDF$School)[which.max(table(schoolInfoDF$School))]
## A
## 13
View number of students in each section by school
studentsBySchool <- aggregate(cbind(schoolInfoDF$`Very Ahead`, schoolInfoDF$Middling, schoolInfoDF$Behind, schoolInfoDF$`More Behind`,schoolInfoDF$`Very Behind`, schoolInfoDF$Complete), list(schoolInfoDF$School), sum)
Rename Columns
colnames(studentsBySchool) <- c("School","Very Ahead", "Middling", "Behind", "More Behind", "Very Behind"
, "Complete" )
Remove “very ahead” because no students fall into this category
studentsBySchool <- studentsBySchool[,c(1,3,4,5,6,7)]
Add total # students attribute
studentsBySchool$Total <- rowSums(studentsBySchool[,-1])
studentsBySchool
## School Middling Behind More Behind Very Behind Complete Total
## 1 A 113 450 73 154 142 932
## 2 B 84 201 14 22 125 446
## 3 C 11 39 4 12 19 85
## 4 D 3 8 2 6 3 22
## 5 E 11 56 7 15 27 116
Want to use percentages to accurately represent spread of students across levels without skewing with varying class sizes
normalize_Students <- studentsBySchool
normalize_Students$Middling <- round((normalize_Students$Middling/normalize_Students$Total),digits=2)
normalize_Students$Behind <- round((normalize_Students$Behind/normalize_Students$Total),digits=2)
normalize_Students$`More Behind` <- round((normalize_Students$`More Behind`/normalize_Students$Total),digits=2)
normalize_Students$`Very Behind` <- round((normalize_Students$`Very Behind`/normalize_Students$Total),digits=2)
normalize_Students$Complete <- round((normalize_Students$Complete/normalize_Students$Total),digits=2)
normalize_Students
## School Middling Behind More Behind Very Behind Complete Total
## 1 A 0.12 0.48 0.08 0.17 0.15 932
## 2 B 0.19 0.45 0.03 0.05 0.28 446
## 3 C 0.13 0.46 0.05 0.14 0.22 85
## 4 D 0.14 0.36 0.09 0.27 0.14 22
## 5 E 0.09 0.48 0.06 0.13 0.23 116
Remove total students again
normalize_Students <- normalize_Students[,-7]
studentsBySchool <- studentsBySchool[,-7]
Re-Analyze
normalize_Students
## School Middling Behind More Behind Very Behind Complete
## 1 A 0.12 0.48 0.08 0.17 0.15
## 2 B 0.19 0.45 0.03 0.05 0.28
## 3 C 0.13 0.46 0.05 0.14 0.22
## 4 D 0.14 0.36 0.09 0.27 0.14
## 5 E 0.09 0.48 0.06 0.13 0.23
str(normalize_Students)
## 'data.frame': 5 obs. of 6 variables:
## $ School : Factor w/ 5 levels "A","B","C","D",..: 1 2 3 4 5
## $ Middling : num 0.12 0.19 0.13 0.14 0.09
## $ Behind : num 0.48 0.45 0.46 0.36 0.48
## $ More Behind: num 0.08 0.03 0.05 0.09 0.06
## $ Very Behind: num 0.17 0.05 0.14 0.27 0.13
## $ Complete : num 0.15 0.28 0.22 0.14 0.23
summary(normalize_Students)
## School Middling Behind More Behind Very Behind
## A:1 Min. :0.090 Min. :0.360 Min. :0.030 Min. :0.050
## B:1 1st Qu.:0.120 1st Qu.:0.450 1st Qu.:0.050 1st Qu.:0.130
## C:1 Median :0.130 Median :0.460 Median :0.060 Median :0.140
## D:1 Mean :0.134 Mean :0.446 Mean :0.062 Mean :0.152
## E:1 3rd Qu.:0.140 3rd Qu.:0.480 3rd Qu.:0.080 3rd Qu.:0.170
## Max. :0.190 Max. :0.480 Max. :0.090 Max. :0.270
## Complete
## Min. :0.140
## 1st Qu.:0.150
## Median :0.220
## Mean :0.204
## 3rd Qu.:0.230
## Max. :0.280
Melt down data to graph
require(tidyr)
df.long <- gather(normalize_Students, variable,value, -School)
GGPLOT all schools on one graph
df.graph <- ggplot(data=df.long, aes(x = School, y=value, fill = variable)) +
geom_col(position=position_dodge())
df.graph

Breakout data frames for each school, can remove “School” attribute
schoolA <- data.frame(studentsBySchool[1,-1])
schoolB <- data.frame(studentsBySchool[2,-1])
schoolC <- data.frame(studentsBySchool[3,-1])
schoolD <- data.frame(studentsBySchool[4,-1])
schoolE <- data.frame(studentsBySchool[5,-1])
schoolA
## Middling Behind More.Behind Very.Behind Complete
## 1 113 450 73 154 142
schoolB
## Middling Behind More.Behind Very.Behind Complete
## 2 84 201 14 22 125
schoolC
## Middling Behind More.Behind Very.Behind Complete
## 3 11 39 4 12 19
schoolD
## Middling Behind More.Behind Very.Behind Complete
## 4 3 8 2 6 3
schoolE
## Middling Behind More.Behind Very.Behind Complete
## 5 11 56 7 15 27
Plot individual school A info
df.longA <- gather(schoolA, variable, value)
df.graphA <- ggplot(data=df.longA, aes(x = variable, y=value, fill = variable)) +
geom_col(position=position_dodge())
df.graphA

Plot individual school B info
df.longB <- gather(schoolB, variable, value)
df.graphB <- ggplot(data=df.longB, aes(x = variable, y=value, fill = variable)) +
geom_col(position=position_dodge())
df.graphB

Plot individual school C info
df.longC <- gather(schoolC, variable, value)
df.graphC <- ggplot(data=df.longC, aes(x = variable, y=value, fill = variable)) +
geom_col(position=position_dodge())
df.graphC

Plot individual school D info
df.longD <- gather(schoolD, variable, value)
df.graphD <- ggplot(data=df.longD, aes(x = variable, y=value, fill = variable)) +
geom_col(position=position_dodge())
df.graphD

Plot individual school E info
df.longE <- gather(schoolD, variable, value)
df.graphE <- ggplot(data=df.longE, aes(x = variable, y=value, fill = variable)) +
geom_col(position=position_dodge())
df.graphE
