Question
How did the number of teachers, number of computers, district average income, and other factors affect the reading and math test scores of 5th grade students in California school districts in 1998 and 1999?
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
filename <- "https://raw.githubusercontent.com/djunga/msdsbridgefinal/main/CASchools.csv"
df<-read.csv(file=filename)
head(df, n=5L)
## X district school county grades students teachers
## 1 1 75119 Sunol Glen Unified Alameda KK-08 195 10.90
## 2 2 61499 Manzanita Elementary Butte KK-08 240 11.15
## 3 3 61549 Thermalito Union Elementary Butte KK-08 1550 82.90
## 4 4 61457 Golden Feather Union Elementary Butte KK-08 243 14.00
## 5 5 61523 Palermo Union Elementary Butte KK-08 1335 71.50
## calworks lunch computer expenditure income english read math
## 1 0.5102 2.0408 67 6384.911 22.690001 0.000000 691.6 690.0
## 2 15.4167 47.9167 101 5099.381 9.824000 4.583333 660.5 661.9
## 3 55.0323 76.3226 169 5501.955 8.978000 30.000002 636.3 650.9
## 4 36.4754 77.0492 85 7101.831 8.978000 0.000000 651.9 643.5
## 5 33.1086 78.4270 171 5235.988 9.080333 13.857677 641.8 639.9
Data Exploration
Summary of percent of ESL students (English as a 2nd language)
summary(df$english) # Median percent of English learners
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.941 8.778 15.768 22.970 85.540
Summary of district average income (in USD 1000)
summary(df$income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.335 10.639 13.728 15.317 17.629 55.328
Categorical vs numerical variables
str(df) # Note which variables are categorical vs numerical
## 'data.frame': 420 obs. of 15 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ district : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
## $ school : chr "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
## $ county : chr "Alameda" "Butte" "Butte" "Butte" ...
## $ grades : chr "KK-08" "KK-08" "KK-08" "KK-08" ...
## $ students : int 195 240 1550 243 1335 137 195 888 379 2247 ...
## $ teachers : num 10.9 11.1 82.9 14 71.5 ...
## $ calworks : num 0.51 15.42 55.03 36.48 33.11 ...
## $ lunch : num 2.04 47.92 76.32 77.05 78.43 ...
## $ computer : int 67 101 169 85 171 25 28 66 35 0 ...
## $ expenditure: num 6385 5099 5502 7102 5236 ...
## $ income : num 22.69 9.82 8.98 8.98 9.08 ...
## $ english : num 0 4.58 30 0 13.86 ...
## $ read : num 692 660 636 652 642 ...
## $ math : num 690 662 651 644 640 ...
Identify the school with the lowest district average income, and the highest.
df$school[df$income==min(df$income)] # lowest
## [1] "Allensworth Elementary"
df$school[df$income==max(df$income)] # highest
## [1] "Reed Union Elementary"
Identify the school with the lowest percentage of English-language learners, and the highest.
df$school[df$english==min(df$english)] # lowest (there are many with 0 English learners)
## [1] "Sunol Glen Unified" "Golden Feather Union Elementary"
## [3] "Montague Elementary" "McCloud Union Elementary"
## [5] "Bangor Union Elementary" "Cuddeback Union Elementary"
## [7] "Blue Lake Union Elementary" "Peninsula Union Elementary"
## [9] "Dunsmuir Elementary" "Black Butte Union Elementary"
## [11] "Summerville Elementary" "Ophir Elementary"
## [13] "Montgomery Elementary" "Shasta Union Elementary"
## [15] "Monte Rio Union Elementary" "Rohnerville Elementary"
## [17] "Mountain Union Elementary" "South Fork Union Elementary"
## [19] "Junction Elementary" "Kernville Union Elementary"
## [21] "Chicago Park Elementary" "Trinidad Union Elementary"
## [23] "Three Rivers Union Elementary" "Bella Vista Elementary"
## [25] "Colfax Elementary" "Bass Lake Joint Elementary"
## [27] "Cutten Elementary" "Foresthill Union Elementary"
## [29] "Freshwater Elementary" "Springville Union Elementary"
## [31] "Fieldbrook Elementary" "Gold Oak Union Elementary"
## [33] "Hydesville Elementary" "Placer Hills Union Elementary"
## [35] "Ready Springs Union Elementary" "Coarsegold Union Elementary"
## [37] "Richmond Elementary" "Alta-Dutch Flat Union Elementary"
## [39] "Latrobe Elementary" "Douglas City Elementary"
## [41] "Newcastle Elementary" "Weaverville Elementary"
## [43] "North Cow Creek Elementary" "Jacoby Creek Elementary"
## [45] "Fort Ross Elementary" "Knights Ferry Elementary"
## [47] "Mountain Elementary" "Grant Elementary"
## [49] "Happy Valley Elementary"
df$school[df$english==max(df$english)] # highest
## [1] "Lost Hills Union Elementary"
Many schools have 0 ESL students.
Data Wrangling
Add a column named “tsratio”: The ratio of teacher to student.
ratio <- df$students/df$teachers
ratio <- round(ratio, 0)
a <- df[,1:7]
b <- df[,8:15]
df1 <- data.frame(a,tsratio=ratio,b)
df1$tsratio <- ratio
head(df1, n=5L)
## X district school county grades students teachers
## 1 1 75119 Sunol Glen Unified Alameda KK-08 195 10.90
## 2 2 61499 Manzanita Elementary Butte KK-08 240 11.15
## 3 3 61549 Thermalito Union Elementary Butte KK-08 1550 82.90
## 4 4 61457 Golden Feather Union Elementary Butte KK-08 243 14.00
## 5 5 61523 Palermo Union Elementary Butte KK-08 1335 71.50
## tsratio calworks lunch computer expenditure income english read math
## 1 18 0.5102 2.0408 67 6384.911 22.690001 0.000000 691.6 690.0
## 2 22 15.4167 47.9167 101 5099.381 9.824000 4.583333 660.5 661.9
## 3 19 55.0323 76.3226 169 5501.955 8.978000 30.000002 636.3 650.9
## 4 17 36.4754 77.0492 85 7101.831 8.978000 0.000000 651.9 643.5
## 5 19 33.1086 78.4270 171 5235.988 9.080333 13.857677 641.8 639.9
df2<-df1[c(8,9,10,11,12,13,14,15,16)] # contains only the columns with numeric data.
Visualizations
Scatterplots
qplot(tsratio,
read,
data=df1,
color=math,
main="Student:Teacher Ratio vs Math & Reading Scores",
xlab="Student:Teacher Ratio",
ylab="Reading Score"
)

qplot(english,
read,
data=df1,
colour=math,
main="% ESL Students vs Math & Reading Scores",
xlab="% ESL Students",
ylab="Reading Score"
) +
scale_color_gradient(low = "#ffd000", high = "#ff0000")

Boxplots
Reading scores
ggplot(df1, aes(x=read)) + geom_boxplot( color="red", fill="orange", alpha=0.2, outlier.color="black") +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())

Math scores
ggplot(df1, aes(x=math)) + geom_boxplot(color="dodgerblue4", fill="deepskyblue", alpha=0.2, outlier.color="black") +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())

Histogram
ggplot( data=df1, aes(x=lunch)) +
geom_histogram( binwidth=10, fill="#69b3a2", color="forestgreen", alpha=0.3) +
ggtitle("% Reduced Lunch Count") +
theme(
plot.title = element_text(size=15)
)

Correlation Plot
corrplot(corr = cor(df2), tl.col="black", tl.cex=0.6, order='AOE')

Conclusion
Based on the correlation plot and scatterplots, the student-teacher ratio appears to be negatively correlated with both reading and math scores. It makes sense that, the more students in a class, the less individual attention a teacher can give. As a result, the students have a more difficult time addressing the academic areas in which they need improvement. Other factors that also had a significant negative correlation with the test scores are percentage of students on reduced lunch, and percentage of ESL students. However, there is somewhat weaker correlation between ESL students and math scores compared to reading scores.