Dataset source: https://vincentarelbundock.github.io/Rdatasets/doc/AER/CASchools.html

Question

How did the number of teachers, number of computers, district average income, and other factors affect the reading and math test scores of 5th grade students in California school districts in 1998 and 1999?

library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
filename <- "https://raw.githubusercontent.com/djunga/msdsbridgefinal/main/CASchools.csv"
df<-read.csv(file=filename)
head(df, n=5L)
##   X district                          school  county grades students teachers
## 1 1    75119              Sunol Glen Unified Alameda  KK-08      195    10.90
## 2 2    61499            Manzanita Elementary   Butte  KK-08      240    11.15
## 3 3    61549     Thermalito Union Elementary   Butte  KK-08     1550    82.90
## 4 4    61457 Golden Feather Union Elementary   Butte  KK-08      243    14.00
## 5 5    61523        Palermo Union Elementary   Butte  KK-08     1335    71.50
##   calworks   lunch computer expenditure    income   english  read  math
## 1   0.5102  2.0408       67    6384.911 22.690001  0.000000 691.6 690.0
## 2  15.4167 47.9167      101    5099.381  9.824000  4.583333 660.5 661.9
## 3  55.0323 76.3226      169    5501.955  8.978000 30.000002 636.3 650.9
## 4  36.4754 77.0492       85    7101.831  8.978000  0.000000 651.9 643.5
## 5  33.1086 78.4270      171    5235.988  9.080333 13.857677 641.8 639.9

Data Exploration

Summary of percent of ESL students (English as a 2nd language)

summary(df$english) # Median percent of English learners
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.941   8.778  15.768  22.970  85.540

Summary of district average income (in USD 1000)

summary(df$income)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.335  10.639  13.728  15.317  17.629  55.328

Categorical vs numerical variables

str(df) # Note which variables are categorical vs numerical
## 'data.frame':    420 obs. of  15 variables:
##  $ X          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ district   : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ school     : chr  "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
##  $ county     : chr  "Alameda" "Butte" "Butte" "Butte" ...
##  $ grades     : chr  "KK-08" "KK-08" "KK-08" "KK-08" ...
##  $ students   : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers   : num  10.9 11.1 82.9 14 71.5 ...
##  $ calworks   : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ lunch      : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer   : int  67 101 169 85 171 25 28 66 35 0 ...
##  $ expenditure: num  6385 5099 5502 7102 5236 ...
##  $ income     : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ english    : num  0 4.58 30 0 13.86 ...
##  $ read       : num  692 660 636 652 642 ...
##  $ math       : num  690 662 651 644 640 ...

Identify the school with the lowest district average income, and the highest.

df$school[df$income==min(df$income)] # lowest
## [1] "Allensworth Elementary"
df$school[df$income==max(df$income)] # highest
## [1] "Reed Union Elementary"

Identify the school with the lowest percentage of English-language learners, and the highest.

df$school[df$english==min(df$english)] # lowest (there are many with 0 English learners)
##  [1] "Sunol Glen Unified"               "Golden Feather Union Elementary" 
##  [3] "Montague Elementary"              "McCloud Union Elementary"        
##  [5] "Bangor Union Elementary"          "Cuddeback Union Elementary"      
##  [7] "Blue Lake Union Elementary"       "Peninsula Union Elementary"      
##  [9] "Dunsmuir Elementary"              "Black Butte Union Elementary"    
## [11] "Summerville Elementary"           "Ophir Elementary"                
## [13] "Montgomery Elementary"            "Shasta Union Elementary"         
## [15] "Monte Rio Union Elementary"       "Rohnerville Elementary"          
## [17] "Mountain Union Elementary"        "South Fork Union Elementary"     
## [19] "Junction Elementary"              "Kernville Union Elementary"      
## [21] "Chicago Park Elementary"          "Trinidad Union Elementary"       
## [23] "Three Rivers Union Elementary"    "Bella Vista Elementary"          
## [25] "Colfax Elementary"                "Bass Lake Joint Elementary"      
## [27] "Cutten Elementary"                "Foresthill Union Elementary"     
## [29] "Freshwater Elementary"            "Springville Union Elementary"    
## [31] "Fieldbrook Elementary"            "Gold Oak Union Elementary"       
## [33] "Hydesville Elementary"            "Placer Hills Union Elementary"   
## [35] "Ready Springs Union Elementary"   "Coarsegold Union Elementary"     
## [37] "Richmond Elementary"              "Alta-Dutch Flat Union Elementary"
## [39] "Latrobe Elementary"               "Douglas City Elementary"         
## [41] "Newcastle Elementary"             "Weaverville Elementary"          
## [43] "North Cow Creek Elementary"       "Jacoby Creek Elementary"         
## [45] "Fort Ross Elementary"             "Knights Ferry Elementary"        
## [47] "Mountain Elementary"              "Grant Elementary"                
## [49] "Happy Valley Elementary"
df$school[df$english==max(df$english)] # highest
## [1] "Lost Hills Union Elementary"

Many schools have 0 ESL students.

Data Wrangling

Add a column named “tsratio”: The ratio of teacher to student.

ratio <- df$students/df$teachers
ratio <- round(ratio, 0)
a <- df[,1:7]
b <- df[,8:15]
df1 <- data.frame(a,tsratio=ratio,b)
df1$tsratio <- ratio
head(df1, n=5L)
##   X district                          school  county grades students teachers
## 1 1    75119              Sunol Glen Unified Alameda  KK-08      195    10.90
## 2 2    61499            Manzanita Elementary   Butte  KK-08      240    11.15
## 3 3    61549     Thermalito Union Elementary   Butte  KK-08     1550    82.90
## 4 4    61457 Golden Feather Union Elementary   Butte  KK-08      243    14.00
## 5 5    61523        Palermo Union Elementary   Butte  KK-08     1335    71.50
##   tsratio calworks   lunch computer expenditure    income   english  read  math
## 1      18   0.5102  2.0408       67    6384.911 22.690001  0.000000 691.6 690.0
## 2      22  15.4167 47.9167      101    5099.381  9.824000  4.583333 660.5 661.9
## 3      19  55.0323 76.3226      169    5501.955  8.978000 30.000002 636.3 650.9
## 4      17  36.4754 77.0492       85    7101.831  8.978000  0.000000 651.9 643.5
## 5      19  33.1086 78.4270      171    5235.988  9.080333 13.857677 641.8 639.9
df2<-df1[c(8,9,10,11,12,13,14,15,16)]   # contains only the columns with numeric data.

Visualizations

Scatterplots

qplot(tsratio, 
      read, 
      data=df1, 
      color=math, 
      main="Student:Teacher Ratio vs Math & Reading Scores",
      xlab="Student:Teacher Ratio",
      ylab="Reading Score"
      )

qplot(english,
      read, 
      data=df1,
      colour=math,
      main="% ESL Students vs Math & Reading Scores",
      xlab="% ESL Students",
      ylab="Reading Score"
      ) + 
      scale_color_gradient(low = "#ffd000", high = "#ff0000")

Boxplots

Reading scores

ggplot(df1, aes(x=read)) + geom_boxplot( color="red", fill="orange", alpha=0.2, outlier.color="black") +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

Math scores

ggplot(df1, aes(x=math)) + geom_boxplot(color="dodgerblue4", fill="deepskyblue", alpha=0.2, outlier.color="black") +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

Histogram

ggplot( data=df1, aes(x=lunch)) +
  geom_histogram( binwidth=10, fill="#69b3a2", color="forestgreen", alpha=0.3) +
  ggtitle("% Reduced Lunch Count") +
  theme(
    plot.title = element_text(size=15)
  )

Correlation Plot

corrplot(corr = cor(df2), tl.col="black", tl.cex=0.6, order='AOE')

Conclusion

Based on the correlation plot and scatterplots, the student-teacher ratio appears to be negatively correlated with both reading and math scores. It makes sense that, the more students in a class, the less individual attention a teacher can give. As a result, the students have a more difficult time addressing the academic areas in which they need improvement. Other factors that also had a significant negative correlation with the test scores are percentage of students on reduced lunch, and percentage of ESL students. However, there is somewhat weaker correlation between ESL students and math scores compared to reading scores.