#Read the data
url = "https://github.com/chinedu2301/CUNY-SPS-Bridge/raw/main/CASchools.csv"
data <- read.table(file = url, header = TRUE, sep = ",")
head(data)
## X district school county grades students teachers
## 1 1 75119 Sunol Glen Unified Alameda KK-08 195 10.90
## 2 2 61499 Manzanita Elementary Butte KK-08 240 11.15
## 3 3 61549 Thermalito Union Elementary Butte KK-08 1550 82.90
## 4 4 61457 Golden Feather Union Elementary Butte KK-08 243 14.00
## 5 5 61523 Palermo Union Elementary Butte KK-08 1335 71.50
## 6 6 62042 Burrel Union Elementary Fresno KK-08 137 6.40
## calworks lunch computer expenditure income english read math
## 1 0.5102 2.0408 67 6384.911 22.690001 0.000000 691.6 690.0
## 2 15.4167 47.9167 101 5099.381 9.824000 4.583333 660.5 661.9
## 3 55.0323 76.3226 169 5501.955 8.978000 30.000002 636.3 650.9
## 4 36.4754 77.0492 85 7101.831 8.978000 0.000000 651.9 643.5
## 5 33.1086 78.4270 171 5235.988 9.080333 13.857677 641.8 639.9
## 6 12.3188 86.9565 25 5580.147 10.415000 12.408759 605.7 605.4
summary(data) #summary of the data to view descriptive statistics of the data
## X district school county
## Min. : 1.0 Min. :61382 Length:420 Length:420
## 1st Qu.:105.8 1st Qu.:64308 Class :character Class :character
## Median :210.5 Median :67761 Mode :character Mode :character
## Mean :210.5 Mean :67473
## 3rd Qu.:315.2 3rd Qu.:70419
## Max. :420.0 Max. :75440
## grades students teachers calworks
## Length:420 Min. : 81.0 Min. : 4.85 Min. : 0.000
## Class :character 1st Qu.: 379.0 1st Qu.: 19.66 1st Qu.: 4.395
## Mode :character Median : 950.5 Median : 48.56 Median :10.520
## Mean : 2628.8 Mean : 129.07 Mean :13.246
## 3rd Qu.: 3008.0 3rd Qu.: 146.35 3rd Qu.:18.981
## Max. :27176.0 Max. :1429.00 Max. :78.994
## lunch computer expenditure income
## Min. : 0.00 Min. : 0.0 Min. :3926 Min. : 5.335
## 1st Qu.: 23.28 1st Qu.: 46.0 1st Qu.:4906 1st Qu.:10.639
## Median : 41.75 Median : 117.5 Median :5215 Median :13.728
## Mean : 44.71 Mean : 303.4 Mean :5312 Mean :15.317
## 3rd Qu.: 66.86 3rd Qu.: 375.2 3rd Qu.:5601 3rd Qu.:17.629
## Max. :100.00 Max. :3324.0 Max. :7712 Max. :55.328
## english read math
## Min. : 0.000 Min. :604.5 Min. :605.4
## 1st Qu.: 1.941 1st Qu.:640.4 1st Qu.:639.4
## Median : 8.778 Median :655.8 Median :652.5
## Mean :15.768 Mean :655.0 Mean :653.3
## 3rd Qu.:22.970 3rd Qu.:668.7 3rd Qu.:665.9
## Max. :85.540 Max. :704.0 Max. :709.5
str(data)
## 'data.frame': 420 obs. of 15 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ district : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
## $ school : chr "Sunol Glen Unified" "Manzanita Elementary" "Thermalito Union Elementary" "Golden Feather Union Elementary" ...
## $ county : chr "Alameda" "Butte" "Butte" "Butte" ...
## $ grades : chr "KK-08" "KK-08" "KK-08" "KK-08" ...
## $ students : int 195 240 1550 243 1335 137 195 888 379 2247 ...
## $ teachers : num 10.9 11.1 82.9 14 71.5 ...
## $ calworks : num 0.51 15.42 55.03 36.48 33.11 ...
## $ lunch : num 2.04 47.92 76.32 77.05 78.43 ...
## $ computer : int 67 101 169 85 171 25 28 66 35 0 ...
## $ expenditure: num 6385 5099 5502 7102 5236 ...
## $ income : num 22.69 9.82 8.98 8.98 9.08 ...
## $ english : num 0 4.58 30 0 13.86 ...
## $ read : num 692 660 636 652 642 ...
## $ math : num 690 662 651 644 640 ...
#Install and or load relevant libraries for data wrangling
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.4 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
var <- c("school","grades","income", "teachers", "students","read", "math")
data <- data %>% as_tibble() %>% select(all_of(var)) %>% arrange(desc(math, read, english)) %>% mutate(teacher_stu = teachers/students)
#I selected the relevant columns for my analysis
head(data)
## # A tibble: 6 x 8
## school grades income teachers students read math teacher_stu
## <chr> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Los Altos Elementary KK-08 41.7 208. 3724 704 710. 0.0560
## 2 Las Lomitas Elementary KK-08 28.7 59.7 984 701. 708. 0.0607
## 3 Cold Spring Elementary KK-06 43.2 12.3 220 693. 704. 0.0560
## 4 Saratoga Union Elemen~ KK-08 40.4 124. 2341 699. 702. 0.0530
## 5 Hillsborough City Ele~ KK-08 35.8 87.1 1318 695. 701. 0.0661
## 6 Portola Valley Elemen~ KK-08 50.7 44.6 687 698. 700. 0.0649
any(is.na(data)) #Check for missing values
## [1] FALSE
#Plot a graph of Income vs Math Score
pl_Income_Math <- ggplot(data, aes(x = math, y = income)) + geom_point() + stat_smooth(se=FALSE) + labs(x="Math Score", y= "Income", title = "Income vs Math Score") + facet_grid(rows = "grades") + theme_economist()
print(pl_Income_Math)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Plot a graph of Income vs Reading Score
pl_Income_Read <- ggplot(data, aes(x =read, y = income)) + geom_point() + stat_smooth(se=FALSE) + labs(x="Reading Score", y= "Income", title = "Income vs Reading Score") + facet_grid(rows = "grades") + theme_economist()
print(pl_Income_Read)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Plot a graph of Math Score vs Teacher-Student Ratio
pl_Math_TSR <- ggplot(data, aes(x =teacher_stu, y = math)) + geom_point() + stat_smooth(se=FALSE)+ labs(x="Teachers-Students Ratio", y= "Math Score", title = "Math Score vs Teacher-Students Ratio")+theme_economist()
print(pl_Math_TSR)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Correlation between Income and Math Score
cor_inc_Math <- cor.test(data$math, data$income)
print("Correlation between Income and Math Scores")
## [1] "Correlation between Income and Math Scores"
print(cor_inc_Math)
##
## Pearson's product-moment correlation
##
## data: data$math and data$income
## t = 20.006, df = 418, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6470117 0.7452127
## sample estimates:
## cor
## 0.6993981
cor_inc_Read <- cor_inc_Read <- cor.test(data$read, data$income)
print("Correlation between Income and Reading Scores")
## [1] "Correlation between Income and Reading Scores"
print(cor_inc_Read)
##
## Pearson's product-moment correlation
##
## data: data$read and data$income
## t = 19.918, df = 418, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6452147 0.7438378
## sample estimates:
## cor
## 0.6978189