Description: https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/doc/wooldridge/meap93.html
CSV: https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/wooldridge/meap93.csv
The object of this analysis is to explore relationship between:
expend - school expenditure per student
school SES profile is going to be created based on lnchprg (proxy for SES)
Source: http://facultyprofiles.ucr.edu/gsoe_dept/faculty/Robert_Ream/Ream_Ryan_Racial%20Inequality_2013.pdf
setwd("C:/Users/stina/Documents/R programming Bridge Workshop/Week 3 Assignment")
# load data onto dataframe
data.frame_meap_all <- read.csv("Meap93Data.csv")
Subset: Take a look at lnchprg, expend, math10, and sci111
data.frame_meap <- data.frame_meap_all[, c(2,5, 10,11)]
names(data.frame_meap)[1] <- "lunch_program_rate"
names(data.frame_meap)[2] <- "expend_per_student"
names(data.frame_meap)[3] <- "math_pass_rate"
names(data.frame_meap)[4] <- "science_pass_rate"
NOTE: The result below tells us that there are no NA values
# any checks if there is at least one value that is true
# is.na returns a vectore of logicals. If a value is NA, then it returns TRUE
any(is.na(data.frame_meap[,]))
## [1] FALSE
any(data.frame_meap$lunch_program_rate <= 0)
## [1] FALSE
any(data.frame_meap$expend_per_student <= 0)
## [1] FALSE
any(data.frame_meap$math_pass_rate <= 0)
## [1] FALSE
any(data.frame_meap$science_pass_rate <= 0)
## [1] FALSE
is.numeric(data.frame_meap$lunch_program_rate)
## [1] TRUE
is.numeric(data.frame_meap$expend_per_student)
## [1] TRUE
is.numeric(data.frame_meap$math_pass_rate)
## [1] TRUE
is.numeric(data.frame_meap$science_pass_rate)
## [1] TRUE
This combines both the math_pass_rate and science_pass_rate.
data.frame_meap$meap_score <- (data.frame_meap$math_pass_rate + data.frame_meap$science_pass_rate)/2
The “lunch_program_rate” serves as a proxy for socioeconomic status (SES) profile of the school.
An average lunch program participation corresponds to an average-SES profile school. A high lunch program participation corresponds to a low-SES profile school. A low lunch program participation corresponds to a high-SES profile school.
The average value of lunch_program_rate across the sample is around 25.
The column “school_SES_profile” is going to be a factor that is going to have values: “high_SES”, “average_SES”, and “low_SES”.
#data.frame_meap$school_SES_profile <- "unassigned"
data.frame_meap$school_SES_profile[data.frame_meap$lunch_program_rate < 18] <- "high_SES"
data.frame_meap$school_SES_profile[data.frame_meap$lunch_program_rate > 30] <- "low_SES"
data.frame_meap$school_SES_profile[data.frame_meap$lunch_program >=18 &
data.frame_meap$lunch_program <= 30] <- "average_SES"
class(data.frame_meap$school_SES_profile)
## [1] "character"
data.frame_meap$school_SES_profile <- as.factor(data.frame_meap$school_SES_profile)
class(data.frame_meap$school_SES_profile)
## [1] "factor"
data.frame_meap$expend_level[data.frame_meap$expend_per_student <= 3500] <- "Expend Level 1"
data.frame_meap$expend_level[data.frame_meap$expend_per_student > 3500 &
data.frame_meap$expend_per_student <= 4500] <- "Expend Level 2"
data.frame_meap$expend_level[data.frame_meap$expend_per_student > 4500 &
data.frame_meap$expend_per_student <= 5500] <- "Expend Level 3"
data.frame_meap$expend_level[data.frame_meap$expend_per_student > 5500 &
data.frame_meap$expend_per_student <= 6500] <- "Expend Level 4"
data.frame_meap$expend_level[data.frame_meap$expend_per_student > 6500] <- "Expend Level 5"
class(data.frame_meap$expend_level)
## [1] "character"
data.frame_meap$expend_level <- as.factor(data.frame_meap$expend_level)
class(data.frame_meap$expend_level)
## [1] "factor"
summary(data.frame_meap)
## lunch_program_rate expend_per_student math_pass_rate science_pass_rate
## Min. : 1.40 Min. :3332 Min. : 1.90 Min. : 7.20
## 1st Qu.:14.62 1st Qu.:3821 1st Qu.:16.62 1st Qu.:41.30
## Median :23.85 Median :4145 Median :23.40 Median :49.10
## Mean :25.20 Mean :4377 Mean :24.11 Mean :49.18
## 3rd Qu.:33.83 3rd Qu.:4659 3rd Qu.:30.05 3rd Qu.:57.15
## Max. :79.50 Max. :7419 Max. :66.70 Max. :85.70
## meap_score school_SES_profile expend_level
## Min. : 4.95 average_SES:135 Expend Level 1: 10
## 1st Qu.:30.75 high_SES :136 Expend Level 2:273
## Median :36.25 low_SES :137 Expend Level 3: 83
## Mean :36.64 Expend Level 4: 27
## 3rd Qu.:42.31 Expend Level 5: 15
## Max. :65.80
# summary by grouping with school_SES_profile
library(purrr)
data.frame_meap %>% split(.$school_SES_profile) %>% map(summary)
## $average_SES
## lunch_program_rate expend_per_student math_pass_rate science_pass_rate
## Min. :18.00 Min. :3425 Min. : 4.30 Min. :22.10
## 1st Qu.:20.15 1st Qu.:3770 1st Qu.:16.70 1st Qu.:41.15
## Median :23.80 Median :4003 Median :22.50 Median :47.60
## Mean :23.78 Mean :4209 Mean :23.77 Mean :48.79
## 3rd Qu.:27.30 3rd Qu.:4381 3rd Qu.:27.75 3rd Qu.:56.20
## Max. :29.90 Max. :7419 Max. :60.60 Max. :80.00
## meap_score school_SES_profile expend_level
## Min. :17.65 average_SES:135 Expend Level 1: 5
## 1st Qu.:30.48 high_SES : 0 Expend Level 2:100
## Median :35.30 low_SES : 0 Expend Level 3: 23
## Mean :36.28 Expend Level 4: 4
## 3rd Qu.:41.38 Expend Level 5: 3
## Max. :62.05
##
## $high_SES
## lunch_program_rate expend_per_student math_pass_rate science_pass_rate
## Min. : 1.400 Min. :3332 Min. : 2.40 Min. :30.10
## 1st Qu.: 8.475 1st Qu.:4075 1st Qu.:21.45 1st Qu.:48.25
## Median :11.050 Median :4355 Median :26.80 Median :53.75
## Mean :11.071 Mean :4634 Mean :28.27 Mean :54.14
## 3rd Qu.:14.475 3rd Qu.:5030 3rd Qu.:33.23 3rd Qu.:60.60
## Max. :17.800 Max. :7034 Max. :62.40 Max. :74.60
## meap_score school_SES_profile expend_level
## Min. :24.55 average_SES: 0 Expend Level 1: 2
## 1st Qu.:35.89 high_SES :136 Expend Level 2:78
## Median :40.60 low_SES : 0 Expend Level 3:34
## Mean :41.21 Expend Level 4:13
## 3rd Qu.:45.26 Expend Level 5: 9
## Max. :64.55
##
## $low_SES
## lunch_program_rate expend_per_student math_pass_rate science_pass_rate
## Min. :30.40 Min. :3396 Min. : 1.90 Min. : 7.20
## 1st Qu.:33.80 1st Qu.:3751 1st Qu.:14.00 1st Qu.:36.40
## Median :38.60 Median :4042 Median :18.80 Median :44.60
## Mean :40.63 Mean :4286 Mean :20.31 Mean :44.65
## 3rd Qu.:45.10 3rd Qu.:4594 3rd Qu.:26.10 3rd Qu.:53.60
## Max. :79.50 Max. :6641 Max. :66.70 Max. :85.70
## meap_score school_SES_profile expend_level
## Min. : 4.95 average_SES: 0 Expend Level 1: 3
## 1st Qu.:26.65 high_SES : 0 Expend Level 2:95
## Median :32.30 low_SES :137 Expend Level 3:26
## Mean :32.48 Expend Level 4:10
## 3rd Qu.:38.20 Expend Level 5: 3
## Max. :65.80
x <- data.frame_meap$lunch_program_rate
hist(x, breaks=50, col="blue", xlab="Lunch Program", main="Percent of Students in Lunch Program")
x <- data.frame_meap$meap_score
hist(x, breaks=50, col="blue", xlab="MEAP Score", main="School Academic Perfomance")
x <- data.frame_meap$expend_per_student
hist(x, breaks=50, col="blue", xlab="Expenditure Per Student", main="Expenditure Per Student")
library(ggplot2)
ggplot(data=data.frame_meap, aes(x=expend_per_student)) +
geom_histogram(binwidth=1000, aes(fill=school_SES_profile), color="Gray") +
labs(x="Expenditure Per Student", y="Frequency")
plot(data.frame_meap$lunch_program_rate, data.frame_meap$meap_score, main = "School Academic Performance (MEAP Score)",
xlab = "Percent of Student in Lunch Program", ylab = "MEAP Score", pch=19)
#Create a LOWESS (Locally Weighted Scatterplot Smoothing)line
lines(lowess(data.frame_meap$lunch_program_rate, data.frame_meap$meap_score),
col="blue") # lowess line (x,y)
library(ggplot2)
ggplot(data=data.frame_meap, aes(x=lunch_program_rate, y=meap_score,
colour=expend_level)) +
geom_point(size=3, alpha=0.7) +
labs(y="Meap Score", x="Percent of Student in Lunch Program", color="Expenditure Levels")
# specify order of school SES profile (factor) on the box plot
data.frame_meap$school_SES_profile <- factor(data.frame_meap$school_SES_profile, levels=c("low_SES", "average_SES", "high_SES"))
# create box plot
boxplot(meap_score~school_SES_profile, data=data.frame_meap, main="School Academic Performance (MEAP Score)",
xlab = "School SES Profile", ylab="MEAP Score")
plot(data.frame_meap$expend_per_student, data.frame_meap$meap_score, main = "School Academic Performance (MEAP Score)",
xlab = "Expenditure per Student", ylab = "MEAP Score", pch=19)
#Create a LOWESS (Locally Weighted Scatterplot Smoothing) line
lines(lowess(data.frame_meap$expend_per_student, data.frame_meap$meap_score),
col="blue") # lowess line (x,y)
# specify order of school SES profile (factor) on the box plot
data.frame_meap$school_SES_profile <- factor(data.frame_meap$school_SES_profile, levels=c("low_SES", "average_SES", "high_SES"))
# create box plot
boxplot(expend_per_student~school_SES_profile, data=data.frame_meap, main="Expenditure Per Student",
xlab = "School SES Profile", ylab="Expenditure Per Student")
library(ggplot2)
ggplot(data=data.frame_meap, aes(x=expend_per_student, y=meap_score,
colour=school_SES_profile)) +
geom_point(size=3, alpha=0.7) +
labs(y="Meap Score", x="Expenditure Per Student", color="School SES Profile")