This is a project I’m currently working on. The idea of my project is to find how proficiency in math and in reading affect high school dropout rates amongst high school students in the state of Iowa in 2015.
! Important: Rpubs doesn’t support external links. To access links below: Right click and open in new tab. Thanks!
The original datasets, which I will refer to as DS1 and DS2, are avaiable at data.iowa.gov. A preview of each dataset is available here:
DataSet1 DataSet2
The objective of this writeup is to demonstrate proficiency in
The writeup has two parts. Both listed below. I assume that you have previewed both datasets, DS1 and DS2, using the links above.
Part 1: First create and then plot the dataset location_ds, which includes the variables:
Part 2: Create the dataset analysis_ds, which includes the variables:
suppressMessages(library(dplyr))
library(tidyr)
library(stringr)
library(rbokeh)
# Reads in datasets DS1, DS2
DS1 <- read.csv(file = "~/code/projects/data_files/iowa_project/proficiency.csv")
DS2 <- read.csv(file = "~/code/projects/data_files/iowa_project/dropout.csv")
# Takes a look at the first 5 rows of DS1
head(DS1, 5)
## School.Year Topic Grade District District.Name Proficient Total
## 1 2003 Reading 11 1080 Central 40 59
## 2 2004 Reading 11 977 Cardinal 31 44
## 3 2013 Reading 7 3195 Jefferson-Scranton 62 76
## 4 2005 Reading 11 6462 Tri-County 19 25
## 5 2013 Math 8 1917 Boyer Valley 19 29
## X..Proficient Proficient.Category
## 1 67.8 60.1 - 70%
## 2 70.5 70.1 - 80%
## 3 81.6 80.1 - 90%
## 4 76.0 70.1 - 80%
## 5 65.5 60.1 - 70%
## District.Office.Location
## 1 400 1st Street NW\nElkader, Iowa 52043\n(42.8562016460005, -91.40768488699968)
## 2 4045 Ashland Rd\nEldon, Iowa 52554\n(40.96044807700048, -92.2374239729997)
## 3 204 W Madison Ave\nJefferson, Iowa 50129\n(42.01231774900049, -94.37712027299966)
## 4 3003 Highway 22\nThornburg, Iowa 50255\n(41.45220459400048, -92.33400292899967)
## 5 1102 Iowa Ave\nDunlap, Iowa 51529\n(41.854313410000486, -95.5948701319997)
# Explores dimensions of DS1
# 95,590 rows, 10 columns
dim(DS1)
## [1] 59590 10
# Simplifies names of columns
names(DS1) <- c("school_year", "topic", "grade", "district",
"district_name", "proficient", "total", "percent_proficient",
"proficient_category", "location")
# Keeps only columns of interest. Saves new dataset under the name "location_ds"
# Uses the pipe method in the package dplyr. Similar to the pipe command in UNIX
location_ds <- DS1 %>%
select(district, district_name, grade, total, location)
# Takes a look at the first 5 rows of DS1
head(location_ds, 5)
## district district_name grade total
## 1 1080 Central 11 59
## 2 977 Cardinal 11 44
## 3 3195 Jefferson-Scranton 7 76
## 4 6462 Tri-County 11 25
## 5 1917 Boyer Valley 8 29
## location
## 1 400 1st Street NW\nElkader, Iowa 52043\n(42.8562016460005, -91.40768488699968)
## 2 4045 Ashland Rd\nEldon, Iowa 52554\n(40.96044807700048, -92.2374239729997)
## 3 204 W Madison Ave\nJefferson, Iowa 50129\n(42.01231774900049, -94.37712027299966)
## 4 3003 Highway 22\nThornburg, Iowa 50255\n(41.45220459400048, -92.33400292899967)
## 5 1102 Iowa Ave\nDunlap, Iowa 51529\n(41.854313410000486, -95.5948701319997)
# I'm interested only in high school students. Grades available in DS1
# 11 7 8 6 3 5 4 10
unique(location_ds$grade)
## [1] 11 7 8 6 3 5 4 10
# Gets us a count of rows per grade
# 3 4 5 6 7 8 10 11
# 7744 9912 7668 7508 7339 9428 628 9363
table(location_ds$grade)
##
## 3 4 5 6 7 8 10 11
## 7744 9912 7668 7508 7339 9428 628 9363
# While I'm interested in dropout rates amongst high school students. There arent' enough observations with grade == 10. So we keep only the observations with grade == 11
location_ds <- location_ds %>% filter(grade %in% c(11))
# Since we'll later use location_ds to plot the districts on an interactive map, we'll extract the latitude and longitude values from the variable "location"
# Removes all characters before the left parenthesis in the variable location
location_ds$location <- gsub(".*\\(", "", location_ds$location)
# Removes the right parenthesis
location_ds$location <- str_sub(location_ds$location, 1, -2)
# We create new variables "lat" and "lon" to later store the new values
location_ds$lat <- 0
location_ds$lon <- 0
# Then we assign the substring before "," as latitude, and the substring after "," as the longitude for each row in the dataset "location_ds"
for (i in 1:nrow(location_ds)){
location_ds$lat[i] <- str_split_fixed(location_ds$location[i], ", ", n = 2)[1]
location_ds$lon[i] <- str_split_fixed(location_ds$location[i], ", ", n = 2)[2]
}
# Converts lat and lon from string format to numeric format
location_ds$lat <- as.numeric(location_ds$lat)
location_ds$lon <- as.numeric(location_ds$lon)
# Rounds each newly created variable to six digits after the decimal point
location_ds$lat <- round(location_ds$lat, 6)
location_ds$lon <- round(location_ds$lon, 7)
# Drops the "location" and "grade" variables
location_ds <- location_ds %>% select(-location, -grade)
# Finds the number of unique rows
length(unique(location_ds$district))
## [1] 359
# Updates the dataset location_ds
# Keeps only the unique rows
location_ds <- location_ds %>% distinct(district)
# Makes sure we only have 359 rows
nrow(location_ds)
## [1] 359
# Eventually I want to plot each of the districts on a map. And I want the size of the student body to be reflected on the map. I will use the variable "total" to create a categorical variable "size". I'll then use "size" to determine the size of the dot representing each district on the map
# Gets quantiles
quantiles <- quantile(location_ds$total, c(.25, .5, .75, .95), na.rm = TRUE)
# 25% 50% 75% 95%
# 33.0 52.0 94.0 304.3
# Creates variable "size"
location_ds$size <- 0
# Because some rows have NA's as entries for total, we'll convert those to zeros. Districts with <= 36 students in the 11th grade get the smallest size. Districts with >36 and <= 54 get the next size up. And we continue in that manner.
# The variable size has 5 levels
for (i in 1:nrow(location_ds)){
if (is.na(location_ds$total[i]) == TRUE){
location_ds$total[i] = 0
}
location_ds$size[i] <- 1*(location_ds$total[i] >= 0 && location_ds$total[i] <= quantiles[1]) +
2*(location_ds$total[i] > quantiles[1] && location_ds$total[i] <= quantiles[2]) +
3*(location_ds$total[i] > quantiles[2] && location_ds$total[i] <= quantiles[3]) +
4*(location_ds$total[i] > quantiles[3] && location_ds$total[i] <= quantiles[4]) +
5*(location_ds$total[i] > quantiles[4])
}
# Let's take a look at the result:
head(location_ds)
## district district_name total lat lon size
## 1 1080 Central 59 42.85620 -91.40768 3
## 2 977 Cardinal 44 40.96045 -92.23742 2
## 3 6462 Tri-County 25 41.45220 -92.33400 1
## 4 1989 Edgewood-Colesburg 51 42.64446 -91.40485 2
## 5 6219 Storm Lake 140 42.64208 -95.20082 4
## 6 6165 Stanton 24 40.98418 -95.10040 1
# Because in the future I will use this new dataset "location_ds", I create a .csv file with that name.
# Creates a new .csv file
write.csv(x = location_ds, file = "~/code/projects/data_files/iowa_project/location_ds.csv")
# Plots all districts on interactive map
neon <- '[{"stylers":[{"saturation":100},{"gamma":0.6}]}]'
gmap(lat = 41.8505, lng = -93.6091, zoom = 7,
width = 900, height = 650, title = "School Districts",
api_key = "AIzaSyADOlm_F8Ui0FIvTh5nJoyLNTtnMAN4HSI",
map_type = "roadmap", map_style = neon) %>%
ly_points(lon, lat, data = location_ds, alpha = 0.5,
size = 3*size, hover = c(district_name), color="blue")
# Keeps only columns of interest from DS1
# Saves new dataset under the name "analysis_ds"
analysis_ds <- DS1 %>%
select(school_year, topic, district, grade, proficient, total) %>%
filter(grade == 11, school_year == 2015)
# There 338 unique school districts in the 2015 dataset
length(unique(analysis_ds$district))
## [1] 338
# Arranges dataset by district number
analysis_ds <- analysis_ds %>% arrange(district)
# Although I'm commenting out the code below because the output is too big, there are four
# observations per district number when there should only be two, one for each topic (math and reading)
#
# CODE:
#table(analysis_ds$district)
# We're going to delete the repeats
# the function nth.delete removes the nth row of a dataset
nth.delete <- function(dataframe, n){
dataframe[-(seq(n,to=nrow(dataframe),by=n)),]
}
# First we remove every 4th row, then every 3rd row
analysis_ds <- nth.delete(analysis_ds, 4)
analysis_ds <- nth.delete(analysis_ds, 3)
# let's take a peek. It works!
head(analysis_ds)
## school_year topic district grade proficient total
## 1 2015 Reading 9 11 27 33
## 2 2015 Math 9 11 29 33
## 5 2015 Reading 18 11 18 25
## 6 2015 Math 18 11 22 25
## 9 2015 Reading 27 11 95 105
## 10 2015 Math 27 11 97 105
# There are exactly 50 observations with a value of 0 as total and as proficient
# This means that there are 25 district that either misreported/didn't take the tests,
# or simply doesn't have any 11th graders
# Either way, we'll filter them out
analysis_ds <- analysis_ds %>% filter(total != 0, proficient != 0)
# There are some discrepancies in the total column for some districts
# By some I mean between 10 and 20 schools are off by 1 or 2 students under the
# variable total.
# I believe this is caused by absences.
# So we create a new dataset "district_total" where we have one row per district
# And this row has the max total
district_total <- analysis_ds %>%
select(district, total) %>%
group_by(district) %>%
summarise(total = max(total))
# We dump the variable "total" in "analysis_df"
analysis_ds <- analysis_ds %>% select(-total)
# We also dump district 1728, which only has one proficiency test
# Alternatively, we could have assign it the median value of all other districts
analysis_ds <- analysis_ds %>% filter(district != 1278)
# We then join both datasets with a full join a.k.a. a "union" in set theory
analysis_ds <- full_join(analysis_ds, district_total, by = NULL)
## Joining by: "district"
# let's take a look
head(analysis_ds)
## school_year topic district grade proficient total
## 1 2015 Reading 9 11 27 33
## 2 2015 Math 9 11 29 33
## 3 2015 Reading 18 11 18 25
## 4 2015 Math 18 11 22 25
## 5 2015 Reading 27 11 95 105
## 6 2015 Math 27 11 97 105
# Ideally, we want, one row per district. And we want both, the math and reading scores on the same line
# We achieve that with the method spread
analysis_ds <- spread(analysis_ds, topic, proficient)
# let's take a look
head(analysis_ds)
## school_year district grade total Math Reading NA
## 1 2015 9 11 33 29 27 NA
## 2 2015 18 11 25 22 18 NA
## 3 2015 27 11 105 97 95 NA
## 4 2015 63 11 30 25 28 NA
## 5 2015 81 11 85 73 71 NA
## 6 2015 99 11 48 38 39 NA
# drops last column (full of NA's)
analysis_ds <- analysis_ds[,-7]
# We now proceed to create the proficiency scores in terms of percentages
analysis_ds <- analysis_ds %>%
mutate(math_proficient_percent = Math/total*100, reading_proficient_percent=Reading/total*100)
analysis_ds <- analysis_ds %>% select(district, total, math_proficient_percent, reading_proficient_percent)
analysis_ds <- analysis_ds %>% mutate(math_proficient_percent = round(math_proficient_percent,2), reading_proficient_percent=round(reading_proficient_percent,2))
# Let's take a peek at DS2. Lots of columns. We only want two.
head(DS2)
## District.Number District.Name County AEA Overall.Dropouts
## 1 9 AGWSR 42 7 2
## 2 18 Adair-Casey 39 11 0
## 3 5013 Oskaloosa 62 15 25
## 4 225 Ames 85 11 20
## 5 63 Akron Westfield 75 12 0
## 6 72 Albert City-Truesdale 11 5 ****
## Overall.Enrollments Overall.Rate African.American.Dropouts
## 1 240 0.83 ****
## 2 128 0 ****
## 3 1073 2.33 0
## 4 1889 1.06 4
## 5 235 0 ****
## 6 **** **** ****
## African.American.Enrollments African.American.Rate Hispanic.Dropouts
## 1 **** NA 0
## 2 **** NA ****
## 3 10 0.00 0
## 4 145 2.76 2
## 5 **** NA 0
## 6 **** NA ****
## Hispanic.Enrollments Hispanic.Rate American.Indian.Dropouts
## 1 27 0.0 ****
## 2 **** NA ****
## 3 29 0.0 ****
## 4 133 1.5 ****
## 5 17 0.0 ****
## 6 **** NA ****
## American.Indian.Enrollments American.Indian.Rate Asian.Dropouts
## 1 **** NA ****
## 2 **** NA ****
## 3 **** NA 0
## 4 **** NA 0
## 5 **** NA ****
## 6 **** NA ****
## Asian.Enrollments Asian.Rate Hawaiian.Pacific.Islander.Dropouts
## 1 **** NA ****
## 2 **** NA ****
## 3 23 0 ****
## 4 157 0 ****
## 5 **** NA ****
## 6 **** NA ****
## Hawaiian.Pacific.Islander.Enrollments Hawaiian.Pacific.Islander.Rate
## 1 **** NA
## 2 **** NA
## 3 **** NA
## 4 **** NA
## 5 **** NA
## 6 **** NA
## White.Dropouts White.Enrollments White.Rate Two.or.More.Races.Dropouts
## 1 0 206 0 ****
## 2 0 126 0 ****
## 3 24 985 2.44 1
## 4 12 1372 0.87 2
## 5 0 203 0 ****
## 6 **** **** **** ****
## Two.or.More.Races.Enrollments Two.or.More.Races.Rate Female.Dropouts
## 1 **** NA 2
## 2 **** NA 0
## 3 20 5.00 13
## 4 76 2.63 6
## 5 **** NA 0
## 6 **** NA ****
## Female.Enrollments Female.Rate Male.Dropouts Male.Enrollments Male.Rate
## 1 116 1.72 0 124 0
## 2 67 0 0 61 0
## 3 507 2.56 12 566 2.12
## 4 929 0.65 14 960 1.46
## 5 105 0 0 130 0
## 6 **** **** **** **** ****
## Grade.7.12.Status
## 1 High School
## 2 High School
## 3 High School
## 4 High School
## 5 High School
## 6 No 7-12
# Filters out districts without high schools
DS2 <- DS2 %>% filter(Grade.7.12.Status == "High School")
# keeps districts and dropout rates
DS2 <- DS2 %>% select(District.Number, Overall.Rate)
# renames variables
names(DS2) <- c("district", "dropout_rate")
DS2$dropout_rate <- as.numeric(as.character(DS2$dropout_rate))
# joins the reduced DS2 dataset with the analysis_ds
# inner join is the equivalent to an intersection in set notation
analysis_ds <- inner_join(analysis_ds, DS2, by = NULL)
## Joining by: "district"
# Final result
head(analysis_ds)
## district total math_proficient_percent reading_proficient_percent
## 1 9 33 87.88 81.82
## 2 18 25 88.00 72.00
## 3 27 105 92.38 90.48
## 4 63 30 83.33 93.33
## 5 81 85 85.88 83.53
## 6 99 48 79.17 81.25
## dropout_rate
## 1 0.83
## 2 0.00
## 3 0.56
## 4 0.00
## 5 0.19
## 6 0.99