This is a project I’m currently working on. The idea of my project is to find how proficiency in math and in reading affect high school dropout rates amongst high school students in the state of Iowa in 2015.

! Important: Rpubs doesn’t support external links. To access links below: Right click and open in new tab. Thanks!

The original datasets, which I will refer to as DS1 and DS2, are avaiable at data.iowa.gov. A preview of each dataset is available here:
DataSet1 DataSet2

The objective of this writeup is to demonstrate proficiency in

  1. reading in files (csv in this case)
  2. manipulating and preparing datasets for analysis and graphics
  3. visualizing data

The writeup has two parts. Both listed below. I assume that you have previewed both datasets, DS1 and DS2, using the links above.

Part 1: First create and then plot the dataset location_ds, which includes the variables:

Part 2: Create the dataset analysis_ds, which includes the variables:

Importing the required packages

suppressMessages(library(dplyr)) 
library(tidyr)
library(stringr)
library(rbokeh)

Part 1: Preparing and plotting the new dataset location_ds from DS1

# Reads in datasets DS1, DS2
DS1 <- read.csv(file = "~/code/projects/data_files/iowa_project/proficiency.csv")
DS2 <- read.csv(file = "~/code/projects/data_files/iowa_project/dropout.csv")
Exploration of DS1
# Takes a look at the first 5 rows of DS1
head(DS1, 5)
##   School.Year   Topic Grade District      District.Name Proficient Total
## 1        2003 Reading    11     1080            Central         40    59
## 2        2004 Reading    11      977           Cardinal         31    44
## 3        2013 Reading     7     3195 Jefferson-Scranton         62    76
## 4        2005 Reading    11     6462         Tri-County         19    25
## 5        2013    Math     8     1917       Boyer Valley         19    29
##   X..Proficient Proficient.Category
## 1          67.8          60.1 - 70%
## 2          70.5          70.1 - 80%
## 3          81.6          80.1 - 90%
## 4          76.0          70.1 - 80%
## 5          65.5          60.1 - 70%
##                                                            District.Office.Location
## 1    400 1st Street NW\nElkader, Iowa 52043\n(42.8562016460005, -91.40768488699968)
## 2        4045 Ashland Rd\nEldon, Iowa 52554\n(40.96044807700048, -92.2374239729997)
## 3 204 W Madison Ave\nJefferson, Iowa 50129\n(42.01231774900049, -94.37712027299966)
## 4   3003 Highway 22\nThornburg, Iowa 50255\n(41.45220459400048, -92.33400292899967)
## 5        1102 Iowa Ave\nDunlap, Iowa 51529\n(41.854313410000486, -95.5948701319997)
# Explores dimensions of DS1
# 95,590 rows, 10 columns
dim(DS1) 
## [1] 59590    10
# Simplifies names of columns
names(DS1) <- c("school_year", "topic", "grade", "district", 
  "district_name", "proficient", "total", "percent_proficient",
  "proficient_category", "location")
Creation of dataset location_ds
# Keeps only columns of interest. Saves new dataset under the name "location_ds"
# Uses the pipe method in the package dplyr. Similar to the pipe command in UNIX
location_ds <- DS1 %>% 
                select(district, district_name, grade, total, location)

# Takes a look at the first 5 rows of DS1
head(location_ds, 5)
##   district      district_name grade total
## 1     1080            Central    11    59
## 2      977           Cardinal    11    44
## 3     3195 Jefferson-Scranton     7    76
## 4     6462         Tri-County    11    25
## 5     1917       Boyer Valley     8    29
##                                                                            location
## 1    400 1st Street NW\nElkader, Iowa 52043\n(42.8562016460005, -91.40768488699968)
## 2        4045 Ashland Rd\nEldon, Iowa 52554\n(40.96044807700048, -92.2374239729997)
## 3 204 W Madison Ave\nJefferson, Iowa 50129\n(42.01231774900049, -94.37712027299966)
## 4   3003 Highway 22\nThornburg, Iowa 50255\n(41.45220459400048, -92.33400292899967)
## 5        1102 Iowa Ave\nDunlap, Iowa 51529\n(41.854313410000486, -95.5948701319997)
# I'm interested only in high school students. Grades available in DS1
# 11  7  8  6  3  5  4 10
unique(location_ds$grade)
## [1] 11  7  8  6  3  5  4 10
# Gets us a count of rows per grade
#    3    4    5    6    7    8   10   11 
# 7744 9912 7668 7508 7339 9428  628 9363 
table(location_ds$grade)
## 
##    3    4    5    6    7    8   10   11 
## 7744 9912 7668 7508 7339 9428  628 9363
# While I'm interested in dropout rates amongst high school students. There arent' enough observations with grade == 10. So we keep only the observations with grade == 11
location_ds <- location_ds %>% filter(grade %in% c(11))
Extracting latitude and longitude from the variable location
# Since we'll later use location_ds to plot the districts on an interactive map, we'll extract the latitude and longitude values from the variable "location"

# Removes all characters before the left parenthesis in the variable location
location_ds$location <- gsub(".*\\(", "", location_ds$location)

# Removes the right parenthesis
location_ds$location <- str_sub(location_ds$location, 1, -2)

# We create new variables "lat" and "lon" to later store the new values
location_ds$lat <- 0
location_ds$lon <- 0

# Then we assign the substring before "," as latitude, and the substring after "," as the longitude for each row in the dataset "location_ds"
for (i in 1:nrow(location_ds)){
  location_ds$lat[i] <- str_split_fixed(location_ds$location[i], ", ", n = 2)[1]
  location_ds$lon[i] <- str_split_fixed(location_ds$location[i], ", ", n = 2)[2]
}

# Converts lat and lon from string format to numeric format
location_ds$lat <- as.numeric(location_ds$lat)
location_ds$lon <- as.numeric(location_ds$lon)

# Rounds each newly created variable to six digits after the decimal point
location_ds$lat <- round(location_ds$lat, 6)
location_ds$lon <- round(location_ds$lon, 7)

# Drops the "location" and "grade" variables
location_ds <-  location_ds %>% select(-location, -grade)
Deleting duplicate rows
# Finds the number of unique rows
length(unique(location_ds$district))
## [1] 359
# Updates the dataset location_ds 
# Keeps only the unique rows
location_ds <- location_ds %>% distinct(district)

# Makes sure we only have 359 rows
nrow(location_ds)
## [1] 359
Creating a categorical variable size from the numerica variable total
# Eventually I want to plot each of the districts on a map. And I want the size of the student body to be reflected on the map. I will use the variable "total" to create a categorical variable "size". I'll then use "size" to determine the size of the dot representing each district on the map

# Gets quantiles
quantiles <- quantile(location_ds$total, c(.25, .5, .75, .95), na.rm = TRUE)
#  25%   50%   75%   95% 
# 33.0  52.0  94.0 304.3 

# Creates variable "size"
location_ds$size <- 0

# Because some rows have NA's as entries for total, we'll convert those to zeros. Districts with <= 36 students in the 11th grade get the smallest size. Districts with >36 and <= 54 get the next size up. And we continue in that manner. 
# The variable size has 5 levels
for (i in 1:nrow(location_ds)){
  if (is.na(location_ds$total[i]) == TRUE){
    location_ds$total[i] = 0
  }
  location_ds$size[i] <- 1*(location_ds$total[i] >= 0 && location_ds$total[i] <= quantiles[1]) +
    2*(location_ds$total[i] > quantiles[1] && location_ds$total[i] <= quantiles[2]) +
    3*(location_ds$total[i] > quantiles[2] && location_ds$total[i] <= quantiles[3]) +
    4*(location_ds$total[i] > quantiles[3] && location_ds$total[i] <= quantiles[4]) +
    5*(location_ds$total[i] > quantiles[4])
}
Resulting dataset
# Let's take a look at the result:
head(location_ds)
##   district      district_name total      lat       lon size
## 1     1080            Central    59 42.85620 -91.40768    3
## 2      977           Cardinal    44 40.96045 -92.23742    2
## 3     6462         Tri-County    25 41.45220 -92.33400    1
## 4     1989 Edgewood-Colesburg    51 42.64446 -91.40485    2
## 5     6219         Storm Lake   140 42.64208 -95.20082    4
## 6     6165            Stanton    24 40.98418 -95.10040    1
Creating a new .csv file with the new dataset
# Because in the future I will use this new dataset "location_ds", I create a .csv file with that name. 
# Creates a new .csv file
write.csv(x = location_ds, file = "~/code/projects/data_files/iowa_project/location_ds.csv")
Plotting the data points on an interactive map
# Plots all districts on interactive map
neon <- '[{"stylers":[{"saturation":100},{"gamma":0.6}]}]'

gmap(lat = 41.8505, lng = -93.6091, zoom = 7, 
     width = 900, height = 650, title = "School Districts",
     api_key = "AIzaSyADOlm_F8Ui0FIvTh5nJoyLNTtnMAN4HSI",
     map_type = "roadmap", map_style = neon) %>%
  ly_points(lon, lat, data = location_ds, alpha = 0.5,
            size = 3*size, hover = c(district_name), color="blue")

Part 2: Preparing the new dataset analysis_ds from DS1

Selecting the columns we need from DS1
# Keeps only columns of interest from DS1
# Saves new dataset under the name "analysis_ds"
analysis_ds <- DS1 %>% 
  select(school_year, topic, district, grade, proficient, total) %>%
  filter(grade == 11, school_year == 2015)


# There 338 unique school districts in the 2015 dataset
length(unique(analysis_ds$district)) 
## [1] 338
# Arranges dataset by district number
analysis_ds <- analysis_ds %>% arrange(district)

# Although I'm commenting out the code below because the output is too big, there are four
# observations per district number when there should only be two, one for each topic (math and reading)
#
# CODE:
#table(analysis_ds$district)
Deleting redundant rows
# We're going to delete the repeats

# the function nth.delete removes the nth row of a dataset
nth.delete <- function(dataframe, n){
  dataframe[-(seq(n,to=nrow(dataframe),by=n)),]
}

# First we remove every 4th row, then every 3rd row 
analysis_ds <- nth.delete(analysis_ds, 4)
analysis_ds <- nth.delete(analysis_ds, 3)

# let's take a peek. It works!
head(analysis_ds)
##    school_year   topic district grade proficient total
## 1         2015 Reading        9    11         27    33
## 2         2015    Math        9    11         29    33
## 5         2015 Reading       18    11         18    25
## 6         2015    Math       18    11         22    25
## 9         2015 Reading       27    11         95   105
## 10        2015    Math       27    11         97   105
Removing rows without proficient scores
# There are exactly 50 observations with a value of 0 as total and as proficient
# This means that there are 25 district that either misreported/didn't take the tests,
# or simply doesn't have any 11th graders
# Either way, we'll filter them out
analysis_ds <- analysis_ds %>% filter(total != 0, proficient != 0)
Fixing discrepancy under variable total
# There are some discrepancies in the total column for some districts
# By some I mean between 10 and 20 schools are off by 1 or 2 students under the
# variable total.
# I believe this is caused by absences.

# So we create a new dataset "district_total" where we have one row per district
# And this row has the max total
district_total <- analysis_ds %>% 
  select(district, total) %>%
  group_by(district) %>%
  summarise(total = max(total))

# We dump the variable "total" in "analysis_df"
analysis_ds <- analysis_ds %>% select(-total) 

# We also dump district 1728, which only has one proficiency test
# Alternatively, we could have assign it the median value of all other districts
analysis_ds <- analysis_ds %>% filter(district != 1278)

# We then join both datasets with a full join a.k.a. a "union" in set theory
analysis_ds <- full_join(analysis_ds, district_total, by = NULL)
## Joining by: "district"
# let's take a look
head(analysis_ds)
##   school_year   topic district grade proficient total
## 1        2015 Reading        9    11         27    33
## 2        2015    Math        9    11         29    33
## 3        2015 Reading       18    11         18    25
## 4        2015    Math       18    11         22    25
## 5        2015 Reading       27    11         95   105
## 6        2015    Math       27    11         97   105
# Ideally, we want, one row per district. And we want both, the math and reading scores on the same line
# We achieve that with the method spread
analysis_ds <- spread(analysis_ds, topic, proficient)

# let's take a look
head(analysis_ds)
##   school_year district grade total Math Reading NA
## 1        2015        9    11    33   29      27 NA
## 2        2015       18    11    25   22      18 NA
## 3        2015       27    11   105   97      95 NA
## 4        2015       63    11    30   25      28 NA
## 5        2015       81    11    85   73      71 NA
## 6        2015       99    11    48   38      39 NA
# drops last column (full of NA's)
analysis_ds <- analysis_ds[,-7]
Converting proficiency scores to percentages
# We now proceed to create the proficiency scores in terms of percentages
analysis_ds <- analysis_ds %>% 
  mutate(math_proficient_percent = Math/total*100, reading_proficient_percent=Reading/total*100)

analysis_ds <- analysis_ds %>% select(district, total, math_proficient_percent, reading_proficient_percent)

analysis_ds <- analysis_ds %>% mutate(math_proficient_percent = round(math_proficient_percent,2), reading_proficient_percent=round(reading_proficient_percent,2))
Select the columns we need from DS2
# Let's take a peek at DS2. Lots of columns. We only want two.
head(DS2)
##   District.Number         District.Name County AEA Overall.Dropouts
## 1               9                 AGWSR     42   7                2
## 2              18           Adair-Casey     39  11                0
## 3            5013             Oskaloosa     62  15               25
## 4             225                  Ames     85  11               20
## 5              63       Akron Westfield     75  12                0
## 6              72 Albert City-Truesdale     11   5             ****
##   Overall.Enrollments Overall.Rate African.American.Dropouts
## 1                 240         0.83                      ****
## 2                 128            0                      ****
## 3                1073         2.33                         0
## 4                1889         1.06                         4
## 5                 235            0                      ****
## 6                ****         ****                      ****
##   African.American.Enrollments African.American.Rate Hispanic.Dropouts
## 1                         ****                    NA                 0
## 2                         ****                    NA              ****
## 3                           10                  0.00                 0
## 4                          145                  2.76                 2
## 5                         ****                    NA                 0
## 6                         ****                    NA              ****
##   Hispanic.Enrollments Hispanic.Rate American.Indian.Dropouts
## 1                   27           0.0                     ****
## 2                 ****            NA                     ****
## 3                   29           0.0                     ****
## 4                  133           1.5                     ****
## 5                   17           0.0                     ****
## 6                 ****            NA                     ****
##   American.Indian.Enrollments American.Indian.Rate Asian.Dropouts
## 1                        ****                   NA           ****
## 2                        ****                   NA           ****
## 3                        ****                   NA              0
## 4                        ****                   NA              0
## 5                        ****                   NA           ****
## 6                        ****                   NA           ****
##   Asian.Enrollments Asian.Rate Hawaiian.Pacific.Islander.Dropouts
## 1              ****         NA                               ****
## 2              ****         NA                               ****
## 3                23          0                               ****
## 4               157          0                               ****
## 5              ****         NA                               ****
## 6              ****         NA                               ****
##   Hawaiian.Pacific.Islander.Enrollments Hawaiian.Pacific.Islander.Rate
## 1                                  ****                             NA
## 2                                  ****                             NA
## 3                                  ****                             NA
## 4                                  ****                             NA
## 5                                  ****                             NA
## 6                                  ****                             NA
##   White.Dropouts White.Enrollments White.Rate Two.or.More.Races.Dropouts
## 1              0               206          0                       ****
## 2              0               126          0                       ****
## 3             24               985       2.44                          1
## 4             12              1372       0.87                          2
## 5              0               203          0                       ****
## 6           ****              ****       ****                       ****
##   Two.or.More.Races.Enrollments Two.or.More.Races.Rate Female.Dropouts
## 1                          ****                     NA               2
## 2                          ****                     NA               0
## 3                            20                   5.00              13
## 4                            76                   2.63               6
## 5                          ****                     NA               0
## 6                          ****                     NA            ****
##   Female.Enrollments Female.Rate Male.Dropouts Male.Enrollments Male.Rate
## 1                116        1.72             0              124         0
## 2                 67           0             0               61         0
## 3                507        2.56            12              566      2.12
## 4                929        0.65            14              960      1.46
## 5                105           0             0              130         0
## 6               ****        ****          ****             ****      ****
##   Grade.7.12.Status
## 1       High School
## 2       High School
## 3       High School
## 4       High School
## 5       High School
## 6           No 7-12
# Filters out districts without high schools
DS2 <- DS2 %>% filter(Grade.7.12.Status == "High School")

# keeps districts and dropout rates
DS2 <- DS2 %>% select(District.Number, Overall.Rate)
                  
# renames variables                    
names(DS2) <- c("district", "dropout_rate")  

DS2$dropout_rate <- as.numeric(as.character(DS2$dropout_rate))
Resulting dataset by joining both datasets
# joins the reduced DS2 dataset with the analysis_ds
# inner join is the equivalent to an intersection in set notation
analysis_ds <- inner_join(analysis_ds, DS2, by = NULL)
## Joining by: "district"
# Final result
head(analysis_ds)
##   district total math_proficient_percent reading_proficient_percent
## 1        9    33                   87.88                      81.82
## 2       18    25                   88.00                      72.00
## 3       27   105                   92.38                      90.48
## 4       63    30                   83.33                      93.33
## 5       81    85                   85.88                      83.53
## 6       99    48                   79.17                      81.25
##   dropout_rate
## 1         0.83
## 2         0.00
## 3         0.56
## 4         0.00
## 5         0.19
## 6         0.99