#libraries
library(dplyr)
library(rjson)
library(janitor)
library(sqldf)
library(stringr)

Research question

Can education and wages be used as predictors for home values?

There are many factors theorized to influence a home’s value and for this project we’ll attempt to answer if two popular topics, education and wages, can be used as predictors for a third popular topic, home values.

Features
Education: categories of high school dropout, high school diploma, some college, four year degree
Wages: average annual wage
Geography: county, state, and census regions

Predict
Home Values: average home value for each county

About the Data:

Data is collected from several sources listed below:

Wages: Quarterly Census of Employment and Wages
The Quarterly Census of Employment and Wages (QCEW) program publishes a quarterly count of employment and wages reported by employers covering more than 95 percent of U.S. jobs, available at the county, MSA, state and national levels by industry.

Education: Local Education Agency
Revenues and expenditures are audited after the close of the fiscal year and are then submitted to NCES by each state education agency.

Home Values: Zillow
Zillow Home Value Index (ZHVI) is a smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. Zillow publishes top-tier ZHVI (typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI (typical value for homes that fall within the 5th to 35th percentile range for a given region).

State Information:
A json file of state abbreviation, state full name, and census region each state is in.

Data Cleaning and Transformations:

In this section we’ll load, clean, and transform the state, education, wage, and housing data. For each load the features of interest will be selected, rows filtered as needed, and the results written to clean_education, clean_wages, and clean_homes csv files.

Finally, a fourth csv called “master” is created that is a merge of all three files using state and county as the key fields. This file will serve as the input file for the Visualization and Analysis notebooks.

States

#load state data
json_data <- fromJSON(file = 'https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/state_mapping.json')
state_mapping <- as.data.frame(json_data)
state_mapping <- state_mapping %>% `colnames<-`(c('state_short', 'state_long', 'region'))
head(state_mapping, 1)
##   state_short state_long region
## 1          AL    Alabama  South

Education data

# load education data
education <- read.csv('https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/Education(Graduation_Rates).csv')

cat('Dimension of raw education data', dim(education),"\n")
## Dimension of raw education data 3283 47
#identify columns of interest and rename
education <- clean_names(education)
education <- select(education, 
                    state, 
                    county = area_name,
                    dropout = less_than_a_high_school_diploma_2014_18,
                    hs_diploma = high_school_diploma_only_2014_18,
                    some_college = some_college_or_associate_s_degree_2014_18,
                    four_year_degree = bachelor_s_degree_or_higher_2014_18,
                    dropout_percent = percent_of_adults_with_less_than_a_high_school_diploma_2014_18,
                    hs_diploma_percent = percent_of_adults_with_a_high_school_diploma_only_2014_18,
                    some_college_percent = percent_of_adults_completing_some_college_or_associate_s_degree_2014_18,
                    four_year_degree_percent = percent_of_adults_with_a_bachelor_s_degree_or_higher_2014_18)

cat('Dimension of selected education data', dim(education), "\n")
## Dimension of selected education data 3283 10
#remove rows that are missing education data
education <- subset(education, dropout != '')

#add the full state name and census region to the education dataframe
education <- sqldf('SELECT * 
                    FROM education
                    JOIN state_mapping ON education.state = state_mapping.state_short')

#Region is characters, so add a Region number column as a categorical variable
education$region_num = as.factor(ifelse(education$region == 'Northeast',1,ifelse(education$region == 'South',2,ifelse(education$region == 'Midwest',3,ifelse(education$region == 'West',4,5)))))

cat('Dimension of education data after cleaning and filtering', dim(education))
## Dimension of education data after cleaning and filtering 3272 14
head(education,1)
##   state  county dropout hs_diploma some_college four_year_degree
## 1    AL Alabama 470,043  1,020,172      987,148          822,595
##   dropout_percent hs_diploma_percent some_college_percent
## 1            14.2               30.9                 29.9
##   four_year_degree_percent state_short state_long region region_num
## 1                     24.9          AL    Alabama  South          2
path_out = 'C:\\Users\\user\\Documents\\00_Applications_DataScience\\CUNY\\DATA606\\Data606_Final_Project_Repository\\01Data\\'
write.csv(education, paste(path_out, 'clean_education.csv'), row.names = FALSE)

Wages data

# load wages data
wages <- read.csv('https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/allhlcn18.csv')

cat('Dimension of raw wage data:', dim(wages), '\n')
## Dimension of raw wage data: 62889 20
#filter to columns and rows of interest
wages <- wages %>%
  select(type = Area.Type,
         state = St.Name, 
         county = Area, 
         ownership = Ownership,
         resident_count = Annual.Average.Establishment.Count,
         avg_annual_pay = Annual.Average.Pay) %>%
  subset(type == 'County' & ownership == 'Total Covered') %>%
  select(-type)

#Remove the state name from the data in the county column,ie "Bibb County, Alabama" becomes "Bibb County"
wages$position <- (regexpr(pattern =',', wages$county)) - 1
wages$county <- str_sub(wages$county, end = wages$position)

#Convert wages from fctr to double
wages$avg_annual_pay = as.numeric(gsub("\\,", "", wages$avg_annual_pay))
wages$avg_annual_pay = as.numeric(as.character(wages$avg_annual_pay))

#Convert resident_count from fctr to double
wages$resident_count = as.numeric(gsub("\\,", "", wages$resident_count))
wages$resident_count = as.numeric(as.character(wages$resident_count))

cat('Dimension of wages data after filtering', dim(wages))
## Dimension of wages data after filtering 3191 6
head(wages,1)
##      state         county     ownership resident_count avg_annual_pay position
## 36 Alabama Autauga County Total Covered            871       38407.01       14
write.csv(wages, paste(path_out, 'clean_wages.csv'), row.names = FALSE)

Home Values data

#load home values data
homes <- read.csv('https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/County_Zhvi_AllHomes(Home_Values).csv')

homes <- clean_names(homes)

cat('Dimension of homes data:', dim(homes), '\n')
## Dimension of homes data: 2840 293
homes <-  homes %>%
  select(county = region_name, state = state, metro = metro,
         jan = x2018_01, feb = x2018_02, mar = x2018_03, apr = x2018_04, may = x2018_05, jun = x2018_06,
         jul = x2018_07, aug = x2018_08, sep = x2018_09, oct = x2018_10, nov = x2018_11, dec = x2018_12)

homes$avg_home_value = (homes$jan + homes$feb + homes$mar + homes$apr + homes$may + homes$jun + homes$jul + homes$aug + homes$sep + homes$oct + homes$nov + homes$dec)/12

homes <- homes %>%
  select(-c(jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec))

cat('Dimension of selected homes data:', dim(homes))
## Dimension of selected homes data: 2840 4
head(homes,1)
##               county state                          metro avg_home_value
## 1 Los Angeles County    CA Los Angeles-Long Beach-Anaheim       623588.4
write.csv(homes, paste(path_out, 'clean_homes.csv'), row.names = FALSE)

Join files
Now we’ll join the education, wages, and home values dataframes using state and county as the keys. This will give us a master data file to use for the rest of the project.

#Join education and wages using sql
master <- sqldf('SELECT * 
                    FROM education
                    INNER JOIN wages ON (education.county = wages.county AND education.state_long =
                    wages.state)')

master <- clean_names(master)
cat('Master data size after joining education and wages is', dim(master), '\n')
## Master data size after joining education and wages is 3087 20
#Join master with homes using merge
master <- merge(master, homes)

#Clean the master by removing unneeded columns
master <- master %>%
  select(-c(state_short, state_long, state_2, county_2, ownership, position))

cat('Master data with education, wages, and homes data is', dim(master))
## Master data with education, wages, and homes data is 2745 16
head(master,1)
##   state                       county dropout hs_diploma some_college
## 1    AK Fairbanks North Star Borough   3,381     13,785       24,520
##   four_year_degree dropout_percent hs_diploma_percent some_college_percent
## 1           20,152             5.5               22.3                 39.7
##   four_year_degree_percent region region_num resident_count avg_annual_pay
## 1                     32.6   West          4           2275       53107.01
##       metro avg_home_value
## 1 Fairbanks       244178.1
write.csv(master, paste(path_out, 'master.csv'), row.names = FALSE)

This is the end of Data Cleaning and Transformation. To continue, click here for Visualizations.