#libraries
library(dplyr)
library(rjson)
library(janitor)
library(sqldf)
library(stringr)
Can education and wages be used as predictors for home values?
There are many factors theorized to influence a home’s value and for this project we’ll attempt to answer if two popular topics, education and wages, can be used as predictors for a third popular topic, home values.
Features
Education: categories of high school dropout, high school diploma, some college, four year degree
Wages: average annual wage
Geography: county, state, and census regions
Predict
Home Values: average home value for each county
Data is collected from several sources listed below:
Wages: Quarterly Census of Employment and Wages
The Quarterly Census of Employment and Wages (QCEW) program publishes a quarterly count of employment and wages reported by employers covering more than 95 percent of U.S. jobs, available at the county, MSA, state and national levels by industry.
Education: Local Education Agency
Revenues and expenditures are audited after the close of the fiscal year and are then submitted to NCES by each state education agency.
Home Values: Zillow
Zillow Home Value Index (ZHVI) is a smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. Zillow publishes top-tier ZHVI (typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI (typical value for homes that fall within the 5th to 35th percentile range for a given region).
State Information:
A json file of state abbreviation, state full name, and census region each state is in.
In this section we’ll load, clean, and transform the state, education, wage, and housing data. For each load the features of interest will be selected, rows filtered as needed, and the results written to clean_education, clean_wages, and clean_homes csv files.
Finally, a fourth csv called “master” is created that is a merge of all three files using state and county as the key fields. This file will serve as the input file for the Visualization and Analysis notebooks.
States
#load state data
json_data <- fromJSON(file = 'https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/state_mapping.json')
state_mapping <- as.data.frame(json_data)
state_mapping <- state_mapping %>% `colnames<-`(c('state_short', 'state_long', 'region'))
head(state_mapping, 1)
## state_short state_long region
## 1 AL Alabama South
Education data
# load education data
education <- read.csv('https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/Education(Graduation_Rates).csv')
cat('Dimension of raw education data', dim(education),"\n")
## Dimension of raw education data 3283 47
#identify columns of interest and rename
education <- clean_names(education)
education <- select(education,
state,
county = area_name,
dropout = less_than_a_high_school_diploma_2014_18,
hs_diploma = high_school_diploma_only_2014_18,
some_college = some_college_or_associate_s_degree_2014_18,
four_year_degree = bachelor_s_degree_or_higher_2014_18,
dropout_percent = percent_of_adults_with_less_than_a_high_school_diploma_2014_18,
hs_diploma_percent = percent_of_adults_with_a_high_school_diploma_only_2014_18,
some_college_percent = percent_of_adults_completing_some_college_or_associate_s_degree_2014_18,
four_year_degree_percent = percent_of_adults_with_a_bachelor_s_degree_or_higher_2014_18)
cat('Dimension of selected education data', dim(education), "\n")
## Dimension of selected education data 3283 10
#remove rows that are missing education data
education <- subset(education, dropout != '')
#add the full state name and census region to the education dataframe
education <- sqldf('SELECT *
FROM education
JOIN state_mapping ON education.state = state_mapping.state_short')
#Region is characters, so add a Region number column as a categorical variable
education$region_num = as.factor(ifelse(education$region == 'Northeast',1,ifelse(education$region == 'South',2,ifelse(education$region == 'Midwest',3,ifelse(education$region == 'West',4,5)))))
cat('Dimension of education data after cleaning and filtering', dim(education))
## Dimension of education data after cleaning and filtering 3272 14
head(education,1)
## state county dropout hs_diploma some_college four_year_degree
## 1 AL Alabama 470,043 1,020,172 987,148 822,595
## dropout_percent hs_diploma_percent some_college_percent
## 1 14.2 30.9 29.9
## four_year_degree_percent state_short state_long region region_num
## 1 24.9 AL Alabama South 2
path_out = 'C:\\Users\\user\\Documents\\00_Applications_DataScience\\CUNY\\DATA606\\Data606_Final_Project_Repository\\01Data\\'
write.csv(education, paste(path_out, 'clean_education.csv'), row.names = FALSE)
Wages data
# load wages data
wages <- read.csv('https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/allhlcn18.csv')
cat('Dimension of raw wage data:', dim(wages), '\n')
## Dimension of raw wage data: 62889 20
#filter to columns and rows of interest
wages <- wages %>%
select(type = Area.Type,
state = St.Name,
county = Area,
ownership = Ownership,
resident_count = Annual.Average.Establishment.Count,
avg_annual_pay = Annual.Average.Pay) %>%
subset(type == 'County' & ownership == 'Total Covered') %>%
select(-type)
#Remove the state name from the data in the county column,ie "Bibb County, Alabama" becomes "Bibb County"
wages$position <- (regexpr(pattern =',', wages$county)) - 1
wages$county <- str_sub(wages$county, end = wages$position)
#Convert wages from fctr to double
wages$avg_annual_pay = as.numeric(gsub("\\,", "", wages$avg_annual_pay))
wages$avg_annual_pay = as.numeric(as.character(wages$avg_annual_pay))
#Convert resident_count from fctr to double
wages$resident_count = as.numeric(gsub("\\,", "", wages$resident_count))
wages$resident_count = as.numeric(as.character(wages$resident_count))
cat('Dimension of wages data after filtering', dim(wages))
## Dimension of wages data after filtering 3191 6
head(wages,1)
## state county ownership resident_count avg_annual_pay position
## 36 Alabama Autauga County Total Covered 871 38407.01 14
write.csv(wages, paste(path_out, 'clean_wages.csv'), row.names = FALSE)
Home Values data
#load home values data
homes <- read.csv('https://raw.githubusercontent.com/krpopkin/DATA607-Class-Repository/master/Final_Project/Data/County_Zhvi_AllHomes(Home_Values).csv')
homes <- clean_names(homes)
cat('Dimension of homes data:', dim(homes), '\n')
## Dimension of homes data: 2840 293
homes <- homes %>%
select(county = region_name, state = state, metro = metro,
jan = x2018_01, feb = x2018_02, mar = x2018_03, apr = x2018_04, may = x2018_05, jun = x2018_06,
jul = x2018_07, aug = x2018_08, sep = x2018_09, oct = x2018_10, nov = x2018_11, dec = x2018_12)
homes$avg_home_value = (homes$jan + homes$feb + homes$mar + homes$apr + homes$may + homes$jun + homes$jul + homes$aug + homes$sep + homes$oct + homes$nov + homes$dec)/12
homes <- homes %>%
select(-c(jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec))
cat('Dimension of selected homes data:', dim(homes))
## Dimension of selected homes data: 2840 4
head(homes,1)
## county state metro avg_home_value
## 1 Los Angeles County CA Los Angeles-Long Beach-Anaheim 623588.4
write.csv(homes, paste(path_out, 'clean_homes.csv'), row.names = FALSE)
Join files
Now we’ll join the education, wages, and home values dataframes using state and county as the keys. This will give us a master data file to use for the rest of the project.
#Join education and wages using sql
master <- sqldf('SELECT *
FROM education
INNER JOIN wages ON (education.county = wages.county AND education.state_long =
wages.state)')
master <- clean_names(master)
cat('Master data size after joining education and wages is', dim(master), '\n')
## Master data size after joining education and wages is 3087 20
#Join master with homes using merge
master <- merge(master, homes)
#Clean the master by removing unneeded columns
master <- master %>%
select(-c(state_short, state_long, state_2, county_2, ownership, position))
cat('Master data with education, wages, and homes data is', dim(master))
## Master data with education, wages, and homes data is 2745 16
head(master,1)
## state county dropout hs_diploma some_college
## 1 AK Fairbanks North Star Borough 3,381 13,785 24,520
## four_year_degree dropout_percent hs_diploma_percent some_college_percent
## 1 20,152 5.5 22.3 39.7
## four_year_degree_percent region region_num resident_count avg_annual_pay
## 1 32.6 West 4 2275 53107.01
## metro avg_home_value
## 1 Fairbanks 244178.1
write.csv(master, paste(path_out, 'master.csv'), row.names = FALSE)
This is the end of Data Cleaning and Transformation. To continue, click here for Visualizations.