#libraries
library(dplyr)
library(janitor)
library(sqldf)
library(stringr)
library(psych)
library(ggplot2)
In this section we’ll load education, wage, and housing data. For each load the features of interest will be selected, rows filtered as needed, and the results written to clean_education, clean_wages, and clean_homes csv files. A fourth csv called “master” is also created that is a merge of all three files using state and county as the key fields.
Education data
# load education data
education <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/Education(Graduation_Rates).csv')
cat('Dimension of raw education data', dim(education),"\n")
## Dimension of raw education data 3283 47
#identify columns of interest and rename
education <- clean_names(education)
education <- select(education,
state,
county = area_name,
dropout = less_than_a_high_school_diploma_2014_18,
hs_diploma = high_school_diploma_only_2014_18,
some_college = some_college_or_associate_s_degree_2014_18,
four_year_degree = bachelor_s_degree_or_higher_2014_18,
dropout_percent = percent_of_adults_with_less_than_a_high_school_diploma_2014_18,
hs_diploma_percent = percent_of_adults_with_a_high_school_diploma_only_2014_18,
some_college_percent = percent_of_adults_completing_some_college_or_associate_s_degree_2014_18,
four_year_degree_percent = percent_of_adults_with_a_bachelor_s_degree_or_higher_2014_18)
cat('Dimension of selected education data', dim(education), "\n")
## Dimension of selected education data 3283 10
#remove rows that are missing education data
education <- subset(education, dropout != '')
#add the full state name to the education dataframe
state_mapping <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/state_mapping.csv')
education <- sqldf('SELECT *
FROM education
JOIN state_mapping ON education.state = state_mapping.state_short')
cat('Dimension of education data after cleaning and filtering', dim(education))
## Dimension of education data after cleaning and filtering 3272 12
head(education,1)
## state county dropout hs_diploma some_college four_year_degree
## 1 AL Alabama 470,043 1,020,172 987,148 822,595
## dropout_percent hs_diploma_percent some_college_percent
## 1 14.2 30.9 29.9
## four_year_degree_percent state_short state_long
## 1 24.9 AL Alabama
path_out = 'C:\\Users\\user\\Documents\\00_Applications_DataScience\\CUNY\\DATA606\\Data606_Final_Project_Repository\\01Data\\'
write.csv(education, paste(path_out, 'clean_education.csv'), row.names = FALSE)
Wages data
# load wages data
wages <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/allhlcn18.csv')
cat('Dimension of raw wage data:', dim(wages), '\n')
## Dimension of raw wage data: 62889 20
#filter to columns and rows of interest
wages <- wages %>%
select(type = Area.Type,
state = St.Name,
county = Area,
ownership = Ownership,
resident_count = Annual.Average.Establishment.Count,
avg_annual_pay = Annual.Average.Pay) %>%
subset(type == 'County' & ownership == 'Total Covered') %>%
select(-type)
#Remove the state name from the data in the county column,ie "Bibb County, Alabama" becomes "Bibb County"
wages$position <- (regexpr(pattern =',', wages$county)) - 1
wages$county <- str_sub(wages$county, end = wages$position)
#Convert from fctr to double
wages$avg_annual_pay = as.numeric(gsub("\\,", "", wages$avg_annual_pay))
wages$avg_annual_pay = as.numeric(as.character(wages$avg_annual_pay))
cat('Dimension of wages data after filtering', dim(wages))
## Dimension of wages data after filtering 3191 6
head(wages,1)
## state county ownership resident_count avg_annual_pay position
## 36 Alabama Autauga County Total Covered 871 38407 14
write.csv(wages, paste(path_out, 'clean_wages.csv'), row.names = FALSE)
Home Values data
#load home values data
homes <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/County_Zhvi_AllHomes(Home_Values).csv')
homes <- clean_names(homes)
cat('Dimension of homes data:', dim(homes), '\n')
## Dimension of homes data: 2840 293
homes <- homes %>%
select(county = region_name, state = state, metro = metro,
jan = x2018_01, feb = x2018_02, mar = x2018_03, apr = x2018_04, may = x2018_05, jun = x2018_06,
jul = x2018_07, aug = x2018_08, sep = x2018_09, oct = x2018_10, nov = x2018_11, dec = x2018_12)
homes$avg_home_value = (homes$jan + homes$feb + homes$mar + homes$apr + homes$may + homes$jun + homes$jul + homes$aug + homes$sep + homes$oct + homes$nov + homes$dec)/12
homes <- homes %>%
select(-c(jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec))
cat('Dimension of selected homes data:', dim(homes))
## Dimension of selected homes data: 2840 4
head(homes,1)
## county state metro avg_home_value
## 1 Los Angeles County CA Los Angeles-Long Beach-Anaheim 623588.4
write.csv(homes, paste(path_out, 'clean_homes.csv'), row.names = FALSE)
Join files
Now we’ll join the education, wages, and home values dataframes using state and county as the keys. This will give us a master data file to use when we start the project.
#Join education and wages using sql
master <- sqldf('SELECT *
FROM education
INNER JOIN wages ON (education.county = wages.county AND education.state_long =
wages.state)')
master <- clean_names(master)
cat('Master data size after joining education and wages is', dim(master), '\n')
## Master data size after joining education and wages is 3087 18
#Join master with homes using merge
master <- merge(master, homes)
#Clean the master by removing unneeded columns
master <- master %>%
select(-c(state_short, state_long, state_2, county_2, ownership, position))
cat('Master data with education, wages, and homes data is', dim(master))
## Master data with education, wages, and homes data is 2745 14
head(master,1)
## state county dropout hs_diploma some_college
## 1 AK Fairbanks North Star Borough 3,381 13,785 24,520
## four_year_degree dropout_percent hs_diploma_percent some_college_percent
## 1 20,152 5.5 22.3 39.7
## four_year_degree_percent resident_count avg_annual_pay metro
## 1 32.6 2,275 53107 Fairbanks
## avg_home_value
## 1 244178.1
write.csv(master, paste(path_out, 'master.csv'), row.names = FALSE)
Can education and wages be used as predictors for home values?
There are many factors theorized to influence a home’s value and for this project we’ll attempt to answer if two popular topics, education and wages, can be used as predictors for a third popular topic, home values.
Cases are counties in the United States for this project and there are 2,745 that we’ll use to answer the research question.
Data is collected from several sources listed below:
Wages: Quarterly Census of Employment and Wages
The Quarterly Census of Employment and Wages (QCEW) program publishes a quarterly count of employment and wages reported by employers covering more than 95 percent of U.S. jobs, available at the county, MSA, state and national levels by industry.
Education: Local Education Agency
Revenues and expenditures are audited after the close of the fiscal year and are then submitted to NCES by each state education agency.
Home Values: Zillow
Zillow Home Value Index (ZHVI) is a smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. Zillow publishes top-tier ZHVI (typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI (typical value for homes that fall within the 5th to 35th percentile range for a given region).
This is an observational study.
The response variable is home values and is quantitative.
There are two independent variables; wages which is quantitative and education level which is qualitative.
Description of the master data file <>
describe(master)
## vars n mean sd median trimmed
## state* 1 2745 NaN NA NA NaN
## county* 2 2745 NaN NA NA NaN
## dropout* 3 2745 1319.57 796.70 1323.0 1314.32
## hs_diploma* 4 2745 1516.98 852.64 1504.0 1514.10
## some_college* 5 2745 1456.79 842.45 1455.0 1450.73
## four_year_degree* 6 2745 1383.88 842.60 1370.0 1378.21
## dropout_percent 7 2745 13.38 6.04 12.2 12.86
## hs_diploma_percent 8 2745 34.36 7.29 34.6 34.62
## some_college_percent 9 2745 30.56 5.01 30.5 30.54
## four_year_degree_percent 10 2745 21.70 9.54 19.2 20.40
## resident_count* 11 2745 3281.06 1803.78 3556.0 3334.82
## avg_annual_pay 12 2745 41527.10 9278.22 39763.0 40322.15
## metro* 13 2745 263.23 282.75 156.0 230.55
## avg_home_value 14 2745 160108.92 111972.30 130851.3 141978.32
## mad min max range skew kurtosis
## state* NA Inf -Inf -Inf NA NA
## county* NA Inf -Inf -Inf NA NA
## dropout* 1022.99 2.00 2714.0 2712.0 0.03 -1.21
## hs_diploma* 1065.99 2.00 3021.0 3019.0 0.02 -1.17
## some_college* 1065.99 2.00 2970.0 2968.0 0.04 -1.17
## four_year_degree* 1110.47 3.00 2830.0 2827.0 0.04 -1.25
## dropout_percent 5.78 1.40 48.5 47.1 0.90 1.05
## hs_diploma_percent 7.12 8.10 55.6 47.5 -0.32 -0.03
## some_college_percent 5.04 11.40 48.0 36.6 0.02 -0.03
## four_year_degree_percent 7.41 6.90 74.6 67.7 1.39 2.22
## resident_count* 2271.34 3.00 6126.0 6123.0 -0.22 -1.11
## avg_annual_pay 6241.75 0.00 134664.0 134664.0 2.61 14.38
## metro* 229.80 1.00 860.0 859.0 0.63 -1.06
## avg_home_value 62020.49 29574.67 1527482.2 1497907.5 4.18 31.44
## se
## state* NA
## county* NA
## dropout* 15.21
## hs_diploma* 16.27
## some_college* 16.08
## four_year_degree* 16.08
## dropout_percent 0.12
## hs_diploma_percent 0.14
## some_college_percent 0.10
## four_year_degree_percent 0.18
## resident_count* 34.43
## avg_annual_pay 177.09
## metro* 5.40
## avg_home_value 2137.17
Education <>
More analysis of education will be done in the project. Below are histograms of high school graduation rates and four year college graduation rates.
#Percent with high school diploma
ggplot(master, aes(x=hs_diploma_percent)) + geom_histogram()
#Percent with four year degrees
ggplot(master, aes(x=four_year_degree_percent)) + geom_histogram()
Avg annual pay <>
Average annual pay is skewed right. This makes sense as all US states are in this dataset.
ggplot(master, aes(x=avg_annual_pay)) + geom_histogram()
Home values plot <>
Similar to wages, home values in some areas are very high, giving the distribution a right skew
ggplot(master, aes(x=avg_home_value)) + geom_histogram()