#libraries
library(dplyr)
library(janitor)
library(sqldf)
library(stringr)
library(psych)
library(ggplot2)

Data Preparation

In this section we’ll load education, wage, and housing data. For each load the features of interest will be selected, rows filtered as needed, and the results written to clean_education, clean_wages, and clean_homes csv files. A fourth csv called “master” is also created that is a merge of all three files using state and county as the key fields.

Education data

# load education data
education <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/Education(Graduation_Rates).csv')

cat('Dimension of raw education data', dim(education),"\n")
## Dimension of raw education data 3283 47
#identify columns of interest and rename
education <- clean_names(education)
education <- select(education, 
                    state, 
                    county = area_name,
                    dropout = less_than_a_high_school_diploma_2014_18,
                    hs_diploma = high_school_diploma_only_2014_18,
                    some_college = some_college_or_associate_s_degree_2014_18,
                    four_year_degree = bachelor_s_degree_or_higher_2014_18,
                    dropout_percent = percent_of_adults_with_less_than_a_high_school_diploma_2014_18,
                    hs_diploma_percent = percent_of_adults_with_a_high_school_diploma_only_2014_18,
                    some_college_percent = percent_of_adults_completing_some_college_or_associate_s_degree_2014_18,
                    four_year_degree_percent = percent_of_adults_with_a_bachelor_s_degree_or_higher_2014_18)

cat('Dimension of selected education data', dim(education), "\n")
## Dimension of selected education data 3283 10
#remove rows that are missing education data
education <- subset(education, dropout != '')

#add the full state name to the education dataframe
state_mapping <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/state_mapping.csv')

education <- sqldf('SELECT * 
                    FROM education
                    JOIN state_mapping ON education.state = state_mapping.state_short')

cat('Dimension of education data after cleaning and filtering', dim(education))
## Dimension of education data after cleaning and filtering 3272 12
head(education,1)
##   state  county dropout hs_diploma some_college four_year_degree
## 1    AL Alabama 470,043  1,020,172      987,148          822,595
##   dropout_percent hs_diploma_percent some_college_percent
## 1            14.2               30.9                 29.9
##   four_year_degree_percent state_short state_long
## 1                     24.9          AL    Alabama
path_out = 'C:\\Users\\user\\Documents\\00_Applications_DataScience\\CUNY\\DATA606\\Data606_Final_Project_Repository\\01Data\\'
write.csv(education, paste(path_out, 'clean_education.csv'), row.names = FALSE)

Wages data

# load wages data
wages <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/allhlcn18.csv')

cat('Dimension of raw wage data:', dim(wages), '\n')
## Dimension of raw wage data: 62889 20
#filter to columns and rows of interest
wages <- wages %>%
  select(type = Area.Type,
         state = St.Name, 
         county = Area, 
         ownership = Ownership,
         resident_count = Annual.Average.Establishment.Count,
         avg_annual_pay = Annual.Average.Pay) %>%
  subset(type == 'County' & ownership == 'Total Covered') %>%
  select(-type)

#Remove the state name from the data in the county column,ie "Bibb County, Alabama" becomes "Bibb County"
wages$position <- (regexpr(pattern =',', wages$county)) - 1
wages$county <- str_sub(wages$county, end = wages$position)

#Convert from fctr to double
wages$avg_annual_pay = as.numeric(gsub("\\,", "", wages$avg_annual_pay))
wages$avg_annual_pay = as.numeric(as.character(wages$avg_annual_pay))

cat('Dimension of wages data after filtering', dim(wages))
## Dimension of wages data after filtering 3191 6
head(wages,1)
##      state         county     ownership resident_count avg_annual_pay position
## 36 Alabama Autauga County Total Covered            871          38407       14
write.csv(wages, paste(path_out, 'clean_wages.csv'), row.names = FALSE)

Home Values data

#load home values data
homes <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/County_Zhvi_AllHomes(Home_Values).csv')

homes <- clean_names(homes)

cat('Dimension of homes data:', dim(homes), '\n')
## Dimension of homes data: 2840 293
homes <-  homes %>%
  select(county = region_name, state = state, metro = metro,
         jan = x2018_01, feb = x2018_02, mar = x2018_03, apr = x2018_04, may = x2018_05, jun = x2018_06,
         jul = x2018_07, aug = x2018_08, sep = x2018_09, oct = x2018_10, nov = x2018_11, dec = x2018_12)

homes$avg_home_value = (homes$jan + homes$feb + homes$mar + homes$apr + homes$may + homes$jun + homes$jul + homes$aug + homes$sep + homes$oct + homes$nov + homes$dec)/12

homes <- homes %>%
  select(-c(jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec))

cat('Dimension of selected homes data:', dim(homes))
## Dimension of selected homes data: 2840 4
head(homes,1)
##               county state                          metro avg_home_value
## 1 Los Angeles County    CA Los Angeles-Long Beach-Anaheim       623588.4
write.csv(homes, paste(path_out, 'clean_homes.csv'), row.names = FALSE)

Join files
Now we’ll join the education, wages, and home values dataframes using state and county as the keys. This will give us a master data file to use when we start the project.

#Join education and wages using sql
master <- sqldf('SELECT * 
                    FROM education
                    INNER JOIN wages ON (education.county = wages.county AND education.state_long =
                    wages.state)')

master <- clean_names(master)
cat('Master data size after joining education and wages is', dim(master), '\n')
## Master data size after joining education and wages is 3087 18
#Join master with homes using merge
master <- merge(master, homes)

#Clean the master by removing unneeded columns
master <- master %>%
  select(-c(state_short, state_long, state_2, county_2, ownership, position))

cat('Master data with education, wages, and homes data is', dim(master))
## Master data with education, wages, and homes data is 2745 14
head(master,1)
##   state                       county dropout hs_diploma some_college
## 1    AK Fairbanks North Star Borough   3,381     13,785       24,520
##   four_year_degree dropout_percent hs_diploma_percent some_college_percent
## 1           20,152             5.5               22.3                 39.7
##   four_year_degree_percent resident_count avg_annual_pay     metro
## 1                     32.6          2,275          53107 Fairbanks
##   avg_home_value
## 1       244178.1
write.csv(master, paste(path_out, 'master.csv'), row.names = FALSE)

Research question

Can education and wages be used as predictors for home values?

There are many factors theorized to influence a home’s value and for this project we’ll attempt to answer if two popular topics, education and wages, can be used as predictors for a third popular topic, home values.

Cases

Cases are counties in the United States for this project and there are 2,745 that we’ll use to answer the research question.

Data source and method of collection:

Data is collected from several sources listed below:

Wages: Quarterly Census of Employment and Wages
The Quarterly Census of Employment and Wages (QCEW) program publishes a quarterly count of employment and wages reported by employers covering more than 95 percent of U.S. jobs, available at the county, MSA, state and national levels by industry.

Education: Local Education Agency
Revenues and expenditures are audited after the close of the fiscal year and are then submitted to NCES by each state education agency.

Home Values: Zillow
Zillow Home Value Index (ZHVI) is a smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. Zillow publishes top-tier ZHVI (typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI (typical value for homes that fall within the 5th to 35th percentile range for a given region).

Type of study

This is an observational study.

Dependent Variable

The response variable is home values and is quantitative.

Independent Variable

There are two independent variables; wages which is quantitative and education level which is qualitative.

Relevant summary statistics

Description of the master data file <>

describe(master)
##                          vars    n      mean        sd   median   trimmed
## state*                      1 2745       NaN        NA       NA       NaN
## county*                     2 2745       NaN        NA       NA       NaN
## dropout*                    3 2745   1319.57    796.70   1323.0   1314.32
## hs_diploma*                 4 2745   1516.98    852.64   1504.0   1514.10
## some_college*               5 2745   1456.79    842.45   1455.0   1450.73
## four_year_degree*           6 2745   1383.88    842.60   1370.0   1378.21
## dropout_percent             7 2745     13.38      6.04     12.2     12.86
## hs_diploma_percent          8 2745     34.36      7.29     34.6     34.62
## some_college_percent        9 2745     30.56      5.01     30.5     30.54
## four_year_degree_percent   10 2745     21.70      9.54     19.2     20.40
## resident_count*            11 2745   3281.06   1803.78   3556.0   3334.82
## avg_annual_pay             12 2745  41527.10   9278.22  39763.0  40322.15
## metro*                     13 2745    263.23    282.75    156.0    230.55
## avg_home_value             14 2745 160108.92 111972.30 130851.3 141978.32
##                               mad      min       max     range  skew kurtosis
## state*                         NA      Inf      -Inf      -Inf    NA       NA
## county*                        NA      Inf      -Inf      -Inf    NA       NA
## dropout*                  1022.99     2.00    2714.0    2712.0  0.03    -1.21
## hs_diploma*               1065.99     2.00    3021.0    3019.0  0.02    -1.17
## some_college*             1065.99     2.00    2970.0    2968.0  0.04    -1.17
## four_year_degree*         1110.47     3.00    2830.0    2827.0  0.04    -1.25
## dropout_percent              5.78     1.40      48.5      47.1  0.90     1.05
## hs_diploma_percent           7.12     8.10      55.6      47.5 -0.32    -0.03
## some_college_percent         5.04    11.40      48.0      36.6  0.02    -0.03
## four_year_degree_percent     7.41     6.90      74.6      67.7  1.39     2.22
## resident_count*           2271.34     3.00    6126.0    6123.0 -0.22    -1.11
## avg_annual_pay            6241.75     0.00  134664.0  134664.0  2.61    14.38
## metro*                     229.80     1.00     860.0     859.0  0.63    -1.06
## avg_home_value           62020.49 29574.67 1527482.2 1497907.5  4.18    31.44
##                               se
## state*                        NA
## county*                       NA
## dropout*                   15.21
## hs_diploma*                16.27
## some_college*              16.08
## four_year_degree*          16.08
## dropout_percent             0.12
## hs_diploma_percent          0.14
## some_college_percent        0.10
## four_year_degree_percent    0.18
## resident_count*            34.43
## avg_annual_pay            177.09
## metro*                      5.40
## avg_home_value           2137.17

Education <>
More analysis of education will be done in the project. Below are histograms of high school graduation rates and four year college graduation rates.

#Percent with high school diploma
ggplot(master, aes(x=hs_diploma_percent)) + geom_histogram()

#Percent with four year degrees
ggplot(master, aes(x=four_year_degree_percent)) + geom_histogram()

Avg annual pay <>
Average annual pay is skewed right. This makes sense as all US states are in this dataset.

ggplot(master, aes(x=avg_annual_pay)) + geom_histogram()

Home values plot <>
Similar to wages, home values in some areas are very high, giving the distribution a right skew

ggplot(master, aes(x=avg_home_value)) + geom_histogram()