Data 606 Final Project Proposal

#libraries
library(dplyr)
library(janitor)
library(sqldf)
library(stringr)
library(psych)
library(ggplot2)

Data Preparation

In this section we’ll load education, wage, and housing data. For each load the features of interest will be selected, rows filtered as needed, and the results written to clean_education, clean_wages, and clean_homes csv files. A fourth csv called “master” is also created that is a merge of all three files using state and county as the key fields.

Education data

# load education data
education <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/Education(Graduation_Rates).csv')

cat('Dimension of raw education data', dim(education),"\n")

## Dimension of raw education data 3283 47

#identify columns of interest and rename
education <- clean_names(education)
education <- select(education, 
                    state, 
                    county = area_name,
                    dropout = less_than_a_high_school_diploma_2014_18,
                    hs_diploma = high_school_diploma_only_2014_18,
                    some_college = some_college_or_associate_s_degree_2014_18,
                    four_year_degree = bachelor_s_degree_or_higher_2014_18,
                    dropout_percent = percent_of_adults_with_less_than_a_high_school_diploma_2014_18,
                    hs_diploma_percent = percent_of_adults_with_a_high_school_diploma_only_2014_18,
                    some_college_percent = percent_of_adults_completing_some_college_or_associate_s_degree_2014_18,
                    four_year_degree_percent = percent_of_adults_with_a_bachelor_s_degree_or_higher_2014_18)

cat('Dimension of selected education data', dim(education), "\n")

## Dimension of selected education data 3283 10

#remove rows that are missing education data
education <- subset(education, dropout != '')

#add the full state name to the education dataframe
state_mapping <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/state_mapping.csv')

education <- sqldf('SELECT * 
                    FROM education
                    JOIN state_mapping ON education.state = state_mapping.state_short')

cat('Dimension of education data after cleaning and filtering', dim(education))

## Dimension of education data after cleaning and filtering 3272 12

head(education,1)

##   state  county dropout hs_diploma some_college four_year_degree
## 1    AL Alabama 470,043  1,020,172      987,148          822,595
##   dropout_percent hs_diploma_percent some_college_percent
## 1            14.2               30.9                 29.9
##   four_year_degree_percent state_short state_long
## 1                     24.9          AL    Alabama

path_out = 'C:\\Users\\user\\Documents\\00_Applications_DataScience\\CUNY\\DATA606\\Data606_Final_Project_Repository\\01Data\\'
write.csv(education, paste(path_out, 'clean_education.csv'), row.names = FALSE)

Wages data

# load wages data
wages <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/allhlcn18.csv')

cat('Dimension of raw wage data:', dim(wages), '\n')

## Dimension of raw wage data: 62889 20

#filter to columns and rows of interest
wages <- wages %>%
  select(type = Area.Type,
         state = St.Name, 
         county = Area, 
         ownership = Ownership,
         resident_count = Annual.Average.Establishment.Count,
         avg_annual_pay = Annual.Average.Pay) %>%
  subset(type == 'County' & ownership == 'Total Covered') %>%
  select(-type)

#Remove the state name from the data in the county column,ie "Bibb County, Alabama" becomes "Bibb County"
wages$position <- (regexpr(pattern =',', wages$county)) - 1
wages$county <- str_sub(wages$county, end = wages$position)

#Convert from fctr to double
wages$avg_annual_pay = as.numeric(gsub("\\,", "", wages$avg_annual_pay))
wages$avg_annual_pay = as.numeric(as.character(wages$avg_annual_pay))

cat('Dimension of wages data after filtering', dim(wages))

## Dimension of wages data after filtering 3191 6

head(wages,1)

##      state         county     ownership resident_count avg_annual_pay position
## 36 Alabama Autauga County Total Covered            871          38407       14

write.csv(wages, paste(path_out, 'clean_wages.csv'), row.names = FALSE)

Home Values data

#load home values data
homes <- read.csv('https://raw.githubusercontent.com/krpopkin/Data606_Final_Project_Repository/master/01Data/County_Zhvi_AllHomes(Home_Values).csv')

homes <- clean_names(homes)

cat('Dimension of homes data:', dim(homes), '\n')

## Dimension of homes data: 2840 293

homes <-  homes %>%
  select(county = region_name, state = state, metro = metro,
         jan = x2018_01, feb = x2018_02, mar = x2018_03, apr = x2018_04, may = x2018_05, jun = x2018_06,
         jul = x2018_07, aug = x2018_08, sep = x2018_09, oct = x2018_10, nov = x2018_11, dec = x2018_12)

homes$avg_home_value = (homes$jan + homes$feb + homes$mar + homes$apr + homes$may + homes$jun + homes$jul + homes$aug + homes$sep + homes$oct + homes$nov + homes$dec)/12

homes <- homes %>%
  select(-c(jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec))

cat('Dimension of selected homes data:', dim(homes))

## Dimension of selected homes data: 2840 4

head(homes,1)

##               county state                          metro avg_home_value
## 1 Los Angeles County    CA Los Angeles-Long Beach-Anaheim       623588.4

write.csv(homes, paste(path_out, 'clean_homes.csv'), row.names = FALSE)

Join files
Now we’ll join the education, wages, and home values dataframes using state and county as the keys. This will give us a master data file to use when we start the project.

#Join education and wages using sql
master <- sqldf('SELECT * 
                    FROM education
                    INNER JOIN wages ON (education.county = wages.county AND education.state_long =
                    wages.state)')

master <- clean_names(master)
cat('Master data size after joining education and wages is', dim(master), '\n')

## Master data size after joining education and wages is 3087 18

#Join master with homes using merge
master <- merge(master, homes)

#Clean the master by removing unneeded columns
master <- master %>%
  select(-c(state_short, state_long, state_2, county_2, ownership, position))

cat('Master data with education, wages, and homes data is', dim(master))

## Master data with education, wages, and homes data is 2745 14

head(master,1)

##   state                       county dropout hs_diploma some_college
## 1    AK Fairbanks North Star Borough   3,381     13,785       24,520
##   four_year_degree dropout_percent hs_diploma_percent some_college_percent
## 1           20,152             5.5               22.3                 39.7
##   four_year_degree_percent resident_count avg_annual_pay     metro
## 1                     32.6          2,275          53107 Fairbanks
##   avg_home_value
## 1       244178.1

write.csv(master, paste(path_out, 'master.csv'), row.names = FALSE)

Research question

Can education and wages be used as predictors for home values?

There are many factors theorized to influence a home’s value and for this project we’ll attempt to answer if two popular topics, education and wages, can be used as predictors for a third popular topic, home values.

Cases

Cases are counties in the United States for this project and there are 2,745 that we’ll use to answer the research question.

Data source and method of collection:

Data is collected from several sources listed below:

Wages: Quarterly Census of Employment and Wages
The Quarterly Census of Employment and Wages (QCEW) program publishes a quarterly count of employment and wages reported by employers covering more than 95 percent of U.S. jobs, available at the county, MSA, state and national levels by industry.

Education: Local Education Agency
Revenues and expenditures are audited after the close of the fiscal year and are then submitted to NCES by each state education agency.

Home Values: Zillow
Zillow Home Value Index (ZHVI) is a smoothed, seasonally adjusted measure of the typical home value and market changes across a given region and housing type. Zillow publishes top-tier ZHVI (typical value for homes within the 65th to 95th percentile range for a given region) and bottom-tier ZHVI (typical value for homes that fall within the 5th to 35th percentile range for a given region).

Type of study

This is an observational study.

Dependent Variable

The response variable is home values and is quantitative.

Independent Variable

There are two independent variables; wages which is quantitative and education level which is qualitative.

Relevant summary statistics

Description of the master data file <>

describe(master)

## vars n mean sd median trimmed ## state* 1 2745 NaN NA NA NaN ## county* 2 2745 NaN NA NA NaN ## dropout* 3 2745 1319.57 796.70 1323.0 1314.32 ## hs_diploma* 4 2745 1516.98 852.64 1504.0 1514.10 ## some_college* 5 2745 1456.79 842.45 1455.0 1450.73 ## four_year_degree* 6 2745 1383.88 842.60 1370.0 1378.21 ## dropout_percent 7 2745 13.38 6.04 12.2 12.86 ## hs_diploma_percent 8 2745 34.36 7.29 34.6 34.62 ## some_college_percent 9 2745 30.56 5.01 30.5 30.54 ## four_year_degree_percent 10 2745 21.70 9.54 19.2 20.40 ## resident_count* 11 2745 3281.06 1803.78 3556.0 3334.82 ## avg_annual_pay 12 2745 41527.10 9278.22 39763.0 40322.15 ## metro* 13 2745 263.23 282.75 156.0 230.55 ## avg_home_value 14 2745 160108.92 111972.30 130851.3 141978.32 ## mad min max range skew kurtosis ## state* NA Inf -Inf -Inf NA NA ## county* NA Inf -Inf -Inf NA NA ## dropout* 1022.99 2.00 2714.0 2712.0 0.03 -1.21 ## hs_diploma* 1065.99 2.00 3021.0 3019.0 0.02 -1.17 ## some_college* 1065.99 2.00 2970.0 2968.0 0.04 -1.17 ## four_year_degree* 1110.47 3.00 2830.0 2827.0 0.04 -1.25 ## dropout_percent 5.78 1.40 48.5 47.1 0.90 1.05 ## hs_diploma_percent 7.12 8.10 55.6 47.5 -0.32 -0.03 ## some_college_percent 5.04 11.40 48.0 36.6 0.02 -0.03 ## four_year_degree_percent 7.41 6.90 74.6 67.7 1.39 2.22 ## resident_count* 2271.34 3.00 6126.0 6123.0 -0.22 -1.11 ## avg_annual_pay 6241.75 0.00 134664.0 134664.0 2.61 14.38 ## metro* 229.80 1.00 860.0 859.0 0.63 -1.06 ## avg_home_value 62020.49 29574.67 1527482.2 1497907.5 4.18 31.44 ## se ## state* NA ## county* NA ## dropout* 15.21 ## hs_diploma* 16.27 ## some_college* 16.08 ## four_year_degree* 16.08 ## dropout_percent 0.12 ## hs_diploma_percent 0.14 ## some_college_percent 0.10 ## four_year_degree_percent 0.18 ## resident_count* 34.43 ## avg_annual_pay 177.09 ## metro* 5.40 ## avg_home_value 2137.17

Education <>
More analysis of education will be done in the project. Below are histograms of high school graduation rates and four year college graduation rates.

#Percent with high school diploma ggplot(master, aes(x=hs_diploma_percent)) + geom_histogram()

#Percent with four year degrees ggplot(master, aes(x=four_year_degree_percent)) + geom_histogram()

Avg annual pay <>
Average annual pay is skewed right. This makes sense as all US states are in this dataset.

ggplot(master, aes(x=avg_annual_pay)) + geom_histogram()

Home values plot <>
Similar to wages, home values in some areas are very high, giving the distribution a right skew

ggplot(master, aes(x=avg_home_value)) + geom_histogram()