library(tidyverse)
## ── Attaching packages ──── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.1 ✔ stringr 1.4.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro)
## Please visit openintro.org for free statistics materials
##
## Attaching package: 'openintro'
## The following object is masked from 'package:ggplot2':
##
## diamonds
## The following objects are masked from 'package:datasets':
##
## cars, trees
Create Train and Test. Use California, Oregon and Washington for training. Use Michigan and Indiana for testing.
Train = filter(countyComplete,state %in% c("Washington","Oregon","California"))
mod1 = lm(median_household_income~density + bachelors,data = Train)
summary(mod1)
##
## Call:
## lm(formula = median_household_income ~ density + bachelors, data = Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32706 -5329 -509 4846 20061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.099e+04 2.099e+03 14.763 < 2e-16 ***
## density 5.674e-01 5.170e-01 1.098 0.274
## bachelors 8.033e+02 8.694e+01 9.239 6.17e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8465 on 130 degrees of freedom
## Multiple R-squared: 0.4652, Adjusted R-squared: 0.457
## F-statistic: 56.54 on 2 and 130 DF, p-value: < 2.2e-16
Test = filter(countyComplete,state %in% c("Michigan","Indiana","Ohio"))
pred = predict(mod1,Test)
head(pred)
## 1 2 3 4 5 6
## 40844.73 52259.40 52541.69 42004.47 40831.06 61587.55
Review = cbind(Test,pred) %>%
select(state, name, median_household_income, pred) %>%
mutate(res = median_household_income - pred)
sd(Review$res)
## [1] 6553.46
Review %>% ggplot(aes(median_household_income,pred)) + geom_point()
summary(Train$median_household_income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33403 41818 46430 49979 56439 89268
summary(Test$median_household_income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31205 40040 44280 45527 49236 87908
sstot = sum((Review$median_household_income - mean(Review$median_household))^2)
ssres = sum(Review$res^2)
rsq = 1 - (ssres/sstot)
rsq
## [1] 0.3611946
We need to base our train and test split on randomization.
# Randomize the dataframe
# Determine the number of rows in the dataframe
nrows = nrow(countyComplete)
nrows
## [1] 3143
# Get a vector with the row numbers of the dataframe scrambled
# Use a seed to enable replication
set.seed(999)
rows <- sample(nrows)
head(rows)
## [1] 1223 1832 298 2678 2470 375
str(rows)
## int [1:3143] 1223 1832 298 2678 2470 375 1903 254 1226 1942 ...
# Scramble the dataframe itself
countyComplete = countyComplete[rows,]
# Look at the head of the scrambled dataframe
head(countyComplete)
## state name FIPS pop2010 pop2000 age_under_5
## 1223 Massachusetts Franklin County 25011 71372 71535 4.8
## 1832 New York Broome County 36007 200600 200536 5.2
## 298 Colorado Rio Grande County 8105 11982 12413 6.6
## 2678 Texas McLennan County 48309 234906 213517 7.1
## 2470 Tennessee Houston County 47083 8426 8088 5.6
## 375 Florida St. Johns County 12109 190039 123135 5.3
## age_under_18 age_over_65 female white black native asian pac_isl
## 1223 19.7 15.2 51.2 94.2 1.1 0.3 1.3 NA
## 1832 20.2 16.4 51.0 88.0 4.8 0.2 3.5 NA
## 298 25.2 16.2 50.3 78.3 0.4 1.8 0.4 0
## 2678 25.4 12.5 51.4 69.8 14.8 0.6 1.4 0
## 2470 23.6 17.8 50.8 95.1 2.3 0.3 0.3 0
## 375 23.1 15.7 51.4 89.3 5.6 0.3 2.1 NA
## two_plus_races hispanic white_not_hispanic no_move_in_one_plus_year
## 1223 2.1 3.2 92.4 87.1
## 1832 2.5 3.4 86.3 86.6
## 298 3.2 42.4 55.1 88.1
## 2678 2.5 23.6 58.9 81.1
## 2470 1.6 1.5 94.1 88.8
## 375 1.8 5.2 85.3 81.9
## foreign_born foreign_spoken_at_home hs_grad bachelors veterans
## 1223 4.1 5.9 91.1 32.5 6672
## 1832 6.1 9.0 88.3 25.1 16633
## 298 5.2 26.9 83.9 19.2 963
## 2678 8.2 18.2 80.3 20.6 17934
## 2470 0.4 1.0 79.6 7.5 828
## 375 6.3 8.0 92.8 38.7 19062
## mean_work_travel housing_units home_ownership housing_multi_unit
## 1223 24.6 33758 70.0 27.9
## 1832 18.2 90563 66.6 31.8
## 298 19.0 6630 78.9 8.9
## 2678 18.5 95124 60.1 24.3
## 2470 27.8 4188 73.6 5.8
## 375 25.8 89830 77.1 20.0
## median_val_owner_occupied households persons_per_household
## 1223 222000 30447 2.32
## 1832 99500 80806 2.36
## 298 129300 3630 3.16
## 2678 101000 82998 2.68
## 2470 87900 3392 2.40
## 375 294100 70324 2.53
## per_capita_income median_household_income poverty
## 1223 27544 52002 11.3
## 1832 24314 44457 15.5
## 298 17199 39871 17.1
## 2678 20652 40672 20.5
## 2470 17791 33738 19.4
## 375 36027 62663 9.1
## private_nonfarm_establishments private_nonfarm_employment
## 1223 1623 20769
## 1832 4287 74702
## 298 377 2730
## 2678 4955 94548
## 2470 100 925
## 375 4958 44374
## percent_change_private_nonfarm_employment
## 1223 -16.1
## 1832 -10.0
## 298 -0.2
## 2678 4.3
## 2470 -15.2
## 375 17.1
## nonemployment_establishments firms black_owned_firms
## 1223 6025 7193 NA
## 1832 9407 14023 1.9
## 298 1011 1791 NA
## 2678 13208 16888 5.1
## 2470 559 810 NA
## 375 15342 19871 2.3
## native_owned_firms asian_owned_firms pac_isl_owned_firms
## 1223 NA NA NA
## 1832 1.0 NA NA
## 298 NA NA NA
## 2678 0.7 1.6 NA
## 2470 NA NA NA
## 375 0.5 2.7 NA
## hispanic_owned_firms women_owned_firms manufacturer_shipments_2007
## 1223 NA 31.2 1164378
## 1832 NA 29.0 2525374
## 298 16.1 24.6 0
## 2678 8.9 28.4 5888916
## 2470 NA 17.8 0
## 375 4.9 26.1 533263
## mercent_whole_sales_2007 sales sales_per_capita
## 1223 460103 691013 9628
## 1832 1958804 2573019 13196
## 298 218177 110165 9480
## 2678 4606220 2942576 12892
## 2470 0 33434 4152
## 375 871130 2039710 11601
## accommodation_food_service building_permits fed_spending area
## 1223 80305 58 543819 699.32
## 1832 326420 60 1723923 705.77
## 298 14191 21 102744 911.96
## 2678 375262 602 1871880 1037.10
## 2470 2725 3 88123 200.29
## 375 507634 1268 1296975 600.66
## density
## 1223 102.1
## 1832 284.2
## 298 13.1
## 2678 226.5
## 2470 42.1
## 375 316.4
# Get a splitting point, about 80% of the dataframe
split <- round(nrow(countyComplete) * .8)
# Split into train and test dataframes
train = countyComplete[1:split,]
test = countyComplete[(split+1):nrows,]
# Check the results
nrow(train)
## [1] 2514
nrow(test)
## [1] 629
nrow(train) + nrow(test)
## [1] 3143