Train Test

library(tidyverse)

## ── Attaching packages ──── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0       ✔ purrr   0.2.5  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.1       ✔ stringr 1.4.0  
## ✔ readr   1.1.1       ✔ forcats 0.3.0

## ── Conflicts ─────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(openintro)

## Please visit openintro.org for free statistics materials

## 
## Attaching package: 'openintro'

## The following object is masked from 'package:ggplot2':
## 
##     diamonds

## The following objects are masked from 'package:datasets':
## 
##     cars, trees

Create Train and Test. Use California, Oregon and Washington for training. Use Michigan and Indiana for testing.

Train = filter(countyComplete,state %in% c("Washington","Oregon","California"))
mod1 = lm(median_household_income~density + bachelors,data = Train)
summary(mod1)

## 
## Call:
## lm(formula = median_household_income ~ density + bachelors, data = Train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32706  -5329   -509   4846  20061 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.099e+04  2.099e+03  14.763  < 2e-16 ***
## density     5.674e-01  5.170e-01   1.098    0.274    
## bachelors   8.033e+02  8.694e+01   9.239 6.17e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8465 on 130 degrees of freedom
## Multiple R-squared:  0.4652, Adjusted R-squared:  0.457 
## F-statistic: 56.54 on 2 and 130 DF,  p-value: < 2.2e-16

Test = filter(countyComplete,state %in% c("Michigan","Indiana","Ohio"))
pred = predict(mod1,Test)
head(pred)

##        1        2        3        4        5        6 
## 40844.73 52259.40 52541.69 42004.47 40831.06 61587.55

Review = cbind(Test,pred) %>% 
  select(state, name, median_household_income, pred) %>% 
  mutate(res = median_household_income - pred)

sd(Review$res)

## [1] 6553.46

Review %>% ggplot(aes(median_household_income,pred)) + geom_point()

summary(Train$median_household_income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33403   41818   46430   49979   56439   89268

summary(Test$median_household_income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   31205   40040   44280   45527   49236   87908

sstot = sum((Review$median_household_income - mean(Review$median_household))^2)
ssres = sum(Review$res^2)
rsq = 1 - (ssres/sstot)
rsq

## [1] 0.3611946

We need to base our train and test split on randomization.

Code for Randomized Train and Test

# Randomize the dataframe


# Determine the number of rows in the dataframe
nrows = nrow(countyComplete)
nrows

## [1] 3143

# Get a vector with the row numbers of the dataframe scrambled

# Use a seed to enable replication
set.seed(999)

rows <- sample(nrows)
head(rows)

## [1] 1223 1832  298 2678 2470  375

str(rows)

##  int [1:3143] 1223 1832 298 2678 2470 375 1903 254 1226 1942 ...

# Scramble the dataframe itself
countyComplete = countyComplete[rows,]

# Look at the head of the scrambled dataframe
head(countyComplete)

##              state              name  FIPS pop2010 pop2000 age_under_5
## 1223 Massachusetts   Franklin County 25011   71372   71535         4.8
## 1832      New York     Broome County 36007  200600  200536         5.2
## 298       Colorado Rio Grande County  8105   11982   12413         6.6
## 2678         Texas   McLennan County 48309  234906  213517         7.1
## 2470     Tennessee    Houston County 47083    8426    8088         5.6
## 375        Florida  St. Johns County 12109  190039  123135         5.3
##      age_under_18 age_over_65 female white black native asian pac_isl
## 1223         19.7        15.2   51.2  94.2   1.1    0.3   1.3      NA
## 1832         20.2        16.4   51.0  88.0   4.8    0.2   3.5      NA
## 298          25.2        16.2   50.3  78.3   0.4    1.8   0.4       0
## 2678         25.4        12.5   51.4  69.8  14.8    0.6   1.4       0
## 2470         23.6        17.8   50.8  95.1   2.3    0.3   0.3       0
## 375          23.1        15.7   51.4  89.3   5.6    0.3   2.1      NA
##      two_plus_races hispanic white_not_hispanic no_move_in_one_plus_year
## 1223            2.1      3.2               92.4                     87.1
## 1832            2.5      3.4               86.3                     86.6
## 298             3.2     42.4               55.1                     88.1
## 2678            2.5     23.6               58.9                     81.1
## 2470            1.6      1.5               94.1                     88.8
## 375             1.8      5.2               85.3                     81.9
##      foreign_born foreign_spoken_at_home hs_grad bachelors veterans
## 1223          4.1                    5.9    91.1      32.5     6672
## 1832          6.1                    9.0    88.3      25.1    16633
## 298           5.2                   26.9    83.9      19.2      963
## 2678          8.2                   18.2    80.3      20.6    17934
## 2470          0.4                    1.0    79.6       7.5      828
## 375           6.3                    8.0    92.8      38.7    19062
##      mean_work_travel housing_units home_ownership housing_multi_unit
## 1223             24.6         33758           70.0               27.9
## 1832             18.2         90563           66.6               31.8
## 298              19.0          6630           78.9                8.9
## 2678             18.5         95124           60.1               24.3
## 2470             27.8          4188           73.6                5.8
## 375              25.8         89830           77.1               20.0
##      median_val_owner_occupied households persons_per_household
## 1223                    222000      30447                  2.32
## 1832                     99500      80806                  2.36
## 298                     129300       3630                  3.16
## 2678                    101000      82998                  2.68
## 2470                     87900       3392                  2.40
## 375                     294100      70324                  2.53
##      per_capita_income median_household_income poverty
## 1223             27544                   52002    11.3
## 1832             24314                   44457    15.5
## 298              17199                   39871    17.1
## 2678             20652                   40672    20.5
## 2470             17791                   33738    19.4
## 375              36027                   62663     9.1
##      private_nonfarm_establishments private_nonfarm_employment
## 1223                           1623                      20769
## 1832                           4287                      74702
## 298                             377                       2730
## 2678                           4955                      94548
## 2470                            100                        925
## 375                            4958                      44374
##      percent_change_private_nonfarm_employment
## 1223                                     -16.1
## 1832                                     -10.0
## 298                                       -0.2
## 2678                                       4.3
## 2470                                     -15.2
## 375                                       17.1
##      nonemployment_establishments firms black_owned_firms
## 1223                         6025  7193                NA
## 1832                         9407 14023               1.9
## 298                          1011  1791                NA
## 2678                        13208 16888               5.1
## 2470                          559   810                NA
## 375                         15342 19871               2.3
##      native_owned_firms asian_owned_firms pac_isl_owned_firms
## 1223                 NA                NA                  NA
## 1832                1.0                NA                  NA
## 298                  NA                NA                  NA
## 2678                0.7               1.6                  NA
## 2470                 NA                NA                  NA
## 375                 0.5               2.7                  NA
##      hispanic_owned_firms women_owned_firms manufacturer_shipments_2007
## 1223                   NA              31.2                     1164378
## 1832                   NA              29.0                     2525374
## 298                  16.1              24.6                           0
## 2678                  8.9              28.4                     5888916
## 2470                   NA              17.8                           0
## 375                   4.9              26.1                      533263
##      mercent_whole_sales_2007   sales sales_per_capita
## 1223                   460103  691013             9628
## 1832                  1958804 2573019            13196
## 298                    218177  110165             9480
## 2678                  4606220 2942576            12892
## 2470                        0   33434             4152
## 375                    871130 2039710            11601
##      accommodation_food_service building_permits fed_spending    area
## 1223                      80305               58       543819  699.32
## 1832                     326420               60      1723923  705.77
## 298                       14191               21       102744  911.96
## 2678                     375262              602      1871880 1037.10
## 2470                       2725                3        88123  200.29
## 375                      507634             1268      1296975  600.66
##      density
## 1223   102.1
## 1832   284.2
## 298     13.1
## 2678   226.5
## 2470    42.1
## 375    316.4

# Get a splitting point, about 80% of the dataframe
split <- round(nrow(countyComplete) * .8)

# Split into train and test dataframes
train = countyComplete[1:split,]
test = countyComplete[(split+1):nrows,]

# Check the results
nrow(train)

## [1] 2514

nrow(test)

## [1] 629

nrow(train) + nrow(test)

## [1] 3143

Train Test

Harold Nelson

3/27/2019

Code for Randomized Train and Test