library(tidyverse)
## ── Attaching packages ── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Let’s create a simple dataframe using data.frame.
Name = c("Tom","Dick","Harry","Mary","Sally","Susan")
Gender = c("m","m","m","f","f","f")
People = data.frame(Name,Gender)
People
## Name Gender
## 1 Tom m
## 2 Dick m
## 3 Harry m
## 4 Mary f
## 5 Sally f
## 6 Susan f
str(People)
## 'data.frame': 6 obs. of 2 variables:
## $ Name : Factor w/ 6 levels "Dick","Harry",..: 6 1 2 3 4 5
## $ Gender: Factor w/ 2 levels "f","m": 2 2 2 1 1 1
Note that data.frame() created People$Gender as a factor. How could we prevent this?
People2 = data.frame(Name,Gender,stringsAsFactors = FALSE)
str(People2)
## 'data.frame': 6 obs. of 2 variables:
## $ Name : chr "Tom" "Dick" "Harry" "Mary" ...
## $ Gender: chr "m" "m" "m" "f" ...
How can we turn People2$Gender into a factor. I’ll create the factor version as a separate variable.
People2$GenderF = factor(People2$Gender)
People2
## Name Gender GenderF
## 1 Tom m m
## 2 Dick m m
## 3 Harry m m
## 4 Mary f f
## 5 Sally f f
## 6 Susan f f
str(People2)
## 'data.frame': 6 obs. of 3 variables:
## $ Name : chr "Tom" "Dick" "Harry" "Mary" ...
## $ Gender : chr "m" "m" "m" "f" ...
## $ GenderF: Factor w/ 2 levels "f","m": 2 2 2 1 1 1
Gender and GenderF look the same, but they do differ in some respects. There’s some ambiguity with GenderF. It’s really a numeric value in the dataframe and a separate table of values which maps the numeric values to the character string values. Watch what happens when we create numeric versions of these two values using as.numeric().
GenderNum = as.numeric(People2$Gender)
## Warning: NAs introduced by coercion
GenderFNum = as.numeric(People2$GenderF)
We see a warning. Let’s look at our vectors. First check GenderNum.
GenderNum
## [1] NA NA NA NA NA NA
Now look at GenderFNum
GenderFNum
## [1] 2 2 2 1 1 1
Can we treat GenderF as a numeric variable?
People2$GenderF + 1
## Warning in Ops.factor(People2$GenderF, 1): '+' not meaningful for factors
## [1] NA NA NA NA NA NA
No, we can’t do this.
What about using different character strings to represent our factor? We can select the values as we create the factor with the labels argument of the factor function.
People2$GenderF = factor(People2$Gender,labels =c("Female","Male"))
People2
## Name Gender GenderF
## 1 Tom m Male
## 2 Dick m Male
## 3 Harry m Male
## 4 Mary f Female
## 5 Sally f Female
## 6 Susan f Female
Note that the sorted values of the Gender vector are used to set the numeric values in the factor. We can override this using the levels argument of the factor function.
People2$GenderF = factor(People2$Gender,levels=c("m","f"))
People2
## Name Gender GenderF
## 1 Tom m m
## 2 Dick m m
## 3 Harry m m
## 4 Mary f f
## 5 Sally f f
## 6 Susan f f
str(People2$GenderF)
## Factor w/ 2 levels "m","f": 1 1 1 2 2 2
It looks the same, but the str() reveals that the numeric value 1 now means “m”, not “f”.
This is important if we use the levels() function to assign new string values after the factor has been created. Suppose we forget and think in terms of the natural sorted order of values.
levels(People2$GenderF) = c("Woman","Man")
People2
## Name Gender GenderF
## 1 Tom m Woman
## 2 Dick m Woman
## 3 Harry m Woman
## 4 Mary f Man
## 5 Sally f Man
## 6 Susan f Man
Here’s an example with countyComplete
library(openintro)
## Please visit openintro.org for free statistics materials
##
## Attaching package: 'openintro'
## The following object is masked from 'package:ggplot2':
##
## diamonds
## The following objects are masked from 'package:datasets':
##
## cars, trees
str(countyComplete)
## 'data.frame': 3143 obs. of 53 variables:
## $ state : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 1877 levels "Abbeville County",..: 83 90 101 151 166 227 237 250 298 320 ...
## $ FIPS : num 1001 1003 1005 1007 1009 ...
## $ pop2010 : num 54571 182265 27457 22915 57322 ...
## $ pop2000 : num 43671 140415 29038 20826 51024 ...
## $ age_under_5 : num 6.6 6.1 6.2 6 6.3 6.8 6.5 6.1 5.7 5.3 ...
## $ age_under_18 : num 26.8 23 21.9 22.7 24.6 22.3 24.1 22.9 22.5 21.4 ...
## $ age_over_65 : num 12 16.8 14.2 12.7 14.7 13.5 16.7 14.3 16.7 17.9 ...
## $ female : num 51.3 51.1 46.9 46.3 50.5 45.8 53 51.8 52.2 50.4 ...
## $ white : num 78.5 85.7 48 75.8 92.6 23 54.4 74.9 58.8 92.7 ...
## $ black : num 17.7 9.4 46.9 22 1.3 70.2 43.4 20.6 38.7 4.6 ...
## $ native : num 0.4 0.7 0.4 0.3 0.5 0.2 0.3 0.5 0.2 0.5 ...
## $ asian : num 0.9 0.7 0.4 0.1 0.2 0.2 0.8 0.7 0.5 0.2 ...
## $ pac_isl : num NA NA NA NA NA NA 0 0.1 0 0 ...
## $ two_plus_races : num 1.6 1.5 0.9 0.9 1.2 0.8 0.8 1.7 1.1 1.5 ...
## $ hispanic : num 2.4 4.4 5.1 1.8 8.1 7.1 0.9 3.3 1.6 1.2 ...
## $ white_not_hispanic : num 77.2 83.5 46.8 75 88.9 21.9 54.1 73.6 58.1 92.1 ...
## $ no_move_in_one_plus_year : num 86.3 83 83 90.5 87.2 88.5 92.8 82.9 86.2 88.1 ...
## $ foreign_born : num 2 3.6 2.8 0.7 4.7 1.1 1.1 2.5 0.9 0.5 ...
## $ foreign_spoken_at_home : num 3.7 5.5 4.7 1.5 7.2 3.8 1.6 4.5 1.6 1.4 ...
## $ hs_grad : num 85.3 87.6 71.9 74.5 74.7 74.7 74.8 78.5 71.8 73.4 ...
## $ bachelors : num 21.7 26.8 13.5 10 12.5 12 11 16.1 10.8 10.5 ...
## $ veterans : num 5817 20396 2327 1883 4072 ...
## $ mean_work_travel : num 25.1 25.8 23.8 28.3 33.2 28.1 25.1 22.1 23.6 26.2 ...
## $ housing_units : num 22135 104061 11829 8981 23887 ...
## $ home_ownership : num 77.5 76.7 68 82.9 82 76.9 69 70.7 71.4 77.5 ...
## $ housing_multi_unit : num 7.2 22.6 11.1 6.6 3.7 9.9 13.7 14.3 8.7 4.3 ...
## $ median_val_owner_occupied : num 133900 177200 88200 81200 113700 ...
## $ households : num 19718 69476 9795 7441 20605 ...
## $ persons_per_household : num 2.7 2.5 2.52 3.02 2.73 2.85 2.58 2.46 2.51 2.22 ...
## $ per_capita_income : num 24568 26469 15875 19918 21070 ...
## $ median_household_income : num 53255 50147 33219 41770 45549 ...
## $ poverty : num 10.6 12.2 25 12.6 13.4 25.3 25 19.5 20.3 17.6 ...
## $ private_nonfarm_establishments : num 877 4812 522 318 749 ...
## $ private_nonfarm_employment : num 10628 52233 7990 2927 6968 ...
## $ percent_change_private_nonfarm_employment: num 16.6 17.4 -27 -14 -11.4 -18.5 2.1 -5.6 -45.8 5.4 ...
## $ nonemployment_establishments : num 2971 14175 1527 1192 3501 ...
## $ firms : num 4067 19035 1667 1385 4458 ...
## $ black_owned_firms : num 15.2 2.7 NA 14.9 NA NA NA 7.2 NA NA ...
## $ native_owned_firms : num NA 0.4 NA NA NA NA NA NA NA NA ...
## $ asian_owned_firms : num 1.3 1 NA NA NA NA 3.3 1.6 NA NA ...
## $ pac_isl_owned_firms : num NA NA NA NA NA NA NA NA NA NA ...
## $ hispanic_owned_firms : num 0.7 1.3 NA NA NA NA NA 0.5 NA NA ...
## $ women_owned_firms : num 31.7 27.3 27 NA 23.2 38.8 NA 24.7 29.3 14.5 ...
## $ manufacturer_shipments_2007 : num NA 1410273 NA 0 341544 ...
## $ mercent_whole_sales_2007 : num NA NA NA NA NA ...
## $ sales : num 598175 2966489 188337 124707 319700 ...
## $ sales_per_capita : num 12003 17166 6334 5804 5622 ...
## $ accommodation_food_service : num 88157 436955 NA 10757 20941 ...
## $ building_permits : num 191 696 10 8 18 1 3 107 10 6 ...
## $ fed_spending : num 331142 1119082 240308 163201 294114 ...
## $ area : num 594 1590 885 623 645 ...
## $ density : num 91.8 114.6 31 36.8 88.9 ...
We see that the variable state is a factor.
let’s get a count of counties in each state using the table command.
table(countyComplete$state)
##
## Alabama Alaska Arizona
## 67 29 15
## Arkansas California Colorado
## 75 58 64
## Connecticut Delaware District of Columbia
## 8 3 1
## Florida Georgia Hawaii
## 67 159 5
## Idaho Illinois Indiana
## 44 102 92
## Iowa Kansas Kentucky
## 99 105 120
## Louisiana Maine Maryland
## 64 16 24
## Massachusetts Michigan Minnesota
## 14 83 87
## Mississippi Missouri Montana
## 82 115 56
## Nebraska Nevada New Hampshire
## 93 17 10
## New Jersey New Mexico New York
## 21 33 62
## North Carolina North Dakota Ohio
## 100 53 88
## Oklahoma Oregon Pennsylvania
## 77 36 67
## Rhode Island South Carolina South Dakota
## 5 46 66
## Tennessee Texas Utah
## 95 254 29
## Vermont Virginia Washington
## 14 134 39
## West Virginia Wisconsin Wyoming
## 55 72 23
Let’s create a smaller dataframe, pnw.
pnw = filter(countyComplete,state %in% c("Idaho","Washington","Oregon"))
str(pnw)
## 'data.frame': 119 obs. of 53 variables:
## $ state : Factor w/ 51 levels "Alabama","Alaska",..: 13 13 13 13 13 13 13 13 13 13 ...
## $ name : Factor w/ 1877 levels "Abbeville County",..: 4 6 98 120 135 156 160 168 173 174 ...
## $ FIPS : num 16001 16003 16005 16007 16009 ...
## $ pop2010 : num 392365 3976 82839 5986 9285 ...
## $ pop2000 : num 300904 3476 75565 6411 9171 ...
## $ age_under_5 : num 7.2 4.8 8.4 6.8 6.5 9.7 6.6 4.4 5.2 9.6 ...
## $ age_under_18 : num 26.4 19.2 27.4 27.3 23.7 33 24.3 21.5 21.7 31.5 ...
## $ age_over_65 : num 10.5 20.8 11.2 18.4 18.3 11.4 11.6 15.9 17.1 10.9 ...
## $ female : num 49.9 48.7 50.1 50.4 49 49.8 49.1 47.8 49.6 50.1 ...
## $ white : num 90.3 96.1 89.8 96.3 86.6 80.6 84.9 95.4 96 90.6 ...
## $ black : num 1.1 0.1 0.8 0.1 0.3 0.2 0.2 0.2 0.1 0.6 ...
## $ native : num 0.7 1 3.2 0.5 8.7 6.5 0.6 0.8 0.8 0.8 ...
## $ asian : num 2.4 0.4 1.3 0.4 0.3 0.6 0.9 0.4 0.5 0.8 ...
## $ pac_isl : num 0.2 0.1 0.2 0 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ two_plus_races : num 2.8 1.7 2.7 1.1 3.6 2.1 1.5 2.3 2.1 2.1 ...
## $ hispanic : num 7.1 2.4 6.7 3.6 2.5 17.2 20 3.5 2.2 11.4 ...
## $ white_not_hispanic : num 86.5 94.8 86.4 94.7 85.3 74.9 78 93.2 94.4 85.3 ...
## $ no_move_in_one_plus_year : num 79.2 85.5 79.4 85.4 86.9 85.5 83.2 84.6 87.3 82 ...
## $ foreign_born : num 5.8 1 2.9 0.3 1.2 6.4 15 2.3 2.8 5.1 ...
## $ foreign_spoken_at_home : num 9 1.5 6.4 3.3 3.1 15.5 18.5 3.5 3.3 9.9 ...
## $ hs_grad : num 92.9 88.4 90.5 90.4 87.2 84.6 92 87.2 91 90.6 ...
## $ bachelors : num 35 21.3 27.3 16.7 12.1 15.9 43.2 23.9 22.5 26.2 ...
## $ veterans : num 32352 459 6047 550 988 ...
## $ mean_work_travel : num 19.3 18.3 17.7 20.9 17.1 20.2 17.8 36.5 23.9 20.1 ...
## $ housing_units : num 159471 2636 33191 3914 4629 ...
## $ home_ownership : num 69.6 80.4 71.2 80.9 74.2 79.9 68.3 76.8 74.5 74.1 ...
## $ housing_multi_unit : num 18 3.1 19.5 7.5 5.8 9.6 29.9 1.3 10.1 17.4 ...
## $ median_val_owner_occupied : num 214500 205100 135500 135900 123000 ...
## $ households : num 145584 1700 29860 2538 3840 ...
## $ persons_per_household : num 2.55 2.31 2.65 2.37 2.39 3.08 2.29 2.3 2.21 2.81 ...
## $ per_capita_income : num 27915 22730 21275 19284 18312 ...
## $ median_household_income : num 55835 36004 44848 43374 37500 ...
## $ poverty : num 10.2 12.4 14 13.9 15.2 14.7 9.3 16.3 14.3 11 ...
## $ private_nonfarm_establishments : num 12394 143 2040 130 246 ...
## $ private_nonfarm_employment : num 166239 480 26890 1003 2190 ...
## $ percent_change_private_nonfarm_employment: num 8.5 -14.9 11.5 8.3 -14.6 10.2 4.4 -2.9 13.9 11.8 ...
## $ nonemployment_establishments : num 29533 359 4715 420 504 ...
## $ firms : num 42344 492 6616 645 596 ...
## $ black_owned_firms : num 0.4 NA NA NA NA NA NA NA NA NA ...
## $ native_owned_firms : num 1 NA 0.5 NA NA 2.5 NA NA NA 0.8 ...
## $ asian_owned_firms : num 1.3 NA 0.4 NA NA 1 NA NA 0.7 0.9 ...
## $ pac_isl_owned_firms : num NA NA NA NA NA NA NA NA NA NA ...
## $ hispanic_owned_firms : num 2.1 NA 1.5 NA NA 3.6 2.4 NA NA 1.9 ...
## $ women_owned_firms : num 25.4 18.1 25 NA 26.2 20 23.7 27.2 20.6 21.8 ...
## $ manufacturer_shipments_2007 : num 4942388 0 NA 0 NA ...
## $ mercent_whole_sales_2007 : num 6006918 NA 386587 23383 6085 ...
## $ sales : num 5855102 19965 1099246 58739 93862 ...
## $ sales_per_capita : num 15720 5627 13706 10031 10111 ...
## $ accommodation_food_service : num 795953 NA 117808 NA NA ...
## $ building_permits : num 1285 13 140 27 19 ...
## $ fed_spending : num 3122360 33748 580516 47263 103879 ...
## $ area : num 1053 1363 1112 975 777 ...
## $ density : num 372.8 2.9 74.5 6.1 12 ...
Note that the state factor in the smaller dataframe has all of the original levels. Look at what happens when we try to get a table of the state variable from pnw.
table(pnw$state)
##
## Alabama Alaska Arizona
## 0 0 0
## Arkansas California Colorado
## 0 0 0
## Connecticut Delaware District of Columbia
## 0 0 0
## Florida Georgia Hawaii
## 0 0 0
## Idaho Illinois Indiana
## 44 0 0
## Iowa Kansas Kentucky
## 0 0 0
## Louisiana Maine Maryland
## 0 0 0
## Massachusetts Michigan Minnesota
## 0 0 0
## Mississippi Missouri Montana
## 0 0 0
## Nebraska Nevada New Hampshire
## 0 0 0
## New Jersey New Mexico New York
## 0 0 0
## North Carolina North Dakota Ohio
## 0 0 0
## Oklahoma Oregon Pennsylvania
## 0 36 0
## Rhode Island South Carolina South Dakota
## 0 0 0
## Tennessee Texas Utah
## 0 0 0
## Vermont Virginia Washington
## 0 0 39
## West Virginia Wisconsin Wyoming
## 0 0 0
pnw$state = as.character(pnw$state)
table(pnw$state)
##
## Idaho Oregon Washington
## 44 36 39