#CREATING LABELS for race categories:
breaks_race <- c(-Inf, -7, -8 ,1 ,2 ,3 ,4 ,5 ,6 ,97)
data4$R_RACE <-cut(data4$R_RACE, breaks_race, labels = c("refused","Don’t know","white","Black or African","Asian", "American Indian","Native Hawaiian or other Pacific Islander", "Multiple responses selected", "Some other race"))
table(data4$R_RACE)
##
## refused
## 264
## Don’t know
## 1188
## white
## 214237
## Black or African
## 19426
## Asian
## 12064
## American Indian
## 1721
## Native Hawaiian or other Pacific Islander
## 636
## Multiple responses selected
## 8490
## Some other race
## 6208
table(data4$HHFAMINC)
##
## -9 -8 -7 1 2 3 4 5 6 7 8 9 10
## 35 1338 6602 10396 10030 19037 21785 29222 45560 36777 30026 17244 17438
## 11
## 18744
breaks_inc <- c(-Inf,5,8,Inf)
unique(data4$HHFAMINC)
## [1] 7 8 10 3 5 11 9 4 6 1 -7 2 -8 -9
data4$HHFAMINC <-cut(data4$HHFAMINC, c(-Inf,5,8,Inf), labels = c("Low", "Medium", "High"))
table(data4$HHFAMINC)
##
## Low Medium High
## 98445 112363 53426
table(data2$HHFAMINC)
##
## -9 -8 -7 1 2 3 4 5 6 7 8 9 10
## 42 1067 7000 6124 7401 16130 20097 28647 46141 38069 30778 17678 17580
## 11
## 19361
breaks_inc <- c(-Inf,5,8,Inf)
unique(data2$HHFAMINC)
## [1] 7 8 10 3 5 11 9 4 6 1 -7 2 -8 -9
data2$HHFAMINC <-cut(data2$HHFAMINC, c(-Inf,5,8,Inf), labels = c("Low", "Medium", "High"))
table(data2$HHFAMINC)
##
## Low Medium High
## 86508 114988 54619
#lapply(data4$HTPPOPDN,as.numeric)
unique(data4$HTPPOPDN)
## [1] 1500 300 17000 7000 50 30000 3000 750 -9
table(data4$HTPPOPDN)
##
## -9 50 300 750 1500 3000 7000 17000 30000
## 229 39256 46171 25204 35116 49362 54101 11313 3482
break_popden <- c(-Inf, -9, 50, 300, 750, 1500, 3000, 7000, 17000, 30000)
data4$HTPPOPDN <-cut(data4$HTPPOPDN, break_popden, labels = c( "-9", "50", "300", "750", "1500", "3000", "7000", "10,000-24,999", "25,000-999,999"))
table(data4$HTPPOPDN)
##
## -9 50 300 750 1500
## 229 39256 46171 25204 35116
## 3000 7000 10,000-24,999 25,000-999,999
## 49362 54101 11313 3482
data4$WRK_HOME <-as.integer(as.character(data4$WRK_HOME))
break_wfh <- c(-Inf, -1, -7, -8, -9, 1, 2)
labels_wfh <- c("I don't know", "I prefer not to answer", "Appropriate skip", "Not ascertained", "Yes", "No")
data4$WRK_HOME <-cut(data4$WRK_HOME, break_wfh, labels_wfh)
table(data4$WRK_HOME)
##
## I don't know I prefer not to answer Appropriate skip
## 4778 10 8
## Not ascertained Yes No
## 135943 16834 106661
data4$R_AGE <-as.integer(as.character(data4$R_AGE))
break_age <- c(-Inf, -8, -7, 12, 20, 45, 60 )
labels_age <- c("I don't know", "I prefer not to answer", "Children","Teenage", "Young", "Old")
data4$R_AGE <-cut(data4$R_AGE, break_age, labels_age)
table(data4$R_AGE)
##
## I don't know I prefer not to answer Children
## 44 452 19977
## Teenage Young Old
## 18751 67450 62686
Q1.1. What is the average household vehicle count? (Household Data set) In order to calculate the average we would use the mean() function.
# lapply(data4$HHVEHCNT,as.numeric)
break_veh_count <- c(-Inf,1,2,3,Inf)
data4$HHVEHCNT <- cut(data4$HHVEHCNT, break_veh_count,
labels = c("zero", "one", "two", "more than two"))
table(data2$HHVEHCNT)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 41534 99872 60801 30248 12705 5658 2471 1184 657 390 187 408
veh_count_mean <- mean(data2$HHVEHCNT) #using $ sign to get to a specific column of data set
print( veh_count_mean)
## [1] 2.677239
data4 %>%
group_by(HHVEHCNT, HTPPOPDN) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'HHVEHCNT'. You can override using the
## `.groups` argument.
## # A tibble: 36 × 4
## # Groups: HHVEHCNT [4]
## HHVEHCNT HTPPOPDN count p
## <fct> <fct> <int> <dbl>
## 1 zero -9 68 0.101
## 2 zero 50 7105 10.5
## 3 zero 300 9049 13.4
## 4 zero 750 5537 8.21
## 5 zero 1500 8413 12.5
## 6 zero 3000 13155 19.5
## 7 zero 7000 16515 24.5
## 8 zero 10,000-24,999 4890 7.25
## 9 zero 25,000-999,999 2671 3.96
## 10 one -9 95 0.0852
## # … with 26 more rows
#lapply(data4$BIKESHARE,as.numeric)
data2$HHVEHCNT <-as.integer(as.character(data2$HHVEHCNT))
breaks_veh_count <- c(0,1,2,3, Inf)
data2$HHVEHCNT <- cut(data2$HHVEHCNT, breaks_veh_count,
labels = c("zero", "one", "two", "more than two"))
table(data2$HHVEHCNT)
##
## zero one two more than two
## 41534 99872 60801 53908
data2 %>%
group_by(HHVEHCNT,HHFAMINC) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'HHVEHCNT'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: HHVEHCNT [4]
## HHVEHCNT HHFAMINC count p
## <fct> <fct> <int> <dbl>
## 1 zero Low 27031 65.1
## 2 zero Medium 12211 29.4
## 3 zero High 2292 5.52
## 4 one Low 32282 32.3
## 5 one Medium 46850 46.9
## 6 one High 20740 20.8
## 7 two Low 15885 26.1
## 8 two Medium 29400 48.4
## 9 two High 15516 25.5
## 10 more than two Low 11310 21.0
## 11 more than two Medium 26527 49.2
## 12 more than two High 16071 29.8
replace(data2$vehage, data2$vehage<0,0)
## numeric(0)
avg_veh_age <- mean(data2$VEHAGE)
print(avg_veh_age)
## [1] 10.35699
’ 7. What is the average distance to work? (Hint: Use the person data file. First sort the data so that you don’t include the records that are coded with missing values (negative numbers or zero).
vmt_miles <- replace(data3$VMT_MILE, data3$VMT_MILE<0,0)
avg_vmt_miles <- mean(vmt_miles)
print(avg_vmt_miles)
## [1] 6.30844
data3$VMT_MILE <-as.integer(as.character(data3$VMT_MILE))
typeof(data3$VMT_MILE)
## [1] "integer"
break_mile <- c(-Inf, 0, Inf)
data3$VMT_MILE <- cut(data3$VMT_MILE, break_mile, labels = c("Appropriate skip", "0 - 5441.489"))
table(data3$VMT_MILE)
##
## Appropriate skip 0 - 5441.489
## 393527 530045
data3$R_SEX <-as.integer(as.character(data3$R_SEX))
typeof(data3$R_SEX)
## [1] "integer"
break_sex <- c(-Inf,-7, -8, 1 , 2)
data3$R_SEX <- cut(data3$R_SEX, break_sex, labels = c("I prefer not to answer", "I don't know", "Male", "Female"))
table(data3$R_SEX)
##
## I prefer not to answer I don't know Male
## 92 619 432478
## Female
## 490383
table(data3$VMT_MILE)
##
## Appropriate skip 0 - 5441.489
## 393527 530045
data3 %>%
group_by(VMT_MILE,R_SEX) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'VMT_MILE'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 4
## # Groups: VMT_MILE [2]
## VMT_MILE R_SEX count p
## <fct> <fct> <int> <dbl>
## 1 Appropriate skip I prefer not to answer 43 0.0109
## 2 Appropriate skip I don't know 361 0.0917
## 3 Appropriate skip Male 164151 41.7
## 4 Appropriate skip Female 228972 58.2
## 5 0 - 5441.489 I prefer not to answer 49 0.00924
## 6 0 - 5441.489 I don't know 258 0.0487
## 7 0 - 5441.489 Male 268327 50.6
## 8 0 - 5441.489 Female 261411 49.3
# table_sex_mile <- aggregate(VMT_MILE ~ R_SEX_IMP , data = data3, mean)
# table_sex_mile
data4 %>%
# filter(WRK_HOME > 0) %>%
group_by(WRK_HOME) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## # A tibble: 6 × 3
## WRK_HOME count p
## <fct> <int> <dbl>
## 1 I don't know 4778 1.81
## 2 I prefer not to answer 10 0.00378
## 3 Appropriate skip 8 0.00303
## 4 Not ascertained 135943 51.4
## 5 Yes 16834 6.37
## 6 No 106661 40.4
Individual who work from home vs metropolitan status:
data4$MSACAT <-as.integer(as.character(data4$MSACAT))
typeof(data4$MSACAT)
## [1] "integer"
break_msa <- c(-Inf, 1, 2, 3 , 4)
data4$MSACAT <- cut(data4$MSACAT, break_msa, labels = c("MSA of 1 million or more, with rail", "MSA of 1 million or more, and not in 1", "MSA less than 1 million", "Not in MSA"))
table(data4$MSACAT)
##
## MSA of 1 million or more, with rail MSA of 1 million or more, and not in 1
## 41069 75159
## MSA less than 1 million Not in MSA
## 107604 40402
data4 %>%
filter(WRK_HOME== "Yes" |WRK_HOME== "No") %>%
group_by(WRK_HOME, MSACAT) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'WRK_HOME'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 4
## # Groups: WRK_HOME [2]
## WRK_HOME MSACAT count p
## <fct> <fct> <int> <dbl>
## 1 Yes MSA of 1 million or more, with rail 3165 18.8
## 2 Yes MSA of 1 million or more, and not in 1 5425 32.2
## 3 Yes MSA less than 1 million 5930 35.2
## 4 Yes Not in MSA 2314 13.7
## 5 No MSA of 1 million or more, with rail 17612 16.5
## 6 No MSA of 1 million or more, and not in 1 31832 29.8
## 7 No MSA less than 1 million 43028 40.3
## 8 No Not in MSA 14189 13.3
Individual who work from home vs age:
data4 %>%
filter(WRK_HOME == "Yes" | WRK_HOME== "No") %>%
group_by(WRK_HOME, R_AGE) %>%
summarise(count = n()) %>%
mutate(p = (count/sum(count))*100)
## `summarise()` has grouped output by 'WRK_HOME'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: WRK_HOME [2]
## WRK_HOME R_AGE count p
## <fct> <fct> <int> <dbl>
## 1 Yes I don't know 3 0.0178
## 2 Yes I prefer not to answer 23 0.137
## 3 Yes Teenage 166 0.986
## 4 Yes Young 5270 31.3
## 5 Yes Old 6189 36.8
## 6 Yes <NA> 5183 30.8
## 7 No I don't know 12 0.0113
## 8 No I prefer not to answer 146 0.137
## 9 No Teenage 3895 3.65
## 10 No Young 46741 43.8
## 11 No Old 38325 35.9
## 12 No <NA> 17542 16.4
Individual who work from home vs sex:
data4$R_SEX <-as.integer(as.character(data4$R_SEX))
typeof(data4$R_SEX)
## [1] "integer"
break_sex <- c(-7, -8, 1 , 2, Inf)
data4$R_SEX <- cut(data4$R_SEX, break_sex, labels = c("I prefer not to answer", "I don't know", "Male", "Female"))
table(data4$R_SEX)
##
## I prefer not to answer I don't know Male
## 209 124687 139304
## Female
## 0
data4 %>%
filter(WRK_HOME == "Yes" | WRK_HOME== "No", R_SEX=="Male"| R_SEX== "Female") %>%
group_by(WRK_HOME, R_SEX) %>%
summarise(count = n()) %>%
mutate(p = (count/sum(count))*100)
## `summarise()` has grouped output by 'WRK_HOME'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 4
## # Groups: WRK_HOME [2]
## WRK_HOME R_SEX count p
## <fct> <fct> <int> <dbl>
## 1 Yes Male 8241 100
## 2 No Male 52248 100
data4$BIKESHARE <-as.integer(as.character(data4$BIKESHARE))
breaks_bikeshare <- c(-Inf, -8, -7, -1, Inf)
data4$BIKESHARE <-cut(data4$BIKESHARE, breaks_bikeshare, labels = c("Idk", "No ans", "App skip", "Yes"))
table(data4$BIKESHARE)
##
## Idk No ans App skip Yes
## 48 26 235734 28426
len <-length(data4$BIKESHARE)
data4 %>%
group_by(BIKESHARE) %>%
summarize(count=n()) %>%
mutate(p = count/len*100)
## # A tibble: 4 × 3
## BIKESHARE count p
## <fct> <int> <dbl>
## 1 Idk 48 0.0182
## 2 No ans 26 0.00984
## 3 App skip 235734 89.2
## 4 Yes 28426 10.8
data4 %>%
group_by(BIKESHARE, HHFAMINC) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'BIKESHARE'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: BIKESHARE [4]
## BIKESHARE HHFAMINC count p
## <fct> <fct> <int> <dbl>
## 1 Idk Low 27 56.2
## 2 Idk Medium 18 37.5
## 3 Idk High 3 6.25
## 4 No ans Low 13 50
## 5 No ans Medium 12 46.2
## 6 No ans High 1 3.85
## 7 App skip Low 90064 38.2
## 8 App skip Medium 99981 42.4
## 9 App skip High 45689 19.4
## 10 Yes Low 8341 29.3
## 11 Yes Medium 12352 43.5
## 12 Yes High 7733 27.2
data3$TRPTRANS <-as.integer(as.character(data3$TRPTRANS))
breaks_mode <- c(-Inf, -7, -8, -9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 97)
data3$TRPTRANS <-cut(data3$TRPTRANS, breaks_mode, labels = c("I prefer not to answer", "I don't know", "Not ascertained", "Walk", "Bicycle", "Car", "SUV" , "Van", "Pickup truck", "Golf cart / Segway", "Motorcycle / Moped", " RV (motor home, ATV, snowmobile)", "School bus", " Public or commuter bus", "Paratransit / Dial-a-ride", " Private / Charter / Tour / Shuttle bus", "City-to-city bus (Greyhound, Megabus)", "Amtrak / Commuter rail", "Subway / elevated / light rail / street car", "Taxi / limo (including Uber / Lyft)", "Rental car (Including Zipcar / Car2Go)", "Airplane", "Boat / ferry / water taxi", "Something Else"))
table(data3$TRPTRANS)
##
## I prefer not to answer
## 1
## I don't know
## 13
## Not ascertained
## 2
## Walk
## 81288
## Bicycle
## 8034
## Car
## 396931
## SUV
## 229466
## Van
## 60463
## Pickup truck
## 108303
## Golf cart / Segway
## 826
## Motorcycle / Moped
## 2088
## \tRV (motor home, ATV, snowmobile)
## 814
## School bus
## 11313
## \tPublic or commuter bus
## 6616
## Paratransit / Dial-a-ride
## 624
## \tPrivate / Charter / Tour / Shuttle bus
## 1581
## City-to-city bus (Greyhound, Megabus)
## 120
## Amtrak / Commuter rail
## 1148
## Subway / elevated / light rail / street car
## 3326
## Taxi / limo (including Uber / Lyft)
## 2813
## Rental car (Including Zipcar / Car2Go)
## 2006
## Airplane
## 1823
## Boat / ferry / water taxi
## 458
## Something Else
## 3515
data3 %>%
group_by(VMT_MILE, TRPTRANS) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'VMT_MILE'. You can override using the
## `.groups` argument.
## # A tibble: 31 × 4
## # Groups: VMT_MILE [2]
## VMT_MILE TRPTRANS count p
## <fct> <fct> <int> <dbl>
## 1 Appropriate skip I prefer not to answer 1 0.000254
## 2 Appropriate skip I don't know 13 0.00330
## 3 Appropriate skip Not ascertained 2 0.000508
## 4 Appropriate skip Walk 81288 20.7
## 5 Appropriate skip Bicycle 8034 2.04
## 6 Appropriate skip Car 129344 32.9
## 7 Appropriate skip SUV 80641 20.5
## 8 Appropriate skip Van 27411 6.97
## 9 Appropriate skip Pickup truck 31103 7.90
## 10 Appropriate skip Golf cart / Segway 826 0.210
## # … with 21 more rows
data3$MSACAT <-as.integer(as.character(data3$MSACAT))
typeof(data3$MSACAT)
## [1] "integer"
break_msa <- c(-Inf, 1, 2, 3 , 4)
data3$MSACAT <- cut(data3$MSACAT, break_msa, labels = c("MSA of 1 million or more, with rail", "MSA of 1 million or more, and not in 1", "MSA less than 1 million", "Not in MSA"))
table(data3$MSACAT)
##
## MSA of 1 million or more, with rail MSA of 1 million or more, and not in 1
## 143299 264980
## MSA less than 1 million Not in MSA
## 379588 135705
data3$VMT_MILE <-as.integer(as.character(data3$VMT_MILE))
## Warning: NAs introduced by coercion
typeof(data3$VMT_MILE)
## [1] "integer"
break_mile <- c(-Inf, 0, Inf)
data3$VMT_MILE <- cut(data3$VMT_MILE, break_mile, labels = c("Appropriate skip", "0 - 5441.489"))
table(data3$VMT_MILE)
##
## Appropriate skip 0 - 5441.489
## 0 0
data3 %>%
group_by(VMT_MILE, MSACAT) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'VMT_MILE'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 4
## # Groups: VMT_MILE [1]
## VMT_MILE MSACAT count p
## <fct> <fct> <int> <dbl>
## 1 <NA> MSA of 1 million or more, with rail 143299 15.5
## 2 <NA> MSA of 1 million or more, and not in 1 264980 28.7
## 3 <NA> MSA less than 1 million 379588 41.1
## 4 <NA> Not in MSA 135705 14.7
data3 %>%
group_by(TRPTRANS, MSACAT) %>%
summarise(count = n()) %>%
mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'TRPTRANS'. You can override using the
## `.groups` argument.
## # A tibble: 89 × 4
## # Groups: TRPTRANS [24]
## TRPTRANS MSACAT count p
## <fct> <fct> <int> <dbl>
## 1 I prefer not to answer MSA of 1 million or more, and not in 1 1 100
## 2 I don't know MSA of 1 million or more, and not in 1 2 15.4
## 3 I don't know MSA less than 1 million 2 15.4
## 4 I don't know Not in MSA 9 69.2
## 5 Not ascertained MSA less than 1 million 2 100
## 6 Walk MSA of 1 million or more, with rail 19884 24.5
## 7 Walk MSA of 1 million or more, and not in 1 22182 27.3
## 8 Walk MSA less than 1 million 29340 36.1
## 9 Walk Not in MSA 9882 12.2
## 10 Bicycle MSA of 1 million or more, with rail 1473 18.3
## # … with 79 more rows