#CREATING LABELS for race categories:
breaks_race <- c(-Inf, -7, -8 ,1 ,2 ,3 ,4 ,5 ,6 ,97)
data4$R_RACE <-cut(data4$R_RACE, breaks_race, labels = c("refused","Don’t know","white","Black or African","Asian", "American Indian","Native Hawaiian or other Pacific Islander", "Multiple responses selected", "Some other race"))
table(data4$R_RACE)
## 
##                                   refused 
##                                       264 
##                                Don’t know 
##                                      1188 
##                                     white 
##                                    214237 
##                          Black or African 
##                                     19426 
##                                     Asian 
##                                     12064 
##                           American Indian 
##                                      1721 
## Native Hawaiian or other Pacific Islander 
##                                       636 
##               Multiple responses selected 
##                                      8490 
##                           Some other race 
##                                      6208
table(data4$HHFAMINC)
## 
##    -9    -8    -7     1     2     3     4     5     6     7     8     9    10 
##    35  1338  6602 10396 10030 19037 21785 29222 45560 36777 30026 17244 17438 
##    11 
## 18744
breaks_inc <- c(-Inf,5,8,Inf)
unique(data4$HHFAMINC)
##  [1]  7  8 10  3  5 11  9  4  6  1 -7  2 -8 -9
data4$HHFAMINC <-cut(data4$HHFAMINC, c(-Inf,5,8,Inf), labels = c("Low", "Medium", "High"))
table(data4$HHFAMINC)
## 
##    Low Medium   High 
##  98445 112363  53426
table(data2$HHFAMINC)
## 
##    -9    -8    -7     1     2     3     4     5     6     7     8     9    10 
##    42  1067  7000  6124  7401 16130 20097 28647 46141 38069 30778 17678 17580 
##    11 
## 19361
breaks_inc <- c(-Inf,5,8,Inf)
unique(data2$HHFAMINC)
##  [1]  7  8 10  3  5 11  9  4  6  1 -7  2 -8 -9
data2$HHFAMINC <-cut(data2$HHFAMINC, c(-Inf,5,8,Inf), labels = c("Low", "Medium", "High"))
table(data2$HHFAMINC)
## 
##    Low Medium   High 
##  86508 114988  54619
#lapply(data4$HTPPOPDN,as.numeric)
unique(data4$HTPPOPDN)
## [1]  1500   300 17000  7000    50 30000  3000   750    -9
table(data4$HTPPOPDN)
## 
##    -9    50   300   750  1500  3000  7000 17000 30000 
##   229 39256 46171 25204 35116 49362 54101 11313  3482
break_popden <- c(-Inf, -9, 50, 300, 750, 1500, 3000, 7000, 17000, 30000)
data4$HTPPOPDN <-cut(data4$HTPPOPDN, break_popden, labels = c( "-9", "50", "300", "750", "1500", "3000", "7000", "10,000-24,999", "25,000-999,999"))
table(data4$HTPPOPDN)
## 
##             -9             50            300            750           1500 
##            229          39256          46171          25204          35116 
##           3000           7000  10,000-24,999 25,000-999,999 
##          49362          54101          11313           3482
data4$WRK_HOME <-as.integer(as.character(data4$WRK_HOME))
break_wfh <- c(-Inf, -1, -7, -8, -9, 1, 2)
labels_wfh <- c("I don't know", "I prefer not to answer", "Appropriate skip", "Not ascertained", "Yes", "No")
data4$WRK_HOME <-cut(data4$WRK_HOME, break_wfh, labels_wfh)
table(data4$WRK_HOME)
## 
##           I don't know I prefer not to answer       Appropriate skip 
##                   4778                     10                      8 
##        Not ascertained                    Yes                     No 
##                 135943                  16834                 106661
data4$R_AGE <-as.integer(as.character(data4$R_AGE))
break_age <- c(-Inf, -8, -7, 12, 20, 45, 60 )
labels_age <- c("I don't know", "I prefer not to answer", "Children","Teenage", "Young", "Old")
data4$R_AGE <-cut(data4$R_AGE, break_age, labels_age)
table(data4$R_AGE)
## 
##           I don't know I prefer not to answer               Children 
##                     44                    452                  19977 
##                Teenage                  Young                    Old 
##                  18751                  67450                  62686

Q1.1. What is the average household vehicle count? (Household Data set) In order to calculate the average we would use the mean() function.

# lapply(data4$HHVEHCNT,as.numeric)
break_veh_count <- c(-Inf,1,2,3,Inf)
data4$HHVEHCNT <- cut(data4$HHVEHCNT, break_veh_count,
                  labels = c("zero", "one", "two", "more than two"))
table(data2$HHVEHCNT)
## 
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 41534 99872 60801 30248 12705  5658  2471  1184   657   390   187   408
veh_count_mean <- mean(data2$HHVEHCNT)  #using $ sign to get to a specific column of data set
print( veh_count_mean)
## [1] 2.677239
  1. Does average vehicle count change based on population density (cross tab htppopdn and hhvehcnt. Please sort the population density categories accordingly.)
data4 %>% 
  group_by(HHVEHCNT, HTPPOPDN) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'HHVEHCNT'. You can override using the
## `.groups` argument.
## # A tibble: 36 × 4
## # Groups:   HHVEHCNT [4]
##    HHVEHCNT HTPPOPDN       count       p
##    <fct>    <fct>          <int>   <dbl>
##  1 zero     -9                68  0.101 
##  2 zero     50              7105 10.5   
##  3 zero     300             9049 13.4   
##  4 zero     750             5537  8.21  
##  5 zero     1500            8413 12.5   
##  6 zero     3000           13155 19.5   
##  7 zero     7000           16515 24.5   
##  8 zero     10,000-24,999   4890  7.25  
##  9 zero     25,000-999,999  2671  3.96  
## 10 one      -9                95  0.0852
## # … with 26 more rows
  1. Create a variable that groups household vehicle count into four categories: 0 vehicles, 1 vehicle, 2 vehicles, and 3 or more vehicles.
#lapply(data4$BIKESHARE,as.numeric)
data2$HHVEHCNT <-as.integer(as.character(data2$HHVEHCNT))
breaks_veh_count <- c(0,1,2,3, Inf)
data2$HHVEHCNT <- cut(data2$HHVEHCNT, breaks_veh_count,
                  labels = c("zero", "one", "two", "more than two"))
table(data2$HHVEHCNT)
## 
##          zero           one           two more than two 
##         41534         99872         60801         53908
  1. How does the new vehicle count grouping variable vary across different household income groups? Use the new income group’s variable we created in class (low/medium/high).
data2 %>% 
  group_by(HHVEHCNT,HHFAMINC) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'HHVEHCNT'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups:   HHVEHCNT [4]
##    HHVEHCNT      HHFAMINC count     p
##    <fct>         <fct>    <int> <dbl>
##  1 zero          Low      27031 65.1 
##  2 zero          Medium   12211 29.4 
##  3 zero          High      2292  5.52
##  4 one           Low      32282 32.3 
##  5 one           Medium   46850 46.9 
##  6 one           High     20740 20.8 
##  7 two           Low      15885 26.1 
##  8 two           Medium   29400 48.4 
##  9 two           High     15516 25.5 
## 10 more than two Low      11310 21.0 
## 11 more than two Medium   26527 49.2 
## 12 more than two High     16071 29.8
  1. What is the average vehicle age for vehicles? (Hint: Use the vehicle data file. Be careful about the missing values (-9).
replace(data2$vehage, data2$vehage<0,0)
## numeric(0)
avg_veh_age <- mean(data2$VEHAGE)
print(avg_veh_age)
## [1] 10.35699

’ 7. What is the average distance to work? (Hint: Use the person data file. First sort the data so that you don’t include the records that are coded with missing values (negative numbers or zero).

vmt_miles <- replace(data3$VMT_MILE, data3$VMT_MILE<0,0)
avg_vmt_miles <- mean(vmt_miles)
print(avg_vmt_miles)
## [1] 6.30844
  1. Does average distance to work vary across genders?
data3$VMT_MILE <-as.integer(as.character(data3$VMT_MILE)) 
typeof(data3$VMT_MILE)
## [1] "integer"
break_mile <- c(-Inf, 0,  Inf)
data3$VMT_MILE <- cut(data3$VMT_MILE, break_mile, labels = c("Appropriate skip", "0 - 5441.489"))
table(data3$VMT_MILE)
## 
## Appropriate skip     0 - 5441.489 
##           393527           530045
data3$R_SEX <-as.integer(as.character(data3$R_SEX)) 
typeof(data3$R_SEX)
## [1] "integer"
break_sex <- c(-Inf,-7, -8, 1 , 2)
data3$R_SEX <- cut(data3$R_SEX, break_sex, labels = c("I prefer not to answer", "I don't know", "Male", "Female"))
table(data3$R_SEX)
## 
## I prefer not to answer           I don't know                   Male 
##                     92                    619                 432478 
##                 Female 
##                 490383
table(data3$VMT_MILE)
## 
## Appropriate skip     0 - 5441.489 
##           393527           530045
data3 %>% 
  group_by(VMT_MILE,R_SEX) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'VMT_MILE'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 4
## # Groups:   VMT_MILE [2]
##   VMT_MILE         R_SEX                   count        p
##   <fct>            <fct>                   <int>    <dbl>
## 1 Appropriate skip I prefer not to answer     43  0.0109 
## 2 Appropriate skip I don't know              361  0.0917 
## 3 Appropriate skip Male                   164151 41.7    
## 4 Appropriate skip Female                 228972 58.2    
## 5 0 - 5441.489     I prefer not to answer     49  0.00924
## 6 0 - 5441.489     I don't know              258  0.0487 
## 7 0 - 5441.489     Male                   268327 50.6    
## 8 0 - 5441.489     Female                 261411 49.3
# table_sex_mile <- aggregate(VMT_MILE ~ R_SEX_IMP , data = data3, mean)
# table_sex_mile
data4 %>% 
  # filter(WRK_HOME > 0) %>% 
  group_by(WRK_HOME) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## # A tibble: 6 × 3
##   WRK_HOME                count        p
##   <fct>                   <int>    <dbl>
## 1 I don't know             4778  1.81   
## 2 I prefer not to answer     10  0.00378
## 3 Appropriate skip            8  0.00303
## 4 Not ascertained        135943 51.4    
## 5 Yes                     16834  6.37   
## 6 No                     106661 40.4

Individual who work from home vs metropolitan status:

data4$MSACAT <-as.integer(as.character(data4$MSACAT)) 
typeof(data4$MSACAT)
## [1] "integer"
break_msa <- c(-Inf, 1, 2, 3 , 4)
data4$MSACAT <- cut(data4$MSACAT, break_msa, labels = c("MSA of 1 million or more, with rail", "MSA of 1 million or more, and not in 1", "MSA less than 1 million", "Not in MSA"))
table(data4$MSACAT)
## 
##    MSA of 1 million or more, with rail MSA of 1 million or more, and not in 1 
##                                  41069                                  75159 
##                MSA less than 1 million                             Not in MSA 
##                                 107604                                  40402
data4 %>% 
  filter(WRK_HOME== "Yes" |WRK_HOME==  "No") %>% 
  group_by(WRK_HOME, MSACAT) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'WRK_HOME'. You can override using the
## `.groups` argument.
## # A tibble: 8 × 4
## # Groups:   WRK_HOME [2]
##   WRK_HOME MSACAT                                 count     p
##   <fct>    <fct>                                  <int> <dbl>
## 1 Yes      MSA of 1 million or more, with rail     3165  18.8
## 2 Yes      MSA of 1 million or more, and not in 1  5425  32.2
## 3 Yes      MSA less than 1 million                 5930  35.2
## 4 Yes      Not in MSA                              2314  13.7
## 5 No       MSA of 1 million or more, with rail    17612  16.5
## 6 No       MSA of 1 million or more, and not in 1 31832  29.8
## 7 No       MSA less than 1 million                43028  40.3
## 8 No       Not in MSA                             14189  13.3

Individual who work from home vs age:

data4 %>% 
  filter(WRK_HOME == "Yes" | WRK_HOME== "No") %>% 
  group_by(WRK_HOME, R_AGE) %>% 
  summarise(count = n()) %>% 
  mutate(p = (count/sum(count))*100)
## `summarise()` has grouped output by 'WRK_HOME'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups:   WRK_HOME [2]
##    WRK_HOME R_AGE                  count       p
##    <fct>    <fct>                  <int>   <dbl>
##  1 Yes      I don't know               3  0.0178
##  2 Yes      I prefer not to answer    23  0.137 
##  3 Yes      Teenage                  166  0.986 
##  4 Yes      Young                   5270 31.3   
##  5 Yes      Old                     6189 36.8   
##  6 Yes      <NA>                    5183 30.8   
##  7 No       I don't know              12  0.0113
##  8 No       I prefer not to answer   146  0.137 
##  9 No       Teenage                 3895  3.65  
## 10 No       Young                  46741 43.8   
## 11 No       Old                    38325 35.9   
## 12 No       <NA>                   17542 16.4

Individual who work from home vs sex:

data4$R_SEX <-as.integer(as.character(data4$R_SEX)) 
typeof(data4$R_SEX)
## [1] "integer"
break_sex <- c(-7, -8, 1 , 2, Inf)
data4$R_SEX <- cut(data4$R_SEX, break_sex, labels = c("I prefer not to answer", "I don't know", "Male", "Female"))
table(data4$R_SEX)
## 
## I prefer not to answer           I don't know                   Male 
##                    209                 124687                 139304 
##                 Female 
##                      0
data4 %>% 
  filter(WRK_HOME == "Yes" | WRK_HOME== "No", R_SEX=="Male"|  R_SEX== "Female") %>% 
  group_by(WRK_HOME, R_SEX) %>% 
  summarise(count = n()) %>% 
  mutate(p = (count/sum(count))*100)
## `summarise()` has grouped output by 'WRK_HOME'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 4
## # Groups:   WRK_HOME [2]
##   WRK_HOME R_SEX count     p
##   <fct>    <fct> <int> <dbl>
## 1 Yes      Male   8241   100
## 2 No       Male  52248   100
data4$BIKESHARE <-as.integer(as.character(data4$BIKESHARE))
breaks_bikeshare <- c(-Inf, -8, -7, -1, Inf)
data4$BIKESHARE <-cut(data4$BIKESHARE, breaks_bikeshare, labels = c("Idk", "No ans", "App skip", "Yes"))
table(data4$BIKESHARE)
## 
##      Idk   No ans App skip      Yes 
##       48       26   235734    28426
len <-length(data4$BIKESHARE)

data4 %>%
  group_by(BIKESHARE) %>% 
  summarize(count=n()) %>%
  mutate(p = count/len*100)
## # A tibble: 4 × 3
##   BIKESHARE  count        p
##   <fct>      <int>    <dbl>
## 1 Idk           48  0.0182 
## 2 No ans        26  0.00984
## 3 App skip  235734 89.2    
## 4 Yes        28426 10.8
data4 %>% 
  group_by(BIKESHARE, HHFAMINC) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'BIKESHARE'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups:   BIKESHARE [4]
##    BIKESHARE HHFAMINC count     p
##    <fct>     <fct>    <int> <dbl>
##  1 Idk       Low         27 56.2 
##  2 Idk       Medium      18 37.5 
##  3 Idk       High         3  6.25
##  4 No ans    Low         13 50   
##  5 No ans    Medium      12 46.2 
##  6 No ans    High         1  3.85
##  7 App skip  Low      90064 38.2 
##  8 App skip  Medium   99981 42.4 
##  9 App skip  High     45689 19.4 
## 10 Yes       Low       8341 29.3 
## 11 Yes       Medium   12352 43.5 
## 12 Yes       High      7733 27.2
data3$TRPTRANS <-as.integer(as.character(data3$TRPTRANS))
breaks_mode <- c(-Inf, -7, -8, -9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 97)
data3$TRPTRANS <-cut(data3$TRPTRANS, breaks_mode, labels = c("I prefer not to answer", "I don't know", "Not ascertained", "Walk", "Bicycle", "Car", "SUV" , "Van", "Pickup truck", "Golf cart / Segway", "Motorcycle / Moped", "    RV (motor home, ATV, snowmobile)", "School bus", "  Public or commuter bus", "Paratransit / Dial-a-ride", " Private / Charter / Tour / Shuttle bus", "City-to-city bus (Greyhound, Megabus)", "Amtrak / Commuter rail", "Subway / elevated / light rail / street car", "Taxi / limo (including Uber / Lyft)", "Rental car (Including Zipcar / Car2Go)", "Airplane", "Boat / ferry / water taxi",    "Something Else"))
table(data3$TRPTRANS)
## 
##                      I prefer not to answer 
##                                           1 
##                                I don't know 
##                                          13 
##                             Not ascertained 
##                                           2 
##                                        Walk 
##                                       81288 
##                                     Bicycle 
##                                        8034 
##                                         Car 
##                                      396931 
##                                         SUV 
##                                      229466 
##                                         Van 
##                                       60463 
##                                Pickup truck 
##                                      108303 
##                          Golf cart / Segway 
##                                         826 
##                          Motorcycle / Moped 
##                                        2088 
##          \tRV (motor home, ATV, snowmobile) 
##                                         814 
##                                  School bus 
##                                       11313 
##                    \tPublic or commuter bus 
##                                        6616 
##                   Paratransit / Dial-a-ride 
##                                         624 
##    \tPrivate / Charter / Tour / Shuttle bus 
##                                        1581 
##       City-to-city bus (Greyhound, Megabus) 
##                                         120 
##                      Amtrak / Commuter rail 
##                                        1148 
## Subway / elevated / light rail / street car 
##                                        3326 
##         Taxi / limo (including Uber / Lyft) 
##                                        2813 
##      Rental car (Including Zipcar / Car2Go) 
##                                        2006 
##                                    Airplane 
##                                        1823 
##                   Boat / ferry / water taxi 
##                                         458 
##                              Something Else 
##                                        3515
data3 %>% 
  group_by(VMT_MILE, TRPTRANS) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'VMT_MILE'. You can override using the
## `.groups` argument.
## # A tibble: 31 × 4
## # Groups:   VMT_MILE [2]
##    VMT_MILE         TRPTRANS                count         p
##    <fct>            <fct>                   <int>     <dbl>
##  1 Appropriate skip I prefer not to answer      1  0.000254
##  2 Appropriate skip I don't know               13  0.00330 
##  3 Appropriate skip Not ascertained             2  0.000508
##  4 Appropriate skip Walk                    81288 20.7     
##  5 Appropriate skip Bicycle                  8034  2.04    
##  6 Appropriate skip Car                    129344 32.9     
##  7 Appropriate skip SUV                     80641 20.5     
##  8 Appropriate skip Van                     27411  6.97    
##  9 Appropriate skip Pickup truck            31103  7.90    
## 10 Appropriate skip Golf cart / Segway        826  0.210   
## # … with 21 more rows
data3$MSACAT <-as.integer(as.character(data3$MSACAT)) 
typeof(data3$MSACAT)
## [1] "integer"
break_msa <- c(-Inf, 1, 2, 3 , 4)
data3$MSACAT <- cut(data3$MSACAT, break_msa, labels = c("MSA of 1 million or more, with rail", "MSA of 1 million or more, and not in 1", "MSA less than 1 million", "Not in MSA"))
table(data3$MSACAT)
## 
##    MSA of 1 million or more, with rail MSA of 1 million or more, and not in 1 
##                                 143299                                 264980 
##                MSA less than 1 million                             Not in MSA 
##                                 379588                                 135705
data3$VMT_MILE <-as.integer(as.character(data3$VMT_MILE)) 
## Warning: NAs introduced by coercion
typeof(data3$VMT_MILE)
## [1] "integer"
break_mile <- c(-Inf, 0,  Inf)
data3$VMT_MILE <- cut(data3$VMT_MILE, break_mile, labels = c("Appropriate skip", "0 - 5441.489"))
table(data3$VMT_MILE)
## 
## Appropriate skip     0 - 5441.489 
##                0                0
data3 %>% 
  group_by(VMT_MILE, MSACAT) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'VMT_MILE'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 4
## # Groups:   VMT_MILE [1]
##   VMT_MILE MSACAT                                  count     p
##   <fct>    <fct>                                   <int> <dbl>
## 1 <NA>     MSA of 1 million or more, with rail    143299  15.5
## 2 <NA>     MSA of 1 million or more, and not in 1 264980  28.7
## 3 <NA>     MSA less than 1 million                379588  41.1
## 4 <NA>     Not in MSA                             135705  14.7
data3 %>% 
  group_by(TRPTRANS, MSACAT) %>% 
  summarise(count = n()) %>% 
  mutate(p = count/sum(count)*100)
## `summarise()` has grouped output by 'TRPTRANS'. You can override using the
## `.groups` argument.
## # A tibble: 89 × 4
## # Groups:   TRPTRANS [24]
##    TRPTRANS               MSACAT                                 count     p
##    <fct>                  <fct>                                  <int> <dbl>
##  1 I prefer not to answer MSA of 1 million or more, and not in 1     1 100  
##  2 I don't know           MSA of 1 million or more, and not in 1     2  15.4
##  3 I don't know           MSA less than 1 million                    2  15.4
##  4 I don't know           Not in MSA                                 9  69.2
##  5 Not ascertained        MSA less than 1 million                    2 100  
##  6 Walk                   MSA of 1 million or more, with rail    19884  24.5
##  7 Walk                   MSA of 1 million or more, and not in 1 22182  27.3
##  8 Walk                   MSA less than 1 million                29340  36.1
##  9 Walk                   Not in MSA                              9882  12.2
## 10 Bicycle                MSA of 1 million or more, with rail     1473  18.3
## # … with 79 more rows