HW3_wrng

install package

options(warn = -1)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

setwd("C:/Users/nawal/OneDrive/Desktop/Aditya Course/Data Wrangling/Week 4/Week 4/homework3")

##Load data using tidyverse

df<- read_csv("acs_2015_county_data_revised.csv")

## Rows: 3142 Columns: 35

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(df)

## # A tibble: 6 x 35
##   census_id state county total_pop   men women hispanic white black native asian
##       <dbl> <chr> <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1      1001 Alab~ Autau~     55221 26745 28476      2.6  75.8  18.5    0.4   1  
## 2      1003 Alab~ Baldw~    195121 95314 99807      4.5  83.1   9.5    0.6   0.7
## 3      1005 Alab~ Barbo~     26932 14497 12435      4.6  46.2  46.7    0.2   0.4
## 4      1007 Alab~ Bibb       22604 12073 10531      2.2  74.5  21.4    0.4   0.1
## 5      1009 Alab~ Blount     57710 28512 29198      8.6  87.9   1.5    0.3   0.1
## 6      1011 Alab~ Bullo~     10678  5660  5018      4.4  22.2  70.7    1.2   0.2
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## #   family_work <dbl>, unemployment <dbl>

Check the structure of the data

str(df)

## spec_tbl_df [3,142 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ census_id     : num [1:3142] 1001 1003 1005 1007 1009 ...
##  $ state         : chr [1:3142] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county        : chr [1:3142] "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ total_pop     : num [1:3142] 55221 195121 26932 22604 57710 ...
##  $ men           : num [1:3142] 26745 95314 14497 12073 28512 ...
##  $ women         : num [1:3142] 28476 99807 12435 10531 29198 ...
##  $ hispanic      : num [1:3142] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
##  $ white         : num [1:3142] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
##  $ black         : num [1:3142] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
##  $ native        : num [1:3142] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
##  $ asian         : num [1:3142] 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
##  $ pacific       : num [1:3142] 0 0 0 0 0 0 0 0 0 0 ...
##  $ citizen       : num [1:3142] 40725 147695 20714 17495 42345 ...
##  $ income        : num [1:3142] 51281 50254 32964 38678 45813 ...
##  $ income_per_cap: num [1:3142] 24974 27317 16824 18431 20532 ...
##  $ poverty       : num [1:3142] 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
##  $ child_poverty : num [1:3142] 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
##  $ professional  : num [1:3142] 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
##  $ service       : num [1:3142] 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
##  $ office        : num [1:3142] 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
##  $ construction  : num [1:3142] 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
##  $ production    : num [1:3142] 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
##  $ drive         : num [1:3142] 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
##  $ carpool       : num [1:3142] 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
##  $ transit       : num [1:3142] 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
##  $ walk          : num [1:3142] 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
##  $ other_transp  : num [1:3142] 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
##  $ work_at_home  : num [1:3142] 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
##  $ mean_commute  : num [1:3142] 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
##  $ employed      : num [1:3142] 23986 85953 8597 8294 22189 ...
##  $ private_work  : num [1:3142] 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
##  $ public_work   : num [1:3142] 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
##  $ self_employed : num [1:3142] 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
##  $ family_work   : num [1:3142] 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
##  $ unemployment  : num [1:3142] 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   census_id = col_double(),
##   ..   state = col_character(),
##   ..   county = col_character(),
##   ..   total_pop = col_double(),
##   ..   men = col_double(),
##   ..   women = col_double(),
##   ..   hispanic = col_double(),
##   ..   white = col_double(),
##   ..   black = col_double(),
##   ..   native = col_double(),
##   ..   asian = col_double(),
##   ..   pacific = col_double(),
##   ..   citizen = col_double(),
##   ..   income = col_double(),
##   ..   income_per_cap = col_double(),
##   ..   poverty = col_double(),
##   ..   child_poverty = col_double(),
##   ..   professional = col_double(),
##   ..   service = col_double(),
##   ..   office = col_double(),
##   ..   construction = col_double(),
##   ..   production = col_double(),
##   ..   drive = col_double(),
##   ..   carpool = col_double(),
##   ..   transit = col_double(),
##   ..   walk = col_double(),
##   ..   other_transp = col_double(),
##   ..   work_at_home = col_double(),
##   ..   mean_commute = col_double(),
##   ..   employed = col_double(),
##   ..   private_work = col_double(),
##   ..   public_work = col_double(),
##   ..   self_employed = col_double(),
##   ..   family_work = col_double(),
##   ..   unemployment = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

No data don’t require any structure change as the columns are in the same format as desired

glimpse(df)

## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~

To check the missing values present in the data

colSums(is.na(df))

##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

There is only 2 missing value one in income column & other in child_poverty , we will omit this records since using any sort of imputation would not be able to explain the correct value of this 2 observations

df<- na.omit(df)

To summarize data we will use summary function

summary(df)

##    census_id        state              county            total_pop       
##  Min.   : 1001   Length:3140        Length:3140        Min.   :     267  
##  1st Qu.:18179   Class :character   Class :character   1st Qu.:   11036  
##  Median :29176   Mode  :character   Mode  :character   Median :   25793  
##  Mean   :30383                                         Mean   :  100801  
##  3rd Qu.:45080                                         3rd Qu.:   67620  
##  Max.   :56045                                         Max.   :10038388  
##       men              women            hispanic          white      
##  Min.   :    136   Min.   :    131   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5551   1st Qu.:   5488   1st Qu.: 1.900   1st Qu.:65.67  
##  Median :  12838   Median :  12916   Median : 3.700   Median :84.65  
##  Mean   :  49597   Mean   :  51204   Mean   : 8.819   Mean   :77.31  
##  3rd Qu.:  33328   3rd Qu.:  34123   3rd Qu.: 9.000   3rd Qu.:93.33  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.885   Mean   : 1.763   Mean   : 1.253   Mean   : 0.07357  
##  3rd Qu.:10.200   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :11.10000  
##     citizen            income       income_per_cap     poverty    
##  Min.   :    199   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8276   1st Qu.: 38826   1st Qu.:20470   1st Qu.:12.0  
##  Median :  19454   Median : 45095   Median :23575   Median :16.0  
##  Mean   :  70849   Mean   : 46824   Mean   :24331   Mean   :16.7  
##  3rd Qu.:  50795   3rd Qu.: 52248   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.05   Mean   :18.25   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.42   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##   construction     production        drive         carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.2   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.50   1st Qu.:76.6   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.6   Median : 9.90  
##  Mean   :12.75   Mean   :15.82   Mean   :79.1   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.6   3rd Qu.:11.90  
##  Max.   :40.30   Max.   :55.60   Max.   :94.6   Max.   :29.90  
##     transit             walk         other_transp    work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.90   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.30   Median : 4.000  
##  Mean   : 0.9681   Mean   : 3.294   Mean   : 1.61   Mean   : 4.694  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.90   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.10   Max.   :37.200  
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :    166   Min.   :29.50   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4532   1st Qu.:70.90   1st Qu.:13.07  
##  Median :22.90   Median :  10657   Median :75.85   Median :16.10  
##  Mean   :23.15   Mean   :  46416   Mean   :74.45   Mean   :17.33  
##  3rd Qu.:26.60   3rd Qu.:  29272   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.922   Mean   :0.2917   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400

In the variable transit & walk have very skewed mean and median we will further histogram to understand the reason behind it.

hist(df$transit)

hist(df$walk)

After plotting the histogram and looking at the records for highest walk and transit the number seems to be fine

df$flag<- ifelse(df$women> df$men,1,0)

county_women_gt_men <- df%>% filter(flag ==1) %>% summarise(count = length(unique(county)))

print(county_women_gt_men)

## # A tibble: 1 x 1
##   count
##   <int>
## 1  1248

1248

county_unemp<- df%>% filter(unemployment < 10) %>% summarise(count = length(unique(county)))

print(county_unemp)

## # A tibble: 1 x 1
##   count
##   <int>
## 1  1482

1482

highest_mean_commute <- df%>% select(census_id,county,state,mean_commute)%>% arrange(desc(mean_commute))%>% top_n(10)

## Selecting by mean_commute

print(highest_mean_commute)

## # A tibble: 10 x 4
##    census_id county       state         mean_commute
##        <dbl> <chr>        <chr>                <dbl>
##  1     42103 Pike         Pennsylvania          44  
##  2     36005 Bronx        New York              43  
##  3     24017 Charles      Maryland              42.8
##  4     51187 Warren       Virginia              42.7
##  5     36081 Queens       New York              42.6
##  6     36085 Richmond     New York              42.6
##  7     51193 Westmoreland Virginia              42.5
##  8      8093 Park         Colorado              42.4
##  9     36047 Kings        New York              41.7
## 10     54015 Clay         West Virginia         41.4

per_women <- df  %>% mutate(women_prop = women/total_pop) %>% select(census_id,county,state,women_prop) %>% arrange(women_prop) %>% top_n(-10)

## Selecting by women_prop

print(per_women)

## # A tibble: 10 x 4
##    census_id county                 state        women_prop
##        <dbl> <chr>                  <chr>             <dbl>
##  1     42053 Forest                 Pennsylvania      0.268
##  2      8011 Bent                   Colorado          0.314
##  3     51183 Sussex                 Virginia          0.315
##  4     13309 Wheeler                Georgia           0.321
##  5      6035 Lassen                 California        0.332
##  6     48095 Concho                 Texas             0.333
##  7     13053 Chattahoochee          Georgia           0.334
##  8      2013 Aleutians East Borough Alaska            0.335
##  9     22125 West Feliciana         Louisiana         0.336
## 10     32027 Pershing               Nevada            0.337

df$race_variable <- df$hispanic +df$white +df$black + df$native +df$asian + df$pacific

lowest_per_race <- df%>% group_by(county) %>% arrange(race_variable) %>% select(county) %>% head(n=10)

print(lowest_per_race)

## # A tibble: 10 x 1
## # Groups:   county [10]
##    county                  
##    <chr>                   
##  1 Hawaii                  
##  2 Maui                    
##  3 Mayes                   
##  4 Honolulu                
##  5 Pontotoc                
##  6 Grundy                  
##  7 Yakutat City and Borough
##  8 Johnston                
##  9 Kauai                   
## 10 Alfalfa

lowest_per_state<- df %>% group_by(state) %>% summarise(mean_race= mean(race_variable)) %>% top_n(-1)

## Selecting by mean_race

print(lowest_per_state)

## # A tibble: 1 x 2
##   state  mean_race
##   <chr>      <dbl>
## 1 Hawaii      80.3

sum_gt_100 <- df%>% filter(race_variable > 100) %>% select(county)

print(sum_gt_100)

## # A tibble: 11 x 1
##    county   
##    <chr>    
##  1 Claiborne
##  2 Gosper   
##  3 Hooker   
##  4 Nance    
##  5 Bailey   
##  6 Duval    
##  7 Edwards  
##  8 Kenedy   
##  9 Kent     
## 10 Presidio 
## 11 Beaver

sum_eq_100<- df %>% filter(race_variable ==100) %>% group_by(state) %>% summarise(total = length(unique(state)))

print(sum_eq_100)

## # A tibble: 13 x 2
##    state          total
##    <chr>          <int>
##  1 Alabama            1
##  2 Georgia            1
##  3 Kansas             1
##  4 Kentucky           1
##  5 Mississippi        1
##  6 Montana            1
##  7 Nebraska           1
##  8 New Mexico         1
##  9 North Carolina     1
## 10 North Dakota       1
## 11 South Dakota       1
## 12 Texas              1
## 13 West Virginia      1

df$carpool_rank <- min_rank(-df$carpool)

highest_rank_carp<- df %>% filter(carpool_rank %in% c(1:10)) %>% select(census_id,county,state,carpool,carpool_rank) %>% arrange(carpool_rank)

print(highest_rank_carp)

## # A tibble: 10 x 5
##    census_id county   state    carpool carpool_rank
##        <dbl> <chr>    <chr>      <dbl>        <int>
##  1     13061 Clay     Georgia     29.9            1
##  2     18087 LaGrange Indiana     27              2
##  3     13165 Jenkins  Georgia     25.3            3
##  4      5133 Sevier   Arkansas    24.4            4
##  5     20175 Seward   Kansas      23.4            5
##  6     48079 Cochran  Texas       22.8            6
##  7     48247 Jim Hogg Texas       22.6            7
##  8     48393 Roberts  Texas       22.4            8
##  9     39075 Holmes   Ohio        21.8            9
## 10     21197 Powell   Kentucky    21.6           10

lowes_rank_carp <-  df %>% select(census_id,county,state,carpool,carpool_rank) %>% arrange(desc(carpool_rank)) %>% top_n(10)

## Selecting by carpool_rank

print(lowes_rank_carp)

## # A tibble: 11 x 5
##    census_id county      state        carpool carpool_rank
##        <dbl> <chr>       <chr>          <dbl>        <int>
##  1     48261 Kenedy      Texas            0           3139
##  2     48269 King        Texas            0           3139
##  3     48235 Irion       Texas            0.9         3138
##  4     31183 Wheeler     Nebraska         1.3         3137
##  5     36061 New York    New York         1.9         3136
##  6     13309 Wheeler     Georgia          2.3         3134
##  7     38029 Emmons      North Dakota     2.3         3134
##  8     30019 Daniels     Montana          2.6         3132
##  9     31057 Dundy       Nebraska         2.6         3132
## 10     46069 Hyde        South Dakota     2.8         3130
## 11     51720 Norton city Virginia         2.8         3130

state_best <- df%>% group_by(state)%>% summarise(carpool_avg = mean(carpool)) %>% arrange(desc(carpool_avg)) %>% top_n(1)

## Selecting by carpool_avg

print(state_best)

## # A tibble: 1 x 2
##   state  carpool_avg
##   <chr>        <dbl>
## 1 Hawaii        12.8

state_best_5 <- df%>% group_by(state)%>% summarise(carpool_avg = mean(carpool)) %>% arrange(desc(carpool_avg)) %>% top_n(5)

## Selecting by carpool_avg

print(state_best_5)

## # A tibble: 5 x 2
##   state    carpool_avg
##   <chr>          <dbl>
## 1 Hawaii          12.8
## 2 Alaska          12.1
## 3 Arkansas        11.9
## 4 Utah            11.9
## 5 Texas           11.8

HW3_wrng

Aditya Nawal

11/11/2021

install package

Check the structure of the data

No data don’t require any structure change as the columns are in the same format as desired

To check the missing values present in the data

There is only 2 missing value one in income column & other in child_poverty , we will omit this records since using any sort of imputation would not be able to explain the correct value of this 2 observations

To summarize data we will use summary function

In the variable transit & walk have very skewed mean and median we will further histogram to understand the reason behind it.

After plotting the histogram and looking at the records for highest walk and transit the number seems to be fine

1248

1482