options(warn=-1)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data<-read_csv("C:\\Users\\abhishek yadav\\Downloads\\acs data.csv")
## Rows: 3142 Columns: 35
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data,5)
## # A tibble: 5 x 35
##   census_id state county total_pop   men women hispanic white black native asian
##       <dbl> <chr> <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1      1001 Alab~ Autau~     55221 26745 28476      2.6  75.8  18.5    0.4   1  
## 2      1003 Alab~ Baldw~    195121 95314 99807      4.5  83.1   9.5    0.6   0.7
## 3      1005 Alab~ Barbo~     26932 14497 12435      4.6  46.2  46.7    0.2   0.4
## 4      1007 Alab~ Bibb       22604 12073 10531      2.2  74.5  21.4    0.4   0.1
## 5      1009 Alab~ Blount     57710 28512 29198      8.6  87.9   1.5    0.3   0.1
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## #   family_work <dbl>, unemployment <dbl>
data<-na.omit(data)
  1. Based on HW3, Q5, use appropriate visualizations or summaries to report your results. Or provide a plot for comparison. Do any insights appear as a result of these?
county_gender_data<-data%>%group_by(county)%>%summarize(number_women=sum(women),number_men=sum(men))%>%
  mutate(flag=ifelse(number_women>number_men,1,0))%>%group_by(flag)%>%summarize(n_county=length(county))
attach(county_gender_data)

color=c("green")
ggplot(county_gender_data, aes(x=flag, y=n_county),color = "blue") + geom_bar(stat='identity', position='dodge',color="blue",fill=color) + geom_text(aes(label = n_county), vjust = -0.2)

In order to create a visualization that visualizes the number of counties having more women than men, we created a flag: flag =1 when number of women> number of men and 0 otherwise. We have plotted a bar chart with the same flag and we observe that there are 1167 counties with more women than men.

  1. Based on HW3, Q7, visualize the commute time of the top 10 counties. Try to use a bar chart, or you can try other types of plots.
top10commute<-data%>%group_by(county)%>%summarize(commute_time=mean(mean_commute))%>%top_n(10)%>% arrange(desc(commute_time))
## Selecting by commute_time
color=c("green")
ggplot(top10commute, levels=names(sort(table(county), decreasing=TRUE)), aes(x=county, y=commute_time),color = "blue") + geom_bar(stat='identity', position='dodge',color="blue",fill=color)+ geom_text(aes(label = commute_time), vjust = -0.2)

The barchart visualization serves best for this as it clearly shows the top 10 counties having the highest commute time

  1. Based on HW3, Q8, investigate the relationship between the unemployment rate and the percentage of women. Is there any difference among the top 10 counties with the highest unemployment rate?
data<-data%>%mutate(percentage_of_women=women*100.0/total_pop)
ggplot(data = data, aes(x = unemployment, y = percentage_of_women)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)

county_unemp <- head(data%>%group_by(county)%>%mutate(avg_unemp=mean(unemployment))%>%select(county,avg_unemp)%>%arrange(desc(avg_unemp)),10)
county_unemp
## # A tibble: 10 x 2
## # Groups:   county [10]
##    county                   avg_unemp
##    <chr>                        <dbl>
##  1 Corson                        29.4
##  2 Oglala Lakota                 28.7
##  3 Kusilvak Census Area          28.6
##  4 Ziebach                       27.4
##  5 Crowley                       27  
##  6 Conecuh                       22.6
##  7 Allendale                     22.6
##  8 Sharkey                       22.1
##  9 Northwest Arctic Borough      21.9
## 10 Coahoma                       20.3
ggplot(subset(data,county %in% c("Corson","Oglala Lakota","Kusilvak Census Area",'Ziebach',"Crowley","Conecuh","Allendale","Sharkey","Northwest Arctic Borough","Coahoma")), aes(x = unemployment, y = percentage_of_women)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)

Percentage of women is around 50% when the unemployment is between 0-15%. There is not a clear difference in this relationship for counties with highest unemployment.

  1. Based on HW3, Q10, do some brainstorm and come up with ideas to investigate the carpool variable. You can think about what insights you want to know from that variable. For example:
  1. Is the carpooling quite different across counties, why? Is it related to other variables?
  2. Is the unemployment rate associated with the carpooling value?
  3. Your questions here!
# Carpool for all counties
ggplot(data = data, aes(x = county, y = carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)

Carpool is approximately same(around 10) for all the counties.

# carpool in 10 counties with highest unemployment
county_carpool <- head(data%>%group_by(county)%>%mutate(avg_unemp=mean(unemployment))%>%select(county,avg_unemp,carpool)%>%arrange(desc(avg_unemp)),10)
county_carpool
## # A tibble: 10 x 3
## # Groups:   county [10]
##    county                   avg_unemp carpool
##    <chr>                        <dbl>   <dbl>
##  1 Corson                        29.4     5.9
##  2 Oglala Lakota                 28.7    18.5
##  3 Kusilvak Census Area          28.6     3.3
##  4 Ziebach                       27.4     5.7
##  5 Crowley                       27      10.4
##  6 Conecuh                       22.6     4.3
##  7 Allendale                     22.6    11.3
##  8 Sharkey                       22.1    13.6
##  9 Northwest Arctic Borough      21.9    10.4
## 10 Coahoma                       20.3    15
ggplot(subset(data,county %in% c("Corson","Oglala Lakota","Kusilvak Census Area",'Ziebach',"Crowley","Conecuh","Allendale","Sharkey","Northwest Arctic Borough","Coahoma")), aes(x= county,y=carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)

ggplot(subset(data,county %in% c("Corson","Oglala Lakota","Kusilvak Census Area",'Ziebach',"Crowley","Conecuh","Allendale","Sharkey","Northwest Arctic Borough","Coahoma")), aes(x= carpool,y=unemployment)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)

This graph clearly indicates a inverse relationship between carpool and unemployment for top 10 counties with highest unemployment.

# Plot of carpool and counties with highest carpool
top_carpool_county <-head(data%>%group_by(county)%>%mutate(avg_unemp=mean(unemployment))%>%select(county,avg_unemp,carpool)%>%arrange(desc(carpool)),10)
top_carpool_county
## # A tibble: 10 x 3
## # Groups:   county [10]
##    county   avg_unemp carpool
##    <chr>        <dbl>   <dbl>
##  1 Clay          8.87    29.9
##  2 LaGrange      5.5     27  
##  3 Jenkins      10.3     25.3
##  4 Sevier        7.53    24.4
##  5 Seward        6.25    23.4
##  6 Cochran       8.5     22.8
##  7 Jim Hogg     16.5     22.6
##  8 Roberts       4.75    22.4
##  9 Holmes       12.9     21.8
## 10 Powell        5.25    21.6
ggplot(subset(data,county %in% c("Clay","LaGrange","Jenkins",'Sevier',"Seward","Cochran","Jim Hogg","Roberts","Holmes","Powell")), aes(x= county,y=carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)

ggplot(subset(data,county %in% c("Clay","LaGrange","Jenkins",'Sevier',"Seward","Cochran","Jim Hogg","Roberts","Holmes","Powell")), aes(x= unemployment,y=carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)

The above graph shows a inverse relationship between carpool and unemployment.