options(warn=-1)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data<-read_csv("C:\\Users\\abhishek yadav\\Downloads\\acs data.csv")
## Rows: 3142 Columns: 35
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data,5)
## # A tibble: 5 x 35
## census_id state county total_pop men women hispanic white black native asian
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ 55221 26745 28476 2.6 75.8 18.5 0.4 1
## 2 1003 Alab~ Baldw~ 195121 95314 99807 4.5 83.1 9.5 0.6 0.7
## 3 1005 Alab~ Barbo~ 26932 14497 12435 4.6 46.2 46.7 0.2 0.4
## 4 1007 Alab~ Bibb 22604 12073 10531 2.2 74.5 21.4 0.4 0.1
## 5 1009 Alab~ Blount 57710 28512 29198 8.6 87.9 1.5 0.3 0.1
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>
data<-na.omit(data)
county_gender_data<-data%>%group_by(county)%>%summarize(number_women=sum(women),number_men=sum(men))%>%
mutate(flag=ifelse(number_women>number_men,1,0))%>%group_by(flag)%>%summarize(n_county=length(county))
attach(county_gender_data)
color=c("green")
ggplot(county_gender_data, aes(x=flag, y=n_county),color = "blue") + geom_bar(stat='identity', position='dodge',color="blue",fill=color) + geom_text(aes(label = n_county), vjust = -0.2)
In order to create a visualization that visualizes the number of counties having more women than men, we created a flag: flag =1 when number of women> number of men and 0 otherwise. We have plotted a bar chart with the same flag and we observe that there are 1167 counties with more women than men.
top10commute<-data%>%group_by(county)%>%summarize(commute_time=mean(mean_commute))%>%top_n(10)%>% arrange(desc(commute_time))
## Selecting by commute_time
color=c("green")
ggplot(top10commute, levels=names(sort(table(county), decreasing=TRUE)), aes(x=county, y=commute_time),color = "blue") + geom_bar(stat='identity', position='dodge',color="blue",fill=color)+ geom_text(aes(label = commute_time), vjust = -0.2)
The barchart visualization serves best for this as it clearly shows the top 10 counties having the highest commute time
data<-data%>%mutate(percentage_of_women=women*100.0/total_pop)
ggplot(data = data, aes(x = unemployment, y = percentage_of_women)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)
county_unemp <- head(data%>%group_by(county)%>%mutate(avg_unemp=mean(unemployment))%>%select(county,avg_unemp)%>%arrange(desc(avg_unemp)),10)
county_unemp
## # A tibble: 10 x 2
## # Groups: county [10]
## county avg_unemp
## <chr> <dbl>
## 1 Corson 29.4
## 2 Oglala Lakota 28.7
## 3 Kusilvak Census Area 28.6
## 4 Ziebach 27.4
## 5 Crowley 27
## 6 Conecuh 22.6
## 7 Allendale 22.6
## 8 Sharkey 22.1
## 9 Northwest Arctic Borough 21.9
## 10 Coahoma 20.3
ggplot(subset(data,county %in% c("Corson","Oglala Lakota","Kusilvak Census Area",'Ziebach',"Crowley","Conecuh","Allendale","Sharkey","Northwest Arctic Borough","Coahoma")), aes(x = unemployment, y = percentage_of_women)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)
Percentage of women is around 50% when the unemployment is between 0-15%. There is not a clear difference in this relationship for counties with highest unemployment.
# Carpool for all counties
ggplot(data = data, aes(x = county, y = carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)
Carpool is approximately same(around 10) for all the counties.
# carpool in 10 counties with highest unemployment
county_carpool <- head(data%>%group_by(county)%>%mutate(avg_unemp=mean(unemployment))%>%select(county,avg_unemp,carpool)%>%arrange(desc(avg_unemp)),10)
county_carpool
## # A tibble: 10 x 3
## # Groups: county [10]
## county avg_unemp carpool
## <chr> <dbl> <dbl>
## 1 Corson 29.4 5.9
## 2 Oglala Lakota 28.7 18.5
## 3 Kusilvak Census Area 28.6 3.3
## 4 Ziebach 27.4 5.7
## 5 Crowley 27 10.4
## 6 Conecuh 22.6 4.3
## 7 Allendale 22.6 11.3
## 8 Sharkey 22.1 13.6
## 9 Northwest Arctic Borough 21.9 10.4
## 10 Coahoma 20.3 15
ggplot(subset(data,county %in% c("Corson","Oglala Lakota","Kusilvak Census Area",'Ziebach',"Crowley","Conecuh","Allendale","Sharkey","Northwest Arctic Borough","Coahoma")), aes(x= county,y=carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)
ggplot(subset(data,county %in% c("Corson","Oglala Lakota","Kusilvak Census Area",'Ziebach',"Crowley","Conecuh","Allendale","Sharkey","Northwest Arctic Borough","Coahoma")), aes(x= carpool,y=unemployment)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)
This graph clearly indicates a inverse relationship between carpool and unemployment for top 10 counties with highest unemployment.
# Plot of carpool and counties with highest carpool
top_carpool_county <-head(data%>%group_by(county)%>%mutate(avg_unemp=mean(unemployment))%>%select(county,avg_unemp,carpool)%>%arrange(desc(carpool)),10)
top_carpool_county
## # A tibble: 10 x 3
## # Groups: county [10]
## county avg_unemp carpool
## <chr> <dbl> <dbl>
## 1 Clay 8.87 29.9
## 2 LaGrange 5.5 27
## 3 Jenkins 10.3 25.3
## 4 Sevier 7.53 24.4
## 5 Seward 6.25 23.4
## 6 Cochran 8.5 22.8
## 7 Jim Hogg 16.5 22.6
## 8 Roberts 4.75 22.4
## 9 Holmes 12.9 21.8
## 10 Powell 5.25 21.6
ggplot(subset(data,county %in% c("Clay","LaGrange","Jenkins",'Sevier',"Seward","Cochran","Jim Hogg","Roberts","Holmes","Powell")), aes(x= county,y=carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)
ggplot(subset(data,county %in% c("Clay","LaGrange","Jenkins",'Sevier',"Seward","Cochran","Jim Hogg","Roberts","Holmes","Powell")), aes(x= unemployment,y=carpool)) + geom_point(color = "blue",size = 2,shape = 17, alpha = 0.5)
The above graph shows a inverse relationship between carpool and unemployment.