pacman::p_load(knitr, tidyverse, janitor,readxl,kableExtra,ggmap,maps,mapdata,RColorBrewer,colorRamps,usmap,socviz, stringr,ggthemes, plotly,imputeTS)
We are using the Unemployment and Median Household income for the United States, States, and Counties, 2000-20 data set by USDA, Economic Research Service.
# read in the Unemployment file from working directory
raw_data <- read_excel("./Unemployment.xlsx")
head(raw_data)
## # A tibble: 6 × 92
## FIPS_Code State Area_name Rural_urban_con… Urban_influence… Metro_2013
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 00000 US United States NA NA NA
## 2 01000 AL Alabama NA NA NA
## 3 01001 AL Autauga County, AL 2 2 1
## 4 01003 AL Baldwin County, AL 3 2 1
## 5 01005 AL Barbour County, AL 6 6 0
## 6 01007 AL Bibb County, AL 1 1 1
## # … with 86 more variables: Civilian_labor_force_2000 <dbl>,
## # Employed_2000 <dbl>, Unemployed_2000 <dbl>, Unemployment_rate_2000 <dbl>,
## # Civilian_labor_force_2001 <dbl>, Employed_2001 <dbl>,
## # Unemployed_2001 <dbl>, Unemployment_rate_2001 <dbl>,
## # Civilian_labor_force_2002 <dbl>, Employed_2002 <dbl>,
## # Unemployed_2002 <dbl>, Unemployment_rate_2002 <dbl>,
## # Civilian_labor_force_2003 <dbl>, Employed_2003 <dbl>, …
First of all, we need to have an overview of the data set Unemployment as a basis of further analysis.
# get a summary of each variable
summary(raw_data)
## FIPS_Code State Area_name
## Length:3275 Length:3275 Length:3275
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Rural_urban_continuum_code_2013 Urban_influence_code_2013 Metro_2013
## Min. :1.000 Min. : 1.00 Min. :0.000
## 1st Qu.:2.000 1st Qu.: 2.00 1st Qu.:0.000
## Median :6.000 Median : 5.00 Median :0.000
## Mean :4.938 Mean : 5.19 Mean :0.383
## 3rd Qu.:7.000 3rd Qu.: 8.00 3rd Qu.:1.000
## Max. :9.000 Max. :12.00 Max. :1.000
## NA's :56 NA's :56 NA's :53
## Civilian_labor_force_2000 Employed_2000 Unemployed_2000
## Min. : 49 Min. : 45 Min. : 4
## 1st Qu.: 5205 1st Qu.: 4953 1st Qu.: 235
## Median : 11875 Median : 11373 Median : 555
## Mean : 131617 Mean : 126311 Mean : 5306
## 3rd Qu.: 31523 3rd Qu.: 30000 3rd Qu.: 1377
## Max. :142601576 Max. :136904853 Max. :5696723
## NA's :5 NA's :5 NA's :5
## Unemployment_rate_2000 Civilian_labor_force_2001 Employed_2001
## Min. : 1.300 Min. : 48 Min. : 45
## 1st Qu.: 3.200 1st Qu.: 5274 1st Qu.: 4935
## Median : 4.100 Median : 12006 Median : 11348
## Mean : 4.541 Mean : 132693 Mean : 126358
## 3rd Qu.: 5.300 3rd Qu.: 31425 3rd Qu.: 30104
## Max. :17.300 Max. :143786537 Max. :136977996
## NA's :5 NA's :5 NA's :5
## Unemployed_2001 Unemployment_rate_2001 Civilian_labor_force_2002
## Min. : 3 Min. : 1.600 Min. : 48
## 1st Qu.: 273 1st Qu.: 3.800 1st Qu.: 5229
## Median : 651 Median : 4.800 Median : 12190
## Mean : 6335 Mean : 5.233 Mean : 133688
## 3rd Qu.: 1633 3rd Qu.: 6.100 3rd Qu.: 31868
## Max. :6808541 Max. :18.600 Max. :144839298
## NA's :5 NA's :5 NA's :5
## Employed_2002 Unemployed_2002 Unemployment_rate_2002
## Min. : 45 Min. : 3 Min. : 1.600
## 1st Qu.: 4917 1st Qu.: 310 1st Qu.: 4.500
## Median : 11364 Median : 755 Median : 5.600
## Mean : 125898 Mean : 7790 Mean : 5.981
## 3rd Qu.: 29910 3rd Qu.: 1909 3rd Qu.: 6.900
## Max. :136455783 Max. :8383515 Max. :19.900
## NA's :5 NA's :5 NA's :5
## Civilian_labor_force_2003 Employed_2003 Unemployed_2003
## Min. : 50 Min. : 45 Min. : 5
## 1st Qu.: 5274 1st Qu.: 4921 1st Qu.: 333
## Median : 12308 Median : 11416 Median : 790
## Mean : 134456 Mean : 126362 Mean : 8095
## 3rd Qu.: 32208 3rd Qu.: 30193 3rd Qu.: 2022
## Max. :145660094 Max. :136944522 Max. :8715572
## NA's :5 NA's :5 NA's :5
## Unemployment_rate_2003 Civilian_labor_force_2004 Employed_2004
## Min. : 1.900 Min. : 58 Min. : 54
## 1st Qu.: 4.700 1st Qu.: 5240 1st Qu.: 4911
## Median : 5.800 Median : 12288 Median : 11488
## Mean : 6.226 Mean : 135434 Mean : 127905
## 3rd Qu.: 7.300 3rd Qu.: 32314 3rd Qu.: 30505
## Max. :20.200 Max. :146724795 Max. :138613904
## NA's :5 NA's :5 NA's :5
## Unemployed_2004 Unemployment_rate_2004 Civilian_labor_force_2005
## Min. : 4 Min. : 1.600 Min. : 39
## 1st Qu.: 311 1st Qu.: 4.500 1st Qu.: 5264
## Median : 744 Median : 5.500 Median : 12354
## Mean : 7528 Mean : 5.879 Mean : 137285
## 3rd Qu.: 1926 3rd Qu.: 6.700 3rd Qu.: 32506
## Max. :8110891 Max. :20.200 Max. :148597241
## NA's :5 NA's :5 NA's :12
## Employed_2005 Unemployed_2005 Unemployment_rate_2005
## Min. : 35 Min. : 4 Min. : 2.000
## 1st Qu.: 4939 1st Qu.: 298 1st Qu.: 4.200
## Median : 11515 Median : 726 Median : 5.200
## Mean : 130220 Mean : 7066 Mean : 5.672
## 3rd Qu.: 30808 3rd Qu.: 1866 3rd Qu.: 6.500
## Max. :141000912 Max. :7596329 Max. :21.000
## NA's :12 NA's :12 NA's :12
## Civilian_labor_force_2006 Employed_2006 Unemployed_2006
## Min. : 38 Min. : 34 Min. : 4
## 1st Qu.: 5302 1st Qu.: 5020 1st Qu.: 268
## Median : 12443 Median : 11708 Median : 672
## Mean : 139280 Mean : 132779 Mean : 6501
## 3rd Qu.: 32894 3rd Qu.: 31500 3rd Qu.: 1738
## Max. :150707773 Max. :143729350 Max. :6978423
## NA's :12 NA's :12 NA's :12
## Unemployment_rate_2006 Civilian_labor_force_2007 Employed_2007
## Min. : 1.600 Min. : 41 Min. : 38
## 1st Qu.: 3.800 1st Qu.: 5279 1st Qu.: 5015
## Median : 4.800 Median : 12374 Median : 11714
## Mean : 5.152 Mean : 140473 Mean : 133924
## 3rd Qu.: 5.900 3rd Qu.: 33119 3rd Qu.: 31559
## Max. :20.700 Max. :152191050 Max. :145156133
## NA's :12 NA's :5 NA's :5
## Unemployed_2007 Unemployment_rate_2007 Civilian_labor_force_2008
## Min. : 3 Min. : 1.400 Min. : 43
## 1st Qu.: 265 1st Qu.: 3.700 1st Qu.: 5346
## Median : 656 Median : 4.700 Median : 12456
## Mean : 6549 Mean : 5.106 Mean : 141888
## 3rd Qu.: 1740 3rd Qu.: 5.900 3rd Qu.: 33397
## Max. :7034917 Max. :20.200 Max. :153761037
## NA's :5 NA's :5 NA's :5
## Employed_2008 Unemployed_2008 Unemployment_rate_2008
## Min. : 40 Min. : 3 Min. : 1.300
## 1st Qu.: 5003 1st Qu.: 311 1st Qu.: 4.400
## Median : 11676 Median : 795 Median : 5.700
## Mean : 133626 Mean : 8263 Mean : 6.033
## 3rd Qu.: 31504 3rd Qu.: 2074 3rd Qu.: 7.200
## Max. :144860349 Max. :8900688 Max. :22.600
## NA's :5 NA's :5 NA's :5
## Civilian_labor_force_2009 Employed_2009 Unemployed_2009
## Min. : 43 Min. : 39 Min. : 4
## 1st Qu.: 5343 1st Qu.: 4863 1st Qu.: 491
## Median : 12499 Median : 11245 Median : 1246
## Mean : 141920 Mean : 128743 Mean : 13178
## 3rd Qu.: 33354 3rd Qu.: 30528 3rd Qu.: 3226
## Max. :153825454 Max. :139594699 Max. :14230755
## NA's :5 NA's :5 NA's :5
## Unemployment_rate_2009 Civilian_labor_force_2010 Employed_2010
## Min. : 2.000 Min. : 71 Min. : 67
## 1st Qu.: 6.900 1st Qu.: 5244 1st Qu.: 4735
## Median : 8.900 Median : 12117 Median : 10875
## Mean : 9.286 Mean : 142207 Mean : 128455
## 3rd Qu.:11.200 3rd Qu.: 33518 3rd Qu.: 30170
## Max. :28.300 Max. :154254521 Max. :139393814
## NA's :5 NA's :3 NA's :3
## Unemployed_2010 Unemployment_rate_2010 Civilian_labor_force_2011
## Min. : 4 Min. : 2.000 Min. : 66
## 1st Qu.: 489 1st Qu.: 7.300 1st Qu.: 5211
## Median : 1258 Median : 9.300 Median : 12002
## Mean : 13752 Mean : 9.568 Mean : 142421
## 3rd Qu.: 3296 3rd Qu.:11.425 3rd Qu.: 33426
## Max. :14860707 Max. :29.400 Max. :154520678
## NA's :3 NA's :3 NA's :3
## Employed_2011 Unemployed_2011 Unemployment_rate_2011
## Min. : 62 Min. : 4 Min. : 1.40
## 1st Qu.: 4720 1st Qu.: 455 1st Qu.: 6.70
## Median : 10856 Median : 1160 Median : 8.60
## Mean : 129620 Mean : 12801 Mean : 8.91
## 3rd Qu.: 30580 3rd Qu.: 3021 3rd Qu.:10.60
## Max. :140688861 Max. :13831817 Max. :29.30
## NA's :3 NA's :3 NA's :3
## Civilian_labor_force_2012 Employed_2012 Unemployed_2012
## Min. : 67 Min. : 63 Min. : 4
## 1st Qu.: 5176 1st Qu.: 4744 1st Qu.: 398
## Median : 11982 Median : 10954 Median : 1040
## Mean : 142880 Mean : 131303 Mean : 11577
## 3rd Qu.: 33219 3rd Qu.: 30568 3rd Qu.: 2687
## Max. :155038121 Max. :142527201 Max. :12510920
## NA's :3 NA's :3 NA's :3
## Unemployment_rate_2012 Civilian_labor_force_2013 Employed_2013
## Min. : 1.100 Min. : 75 Min. : 71
## 1st Qu.: 5.900 1st Qu.: 5131 1st Qu.: 4688
## Median : 7.700 Median : 11896 Median : 10832
## Mean : 8.029 Mean : 143160 Mean : 132553
## 3rd Qu.: 9.600 3rd Qu.: 32937 3rd Qu.: 30548
## Max. :27.700 Max. :155362278 Max. :143905037
## NA's :3 NA's :3 NA's :3
## Unemployed_2013 Unemployment_rate_2013 Civilian_labor_force_2014
## Min. : 4 Min. : 1.20 Min. : 78
## 1st Qu.: 375 1st Qu.: 5.50 1st Qu.: 5054
## Median : 950 Median : 7.20 Median : 11708
## Mean : 10607 Mean : 7.58 Mean : 143665
## 3rd Qu.: 2481 3rd Qu.: 9.00 3rd Qu.: 32640
## Max. :11457241 Max. :27.40 Max. :155936159
## NA's :3 NA's :3 NA's :3
## Employed_2014 Unemployed_2014 Unemployment_rate_2014
## Min. : 74 Min. : 4 Min. : 1.200
## 1st Qu.: 4693 1st Qu.: 322 1st Qu.: 4.700
## Median : 10852 Median : 818 Median : 6.100
## Mean : 134751 Mean : 8914 Mean : 6.474
## 3rd Qu.: 30771 3rd Qu.: 2127 3rd Qu.: 7.600
## Max. :146318952 Max. :9617207 Max. :26.400
## NA's :3 NA's :3 NA's :3
## Civilian_labor_force_2015 Employed_2015 Unemployed_2015
## Min. : 77 Min. : 73 Min. : 4
## 1st Qu.: 5009 1st Qu.: 4716 1st Qu.: 284
## Median : 11616 Median : 10914 Median : 706
## Mean : 144487 Mean : 136808 Mean : 7679
## 3rd Qu.: 32744 3rd Qu.: 30849 3rd Qu.: 1877
## Max. :156840649 Max. :148554918 Max. :8285731
## NA's :3 NA's :3 NA's :3
## Unemployment_rate_2015 Civilian_labor_force_2016 Employed_2016
## Min. : 1.800 Min. : 86 Min. : 82
## 1st Qu.: 4.200 1st Qu.: 5049 1st Qu.: 4718
## Median : 5.300 Median : 11705 Median : 11038
## Mean : 5.729 Mean : 146167 Mean : 139003
## 3rd Qu.: 6.600 3rd Qu.: 33158 3rd Qu.: 31338
## Max. :24.600 Max. :158674951 Max. :150949349
## NA's :3 NA's :3 NA's :3
## Unemployed_2016 Unemployment_rate_2016 Civilian_labor_force_2017
## Min. : 4 Min. : 1.600 Min. : 100
## 1st Qu.: 270 1st Qu.: 4.000 1st Qu.: 5031
## Median : 656 Median : 5.000 Median : 11726
## Mean : 7164 Mean : 5.427 Mean : 147566
## 3rd Qu.: 1764 3rd Qu.: 6.200 3rd Qu.: 33202
## Max. :7725602 Max. :24.200 Max. :160214378
## NA's :3 NA's :3 NA's :3
## Employed_2017 Unemployed_2017 Unemployment_rate_2017
## Min. : 95 Min. : 5 Min. : 1.500
## 1st Qu.: 4739 1st Qu.: 231 1st Qu.: 3.500
## Median : 11160 Median : 570 Median : 4.400
## Mean : 141096 Mean : 6470 Mean : 4.792
## 3rd Qu.: 31726 3rd Qu.: 1554 3rd Qu.: 5.400
## Max. :153237150 Max. :6977228 Max. :20.600
## NA's :3 NA's :3 NA's :3
## Civilian_labor_force_2018 Employed_2018 Unemployed_2018
## Min. : 211 Min. : 205 Min. : 4
## 1st Qu.: 5030 1st Qu.: 4778 1st Qu.: 208
## Median : 11773 Median : 11232 Median : 510
## Mean : 148682 Mean : 142855 Mean : 5827
## 3rd Qu.: 33332 3rd Qu.: 31959 3rd Qu.: 1373
## Max. :161441134 Max. :155152550 Max. :6288584
## NA's :3 NA's :3 NA's :3
## Unemployment_rate_2018 Civilian_labor_force_2019 Employed_2019
## Min. : 1.200 Min. : 228 Min. : 216
## 1st Qu.: 3.100 1st Qu.: 4993 1st Qu.: 4776
## Median : 3.900 Median : 11856 Median : 11336
## Mean : 4.272 Mean : 150245 Mean : 144702
## 3rd Qu.: 4.900 3rd Qu.: 33511 3rd Qu.: 32190
## Max. :18.800 Max. :163140305 Max. :157154185
## NA's :3 NA's :3 NA's :3
## Unemployed_2019 Unemployment_rate_2019 Civilian_labor_force_2020
## Min. : 4 Min. : 0.800 Min. : 184
## 1st Qu.: 199 1st Qu.: 3.000 1st Qu.: 4915
## Median : 490 Median : 3.700 Median : 11933
## Mean : 5544 Mean : 4.088 Mean : 150903
## 3rd Qu.: 1311 3rd Qu.: 4.600 3rd Qu.: 34360
## Max. :5986120 Max. :20.900 Max. :160611064
## NA's :3 NA's :3 NA's :82
## Employed_2020 Unemployed_2020 Unemployment_rate_2020
## Min. : 174 Min. : 4 Min. : 1.700
## 1st Qu.: 4585 1st Qu.: 301 1st Qu.: 5.200
## Median : 11147 Median : 795 Median : 6.500
## Mean : 138751 Mean : 12152 Mean : 6.754
## 3rd Qu.: 31790 3rd Qu.: 2398 3rd Qu.: 8.000
## Max. :147677360 Max. :12933704 Max. :22.500
## NA's :82 NA's :82 NA's :82
## Median_Household_Income_2019 Med_HH_Income_Percent_of_State_Total_2019
## Min. : 24732 Min. : 39.92
## 1st Qu.: 46309 1st Qu.: 76.52
## Median : 53505 Median : 87.13
## Mean : 55875 Mean : 89.63
## 3rd Qu.: 62327 3rd Qu.:100.00
## Max. :151806 Max. :234.52
## NA's :82 NA's :83
The data set Unemployment, as summarized above, shows the unemployment and median household income of the whole United States, states, and counties from 2000 to 2020. There are 3275 rows and 92 columns in total. Each state has a summarized observation of the whole state and multiple observations of different areas in this state from 2000 to 2020. For example, Alabama has an observation of the state in general and multiple observations of specific areas inside Alabama, such as the Autauga County and Baldwin County.
In this data set, each year has four variables to describe the employment condition, which spans from 2000 to 2020 and takes 84 columns in total. These four variables are respectively: Civilian_labor_force_year (civilian labor force annual average for that year), Employed_year (number employed annual average for that year), Unemployed_year (number unemployed annual average for that year), and Unemployment_rate_year (the unemployment rate for that year). These four variables are all in the double class. Besides, there are 3 variables in the character class, FIPS_Code (state-county FIPS code), State (state abbreviation), and Area_name (state or country name).
There are two variables named Rural_urban_continuum_code_2013 and Urban_influence_code_2013, which are classification schemes that distinguish metropolitan (metro) counties by the population size of their metro area, and non-metropolitan (non-metro) counties by the degree of urbanization and adjacency to a metro area or areas. These two variables are recorded as double, but they should be categorical as each number represents a different category. Metro_2013 shows whether the area is metro or non-metro, where 1 represents metro and 0 non-metro. This variable should be a factor but is stored in this data set as a vector. The variable Median_Household_Income_2019 shows the median of household income in 2019 and is stored as a double. The variable Med_HH_Income_Percent_of_State_Total_2019 shows the county household median income as a percent of the State total median household income in 2019 and is also stored as a double. The missing values are stored as NA. Below is a summary of each variable in the data set Unemployment.
This is the data dictionary for this data set.
# read in the data dictionary from working directory
data_dictionary <- read_excel("./Data Dictionary.xlsx")
# suppress the NA values in the dictionary when printing
options(knitr.kable.NA = '')
# print the data dictionary with kable()
data_dictionary %>%
kbl() %>%
kable_styling()
| Variable | Description |
|---|---|
| FIPS_Code | State-county FIPS code |
| State | State abbreviation |
| Area_name | State or county name |
| Rural_urban_continuum_code_2013 | Rural-urban Continuum Code, 2013 |
| Urban_influence_code_2013 | Urban Influence Code, 2013 |
| Metro_2013 | Metro nonmetro dummy 0=Nonmetro 1=Metro (Based on 2013 OMB Metropolitan Area delineation) |
| Civilian_labor_force | Civilian labor force annual average |
| Employed | Number employed annual average |
| Unemployed | Number unemployed annual average |
| Unemployment_rate | Unemployment rate |
| Median_Household_Income_2019 | Estimate of median household Income, 2019 |
| Med_HH_Income_Percent_of_State_Total_2019 | County household median income as a percent of the State total median household income, 2019 |
| Data sources: | Rural classifications: USDA, Economic Research Service. |
| Labor force variables: U.S. Department of Labor, Bureau of Labor Statistics, Local Area Unemployment Statistics (LAUS). | |
| Median household Income: U.S. Department of Commerce, Bureau of the Census, Small Area Income and Poverty Estimates (SAIPE) Program. |
The research questions we intended to answer with the data set are how the unemployment rate changes overtime in the United States and how (2020) unemployment status varies in different regions in the U.S.. To answer the first question, we analyze the unemployment rate in 2020 across counties using an informational map. Then, to see how the levels of development influence the unemployment rate, we create a boxpplot to visualize the distribution of unemployment rates in metro and non-metro areas. To answer the second question, we make an interactive plot displaying the trend of each state’s unemployment rates from 2000 to 2020.
To answer the questions of our research, we need to perform the following cleaning tasks:
stringr package to remove the states after county names in the variable Area_name.metro_2013 into “metro” and “rural” and factorize them.NA values and replace them with the column mean.# clean variable names by converting them to lower case
clean_data1 <- raw_data %>%
clean_names()
# replace the NA values with column mean by for loop
clean_data1 <- na.mean(clean_data1)
clean_data1
## # A tibble: 3,275 × 92
## fips_code state area_name rural_urban_continu… urban_influence_… metro_2013
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 00000 US United Sta… 4.94 5.19 0.383
## 2 01000 AL Alabama 4.94 5.19 0.383
## 3 01001 AL Autauga Co… 2 2 1
## 4 01003 AL Baldwin Co… 3 2 1
## 5 01005 AL Barbour Co… 6 6 0
## 6 01007 AL Bibb Count… 1 1 1
## 7 01009 AL Blount Cou… 1 1 1
## 8 01011 AL Bullock Co… 6 6 0
## 9 01013 AL Butler Cou… 6 6 0
## 10 01015 AL Calhoun Co… 3 2 1
## # … with 3,265 more rows, and 86 more variables:
## # civilian_labor_force_2000 <dbl>, employed_2000 <dbl>,
## # unemployed_2000 <dbl>, unemployment_rate_2000 <dbl>,
## # civilian_labor_force_2001 <dbl>, employed_2001 <dbl>,
## # unemployed_2001 <dbl>, unemployment_rate_2001 <dbl>,
## # civilian_labor_force_2002 <dbl>, employed_2002 <dbl>,
## # unemployed_2002 <dbl>, unemployment_rate_2002 <dbl>, …
# remove the state names after the county names
clean_data2 <- clean_data1 %>%
mutate(area_name = gsub("(.*),.*", "\\1",clean_data1$area_name)) %>%
# change the variable name the left joining with map data
rename(id = fips_code)
clean_data2
## # A tibble: 3,275 × 92
## id state area_name rural_urban_continu… urban_influence_c… metro_2013
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 00000 US United States 4.94 5.19 0.383
## 2 01000 AL Alabama 4.94 5.19 0.383
## 3 01001 AL Autauga County 2 2 1
## 4 01003 AL Baldwin County 3 2 1
## 5 01005 AL Barbour County 6 6 0
## 6 01007 AL Bibb County 1 1 1
## 7 01009 AL Blount County 1 1 1
## 8 01011 AL Bullock County 6 6 0
## 9 01013 AL Butler County 6 6 0
## 10 01015 AL Calhoun County 3 2 1
## # … with 3,265 more rows, and 86 more variables:
## # civilian_labor_force_2000 <dbl>, employed_2000 <dbl>,
## # unemployed_2000 <dbl>, unemployment_rate_2000 <dbl>,
## # civilian_labor_force_2001 <dbl>, employed_2001 <dbl>,
## # unemployed_2001 <dbl>, unemployment_rate_2001 <dbl>,
## # civilian_labor_force_2002 <dbl>, employed_2002 <dbl>,
## # unemployed_2002 <dbl>, unemployment_rate_2002 <dbl>, …
# convert metro_2013 to factor variable, which is originally stored as numeric
clean_data3 <- clean_data2 %>%
mutate(metro_2013 = factor(metro_2013,
levels = c(1,0),labels = c("Metro","Non-Metro")))
clean_data3
## # A tibble: 3,275 × 92
## id state area_name rural_urban_continu… urban_influence_c… metro_2013
## <chr> <chr> <chr> <dbl> <dbl> <fct>
## 1 00000 US United States 4.94 5.19 <NA>
## 2 01000 AL Alabama 4.94 5.19 <NA>
## 3 01001 AL Autauga County 2 2 Metro
## 4 01003 AL Baldwin County 3 2 Metro
## 5 01005 AL Barbour County 6 6 Non-Metro
## 6 01007 AL Bibb County 1 1 Metro
## 7 01009 AL Blount County 1 1 Metro
## 8 01011 AL Bullock County 6 6 Non-Metro
## 9 01013 AL Butler County 6 6 Non-Metro
## 10 01015 AL Calhoun County 3 2 Metro
## # … with 3,265 more rows, and 86 more variables:
## # civilian_labor_force_2000 <dbl>, employed_2000 <dbl>,
## # unemployed_2000 <dbl>, unemployment_rate_2000 <dbl>,
## # civilian_labor_force_2001 <dbl>, employed_2001 <dbl>,
## # unemployed_2001 <dbl>, unemployment_rate_2001 <dbl>,
## # civilian_labor_force_2002 <dbl>, employed_2002 <dbl>,
## # unemployed_2002 <dbl>, unemployment_rate_2002 <dbl>, …
# subset only the observations of states in general, whose id ends with "000"
clean_data4 <- clean_data3 %>%
subset(grepl("000$", id), drop = FALSE) %>%
select(area_name,starts_with("unemployment"))
clean_data4
## # A tibble: 53 × 22
## area_name unemployment_ra… unemployment_ra… unemployment_ra… unemployment_ra…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 United S… 3.99 4.74 5.79 5.98
## 2 Alabama 4.6 5.2 5.9 6
## 3 Alaska 6.3 6.3 7.2 7.7
## 4 Arizona 4 4.7 6.2 5.8
## 5 Arkansas 4.2 4.8 5.3 5.8
## 6 Californ… 4.9 5.5 6.8 6.9
## 7 Colorado 2.7 3.8 5.7 6.1
## 8 Connecti… 2.1 2.9 4.4 5.5
## 9 Delaware 3.6 3.6 4.3 4.5
## 10 District… 5.7 6.5 7 7.3
## # … with 43 more rows, and 17 more variables: unemployment_rate_2004 <dbl>,
## # unemployment_rate_2005 <dbl>, unemployment_rate_2006 <dbl>,
## # unemployment_rate_2007 <dbl>, unemployment_rate_2008 <dbl>,
## # unemployment_rate_2009 <dbl>, unemployment_rate_2010 <dbl>,
## # unemployment_rate_2011 <dbl>, unemployment_rate_2012 <dbl>,
## # unemployment_rate_2013 <dbl>, unemployment_rate_2014 <dbl>,
## # unemployment_rate_2015 <dbl>, unemployment_rate_2016 <dbl>, …
# make the data set longer by creating the year variable from 2000 to 2020
clean_data5 <- clean_data4 %>%
pivot_longer(cols = `unemployment_rate_2000` : `unemployment_rate_2020`,
names_to = "year",
values_to = "unemployment_rate") %>%
mutate(year = as.numeric(str_sub(year,-4,-1)))
clean_data5
## # A tibble: 1,113 × 3
## area_name year unemployment_rate
## <chr> <dbl> <dbl>
## 1 United States 2000 3.99
## 2 United States 2001 4.74
## 3 United States 2002 5.79
## 4 United States 2003 5.98
## 5 United States 2004 5.53
## 6 United States 2005 5.11
## 7 United States 2006 4.63
## 8 United States 2007 4.62
## 9 United States 2008 5.79
## 10 United States 2009 9.25
## # … with 1,103 more rows
We first made an interactive plot displaying the trend of unemployment rate of US states from 2000 to 2020.
clean_data5 %>%
highlight_key(~area_name) %>% # highlight each line by state
plot_ly(x = ~ year, # set the variable year as x-axis
y = ~ unemployment_rate, # set the unemployment rate as y-axis
color = ~ area_name, # color the liens by region
colors = "#1f77b4") %>% # set the text color by hex code
add_lines(size = I(0.3)) %>% # set the thickness of each line
layout(showlegend = FALSE,
title = list(text = "Trend of Unemployment Rate of US states from 2000 to 2020",
y = 0.98),
xaxis = list(title ='Year'),
yaxis = list(title = 'Unemployment Rate (%)')) %>%
highlight()
From the above graph, we can see the unemployment rate in all of the states in the United States move in the same trend: the unemployment rate fluctuated at a relatively low level from 2000 to 2008. During the financial crisis from 2008 to 2009, the unemployment rate experienced a sharp increase in all the states. Then, it started to move in a decreasing trend until the COVID-19 pandemic impacted the economy in 2020, then the unemployment rate increased sharply again. However, strangely, during the pandemic, the unemployment rate of Puerto Rico did not move in the same trend as other states: instead of increasing, it decreased.
We then draw a map of the US’s unemployment rate in 2020 to get an overview of the US’s most recent unemployment status. To make this map more detailed, we add the county data to our original dataset using left_join.
# map the unemployment rates across the United States in the county level
data("county_map")
# left join the map data with the unemployment data for mapping
nation_map <- left_join(county_map, clean_data2, by = "id")
unemployment_map <- nation_map %>%
ggplot(mapping = aes(x = long, y = lat, fill = unemployment_rate_2020,
group = group)) + # fill the map color according to 5 categories
geom_polygon(color = "black", size = 0.06) + coord_equal() +
ggtitle("2020 Unemployment Rates across the United States") +
labs(fill = "Unemployment Rates")+
theme_map() +
theme(legend.position = "bottom") +
theme(plot.title = element_text(size = 15, family ="Georgia",vjust = -2,hjust = 0.5))+
scale_fill_viridis_c(option = "A",direction = -1)
unemployment_map
According to the map, we found the unemployment rates of the middle of the US, around 0-5%, are lower than that of other counties. These low unemployment rate counties are in states like Nebraska, Kansas, South Dakota, and Iowa. In Nebraska, for example, this low unemployment rate may be due to a larger percentage of the rural population, abundant opportunities, the close links between jobs and agriculture and food production. Counties in the west and the east mostly have an unemployment rate between 5-10%. A small number of counties have an unemployment rate between 10-15%. Only a few counties have an unemployment rate between 20 and 25%.
The second graph we draw is a boxplot showing the US unemployment rate for Metropolitan (Metro) and Non-metropolitan (Non-Metro) area in 2020.
#check outliers
summary(clean_data3 $ unemployment_rate_2020)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.700 5.300 6.600 6.754 8.000 22.500
IQR = 8-5.3
#lower
lwr = 5.3 - 1.5*IQR
#lwr = 1.25, below 1.25 are outliers
upper = 8 + 1.5*IQR
#upper = 12.05, above 12.05 are outliers
#remove outliers
unemploy_2020 <- clean_data3 %>%
# hide the NA values from printing
filter(!is.na(metro_2013)) %>%
#since min of unemployment_rate_2020 is 1.7, there's no outlier below 1.25. So we find outliers by finding the ones above 12.05
filter(unemployment_rate_2020<=12.05)
unemployment_box <- unemploy_2020 %>%
# hide the NA values from printing
filter(!is.na(metro_2013)) %>% ggplot(aes(x=metro_2013, y = unemployment_rate_2020)) +
labs(x = "Level of Development (Metro/ Non-Metro)", y = "Unemployment Rate in 2020") +
ggtitle("Unemployment Rate of Metro and Non-Metro Counties in US in 2020") +
theme(plot.title = element_text(size = 15, family ="Georgia",vjust = -2,hjust = 0.5)) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
geom_boxplot()
unemployment_box
mean.unemp = unemploy_2020 %>% filter(!is.na(metro_2013)) %>% group_by(metro_2013) %>% summarise(mean = mean(unemployment_rate_2020))
mean.unemp
## # A tibble: 2 × 2
## metro_2013 mean
## <fct> <dbl>
## 1 Metro 6.92
## 2 Non-Metro 6.38
After removing the outliers, we can see the distributions of unemployment rates in metro and non-metro areas are very similar. The unemployment rates in metro areas ranges from around 3% to around 12.5%, with a mean of 6.918%. Meanwhile, the unemployment rates in non-metro areas range from 1.7 % to around 12% with a mean of 6.379%. Therefore, we can conclude that in 2020, metro areas on average have a slightly higher unemployment rate than non-metro areas.
We discovered that the unemployment rate in the U.S. fluctuated from 2000 to 2008, sharply increased in 2008 and 2009, kept decreasing from 2009 to 2019, and eventually sharply increased again in 2020 during the time of the pandemic. In general, most states enter recession (sharp increase in unemployment) together, counties at the middle of US have a lower unemployment rate, and non-metro counties have a slightly lower unemployment rate than metro counties in 2020.