Set up

pacman::p_load(knitr, tidyverse, janitor,readxl,kableExtra,ggmap,maps,mapdata,RColorBrewer,colorRamps,usmap,socviz, stringr,ggthemes, plotly,imputeTS)

Import Dataset

We are using the Unemployment and Median Household income for the United States, States, and Counties, 2000-20 data set by USDA, Economic Research Service.

# read in the Unemployment file from working directory
raw_data <- read_excel("./Unemployment.xlsx")
head(raw_data)
## # A tibble: 6 × 92
##   FIPS_Code State Area_name          Rural_urban_con… Urban_influence… Metro_2013
##   <chr>     <chr> <chr>                         <dbl>            <dbl>      <dbl>
## 1 00000     US    United States                    NA               NA         NA
## 2 01000     AL    Alabama                          NA               NA         NA
## 3 01001     AL    Autauga County, AL                2                2          1
## 4 01003     AL    Baldwin County, AL                3                2          1
## 5 01005     AL    Barbour County, AL                6                6          0
## 6 01007     AL    Bibb County, AL                   1                1          1
## # … with 86 more variables: Civilian_labor_force_2000 <dbl>,
## #   Employed_2000 <dbl>, Unemployed_2000 <dbl>, Unemployment_rate_2000 <dbl>,
## #   Civilian_labor_force_2001 <dbl>, Employed_2001 <dbl>,
## #   Unemployed_2001 <dbl>, Unemployment_rate_2001 <dbl>,
## #   Civilian_labor_force_2002 <dbl>, Employed_2002 <dbl>,
## #   Unemployed_2002 <dbl>, Unemployment_rate_2002 <dbl>,
## #   Civilian_labor_force_2003 <dbl>, Employed_2003 <dbl>, …

Exploration of Dataset

First of all, we need to have an overview of the data set Unemployment as a basis of further analysis.

# get a summary of each variable 
summary(raw_data)
##   FIPS_Code            State            Area_name        
##  Length:3275        Length:3275        Length:3275       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Rural_urban_continuum_code_2013 Urban_influence_code_2013   Metro_2013   
##  Min.   :1.000                   Min.   : 1.00             Min.   :0.000  
##  1st Qu.:2.000                   1st Qu.: 2.00             1st Qu.:0.000  
##  Median :6.000                   Median : 5.00             Median :0.000  
##  Mean   :4.938                   Mean   : 5.19             Mean   :0.383  
##  3rd Qu.:7.000                   3rd Qu.: 8.00             3rd Qu.:1.000  
##  Max.   :9.000                   Max.   :12.00             Max.   :1.000  
##  NA's   :56                      NA's   :56                NA's   :53     
##  Civilian_labor_force_2000 Employed_2000       Unemployed_2000  
##  Min.   :       49         Min.   :       45   Min.   :      4  
##  1st Qu.:     5205         1st Qu.:     4953   1st Qu.:    235  
##  Median :    11875         Median :    11373   Median :    555  
##  Mean   :   131617         Mean   :   126311   Mean   :   5306  
##  3rd Qu.:    31523         3rd Qu.:    30000   3rd Qu.:   1377  
##  Max.   :142601576         Max.   :136904853   Max.   :5696723  
##  NA's   :5                 NA's   :5           NA's   :5        
##  Unemployment_rate_2000 Civilian_labor_force_2001 Employed_2001      
##  Min.   : 1.300         Min.   :       48         Min.   :       45  
##  1st Qu.: 3.200         1st Qu.:     5274         1st Qu.:     4935  
##  Median : 4.100         Median :    12006         Median :    11348  
##  Mean   : 4.541         Mean   :   132693         Mean   :   126358  
##  3rd Qu.: 5.300         3rd Qu.:    31425         3rd Qu.:    30104  
##  Max.   :17.300         Max.   :143786537         Max.   :136977996  
##  NA's   :5              NA's   :5                 NA's   :5          
##  Unemployed_2001   Unemployment_rate_2001 Civilian_labor_force_2002
##  Min.   :      3   Min.   : 1.600         Min.   :       48        
##  1st Qu.:    273   1st Qu.: 3.800         1st Qu.:     5229        
##  Median :    651   Median : 4.800         Median :    12190        
##  Mean   :   6335   Mean   : 5.233         Mean   :   133688        
##  3rd Qu.:   1633   3rd Qu.: 6.100         3rd Qu.:    31868        
##  Max.   :6808541   Max.   :18.600         Max.   :144839298        
##  NA's   :5         NA's   :5              NA's   :5                
##  Employed_2002       Unemployed_2002   Unemployment_rate_2002
##  Min.   :       45   Min.   :      3   Min.   : 1.600        
##  1st Qu.:     4917   1st Qu.:    310   1st Qu.: 4.500        
##  Median :    11364   Median :    755   Median : 5.600        
##  Mean   :   125898   Mean   :   7790   Mean   : 5.981        
##  3rd Qu.:    29910   3rd Qu.:   1909   3rd Qu.: 6.900        
##  Max.   :136455783   Max.   :8383515   Max.   :19.900        
##  NA's   :5           NA's   :5         NA's   :5             
##  Civilian_labor_force_2003 Employed_2003       Unemployed_2003  
##  Min.   :       50         Min.   :       45   Min.   :      5  
##  1st Qu.:     5274         1st Qu.:     4921   1st Qu.:    333  
##  Median :    12308         Median :    11416   Median :    790  
##  Mean   :   134456         Mean   :   126362   Mean   :   8095  
##  3rd Qu.:    32208         3rd Qu.:    30193   3rd Qu.:   2022  
##  Max.   :145660094         Max.   :136944522   Max.   :8715572  
##  NA's   :5                 NA's   :5           NA's   :5        
##  Unemployment_rate_2003 Civilian_labor_force_2004 Employed_2004      
##  Min.   : 1.900         Min.   :       58         Min.   :       54  
##  1st Qu.: 4.700         1st Qu.:     5240         1st Qu.:     4911  
##  Median : 5.800         Median :    12288         Median :    11488  
##  Mean   : 6.226         Mean   :   135434         Mean   :   127905  
##  3rd Qu.: 7.300         3rd Qu.:    32314         3rd Qu.:    30505  
##  Max.   :20.200         Max.   :146724795         Max.   :138613904  
##  NA's   :5              NA's   :5                 NA's   :5          
##  Unemployed_2004   Unemployment_rate_2004 Civilian_labor_force_2005
##  Min.   :      4   Min.   : 1.600         Min.   :       39        
##  1st Qu.:    311   1st Qu.: 4.500         1st Qu.:     5264        
##  Median :    744   Median : 5.500         Median :    12354        
##  Mean   :   7528   Mean   : 5.879         Mean   :   137285        
##  3rd Qu.:   1926   3rd Qu.: 6.700         3rd Qu.:    32506        
##  Max.   :8110891   Max.   :20.200         Max.   :148597241        
##  NA's   :5         NA's   :5              NA's   :12               
##  Employed_2005       Unemployed_2005   Unemployment_rate_2005
##  Min.   :       35   Min.   :      4   Min.   : 2.000        
##  1st Qu.:     4939   1st Qu.:    298   1st Qu.: 4.200        
##  Median :    11515   Median :    726   Median : 5.200        
##  Mean   :   130220   Mean   :   7066   Mean   : 5.672        
##  3rd Qu.:    30808   3rd Qu.:   1866   3rd Qu.: 6.500        
##  Max.   :141000912   Max.   :7596329   Max.   :21.000        
##  NA's   :12          NA's   :12        NA's   :12            
##  Civilian_labor_force_2006 Employed_2006       Unemployed_2006  
##  Min.   :       38         Min.   :       34   Min.   :      4  
##  1st Qu.:     5302         1st Qu.:     5020   1st Qu.:    268  
##  Median :    12443         Median :    11708   Median :    672  
##  Mean   :   139280         Mean   :   132779   Mean   :   6501  
##  3rd Qu.:    32894         3rd Qu.:    31500   3rd Qu.:   1738  
##  Max.   :150707773         Max.   :143729350   Max.   :6978423  
##  NA's   :12                NA's   :12          NA's   :12       
##  Unemployment_rate_2006 Civilian_labor_force_2007 Employed_2007      
##  Min.   : 1.600         Min.   :       41         Min.   :       38  
##  1st Qu.: 3.800         1st Qu.:     5279         1st Qu.:     5015  
##  Median : 4.800         Median :    12374         Median :    11714  
##  Mean   : 5.152         Mean   :   140473         Mean   :   133924  
##  3rd Qu.: 5.900         3rd Qu.:    33119         3rd Qu.:    31559  
##  Max.   :20.700         Max.   :152191050         Max.   :145156133  
##  NA's   :12             NA's   :5                 NA's   :5          
##  Unemployed_2007   Unemployment_rate_2007 Civilian_labor_force_2008
##  Min.   :      3   Min.   : 1.400         Min.   :       43        
##  1st Qu.:    265   1st Qu.: 3.700         1st Qu.:     5346        
##  Median :    656   Median : 4.700         Median :    12456        
##  Mean   :   6549   Mean   : 5.106         Mean   :   141888        
##  3rd Qu.:   1740   3rd Qu.: 5.900         3rd Qu.:    33397        
##  Max.   :7034917   Max.   :20.200         Max.   :153761037        
##  NA's   :5         NA's   :5              NA's   :5                
##  Employed_2008       Unemployed_2008   Unemployment_rate_2008
##  Min.   :       40   Min.   :      3   Min.   : 1.300        
##  1st Qu.:     5003   1st Qu.:    311   1st Qu.: 4.400        
##  Median :    11676   Median :    795   Median : 5.700        
##  Mean   :   133626   Mean   :   8263   Mean   : 6.033        
##  3rd Qu.:    31504   3rd Qu.:   2074   3rd Qu.: 7.200        
##  Max.   :144860349   Max.   :8900688   Max.   :22.600        
##  NA's   :5           NA's   :5         NA's   :5             
##  Civilian_labor_force_2009 Employed_2009       Unemployed_2009   
##  Min.   :       43         Min.   :       39   Min.   :       4  
##  1st Qu.:     5343         1st Qu.:     4863   1st Qu.:     491  
##  Median :    12499         Median :    11245   Median :    1246  
##  Mean   :   141920         Mean   :   128743   Mean   :   13178  
##  3rd Qu.:    33354         3rd Qu.:    30528   3rd Qu.:    3226  
##  Max.   :153825454         Max.   :139594699   Max.   :14230755  
##  NA's   :5                 NA's   :5           NA's   :5         
##  Unemployment_rate_2009 Civilian_labor_force_2010 Employed_2010      
##  Min.   : 2.000         Min.   :       71         Min.   :       67  
##  1st Qu.: 6.900         1st Qu.:     5244         1st Qu.:     4735  
##  Median : 8.900         Median :    12117         Median :    10875  
##  Mean   : 9.286         Mean   :   142207         Mean   :   128455  
##  3rd Qu.:11.200         3rd Qu.:    33518         3rd Qu.:    30170  
##  Max.   :28.300         Max.   :154254521         Max.   :139393814  
##  NA's   :5              NA's   :3                 NA's   :3          
##  Unemployed_2010    Unemployment_rate_2010 Civilian_labor_force_2011
##  Min.   :       4   Min.   : 2.000         Min.   :       66        
##  1st Qu.:     489   1st Qu.: 7.300         1st Qu.:     5211        
##  Median :    1258   Median : 9.300         Median :    12002        
##  Mean   :   13752   Mean   : 9.568         Mean   :   142421        
##  3rd Qu.:    3296   3rd Qu.:11.425         3rd Qu.:    33426        
##  Max.   :14860707   Max.   :29.400         Max.   :154520678        
##  NA's   :3          NA's   :3              NA's   :3                
##  Employed_2011       Unemployed_2011    Unemployment_rate_2011
##  Min.   :       62   Min.   :       4   Min.   : 1.40         
##  1st Qu.:     4720   1st Qu.:     455   1st Qu.: 6.70         
##  Median :    10856   Median :    1160   Median : 8.60         
##  Mean   :   129620   Mean   :   12801   Mean   : 8.91         
##  3rd Qu.:    30580   3rd Qu.:    3021   3rd Qu.:10.60         
##  Max.   :140688861   Max.   :13831817   Max.   :29.30         
##  NA's   :3           NA's   :3          NA's   :3             
##  Civilian_labor_force_2012 Employed_2012       Unemployed_2012   
##  Min.   :       67         Min.   :       63   Min.   :       4  
##  1st Qu.:     5176         1st Qu.:     4744   1st Qu.:     398  
##  Median :    11982         Median :    10954   Median :    1040  
##  Mean   :   142880         Mean   :   131303   Mean   :   11577  
##  3rd Qu.:    33219         3rd Qu.:    30568   3rd Qu.:    2687  
##  Max.   :155038121         Max.   :142527201   Max.   :12510920  
##  NA's   :3                 NA's   :3           NA's   :3         
##  Unemployment_rate_2012 Civilian_labor_force_2013 Employed_2013      
##  Min.   : 1.100         Min.   :       75         Min.   :       71  
##  1st Qu.: 5.900         1st Qu.:     5131         1st Qu.:     4688  
##  Median : 7.700         Median :    11896         Median :    10832  
##  Mean   : 8.029         Mean   :   143160         Mean   :   132553  
##  3rd Qu.: 9.600         3rd Qu.:    32937         3rd Qu.:    30548  
##  Max.   :27.700         Max.   :155362278         Max.   :143905037  
##  NA's   :3              NA's   :3                 NA's   :3          
##  Unemployed_2013    Unemployment_rate_2013 Civilian_labor_force_2014
##  Min.   :       4   Min.   : 1.20          Min.   :       78        
##  1st Qu.:     375   1st Qu.: 5.50          1st Qu.:     5054        
##  Median :     950   Median : 7.20          Median :    11708        
##  Mean   :   10607   Mean   : 7.58          Mean   :   143665        
##  3rd Qu.:    2481   3rd Qu.: 9.00          3rd Qu.:    32640        
##  Max.   :11457241   Max.   :27.40          Max.   :155936159        
##  NA's   :3          NA's   :3              NA's   :3                
##  Employed_2014       Unemployed_2014   Unemployment_rate_2014
##  Min.   :       74   Min.   :      4   Min.   : 1.200        
##  1st Qu.:     4693   1st Qu.:    322   1st Qu.: 4.700        
##  Median :    10852   Median :    818   Median : 6.100        
##  Mean   :   134751   Mean   :   8914   Mean   : 6.474        
##  3rd Qu.:    30771   3rd Qu.:   2127   3rd Qu.: 7.600        
##  Max.   :146318952   Max.   :9617207   Max.   :26.400        
##  NA's   :3           NA's   :3         NA's   :3             
##  Civilian_labor_force_2015 Employed_2015       Unemployed_2015  
##  Min.   :       77         Min.   :       73   Min.   :      4  
##  1st Qu.:     5009         1st Qu.:     4716   1st Qu.:    284  
##  Median :    11616         Median :    10914   Median :    706  
##  Mean   :   144487         Mean   :   136808   Mean   :   7679  
##  3rd Qu.:    32744         3rd Qu.:    30849   3rd Qu.:   1877  
##  Max.   :156840649         Max.   :148554918   Max.   :8285731  
##  NA's   :3                 NA's   :3           NA's   :3        
##  Unemployment_rate_2015 Civilian_labor_force_2016 Employed_2016      
##  Min.   : 1.800         Min.   :       86         Min.   :       82  
##  1st Qu.: 4.200         1st Qu.:     5049         1st Qu.:     4718  
##  Median : 5.300         Median :    11705         Median :    11038  
##  Mean   : 5.729         Mean   :   146167         Mean   :   139003  
##  3rd Qu.: 6.600         3rd Qu.:    33158         3rd Qu.:    31338  
##  Max.   :24.600         Max.   :158674951         Max.   :150949349  
##  NA's   :3              NA's   :3                 NA's   :3          
##  Unemployed_2016   Unemployment_rate_2016 Civilian_labor_force_2017
##  Min.   :      4   Min.   : 1.600         Min.   :      100        
##  1st Qu.:    270   1st Qu.: 4.000         1st Qu.:     5031        
##  Median :    656   Median : 5.000         Median :    11726        
##  Mean   :   7164   Mean   : 5.427         Mean   :   147566        
##  3rd Qu.:   1764   3rd Qu.: 6.200         3rd Qu.:    33202        
##  Max.   :7725602   Max.   :24.200         Max.   :160214378        
##  NA's   :3         NA's   :3              NA's   :3                
##  Employed_2017       Unemployed_2017   Unemployment_rate_2017
##  Min.   :       95   Min.   :      5   Min.   : 1.500        
##  1st Qu.:     4739   1st Qu.:    231   1st Qu.: 3.500        
##  Median :    11160   Median :    570   Median : 4.400        
##  Mean   :   141096   Mean   :   6470   Mean   : 4.792        
##  3rd Qu.:    31726   3rd Qu.:   1554   3rd Qu.: 5.400        
##  Max.   :153237150   Max.   :6977228   Max.   :20.600        
##  NA's   :3           NA's   :3         NA's   :3             
##  Civilian_labor_force_2018 Employed_2018       Unemployed_2018  
##  Min.   :      211         Min.   :      205   Min.   :      4  
##  1st Qu.:     5030         1st Qu.:     4778   1st Qu.:    208  
##  Median :    11773         Median :    11232   Median :    510  
##  Mean   :   148682         Mean   :   142855   Mean   :   5827  
##  3rd Qu.:    33332         3rd Qu.:    31959   3rd Qu.:   1373  
##  Max.   :161441134         Max.   :155152550   Max.   :6288584  
##  NA's   :3                 NA's   :3           NA's   :3        
##  Unemployment_rate_2018 Civilian_labor_force_2019 Employed_2019      
##  Min.   : 1.200         Min.   :      228         Min.   :      216  
##  1st Qu.: 3.100         1st Qu.:     4993         1st Qu.:     4776  
##  Median : 3.900         Median :    11856         Median :    11336  
##  Mean   : 4.272         Mean   :   150245         Mean   :   144702  
##  3rd Qu.: 4.900         3rd Qu.:    33511         3rd Qu.:    32190  
##  Max.   :18.800         Max.   :163140305         Max.   :157154185  
##  NA's   :3              NA's   :3                 NA's   :3          
##  Unemployed_2019   Unemployment_rate_2019 Civilian_labor_force_2020
##  Min.   :      4   Min.   : 0.800         Min.   :      184        
##  1st Qu.:    199   1st Qu.: 3.000         1st Qu.:     4915        
##  Median :    490   Median : 3.700         Median :    11933        
##  Mean   :   5544   Mean   : 4.088         Mean   :   150903        
##  3rd Qu.:   1311   3rd Qu.: 4.600         3rd Qu.:    34360        
##  Max.   :5986120   Max.   :20.900         Max.   :160611064        
##  NA's   :3         NA's   :3              NA's   :82               
##  Employed_2020       Unemployed_2020    Unemployment_rate_2020
##  Min.   :      174   Min.   :       4   Min.   : 1.700        
##  1st Qu.:     4585   1st Qu.:     301   1st Qu.: 5.200        
##  Median :    11147   Median :     795   Median : 6.500        
##  Mean   :   138751   Mean   :   12152   Mean   : 6.754        
##  3rd Qu.:    31790   3rd Qu.:    2398   3rd Qu.: 8.000        
##  Max.   :147677360   Max.   :12933704   Max.   :22.500        
##  NA's   :82          NA's   :82         NA's   :82            
##  Median_Household_Income_2019 Med_HH_Income_Percent_of_State_Total_2019
##  Min.   : 24732               Min.   : 39.92                           
##  1st Qu.: 46309               1st Qu.: 76.52                           
##  Median : 53505               Median : 87.13                           
##  Mean   : 55875               Mean   : 89.63                           
##  3rd Qu.: 62327               3rd Qu.:100.00                           
##  Max.   :151806               Max.   :234.52                           
##  NA's   :82                   NA's   :83

The data set Unemployment, as summarized above, shows the unemployment and median household income of the whole United States, states, and counties from 2000 to 2020. There are 3275 rows and 92 columns in total. Each state has a summarized observation of the whole state and multiple observations of different areas in this state from 2000 to 2020. For example, Alabama has an observation of the state in general and multiple observations of specific areas inside Alabama, such as the Autauga County and Baldwin County.

In this data set, each year has four variables to describe the employment condition, which spans from 2000 to 2020 and takes 84 columns in total. These four variables are respectively: Civilian_labor_force_year (civilian labor force annual average for that year), Employed_year (number employed annual average for that year), Unemployed_year (number unemployed annual average for that year), and Unemployment_rate_year (the unemployment rate for that year). These four variables are all in the double class. Besides, there are 3 variables in the character class, FIPS_Code (state-county FIPS code), State (state abbreviation), and Area_name (state or country name).

There are two variables named Rural_urban_continuum_code_2013 and Urban_influence_code_2013, which are classification schemes that distinguish metropolitan (metro) counties by the population size of their metro area, and non-metropolitan (non-metro) counties by the degree of urbanization and adjacency to a metro area or areas. These two variables are recorded as double, but they should be categorical as each number represents a different category. Metro_2013 shows whether the area is metro or non-metro, where 1 represents metro and 0 non-metro. This variable should be a factor but is stored in this data set as a vector. The variable Median_Household_Income_2019 shows the median of household income in 2019 and is stored as a double. The variable Med_HH_Income_Percent_of_State_Total_2019 shows the county household median income as a percent of the State total median household income in 2019 and is also stored as a double. The missing values are stored as NA. Below is a summary of each variable in the data set Unemployment.

Data Dictionary

This is the data dictionary for this data set.

# read in the data dictionary from working directory
data_dictionary <- read_excel("./Data Dictionary.xlsx")
# suppress the NA values in the dictionary when printing
options(knitr.kable.NA = '')
# print the data dictionary with kable()
data_dictionary %>%
  kbl() %>%
  kable_styling()
Variable Description
FIPS_Code State-county FIPS code
State State abbreviation
Area_name State or county name
Rural_urban_continuum_code_2013 Rural-urban Continuum Code, 2013
Urban_influence_code_2013 Urban Influence Code, 2013
Metro_2013 Metro nonmetro dummy 0=Nonmetro 1=Metro (Based on 2013 OMB Metropolitan Area delineation)
Civilian_labor_force Civilian labor force annual average
Employed Number employed annual average
Unemployed Number unemployed annual average
Unemployment_rate Unemployment rate
Median_Household_Income_2019 Estimate of median household Income, 2019
Med_HH_Income_Percent_of_State_Total_2019 County household median income as a percent of the State total median household income, 2019
Data sources: Rural classifications: USDA, Economic Research Service.
Labor force variables: U.S. Department of Labor, Bureau of Labor Statistics, Local Area Unemployment Statistics (LAUS).
Median household Income: U.S. Department of Commerce, Bureau of the Census, Small Area Income and Poverty Estimates (SAIPE) Program.

Research Questions

The research questions we intended to answer with the data set are how the unemployment rate changes overtime in the United States and how (2020) unemployment status varies in different regions in the U.S.. To answer the first question, we analyze the unemployment rate in 2020 across counties using an informational map. Then, to see how the levels of development influence the unemployment rate, we create a boxpplot to visualize the distribution of unemployment rates in metro and non-metro areas. To answer the second question, we make an interactive plot displaying the trend of each state’s unemployment rates from 2000 to 2020.

Cleaning Methods

To answer the questions of our research, we need to perform the following cleaning tasks:

  1. Use stringr package to remove the states after county names in the variable Area_name.
  2. Clean the variable names: we need to standardize all variable names by converting them into snake-case.
  3. We need to reshape the table: the format of the table is not tidy. We need to reshape it and, for example, make “year” a variable.
  4. We need to recode the 0 and 1 in metro_2013 into “metro” and “rural” and factorize them.
  5. We need to identify the NA values and replace them with the column mean.

Data Cleaning

# clean variable names by converting them to lower case
clean_data1 <- raw_data %>%
  clean_names()
# replace the NA values with column mean by for loop
clean_data1 <- na.mean(clean_data1)
clean_data1
## # A tibble: 3,275 × 92
##    fips_code state area_name   rural_urban_continu… urban_influence_… metro_2013
##    <chr>     <chr> <chr>                      <dbl>             <dbl>      <dbl>
##  1 00000     US    United Sta…                 4.94              5.19      0.383
##  2 01000     AL    Alabama                     4.94              5.19      0.383
##  3 01001     AL    Autauga Co…                 2                 2         1    
##  4 01003     AL    Baldwin Co…                 3                 2         1    
##  5 01005     AL    Barbour Co…                 6                 6         0    
##  6 01007     AL    Bibb Count…                 1                 1         1    
##  7 01009     AL    Blount Cou…                 1                 1         1    
##  8 01011     AL    Bullock Co…                 6                 6         0    
##  9 01013     AL    Butler Cou…                 6                 6         0    
## 10 01015     AL    Calhoun Co…                 3                 2         1    
## # … with 3,265 more rows, and 86 more variables:
## #   civilian_labor_force_2000 <dbl>, employed_2000 <dbl>,
## #   unemployed_2000 <dbl>, unemployment_rate_2000 <dbl>,
## #   civilian_labor_force_2001 <dbl>, employed_2001 <dbl>,
## #   unemployed_2001 <dbl>, unemployment_rate_2001 <dbl>,
## #   civilian_labor_force_2002 <dbl>, employed_2002 <dbl>,
## #   unemployed_2002 <dbl>, unemployment_rate_2002 <dbl>, …
# remove the state names after the county names
clean_data2 <- clean_data1 %>%
  mutate(area_name = gsub("(.*),.*", "\\1",clean_data1$area_name)) %>%
  # change the variable name the left joining with map data
  rename(id = fips_code)
clean_data2
## # A tibble: 3,275 × 92
##    id    state area_name      rural_urban_continu… urban_influence_c… metro_2013
##    <chr> <chr> <chr>                         <dbl>              <dbl>      <dbl>
##  1 00000 US    United States                  4.94               5.19      0.383
##  2 01000 AL    Alabama                        4.94               5.19      0.383
##  3 01001 AL    Autauga County                 2                  2         1    
##  4 01003 AL    Baldwin County                 3                  2         1    
##  5 01005 AL    Barbour County                 6                  6         0    
##  6 01007 AL    Bibb County                    1                  1         1    
##  7 01009 AL    Blount County                  1                  1         1    
##  8 01011 AL    Bullock County                 6                  6         0    
##  9 01013 AL    Butler County                  6                  6         0    
## 10 01015 AL    Calhoun County                 3                  2         1    
## # … with 3,265 more rows, and 86 more variables:
## #   civilian_labor_force_2000 <dbl>, employed_2000 <dbl>,
## #   unemployed_2000 <dbl>, unemployment_rate_2000 <dbl>,
## #   civilian_labor_force_2001 <dbl>, employed_2001 <dbl>,
## #   unemployed_2001 <dbl>, unemployment_rate_2001 <dbl>,
## #   civilian_labor_force_2002 <dbl>, employed_2002 <dbl>,
## #   unemployed_2002 <dbl>, unemployment_rate_2002 <dbl>, …
# convert metro_2013 to factor variable, which is originally stored as numeric
clean_data3 <- clean_data2 %>%
  mutate(metro_2013 = factor(metro_2013,
                             levels = c(1,0),labels = c("Metro","Non-Metro"))) 
clean_data3
## # A tibble: 3,275 × 92
##    id    state area_name      rural_urban_continu… urban_influence_c… metro_2013
##    <chr> <chr> <chr>                         <dbl>              <dbl> <fct>     
##  1 00000 US    United States                  4.94               5.19 <NA>      
##  2 01000 AL    Alabama                        4.94               5.19 <NA>      
##  3 01001 AL    Autauga County                 2                  2    Metro     
##  4 01003 AL    Baldwin County                 3                  2    Metro     
##  5 01005 AL    Barbour County                 6                  6    Non-Metro 
##  6 01007 AL    Bibb County                    1                  1    Metro     
##  7 01009 AL    Blount County                  1                  1    Metro     
##  8 01011 AL    Bullock County                 6                  6    Non-Metro 
##  9 01013 AL    Butler County                  6                  6    Non-Metro 
## 10 01015 AL    Calhoun County                 3                  2    Metro     
## # … with 3,265 more rows, and 86 more variables:
## #   civilian_labor_force_2000 <dbl>, employed_2000 <dbl>,
## #   unemployed_2000 <dbl>, unemployment_rate_2000 <dbl>,
## #   civilian_labor_force_2001 <dbl>, employed_2001 <dbl>,
## #   unemployed_2001 <dbl>, unemployment_rate_2001 <dbl>,
## #   civilian_labor_force_2002 <dbl>, employed_2002 <dbl>,
## #   unemployed_2002 <dbl>, unemployment_rate_2002 <dbl>, …
# subset only the observations of states in general, whose id ends with "000"
clean_data4 <- clean_data3 %>%
  subset(grepl("000$", id), drop = FALSE)  %>%
  select(area_name,starts_with("unemployment"))
clean_data4
## # A tibble: 53 × 22
##    area_name unemployment_ra… unemployment_ra… unemployment_ra… unemployment_ra…
##    <chr>                <dbl>            <dbl>            <dbl>            <dbl>
##  1 United S…             3.99             4.74             5.79             5.98
##  2 Alabama               4.6              5.2              5.9              6   
##  3 Alaska                6.3              6.3              7.2              7.7 
##  4 Arizona               4                4.7              6.2              5.8 
##  5 Arkansas              4.2              4.8              5.3              5.8 
##  6 Californ…             4.9              5.5              6.8              6.9 
##  7 Colorado              2.7              3.8              5.7              6.1 
##  8 Connecti…             2.1              2.9              4.4              5.5 
##  9 Delaware              3.6              3.6              4.3              4.5 
## 10 District…             5.7              6.5              7                7.3 
## # … with 43 more rows, and 17 more variables: unemployment_rate_2004 <dbl>,
## #   unemployment_rate_2005 <dbl>, unemployment_rate_2006 <dbl>,
## #   unemployment_rate_2007 <dbl>, unemployment_rate_2008 <dbl>,
## #   unemployment_rate_2009 <dbl>, unemployment_rate_2010 <dbl>,
## #   unemployment_rate_2011 <dbl>, unemployment_rate_2012 <dbl>,
## #   unemployment_rate_2013 <dbl>, unemployment_rate_2014 <dbl>,
## #   unemployment_rate_2015 <dbl>, unemployment_rate_2016 <dbl>, …
# make the data set longer by creating the year variable from 2000 to 2020
clean_data5 <- clean_data4 %>%
  pivot_longer(cols = `unemployment_rate_2000` : `unemployment_rate_2020`,
    names_to = "year",
    values_to = "unemployment_rate") %>%
  mutate(year = as.numeric(str_sub(year,-4,-1)))
clean_data5
## # A tibble: 1,113 × 3
##    area_name      year unemployment_rate
##    <chr>         <dbl>             <dbl>
##  1 United States  2000              3.99
##  2 United States  2001              4.74
##  3 United States  2002              5.79
##  4 United States  2003              5.98
##  5 United States  2004              5.53
##  6 United States  2005              5.11
##  7 United States  2006              4.63
##  8 United States  2007              4.62
##  9 United States  2008              5.79
## 10 United States  2009              9.25
## # … with 1,103 more rows

Data Visualization

Trend of Unemployment Rate of US states from 2000 to 2020

We first made an interactive plot displaying the trend of unemployment rate of US states from 2000 to 2020.

clean_data5 %>%
  highlight_key(~area_name) %>% # highlight each line by state
  plot_ly(x = ~ year, # set the variable year as x-axis
          y = ~ unemployment_rate, # set the unemployment rate as y-axis
          color = ~ area_name, # color the liens by region
          colors = "#1f77b4") %>% # set the text color by hex code
  add_lines(size = I(0.3)) %>% # set the thickness of each line
  layout(showlegend = FALSE, 
         title = list(text = "Trend of Unemployment Rate of US states from 2000 to 2020", 
                      y = 0.98),
         xaxis = list(title ='Year'),
         yaxis = list(title = 'Unemployment Rate (%)')) %>%
  highlight()

From the above graph, we can see the unemployment rate in all of the states in the United States move in the same trend: the unemployment rate fluctuated at a relatively low level from 2000 to 2008. During the financial crisis from 2008 to 2009, the unemployment rate experienced a sharp increase in all the states. Then, it started to move in a decreasing trend until the COVID-19 pandemic impacted the economy in 2020, then the unemployment rate increased sharply again. However, strangely, during the pandemic, the unemployment rate of Puerto Rico did not move in the same trend as other states: instead of increasing, it decreased.

Map of Unemployment Rate across US counties in 2020

We then draw a map of the US’s unemployment rate in 2020 to get an overview of the US’s most recent unemployment status. To make this map more detailed, we add the county data to our original dataset using left_join.

# map the unemployment rates across the United States in the county level
data("county_map")
# left join the map data with the unemployment data for mapping
nation_map <- left_join(county_map, clean_data2, by = "id")
unemployment_map <- nation_map %>%
  ggplot(mapping = aes(x = long, y = lat, fill = unemployment_rate_2020, 
                          group = group)) +  # fill the map color according to 5 categories     
  geom_polygon(color = "black", size = 0.06) + coord_equal() + 
  ggtitle("2020 Unemployment Rates across the United States") +
  labs(fill = "Unemployment Rates")+
  theme_map() + 
  theme(legend.position = "bottom") + 
  theme(plot.title = element_text(size = 15, family ="Georgia",vjust = -2,hjust = 0.5))+
  scale_fill_viridis_c(option = "A",direction = -1)
unemployment_map

According to the map, we found the unemployment rates of the middle of the US, around 0-5%, are lower than that of other counties. These low unemployment rate counties are in states like Nebraska, Kansas, South Dakota, and Iowa. In Nebraska, for example, this low unemployment rate may be due to a larger percentage of the rural population, abundant opportunities, the close links between jobs and agriculture and food production. Counties in the west and the east mostly have an unemployment rate between 5-10%. A small number of counties have an unemployment rate between 10-15%. Only a few counties have an unemployment rate between 20 and 25%.

Distribution of Unemployment rate in metro and non-metro areas in 2020

The second graph we draw is a boxplot showing the US unemployment rate for Metropolitan (Metro) and Non-metropolitan (Non-Metro) area in 2020.

#check outliers
summary(clean_data3 $ unemployment_rate_2020)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.700   5.300   6.600   6.754   8.000  22.500
IQR = 8-5.3
#lower 
lwr = 5.3 - 1.5*IQR
#lwr = 1.25, below 1.25 are outliers 
upper = 8 + 1.5*IQR
#upper = 12.05, above 12.05 are outliers
#remove outliers 
unemploy_2020 <- clean_data3 %>%
  # hide the NA values from printing
  filter(!is.na(metro_2013)) %>%
  #since min of unemployment_rate_2020 is 1.7, there's no outlier below 1.25. So we find outliers by finding the ones above 12.05
  filter(unemployment_rate_2020<=12.05)

unemployment_box <- unemploy_2020 %>% 
  # hide the NA values from printing
  filter(!is.na(metro_2013)) %>%  ggplot(aes(x=metro_2013, y = unemployment_rate_2020)) +
  labs(x = "Level of Development (Metro/ Non-Metro)", y = "Unemployment Rate in 2020") +
  ggtitle("Unemployment Rate of Metro and Non-Metro Counties in US in 2020") +
  theme(plot.title = element_text(size = 15, family ="Georgia",vjust = -2,hjust = 0.5)) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  geom_boxplot()

unemployment_box

Calculate Mean

mean.unemp = unemploy_2020 %>% filter(!is.na(metro_2013)) %>% group_by(metro_2013) %>% summarise(mean =  mean(unemployment_rate_2020))
mean.unemp
## # A tibble: 2 × 2
##   metro_2013  mean
##   <fct>      <dbl>
## 1 Metro       6.92
## 2 Non-Metro   6.38

After removing the outliers, we can see the distributions of unemployment rates in metro and non-metro areas are very similar. The unemployment rates in metro areas ranges from around 3% to around 12.5%, with a mean of 6.918%. Meanwhile, the unemployment rates in non-metro areas range from 1.7 % to around 12% with a mean of 6.379%. Therefore, we can conclude that in 2020, metro areas on average have a slightly higher unemployment rate than non-metro areas.

Conclusion

We discovered that the unemployment rate in the U.S. fluctuated from 2000 to 2008, sharply increased in 2008 and 2009, kept decreasing from 2009 to 2019, and eventually sharply increased again in 2020 during the time of the pandemic. In general, most states enter recession (sharp increase in unemployment) together, counties at the middle of US have a lower unemployment rate, and non-metro counties have a slightly lower unemployment rate than metro counties in 2020.