library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(outliers)
library(ggplot2)
# Read data sets
states <- read_csv("states.csv")
## Rows: 51 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): State, Abbrev, Code
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
uic <- read_csv("UIC_codes.csv")
## Rows: 3221 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): FIPS, State, County_Name, Description, City/Suburb/Town/Rural
## dbl (1): UIC_2013
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unemployment <- read_csv("unemployment.csv")
## Rows: 3275 Columns: 93
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): FIPS_Code, State, Area_name, City/Suburb/Town/Rural
## dbl (25): Rural_urban_continuum_code_2013, Urban_influence_code_2013, Metro_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect data sets
head(states)
head(uic)
head(unemployment)
# Eliminate unnecessary variables
states_mod <- states %>%
select(-Abbrev) %>% relocate(Code, State) %>% rename("State_Code" = "Code")
uic_mod <- uic %>%
select(-UIC_2013) %>% rename("State_Code" = "State", "Loc_Pop_Category" = "Description", "County_Type" = "City/Suburb/Town/Rural")
unemployment_mod <- unemployment %>%
select(FIPS_Code, contains("rate"))
head(states_mod)
head(uic_mod)
head(unemployment_mod)
# Merge data sets
loc_pop <- uic_mod %>%
left_join(states_mod, by = "State_Code") %>% relocate(State, .after = State_Code) %>% rename("FIPS_Code" = "FIPS",
"County" = "County_Name")
df_pre <- unemployment_mod %>%
left_join(loc_pop, by = "FIPS_Code") %>% relocate((State_Code:County_Type), .after = FIPS_Code)
head(loc_pop)
head(df_pre)
tail(df_pre)
df_pre_mod <- df_pre[1:3219, ]
This report attempts to analyse trends of unemployment across different locales in the USA between 2000 and 2020. It utilises three data sets including states, urban influence code (UIC) and unemployment. Data sets uic and unemployment are sourced from https://www.kaggle.com/valbauman/student-engagement-online-learning-supplement, while the source of states data set is https://worldpopulationreview.com/states/state-abbreviations.
The “states” data set includes three variables of “State” (state name), “Abbrev” (abbreviation of state name), and “Code” (a two letter code for state name). The variable “Abbrev” is deleted as it does not add much value in the context of this report.
The “uic” data set is made up of Federal Information Processing Standards (FIPS), State (state code), County_name, Population_2010, UIC_2013, Description (classification of county based on location and population), and City/Suburb/Town/Rural (county category type) as variables. From these variables, State and UIC_2013 are deleted.
The “unemployment” is a very wide data set which contains 93 variables. FIPS, State, Area_name, Median_household_income_2019, some employment statistics and unemployment rates form these variables. From this data set, majority of the variables are eliminated leaving behind only FIPS_Code and unemployment rates from 2000 through 2020.
After removing unnecessary variables from the data sets we have “states_mod”, “uic_mod”, and “unemployment_mod”. In order to establish a draft of final data frame for the report, two sets of merging commands are executed. “uic_mod” is left joined with “state_mod” to create “loc_pop”. Then “unemployment_mod” is left joined with “loc_pop” to achieve “df_pre” which is the data frame to be used for further data pre-processing and analysis in the report.
Applying head() and tail() to “df_pre” indicate that except for the bottom of the data set, the rest of the observations are consistent. The issue with the last observations of the data set is that they are subtotals rather than actual observations on locales inside the USA. As a result, these subtotals are then sliced off the data set by indexing.
# Check attributes of the data set
str(df_pre_mod)
## tibble [3,219 × 28] (S3: tbl_df/tbl/data.frame)
## $ FIPS_Code : chr [1:3219] "01007" "01009" "01021" "01073" ...
## $ State_Code : chr [1:3219] "AL" "AL" "AL" "AL" ...
## $ State : chr [1:3219] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ County : chr [1:3219] "Bibb County" "Blount County" "Chilton County" "Jefferson County" ...
## $ Population_2010 : num [1:3219] 22915 57322 43643 658466 83593 ...
## $ Loc_Pop_Category : chr [1:3219] "Large-in a metro area with at least 1 million residents or more" "Large-in a metro area with at least 1 million residents or more" "Large-in a metro area with at least 1 million residents or more" "Large-in a metro area with at least 1 million residents or more" ...
## $ County_Type : chr [1:3219] "City" "City" "City" "City" ...
## $ Unemployment_rate_2000: num [1:3219] 5.4 3.5 4.3 4 3.8 2.8 5.9 3.2 4.6 4.1 ...
## $ Unemployment_rate_2001: num [1:3219] 6.8 3.7 4.8 4.4 4.2 3 6.1 4.2 5.4 5.1 ...
## $ Unemployment_rate_2002: num [1:3219] 7 5.4 5.1 5.2 5.1 3.5 6.4 5.7 7.4 5.8 ...
## $ Unemployment_rate_2003: num [1:3219] 5.9 4.6 5 5.5 5.2 3.5 6.4 5.2 7 6.6 ...
## $ Unemployment_rate_2004: num [1:3219] 5.4 4.1 4.6 5.2 4.9 3.3 6 4.4 6 6.8 ...
## $ Unemployment_rate_2005: num [1:3219] 4.4 3.6 3.9 4.4 3.9 3.1 4.4 4 5.6 6.6 ...
## $ Unemployment_rate_2006: num [1:3219] 4.2 3.2 3.6 4 3.5 2.7 4.5 3.7 5.1 6.9 ...
## $ Unemployment_rate_2007: num [1:3219] 4.2 3.2 3.5 3.9 3.5 2.7 4.4 3.2 4.5 6.6 ...
## $ Unemployment_rate_2008: num [1:3219] 6 4.8 5.2 5.6 5.2 3.9 5.9 5.1 7 7.9 ...
## $ Unemployment_rate_2009: num [1:3219] 12.2 9.2 9.8 9.9 10 7.1 10.7 8.9 12 10.7 ...
## $ Unemployment_rate_2010: num [1:3219] 11.2 9.7 10.1 10.2 10 7 12.5 9.5 10.6 10.1 ...
## $ Unemployment_rate_2011: num [1:3219] 10.4 8.6 9.1 9.2 8.7 6.2 11.2 8.5 9.7 10.4 ...
## $ Unemployment_rate_2012: num [1:3219] 8.8 7.1 7.4 7.6 7 5.2 9.4 7.3 8.5 9.2 ...
## $ Unemployment_rate_2013: num [1:3219] 8 6.4 6.7 6.8 6.2 4.7 8.6 6.6 8.1 8.2 ...
## $ Unemployment_rate_2014: num [1:3219] 7.2 6.1 6.2 6.3 5.7 4.5 7.9 5.8 7 7.2 ...
## $ Unemployment_rate_2015: num [1:3219] 6.7 5.4 5.7 5.8 5.2 4.2 7.3 5.1 6.3 6.1 ...
## $ Unemployment_rate_2016: num [1:3219] 6.5 5.4 5.5 5.7 5.2 4.3 7.5 4.7 5.6 4.5 ...
## $ Unemployment_rate_2017: num [1:3219] 4.5 4.2 4.2 4.4 4.1 3.3 5.2 4.2 5.1 4.4 ...
## $ Unemployment_rate_2018: num [1:3219] 4 3.5 3.6 3.7 3.5 2.8 4.2 4.1 5 4.3 ...
## $ Unemployment_rate_2019: num [1:3219] 3.1 2.7 2.7 2.9 2.7 2.2 3.3 4.2 5 4.2 ...
## $ Unemployment_rate_2020: num [1:3219] 6.6 4.1 5 6.2 4.9 3.9 5.8 7.4 7.5 8.3 ...
names(df_pre_mod)
## [1] "FIPS_Code" "State_Code" "State"
## [4] "County" "Population_2010" "Loc_Pop_Category"
## [7] "County_Type" "Unemployment_rate_2000" "Unemployment_rate_2001"
## [10] "Unemployment_rate_2002" "Unemployment_rate_2003" "Unemployment_rate_2004"
## [13] "Unemployment_rate_2005" "Unemployment_rate_2006" "Unemployment_rate_2007"
## [16] "Unemployment_rate_2008" "Unemployment_rate_2009" "Unemployment_rate_2010"
## [19] "Unemployment_rate_2011" "Unemployment_rate_2012" "Unemployment_rate_2013"
## [22] "Unemployment_rate_2014" "Unemployment_rate_2015" "Unemployment_rate_2016"
## [25] "Unemployment_rate_2017" "Unemployment_rate_2018" "Unemployment_rate_2019"
## [28] "Unemployment_rate_2020"
dim(df_pre_mod)
## [1] 3219 28
sapply(df_pre_mod, class)
## FIPS_Code State_Code State
## "character" "character" "character"
## County Population_2010 Loc_Pop_Category
## "character" "numeric" "character"
## County_Type Unemployment_rate_2000 Unemployment_rate_2001
## "character" "numeric" "numeric"
## Unemployment_rate_2002 Unemployment_rate_2003 Unemployment_rate_2004
## "numeric" "numeric" "numeric"
## Unemployment_rate_2005 Unemployment_rate_2006 Unemployment_rate_2007
## "numeric" "numeric" "numeric"
## Unemployment_rate_2008 Unemployment_rate_2009 Unemployment_rate_2010
## "numeric" "numeric" "numeric"
## Unemployment_rate_2011 Unemployment_rate_2012 Unemployment_rate_2013
## "numeric" "numeric" "numeric"
## Unemployment_rate_2014 Unemployment_rate_2015 Unemployment_rate_2016
## "numeric" "numeric" "numeric"
## Unemployment_rate_2017 Unemployment_rate_2018 Unemployment_rate_2019
## "numeric" "numeric" "numeric"
## Unemployment_rate_2020
## "numeric"
head(df_pre_mod)
tail(df_pre_mod)
# Convert data types (character->factor, character->ordered factor, numeric->factor)
df_pre_mod %>% distinct(Loc_Pop_Category)
df_pre_mod %>% is.na() %>% colSums()
## FIPS_Code State_Code State
## 0 2 80
## County Population_2010 Loc_Pop_Category
## 2 2 2
## County_Type Unemployment_rate_2000 Unemployment_rate_2001
## 2 5 5
## Unemployment_rate_2002 Unemployment_rate_2003 Unemployment_rate_2004
## 5 5 5
## Unemployment_rate_2005 Unemployment_rate_2006 Unemployment_rate_2007
## 12 12 5
## Unemployment_rate_2008 Unemployment_rate_2009 Unemployment_rate_2010
## 5 5 0
## Unemployment_rate_2011 Unemployment_rate_2012 Unemployment_rate_2013
## 0 0 0
## Unemployment_rate_2014 Unemployment_rate_2015 Unemployment_rate_2016
## 0 0 0
## Unemployment_rate_2017 Unemployment_rate_2018 Unemployment_rate_2019
## 0 0 0
## Unemployment_rate_2020
## 78
df_pre_mod %>% filter(is.na(Loc_Pop_Category)) %>% select(FIPS_Code)
df_pre_mod2 <- df_pre_mod %>% filter(!is.na(State_Code))
df_pre_mod2 %>% is.na() %>% colSums()
## FIPS_Code State_Code State
## 0 0 78
## County Population_2010 Loc_Pop_Category
## 0 0 0
## County_Type Unemployment_rate_2000 Unemployment_rate_2001
## 0 5 5
## Unemployment_rate_2002 Unemployment_rate_2003 Unemployment_rate_2004
## 5 5 5
## Unemployment_rate_2005 Unemployment_rate_2006 Unemployment_rate_2007
## 12 12 5
## Unemployment_rate_2008 Unemployment_rate_2009 Unemployment_rate_2010
## 5 5 0
## Unemployment_rate_2011 Unemployment_rate_2012 Unemployment_rate_2013
## 0 0 0
## Unemployment_rate_2014 Unemployment_rate_2015 Unemployment_rate_2016
## 0 0 0
## Unemployment_rate_2017 Unemployment_rate_2018 Unemployment_rate_2019
## 0 0 0
## Unemployment_rate_2020
## 78
df_pre_mod2 %>% distinct(Loc_Pop_Category)
df_pre_mod2$Loc_Pop_Category <- factor(df_pre_mod2$Loc_Pop_Category, levels = c("Large-in a metro area with at least 1 million residents or more", "Small-in a metro area with fewer than 1 million residents", "Micropolitan adjacent to a large metro are" , "Noncore adjacent to a large metro area", "Micropolitan adjacent to a small metro area", "Noncore adjacent to a small metro with town of at least 2,500 residents", "Noncore adjacent to a small metro and does not contain a town of at least 2,500 residents", "Micropolitan not adjacent to a metro area", "Noncore adjacent to micro area and contains a town of 2,500-19,999 residents", "Noncore adjacent to micro area and does not contain a town of at least 2,500 residents", "Noncore not adjacent to a metro/micro area and contains a town of 2,500 or more residents", "Noncore not adjacent to a metro/micro area and does not contain a town of at least 2,500 residents"), ordered = FALSE)
df_pre_mod2$County_Type <- factor(df_pre_mod2$County_Type, levels = c("Town", "Rural", "Suburb", "City"),
ordered = TRUE)
df_pre_mod2$FIPS_Code <- factor(as.numeric(df_pre_mod2$FIPS_Code))
class(df_pre_mod2$Loc_Pop_Category)
## [1] "factor"
class(df_pre_mod2$County_Type)
## [1] "ordered" "factor"
class(df_pre_mod2$FIPS_Code)
## [1] "factor"
Explanation of the actions taken:
# Investigate data as per tidy data principles
head(df_pre_mod2)
colnames(df_pre_mod2) <- str_replace_all(colnames(df_pre_mod2), pattern = "Unemployment_rate", replacement = "Unmp%")
df_pre_long <- df_pre_mod2 %>% pivot_longer(cols = ("Unmp%_2000":"Unmp%_2020"), names_to = "x_factor", values_to = "Unmp%") %>%
relocate(("x_factor":"Unmp%"), .after = County_Type)
head(df_pre_long)
df_pre_long2 <- df_pre_long %>% separate(x_factor, into = c("text", "Year"), sep = "%_") %>%
select(-text)
head(df_pre_long2)
dim(df_pre_long2)
## [1] 67557 9
Explanation of actions taken:
# Create a new variable from the existing variables
df_pre_long2 %<>% mutate(Location = str_c(County, State, sep = ", ")) %>% relocate(Location, .after = County)
head(df_pre_long2)
# Convert data type character to date
df_pre_long2$Year <- as.Date(df_pre_long2$Year, format = "%Y")
class(df_pre_long2$Year)
## [1] "Date"
Explanation of actions taken:
# Scan for NA, NaN, Inf, and Null
sapply(df_pre_long2, is.infinite) %>% colSums()
## FIPS_Code State_Code State County
## 0 0 0 0
## Location Population_2010 Loc_Pop_Category County_Type
## 0 0 0 0
## Year Unmp%
## 0 0
sapply(df_pre_long2, is.nan) %>% colSums()
## FIPS_Code State_Code State County
## 0 0 0 0
## Location Population_2010 Loc_Pop_Category County_Type
## 0 0 0 0
## Year Unmp%
## 0 0
sapply(df_pre_long2, is.na) %>% colSums()
## FIPS_Code State_Code State County
## 0 0 1638 0
## Location Population_2010 Loc_Pop_Category County_Type
## 1638 0 2772 0
## Year Unmp%
## 0 142
sapply(df_pre_long2, is.null)
## FIPS_Code State_Code State County
## FALSE FALSE FALSE FALSE
## Location Population_2010 Loc_Pop_Category County_Type
## FALSE FALSE FALSE FALSE
## Year Unmp%
## FALSE FALSE
# Impute missing values
df_pre_scan <- df_pre_long2
df_pre_scan$Loc_Pop_Category <- impute(df_pre_scan$Loc_Pop_Category, fun = mode)
df_pre_scan$`Unmp%` <- impute(df_pre_scan$`Unmp%`, fun = mean)
sapply(df_pre_scan, is.na) %>% colSums()
## FIPS_Code State_Code State County
## 0 0 1638 0
## Location Population_2010 Loc_Pop_Category County_Type
## 1638 0 0 0
## Year Unmp%
## 0 0
Explanation of the methodology and actions:
# Check distribution of numeric variables
hist(df_pre_scan$Population_2010, main = "Histogram of Population", xlab = "Population_2010")
hist(df_pre_scan$`Unmp%`, main = "Histogram of Unemployment rate", xlab = "Unmp%")
# Investigate effectiveness of right-skewness reduction methods
hist(log(df_pre_scan$Population_2010), main = "Histogram of Natural Logarithm of Population", xlab = "Population_2010")
hist(log10(df_pre_scan$Population_2010), main = "Histogram of Base 10 Logarithm of Population", xlab = "Population_2010")
hist((df_pre_scan$Population_2010) ^ -1, main = "Histogram of Reciprocal Population", xlab = "Population_2010")
hist((df_pre_scan$Population_2010) ^ (1/2), main = "Histogram of Second Root of Population", xlab = "Population_2010")
hist((df_pre_scan$Population_2010) ^ (1/3), main = "Histogram of Third Root of Population", xlab = "Population_2010")
hist(log(df_pre_scan$`Unmp%`), main = "Histogram of Natural Logarithm of Unemployment rate", xlab = "Unmp%")
hist(log10(df_pre_scan$`Unmp%`), main = "Histogram of Base 10 Logarithm of Unemployment rate", xlab = "Unmp")
hist((df_pre_scan$Population_2010) ^ -1, main = "Histogram of Reciprocal Unemployment rate", xlab = "Unmp")
hist((df_pre_scan$Population_2010) ^ (1/2), main = "Histogram of Second Root of Unemployment rate", xlab = "Unmp")
hist((df_pre_scan$Population_2010) ^ (1/3), main = "Histogram of Third Root of Unemployment rate", xlab = "Unmp")
# Transform data
df_pre_scan2 <- df_pre_scan
df_pre_scan2$Population_2010 %<>% log()
df_pre_scan2$`Unmp%` %<>% log()
# Detect univariate outliers of the numeric variables
z.scores_population <- df_pre_scan2$Population_2010 %>% scores(type = "z")
summary(z.scores_population)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.04906 -0.64422 -0.07124 0.00000 0.56936 4.02087
length(which(abs(z.scores_population) > 3))
## [1] 231
length(which(abs(z.scores_population) > 3)) / length(df_pre_scan2$Population_2010) * 100
## [1] 0.3419335
z.scores_unemployment <- df_pre_scan2$`Unmp%` %>% scores(type = "z")
summary(z.scores_unemployment)
##
## 142 values imputed to 0.2261075
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.42641 -0.72080 -0.05465 0.00000 0.67870 3.74646
length(which(abs(z.scores_unemployment) > 3))
## [1] 159
length(which(abs(z.scores_unemployment) > 3)) / length(df_pre_scan2$`Unmp%`) * 100
## [1] 0.2353568
# Impute outliers with mean
df_pre_scan2$Population_2010[which(abs(z.scores_population) > 3)] <- mean(df_pre_scan2$Population_2010)
df_pre_scan2$`Unmp%`[which(abs(z.scores_unemployment) > 3)] <- mean(df_pre_scan2$`Unmp%`)
Explanation of actions taken:
Bauman, V 2021, USA Unemployment & Education Level, Kaggle, viewed 5 October 2021, https://www.kaggle.com/valbauman/student-engagement-online-learning-supplement.
Dolgun, A 2020, ‘Data Wrangling’, lecture notes, MATH2405, RMIT University, viewed 15 September 2021, https://rmit.instructure.com/courses/82538/pages/4-dot-4-1-data-transformation-i?module_item_id=3259779.
Wickham, H & Grolemund, G 2017, R for Data Science Import, Tidy, Transform, Visualize, and ModelData, O’Reilly Media, Inc., Sebastopol, CA.
World Population Review, n.d., viewed 5 October 2021, https://worldpopulationreview.com/states/state-abbreviations.