library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library()
getwd()
## [1] "C:/Users/ktcol/OneDrive/Documents"
setwd("C:/Users/ktcol/Downloads")
df <- read.csv("Jobs.csv")
df
The format the data was packaged in, although efficient, could not be directly acted upon.
df <- df %>% pivot_wider(names_from = Attribute, values_from = Value) #pivots df from long format to wide format
#https://www.statology.org/long-vs-wide-data/
Here, I am cutting out the variables that do not have any influence on other variables and/or were not being examined. This also begins the exploratory analysis portion of this project, where I examine what variables may be influencing others.
df <- df %>% select(State, PctEmpAgriculture, PctEmpConstruction, PctEmpFIRE, PctEmpInformation, PctEmpManufacturing, PctEmpMining, PctEmpGovt, PctEmpServices, PctEmpTrans, PctEmpTrade, UnempRate2017, UnempRate2018, UnempRate2019, UnempRate2020, UnempRate2021, NumCivLaborforce2017, NumCivLaborforce2018, NumCivLaborforce2019, NumCivLaborForce2020, NumCivLaborforce2021, NumEmployed2017, NumEmployed2018,NumEmployed2019, NumEmployed2020,
NumEmployed2021, NumUnemployed2017, NumUnemployed2018, NumUnemployed2019, NumUnemployed2020, NumUnemployed2021)
The decision is made to examine the relationship between participation in two sectors, agricultural and government, and their relationship to unemployment rate.
df_summarized <- df %>%
mutate(
Mean_UnempRate = rowMeans(select(., starts_with("UnempRate")), na.rm = TRUE),
Mean_NumCivLaborforce = rowMeans(select(., starts_with("NumCivLaborforce")), na.rm = TRUE),
Mean_NumEmployed = rowMeans(select(., starts_with("NumEmployed")), na.rm = TRUE),
Mean_NumUnemployed = rowMeans(select(., starts_with("NumUnemployed")), na.rm = TRUE)
)
df_summarized <- df_summarized %>% select(State, PctEmpAgriculture, PctEmpConstruction, PctEmpFIRE, PctEmpInformation, PctEmpManufacturing, PctEmpMining, PctEmpGovt, PctEmpServices, PctEmpTrans, PctEmpTrade, Mean_UnempRate, Mean_NumCivLaborforce, Mean_NumEmployed, Mean_NumUnemployed)
df_summarized_by_state <- df_summarized %>%
group_by(State) %>%
summarize(
Mean_UnempRate = mean(Mean_UnempRate, na.rm = TRUE),
Mean_PctEmpAgriculture = mean(PctEmpAgriculture, na.rm = TRUE),
Mean_PctEmpGovt = mean(PctEmpGovt, na.rm = TRUE)
)
df_summarized_by_state
df_correlationplot_explored <- df_summarized %>% select(State, PctEmpAgriculture, Mean_UnempRate, PctEmpGovt)
#General Correlation Matrix with Wide Variety of Variables
numeric_cols <- df_summarized %>% select(-State)
cor_matrix <- cor(numeric_cols, use = "complete.obs")
#https://thomasleeper.com/Rcourse/Tutorials/NAhandling.html#:~:text=With%20use%3D%22complete.,calculation%20of%20each%20pairwise%20correlation.
print(cor_matrix)
## PctEmpAgriculture PctEmpConstruction PctEmpFIRE
## PctEmpAgriculture 1.000000000 -0.007885875 -0.26953085
## PctEmpConstruction -0.007885875 1.000000000 -0.04067253
## PctEmpFIRE -0.269530853 -0.040672532 1.00000000
## PctEmpInformation -0.094396744 -0.069111383 0.24616653
## PctEmpManufacturing -0.236187127 -0.143621223 -0.15968547
## PctEmpMining 0.063316346 0.057587475 -0.17262745
## PctEmpGovt -0.009336788 -0.044708076 -0.17306862
## PctEmpServices -0.483113051 -0.186807608 0.25449859
## PctEmpTrans 0.022254310 0.004236233 -0.13499553
## PctEmpTrade -0.211114465 -0.059585455 0.08092970
## Mean_UnempRate -0.256652852 -0.043109957 -0.20523854
## Mean_NumCivLaborforce -0.030006968 -0.015945554 0.05209973
## Mean_NumEmployed -0.030012940 -0.015922794 0.05217023
## Mean_NumUnemployed -0.029875984 -0.016359981 0.05074991
## PctEmpInformation PctEmpManufacturing PctEmpMining
## PctEmpAgriculture -0.094396744 -0.23618713 0.06331635
## PctEmpConstruction -0.069111383 -0.14362122 0.05758747
## PctEmpFIRE 0.246166535 -0.15968547 -0.17262745
## PctEmpInformation 1.000000000 -0.16330596 -0.11846541
## PctEmpManufacturing -0.163305956 1.00000000 -0.26609967
## PctEmpMining -0.118465406 -0.26609967 1.00000000
## PctEmpGovt -0.055701879 -0.34723717 0.05089192
## PctEmpServices 0.187446246 -0.38919812 -0.20505541
## PctEmpTrans -0.088717232 -0.04254633 0.18477153
## PctEmpTrade -0.008334168 0.01270191 -0.15167069
## Mean_UnempRate -0.098587793 -0.02779264 0.01193586
## Mean_NumCivLaborforce 0.039609368 -0.01516066 -0.01470871
## Mean_NumEmployed 0.039573240 -0.01511853 -0.01470966
## Mean_NumUnemployed 0.040257910 -0.01593709 -0.01468152
## PctEmpGovt PctEmpServices PctEmpTrans PctEmpTrade
## PctEmpAgriculture -0.009336788 -0.48311305 0.022254310 -0.2111144649
## PctEmpConstruction -0.044708076 -0.18680761 0.004236233 -0.0595854547
## PctEmpFIRE -0.173068616 0.25449859 -0.134995530 0.0809296980
## PctEmpInformation -0.055701879 0.18744625 -0.088717232 -0.0083341681
## PctEmpManufacturing -0.347237170 -0.38919812 -0.042546328 0.0127019071
## PctEmpMining 0.050891924 -0.20505541 0.184771526 -0.1516706938
## PctEmpGovt 1.000000000 0.01410130 0.016798878 -0.1784407989
## PctEmpServices 0.014101300 1.00000000 -0.317878578 -0.0724503396
## PctEmpTrans 0.016798878 -0.31787858 1.000000000 -0.0660692864
## PctEmpTrade -0.178440799 -0.07245034 -0.066069286 1.0000000000
## Mean_UnempRate 0.304522494 0.17355352 0.004587568 0.0435852070
## Mean_NumCivLaborforce -0.015831510 0.04200475 -0.003598186 0.0003704084
## Mean_NumEmployed -0.015848110 0.04196836 -0.003656477 0.0003752474
## Mean_NumUnemployed -0.015511391 0.04265666 -0.002507890 0.0002798501
## Mean_UnempRate Mean_NumCivLaborforce Mean_NumEmployed
## PctEmpAgriculture -0.256652852 -0.0300069678 -0.0300129400
## PctEmpConstruction -0.043109957 -0.0159455538 -0.0159227940
## PctEmpFIRE -0.205238541 0.0520997254 0.0521702304
## PctEmpInformation -0.098587793 0.0396093680 0.0395732401
## PctEmpManufacturing -0.027792636 -0.0151606593 -0.0151185312
## PctEmpMining 0.011935858 -0.0147087110 -0.0147096555
## PctEmpGovt 0.304522494 -0.0158315101 -0.0158481098
## PctEmpServices 0.173553524 0.0420047545 0.0419683627
## PctEmpTrans 0.004587568 -0.0035981860 -0.0036564766
## PctEmpTrade 0.043585207 0.0003704084 0.0003752474
## Mean_UnempRate 1.000000000 0.0039521860 0.0036953135
## Mean_NumCivLaborforce 0.003952186 1.0000000000 0.9999982284
## Mean_NumEmployed 0.003695313 0.9999982284 1.0000000000
## Mean_NumUnemployed 0.008743974 0.9993826630 0.9993147614
## Mean_NumUnemployed
## PctEmpAgriculture -0.0298759836
## PctEmpConstruction -0.0163599808
## PctEmpFIRE 0.0507499101
## PctEmpInformation 0.0402579096
## PctEmpManufacturing -0.0159370913
## PctEmpMining -0.0146815167
## PctEmpGovt -0.0155113910
## PctEmpServices 0.0426566638
## PctEmpTrans -0.0025078904
## PctEmpTrade 0.0002798501
## Mean_UnempRate 0.0087439738
## Mean_NumCivLaborforce 0.9993826630
## Mean_NumEmployed 0.9993147614
## Mean_NumUnemployed 1.0000000000
corrplot(cor_matrix, method = "circle")
#Repeated with variables that seemed to have a relationship
numeric_cols <- df_correlationplot_explored %>% select(-State)
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)
## PctEmpAgriculture Mean_UnempRate PctEmpGovt
## PctEmpAgriculture 1.000000000 -0.2566529 -0.009336788
## Mean_UnempRate -0.256652852 1.0000000 0.304522494
## PctEmpGovt -0.009336788 0.3045225 1.000000000
corrplot(cor_matrix, method = "circle")
ggplot(df_summarized_by_state, aes(x = Mean_UnempRate, y = Mean_PctEmpGovt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Average Unemployment Rate vs % Employed in Government", x = "Average Unemployment Rate", y = "% Employed in Government") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#summarized across states to clean up visuals
ggplot(df_summarized_by_state, aes(x = Mean_UnempRate, y = Mean_PctEmpAgriculture)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Average Unemployment Rate vs % Employed in Agriculture", x = "Average Unemployment Rate", y = "% Employment in Agriculture") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_correlationplot_explored, aes(x = Mean_UnempRate)) +
geom_histogram(binwidth = 0.5, fill = "maroon1", color = "black") +
labs(title = "Distribution of Unemployment Rates", x = "Average Unemployment Rate", y = "Count") +
theme_minimal()
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).
ggplot(df_summarized_by_state, aes(x = Mean_PctEmpGovt)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(title = "Density Plot of % Employed in Government", x = "% Employed in Government", y = "Density") +
theme_minimal()
# correlation tests
test_unemp_agriculture <- cor.test(df_summarized_by_state$Mean_UnempRate, df_summarized_by_state$Mean_PctEmpAgriculture, use = "complete.obs")
test_unemp_govt <- cor.test(df_summarized_by_state$Mean_UnempRate, df_summarized_by_state$Mean_PctEmpGovt, use = "complete.obs")
print(test_unemp_agriculture)
##
## Pearson's product-moment correlation
##
## data: df_summarized_by_state$Mean_UnempRate and df_summarized_by_state$Mean_PctEmpAgriculture
## t = -3.2813, df = 51, p-value = 0.001868
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6180591 -0.1659478
## sample estimates:
## cor
## -0.4175141
print(test_unemp_govt)
##
## Pearson's product-moment correlation
##
## data: df_summarized_by_state$Mean_UnempRate and df_summarized_by_state$Mean_PctEmpGovt
## t = 4.0356, df = 51, p-value = 0.0001829
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2556863 0.6728043
## sample estimates:
## cor
## 0.4919793
#machine learning practice
df_clean <- df_correlationplot_explored %>%
select(PctEmpAgriculture, PctEmpGovt, Mean_UnempRate) %>%
drop_na()
str(df_clean)
## tibble [3,273 × 3] (S3: tbl_df/tbl/data.frame)
## $ PctEmpAgriculture: num [1:3273] 1.2 1 0.58 1.53 5.48 ...
## $ PctEmpGovt : num [1:3273] 4.7 5.44 9.13 5.13 7.14 ...
## $ Mean_UnempRate : num [1:3273] 5.1 4.3 3.74 3.98 5.72 4.5 3.48 4.7 5.82 5.06 ...
summary(df_clean)
## PctEmpAgriculture PctEmpGovt Mean_UnempRate
## Min. : 0.000 Min. : 0.000 Min. : 1.760
## 1st Qu.: 1.063 1st Qu.: 3.563 1st Qu.: 3.780
## Median : 2.527 Median : 4.839 Median : 4.600
## Mean : 4.744 Mean : 5.707 Mean : 4.956
## 3rd Qu.: 5.819 3rd Qu.: 6.970 3rd Qu.: 5.700
## Max. :52.209 Max. :31.126 Max. :19.920
set.seed(8675309)
split <- 0.75
rows <- nrow(df_clean)
train.entries <- sample(rows, rows*split)
model.train <- df_clean[train.entries, ]
model.valid <- df_clean[-train.entries, ]
lm_model <- lm(Mean_UnempRate ~ PctEmpAgriculture + PctEmpGovt, data = model.train)
summary(lm_model)
##
## Call:
## lm(formula = Mean_UnempRate ~ PctEmpAgriculture + PctEmpGovt,
## data = model.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3343 -1.0360 -0.2714 0.6916 14.5191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.377534 0.073743 59.36 <2e-16 ***
## PctEmpAgriculture -0.083558 0.005627 -14.85 <2e-16 ***
## PctEmpGovt 0.163899 0.010439 15.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.682 on 2451 degrees of freedom
## Multiple R-squared: 0.1607, Adjusted R-squared: 0.16
## F-statistic: 234.6 on 2 and 2451 DF, p-value: < 2.2e-16
model.valid <- model.valid %>%
mutate(
predicted_unemp = predict(lm_model, newdata = model.valid),
residual = Mean_UnempRate - predicted_unemp,
square_error = residual^2
)
mse <- mean(model.valid$square_error, na.rm = TRUE)
rmse <- sqrt(mse)
print(paste("MSE: ", mse))
## [1] "MSE: 3.51066424488354"
print(paste("RMSE: ", rmse))
## [1] "RMSE: 1.87367666497812"
model.train <- model.train %>%
mutate(
yhat = predict(lm_model, newdata = model.train),
residual = Mean_UnempRate - yhat
)
ggplot(model.valid) +
geom_point(aes(x = predicted_unemp, y = residual)) +
geom_hline(aes(yintercept = 0), linetype = "dashed", color = "red") +
xlab("Predicted Unemployment Rate") +
ylab("Residual") +
theme_minimal() +
labs(title = "Residual Plot for Validation Model")
ggplot(model.valid) +
geom_histogram(aes(x = residual), bins = 50, color = "black", fill = "olivedrab") +
xlab("Residual (Unemployment Rate)") +
theme_minimal() +
labs(title = "Distribution of Model Residuals")
#source for colors used: https://www.nceas.ucsb.edu/sites/default/files/2020-04/colorPaletteCheatsheet.pdf
sd_train <- sd(model.train$residual, na.rm = TRUE)
sd_valid <- sd(model.valid$residual, na.rm = TRUE)
print(paste("SD Train: ", sd_train))
## [1] "SD Train: 1.68128567196878"
print(paste("SD Valid: ", sd_valid))
## [1] "SD Valid: 1.86829334999983"