R Notebook

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(forecast)

## Warning: package 'forecast' was built under R version 4.3.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.92 loaded

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library()

getwd()

## [1] "C:/Users/ktcol/OneDrive/Documents"

setwd("C:/Users/ktcol/Downloads")
df <- read.csv("Jobs.csv")
df

The format the data was packaged in, although efficient, could not be directly acted upon.

df <- df %>% pivot_wider(names_from = Attribute, values_from = Value) #pivots df from long format to wide format
#https://www.statology.org/long-vs-wide-data/

Here, I am cutting out the variables that do not have any influence on other variables and/or were not being examined. This also begins the exploratory analysis portion of this project, where I examine what variables may be influencing others.

df <- df %>% select(State, PctEmpAgriculture, PctEmpConstruction, PctEmpFIRE, PctEmpInformation, PctEmpManufacturing, PctEmpMining, PctEmpGovt, PctEmpServices, PctEmpTrans, PctEmpTrade, UnempRate2017, UnempRate2018, UnempRate2019, UnempRate2020, UnempRate2021, NumCivLaborforce2017, NumCivLaborforce2018, NumCivLaborforce2019, NumCivLaborForce2020, NumCivLaborforce2021, NumEmployed2017, NumEmployed2018,NumEmployed2019, NumEmployed2020,
NumEmployed2021, NumUnemployed2017, NumUnemployed2018, NumUnemployed2019, NumUnemployed2020, NumUnemployed2021)

The decision is made to examine the relationship between participation in two sectors, agricultural and government, and their relationship to unemployment rate.

df_summarized <- df %>%
  mutate(
    Mean_UnempRate = rowMeans(select(., starts_with("UnempRate")), na.rm = TRUE),
    Mean_NumCivLaborforce = rowMeans(select(., starts_with("NumCivLaborforce")), na.rm = TRUE),
    Mean_NumEmployed = rowMeans(select(., starts_with("NumEmployed")), na.rm = TRUE),
    Mean_NumUnemployed = rowMeans(select(., starts_with("NumUnemployed")), na.rm = TRUE)
  )
df_summarized <- df_summarized %>% select(State, PctEmpAgriculture, PctEmpConstruction, PctEmpFIRE, PctEmpInformation, PctEmpManufacturing, PctEmpMining, PctEmpGovt, PctEmpServices, PctEmpTrans, PctEmpTrade, Mean_UnempRate, Mean_NumCivLaborforce, Mean_NumEmployed, Mean_NumUnemployed)

df_summarized_by_state <- df_summarized %>%
  group_by(State) %>%
  summarize(
    Mean_UnempRate = mean(Mean_UnempRate, na.rm = TRUE),
    Mean_PctEmpAgriculture = mean(PctEmpAgriculture, na.rm = TRUE),
    Mean_PctEmpGovt = mean(PctEmpGovt, na.rm = TRUE)
  )

df_summarized_by_state

df_correlationplot_explored <- df_summarized %>% select(State, PctEmpAgriculture, Mean_UnempRate, PctEmpGovt)

#General Correlation Matrix with Wide Variety of Variables
numeric_cols <- df_summarized %>% select(-State) 
cor_matrix <- cor(numeric_cols, use = "complete.obs")
#https://thomasleeper.com/Rcourse/Tutorials/NAhandling.html#:~:text=With%20use%3D%22complete.,calculation%20of%20each%20pairwise%20correlation.

print(cor_matrix)

##                       PctEmpAgriculture PctEmpConstruction  PctEmpFIRE
## PctEmpAgriculture           1.000000000       -0.007885875 -0.26953085
## PctEmpConstruction         -0.007885875        1.000000000 -0.04067253
## PctEmpFIRE                 -0.269530853       -0.040672532  1.00000000
## PctEmpInformation          -0.094396744       -0.069111383  0.24616653
## PctEmpManufacturing        -0.236187127       -0.143621223 -0.15968547
## PctEmpMining                0.063316346        0.057587475 -0.17262745
## PctEmpGovt                 -0.009336788       -0.044708076 -0.17306862
## PctEmpServices             -0.483113051       -0.186807608  0.25449859
## PctEmpTrans                 0.022254310        0.004236233 -0.13499553
## PctEmpTrade                -0.211114465       -0.059585455  0.08092970
## Mean_UnempRate             -0.256652852       -0.043109957 -0.20523854
## Mean_NumCivLaborforce      -0.030006968       -0.015945554  0.05209973
## Mean_NumEmployed           -0.030012940       -0.015922794  0.05217023
## Mean_NumUnemployed         -0.029875984       -0.016359981  0.05074991
##                       PctEmpInformation PctEmpManufacturing PctEmpMining
## PctEmpAgriculture          -0.094396744         -0.23618713   0.06331635
## PctEmpConstruction         -0.069111383         -0.14362122   0.05758747
## PctEmpFIRE                  0.246166535         -0.15968547  -0.17262745
## PctEmpInformation           1.000000000         -0.16330596  -0.11846541
## PctEmpManufacturing        -0.163305956          1.00000000  -0.26609967
## PctEmpMining               -0.118465406         -0.26609967   1.00000000
## PctEmpGovt                 -0.055701879         -0.34723717   0.05089192
## PctEmpServices              0.187446246         -0.38919812  -0.20505541
## PctEmpTrans                -0.088717232         -0.04254633   0.18477153
## PctEmpTrade                -0.008334168          0.01270191  -0.15167069
## Mean_UnempRate             -0.098587793         -0.02779264   0.01193586
## Mean_NumCivLaborforce       0.039609368         -0.01516066  -0.01470871
## Mean_NumEmployed            0.039573240         -0.01511853  -0.01470966
## Mean_NumUnemployed          0.040257910         -0.01593709  -0.01468152
##                         PctEmpGovt PctEmpServices  PctEmpTrans   PctEmpTrade
## PctEmpAgriculture     -0.009336788    -0.48311305  0.022254310 -0.2111144649
## PctEmpConstruction    -0.044708076    -0.18680761  0.004236233 -0.0595854547
## PctEmpFIRE            -0.173068616     0.25449859 -0.134995530  0.0809296980
## PctEmpInformation     -0.055701879     0.18744625 -0.088717232 -0.0083341681
## PctEmpManufacturing   -0.347237170    -0.38919812 -0.042546328  0.0127019071
## PctEmpMining           0.050891924    -0.20505541  0.184771526 -0.1516706938
## PctEmpGovt             1.000000000     0.01410130  0.016798878 -0.1784407989
## PctEmpServices         0.014101300     1.00000000 -0.317878578 -0.0724503396
## PctEmpTrans            0.016798878    -0.31787858  1.000000000 -0.0660692864
## PctEmpTrade           -0.178440799    -0.07245034 -0.066069286  1.0000000000
## Mean_UnempRate         0.304522494     0.17355352  0.004587568  0.0435852070
## Mean_NumCivLaborforce -0.015831510     0.04200475 -0.003598186  0.0003704084
## Mean_NumEmployed      -0.015848110     0.04196836 -0.003656477  0.0003752474
## Mean_NumUnemployed    -0.015511391     0.04265666 -0.002507890  0.0002798501
##                       Mean_UnempRate Mean_NumCivLaborforce Mean_NumEmployed
## PctEmpAgriculture       -0.256652852         -0.0300069678    -0.0300129400
## PctEmpConstruction      -0.043109957         -0.0159455538    -0.0159227940
## PctEmpFIRE              -0.205238541          0.0520997254     0.0521702304
## PctEmpInformation       -0.098587793          0.0396093680     0.0395732401
## PctEmpManufacturing     -0.027792636         -0.0151606593    -0.0151185312
## PctEmpMining             0.011935858         -0.0147087110    -0.0147096555
## PctEmpGovt               0.304522494         -0.0158315101    -0.0158481098
## PctEmpServices           0.173553524          0.0420047545     0.0419683627
## PctEmpTrans              0.004587568         -0.0035981860    -0.0036564766
## PctEmpTrade              0.043585207          0.0003704084     0.0003752474
## Mean_UnempRate           1.000000000          0.0039521860     0.0036953135
## Mean_NumCivLaborforce    0.003952186          1.0000000000     0.9999982284
## Mean_NumEmployed         0.003695313          0.9999982284     1.0000000000
## Mean_NumUnemployed       0.008743974          0.9993826630     0.9993147614
##                       Mean_NumUnemployed
## PctEmpAgriculture          -0.0298759836
## PctEmpConstruction         -0.0163599808
## PctEmpFIRE                  0.0507499101
## PctEmpInformation           0.0402579096
## PctEmpManufacturing        -0.0159370913
## PctEmpMining               -0.0146815167
## PctEmpGovt                 -0.0155113910
## PctEmpServices              0.0426566638
## PctEmpTrans                -0.0025078904
## PctEmpTrade                 0.0002798501
## Mean_UnempRate              0.0087439738
## Mean_NumCivLaborforce       0.9993826630
## Mean_NumEmployed            0.9993147614
## Mean_NumUnemployed          1.0000000000

corrplot(cor_matrix, method = "circle")

#Repeated with variables that seemed to have a relationship
numeric_cols <- df_correlationplot_explored %>% select(-State) 
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)

##                   PctEmpAgriculture Mean_UnempRate   PctEmpGovt
## PctEmpAgriculture       1.000000000     -0.2566529 -0.009336788
## Mean_UnempRate         -0.256652852      1.0000000  0.304522494
## PctEmpGovt             -0.009336788      0.3045225  1.000000000

corrplot(cor_matrix, method = "circle")

ggplot(df_summarized_by_state, aes(x = Mean_UnempRate, y = Mean_PctEmpGovt)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Average Unemployment Rate vs % Employed in Government", x = "Average Unemployment Rate", y = "% Employed in Government") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#summarized across states to clean up visuals

ggplot(df_summarized_by_state, aes(x = Mean_UnempRate, y = Mean_PctEmpAgriculture)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Average Unemployment Rate vs % Employed in Agriculture", x = "Average Unemployment Rate", y = "% Employment in Agriculture") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

ggplot(df_correlationplot_explored, aes(x = Mean_UnempRate)) +
  geom_histogram(binwidth = 0.5, fill = "maroon1", color = "black") + 
  labs(title = "Distribution of Unemployment Rates", x = "Average Unemployment Rate", y = "Count") +
  theme_minimal()

## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

ggplot(df_summarized_by_state, aes(x = Mean_PctEmpGovt)) +
  geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
  labs(title = "Density Plot of % Employed in Government", x = "% Employed in Government", y = "Density") +
  theme_minimal()

# correlation tests
test_unemp_agriculture <- cor.test(df_summarized_by_state$Mean_UnempRate, df_summarized_by_state$Mean_PctEmpAgriculture, use = "complete.obs")
test_unemp_govt <- cor.test(df_summarized_by_state$Mean_UnempRate, df_summarized_by_state$Mean_PctEmpGovt, use = "complete.obs")


print(test_unemp_agriculture)

## 
##  Pearson's product-moment correlation
## 
## data:  df_summarized_by_state$Mean_UnempRate and df_summarized_by_state$Mean_PctEmpAgriculture
## t = -3.2813, df = 51, p-value = 0.001868
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6180591 -0.1659478
## sample estimates:
##        cor 
## -0.4175141

print(test_unemp_govt)

## 
##  Pearson's product-moment correlation
## 
## data:  df_summarized_by_state$Mean_UnempRate and df_summarized_by_state$Mean_PctEmpGovt
## t = 4.0356, df = 51, p-value = 0.0001829
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2556863 0.6728043
## sample estimates:
##       cor 
## 0.4919793

#machine learning practice
df_clean <- df_correlationplot_explored %>%
  select(PctEmpAgriculture, PctEmpGovt, Mean_UnempRate) %>%
  drop_na()

str(df_clean)

## tibble [3,273 × 3] (S3: tbl_df/tbl/data.frame)
##  $ PctEmpAgriculture: num [1:3273] 1.2 1 0.58 1.53 5.48 ...
##  $ PctEmpGovt       : num [1:3273] 4.7 5.44 9.13 5.13 7.14 ...
##  $ Mean_UnempRate   : num [1:3273] 5.1 4.3 3.74 3.98 5.72 4.5 3.48 4.7 5.82 5.06 ...

summary(df_clean)

##  PctEmpAgriculture   PctEmpGovt     Mean_UnempRate  
##  Min.   : 0.000    Min.   : 0.000   Min.   : 1.760  
##  1st Qu.: 1.063    1st Qu.: 3.563   1st Qu.: 3.780  
##  Median : 2.527    Median : 4.839   Median : 4.600  
##  Mean   : 4.744    Mean   : 5.707   Mean   : 4.956  
##  3rd Qu.: 5.819    3rd Qu.: 6.970   3rd Qu.: 5.700  
##  Max.   :52.209    Max.   :31.126   Max.   :19.920

set.seed(8675309) 
split <- 0.75
rows <- nrow(df_clean)

train.entries <- sample(rows, rows*split)
model.train <- df_clean[train.entries, ]
model.valid <- df_clean[-train.entries, ]

lm_model <- lm(Mean_UnempRate ~ PctEmpAgriculture + PctEmpGovt, data = model.train)

summary(lm_model)

## 
## Call:
## lm(formula = Mean_UnempRate ~ PctEmpAgriculture + PctEmpGovt, 
##     data = model.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3343 -1.0360 -0.2714  0.6916 14.5191 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.377534   0.073743   59.36   <2e-16 ***
## PctEmpAgriculture -0.083558   0.005627  -14.85   <2e-16 ***
## PctEmpGovt         0.163899   0.010439   15.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.682 on 2451 degrees of freedom
## Multiple R-squared:  0.1607, Adjusted R-squared:   0.16 
## F-statistic: 234.6 on 2 and 2451 DF,  p-value: < 2.2e-16

model.valid <- model.valid %>%
  mutate(
    predicted_unemp = predict(lm_model, newdata = model.valid),
    residual = Mean_UnempRate - predicted_unemp,
    square_error = residual^2
  )

mse <- mean(model.valid$square_error, na.rm = TRUE)
rmse <- sqrt(mse)

print(paste("MSE: ", mse))

## [1] "MSE:  3.51066424488354"

print(paste("RMSE: ", rmse))

## [1] "RMSE:  1.87367666497812"

model.train <- model.train %>%
  mutate(
    yhat = predict(lm_model, newdata = model.train),
    residual = Mean_UnempRate - yhat
  )

ggplot(model.valid) +
  geom_point(aes(x = predicted_unemp, y = residual)) +
  geom_hline(aes(yintercept = 0), linetype = "dashed", color = "red") +
  xlab("Predicted Unemployment Rate") +
  ylab("Residual") +
  theme_minimal() +
  labs(title = "Residual Plot for Validation Model")

ggplot(model.valid) +
  geom_histogram(aes(x = residual), bins = 50, color = "black", fill = "olivedrab") +
  xlab("Residual (Unemployment Rate)") +
  theme_minimal() +
  labs(title = "Distribution of Model Residuals")

#source for colors used: https://www.nceas.ucsb.edu/sites/default/files/2020-04/colorPaletteCheatsheet.pdf

sd_train <- sd(model.train$residual, na.rm = TRUE)
sd_valid <- sd(model.valid$residual, na.rm = TRUE)

print(paste("SD Train: ", sd_train))

## [1] "SD Train:  1.68128567196878"

print(paste("SD Valid: ", sd_valid))

## [1] "SD Valid:  1.86829334999983"