Load Libraries, Import Data, and Correlation
Load Libraries
library(readxl)
library(caret)
library(rattle)
library(party)
library(ggpubr)
Import Data
## Classes 'tbl_df', 'tbl' and 'data.frame': 102 obs. of 24 variables:
## $ ...1 : chr "1" "2" "3" "4" ...
## $ location : chr "Alabama" "Alabama" "Alaska" "Alaska" ...
## $ date : POSIXct, format: "2020-03-29" "2020-04-05" ...
## $ doc.id : chr "2020-03-29_US_Alabama_Mobility_Report_en.pdf" "2020-04-05_US_Alabama_Mobility_Report_en.pdf" "2020-03-29_US_Alaska_Mobility_Report_en.pdf" "2020-04-05_US_Alaska_Mobility_Report_en.pdf" ...
## $ retail : num -41 -50 -48 -41 -43 -40 -36 -29 -50 -53 ...
## $ grocery : num -13 -18 -27 -17 -19 -17 -3 -7 -24 -27 ...
## $ national.park : num 19 -1 18 29 -23 -7 25 81 -38 -61 ...
## $ transport : num -30 -40 -55 -52 -47 -41 -33 -22 -54 -59 ...
## $ work : num -32 -36 -33 -32 -37 -33 -29 -27 -39 -42 ...
## $ residence : num 9 12 10 9 11 10 8 7 15 16 ...
## $ text : chr "covid-19 community mobility report alabama march 29, 2020 mobility google prepared report public health officia"| __truncated__ "covid-19 community mobility report alabama april 5, 2020 mobility google prepared report public health official"| __truncated__ "covid-19 community mobility report alaska march 29, 2020 mobility google prepared report public health official"| __truncated__ "covid-19 community mobility report alaska april 5, 2020 mobility google prepared report public health officials"| __truncated__ ...
## $ smokingrate : num 20.9 20.9 21 21 15.6 15.6 22.3 22.3 11.3 11.3 ...
## $ suiciderate : num 16.5 16.5 24.6 24.6 19.2 19.2 18.3 18.3 10.9 10.9 ...
## $ overdoserate : num 16.6 16.6 14.6 14.6 23.8 23.8 15.7 15.7 12.8 12.8 ...
## $ teenpregnancyrate: num 25.2 25.2 19.3 19.3 20.1 20.1 30.4 30.4 13.6 13.6 ...
## $ averagecrimerate : num 1741 1741 2186 2186 1711 ...
## $ percentcollege : num 0.249 0.249 0.292 0.292 0.289 ...
## $ povertyrate : num 15.6 15.6 12.6 12.6 13.2 13.2 15.4 15.4 12 12 ...
## $ ginicoefficient : num 47.8 47.8 41.8 41.8 46.9 ...
## $ population : num 4903185 4903185 731545 731545 7278717 ...
## $ infectionrate : num 0.000259 0.000259 0.000201 0.000201 0.00022 ...
## $ repubgov : num 59.6 59.6 51.5 51.5 56 56 65.3 65.3 38.1 38.1 ...
## $ under25 : num 0.33 0.33 0.36 0.36 0.33 0.33 0.34 0.34 0.33 0.33 ...
## $ percentfemale : num 0.52 0.52 0.49 0.49 0.51 0.51 0.51 0.51 0.51 0.51 ...
Remove Unnecessary Variables
dataset = data[,-c(1,2,3,4,6,7,8,10,11,21)]
str(dataset)
## Classes 'tbl_df', 'tbl' and 'data.frame': 102 obs. of 14 variables:
## $ retail : num -41 -50 -48 -41 -43 -40 -36 -29 -50 -53 ...
## $ work : num -32 -36 -33 -32 -37 -33 -29 -27 -39 -42 ...
## $ smokingrate : num 20.9 20.9 21 21 15.6 15.6 22.3 22.3 11.3 11.3 ...
## $ suiciderate : num 16.5 16.5 24.6 24.6 19.2 19.2 18.3 18.3 10.9 10.9 ...
## $ overdoserate : num 16.6 16.6 14.6 14.6 23.8 23.8 15.7 15.7 12.8 12.8 ...
## $ teenpregnancyrate: num 25.2 25.2 19.3 19.3 20.1 20.1 30.4 30.4 13.6 13.6 ...
## $ averagecrimerate : num 1741 1741 2186 2186 1711 ...
## $ percentcollege : num 0.249 0.249 0.292 0.292 0.289 ...
## $ povertyrate : num 15.6 15.6 12.6 12.6 13.2 13.2 15.4 15.4 12 12 ...
## $ ginicoefficient : num 47.8 47.8 41.8 41.8 46.9 ...
## $ population : num 4903185 4903185 731545 731545 7278717 ...
## $ repubgov : num 59.6 59.6 51.5 51.5 56 56 65.3 65.3 38.1 38.1 ...
## $ under25 : num 0.33 0.33 0.36 0.36 0.33 0.33 0.34 0.34 0.33 0.33 ...
## $ percentfemale : num 0.52 0.52 0.49 0.49 0.51 0.51 0.51 0.51 0.51 0.51 ...
Check for Missing or NA Values
## retail work smokingrate suiciderate
## 0 0 0 2
## overdoserate teenpregnancyrate averagecrimerate percentcollege
## 2 2 0 0
## povertyrate ginicoefficient population repubgov
## 0 0 0 2
## under25 percentfemale
## 0 0
## retail work smokingrate suiciderate
## Min. :-66.0 Min. :-55.00 Min. : 8.90 Min. : 8.30
## 1st Qu.:-50.0 1st Qu.:-39.25 1st Qu.:14.90 1st Qu.:13.70
## Median :-46.0 Median :-37.00 Median :17.15 Median :15.70
## Mean :-46.3 Mean :-36.85 Mean :17.33 Mean :16.45
## 3rd Qu.:-41.0 3rd Qu.:-34.00 3rd Qu.:19.30 3rd Qu.:19.30
## Max. :-29.0 Max. :-24.00 Max. :26.00 Max. :25.20
## overdoserate teenpregnancyrate averagecrimerate percentcollege
## Min. : 6.90 Min. : 7.2 Min. : 790.2 Min. :0.2026
## 1st Qu.:13.20 1st Qu.:13.3 1st Qu.:1064.1 1st Qu.:0.2712
## Median :21.20 Median :16.7 Median :1427.5 Median :0.3001
## Mean :21.77 Mean :17.7 Mean :1377.2 Mean :0.3065
## 3rd Qu.:27.50 3rd Qu.:21.6 3rd Qu.:1608.7 3rd Qu.:0.3327
## Max. :51.50 Max. :30.4 Max. :2362.6 Max. :0.4291
## povertyrate ginicoefficient population repubgov
## Min. : 6.60 Min. :41.80 Min. : 578759 Min. :33.70
## 1st Qu.:10.00 1st Qu.:45.05 1st Qu.: 1792147 1st Qu.:43.00
## Median :11.30 Median :46.37 Median : 4558234 Median :48.95
## Mean :11.75 Mean :46.35 Mean : 6550675 Mean :49.83
## 3rd Qu.:13.20 3rd Qu.:47.76 3rd Qu.: 7614893 3rd Qu.:55.40
## Max. :19.80 Max. :51.29 Max. :39512223 Max. :76.70
## under25 percentfemale
## Min. :0.260 Min. :0.4900
## 1st Qu.:0.310 1st Qu.:0.5000
## Median :0.325 Median :0.5100
## Mean :0.324 Mean :0.5094
## 3rd Qu.:0.340 3rd Qu.:0.5200
## Max. :0.420 Max. :0.5200
Scatterplot Matrix

Train Models
Modeling Parameters
control <- trainControl(method="cv", number=10)
metric <- c("RMSE")
set.seed(7)
Train Models
set.seed(7)
fit.OLS <- train(retail~., data = dataset, method="lm", metric=metric, trControl=control) #OLS
set.seed(7)
fit.rf <- train(retail~., data = dataset, method="rf", metric=metric, trControl=control, #random forest
importance=T)
Summary Results
##
## Call:
## summary.resamples(object = results)
##
## Models: OLS, rf
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 2.057056 3.083154 3.348474 3.621262 4.161366 6.043144 0
## rf 2.253922 2.789651 3.268588 3.432457 3.792297 5.600750 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 2.759985 3.714630 4.254349 4.306376 4.649491 6.629104 0
## rf 2.455043 3.669378 4.024434 4.143926 4.731242 6.150467 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 0.3070336 0.7056952 0.7173016 0.6949383 0.7395073 0.8503637 0
## rf 0.2605568 0.6657810 0.7793879 0.7280297 0.8754823 0.9469632 0
Summary Plots


Regression Results
Summary Results
## RMSE Rsquared MAE Resample
## Min. :2.760 Min. :0.3070 Min. :2.057 Length:10
## 1st Qu.:3.715 1st Qu.:0.7057 1st Qu.:3.083 Class :character
## Median :4.254 Median :0.7173 Median :3.348 Mode :character
## Mean :4.306 Mean :0.6949 Mean :3.621
## 3rd Qu.:4.649 3rd Qu.:0.7395 3rd Qu.:4.161
## Max. :6.629 Max. :0.8504 Max. :6.043
Mean of Different Resamples
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 4.306376 0.6949383 3.621262 1.047673 0.1497762 1.089288
Resamples
## RMSE Rsquared MAE Resample
## 1 4.218185 0.7409528 3.100920 Fold01
## 2 4.290513 0.6356206 3.563540 Fold02
## 3 4.710732 0.7094835 3.935748 Fold03
## 4 3.727232 0.7044325 2.802650 Fold04
## 5 2.759985 0.8503637 2.057056 Fold05
## 6 3.710429 0.8317223 3.133408 Fold06
## 7 6.629104 0.3070336 6.043144 Fold07
## 8 3.500395 0.7184417 3.077232 Fold08
## 9 5.051415 0.7161614 4.262348 Fold09
## 10 4.465766 0.7351705 4.236572 Fold10
Predictions Results
## RMSE Rsquared MAE
## 3.7275878 0.7389647 3.0454837
Variable Importance
## lm variable importance
##
## Overall
## work 100.000
## teenpregnancyrate 54.262
## suiciderate 34.522
## under25 32.051
## percentfemale 31.482
## smokingrate 28.484
## overdoserate 18.873
## ginicoefficient 14.695
## povertyrate 10.682
## averagecrimerate 10.245
## population 5.919
## percentcollege 1.488
## repubgov 0.000

Random Forest Results
Summary Results
## RMSE Rsquared MAE Resample
## Min. :2.455 Min. :0.2606 Min. :2.254 Length:10
## 1st Qu.:3.669 1st Qu.:0.6658 1st Qu.:2.790 Class :character
## Median :4.024 Median :0.7794 Median :3.269 Mode :character
## Mean :4.144 Mean :0.7280 Mean :3.432
## 3rd Qu.:4.731 3rd Qu.:0.8755 3rd Qu.:3.792
## Max. :6.150 Max. :0.9470 Max. :5.601
Mean of Different Resamples
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 2 4.265927 0.7040235 3.526943 1.079101 0.2239624 1.0548249
## 2 7 4.190522 0.7212923 3.419777 1.016802 0.2115427 1.0131826
## 3 13 4.143926 0.7280297 3.432457 1.013233 0.2114289 0.9732064
Resamples
## RMSE Rsquared MAE Resample
## 1 3.889006 0.7354434 3.244446 Fold01
## 2 4.159861 0.6513040 3.292729 Fold02
## 3 2.455043 0.8857760 2.253922 Fold05
## 4 4.954285 0.5097339 3.868182 Fold04
## 5 4.212389 0.9133743 3.564644 Fold03
## 6 3.657874 0.8233324 2.600430 Fold06
## 7 3.703891 0.9469632 2.766329 Fold09
## 8 3.352254 0.7092118 2.859617 Fold08
## 9 6.150467 0.2605568 5.600750 Fold07
## 10 4.904193 0.8446013 4.273525 Fold10
Predictions
## RMSE Rsquared MAE
## 1.9674408 0.9364127 1.6171312
Variable Importance
## rf variable importance
##
## Overall
## work 100.0000
## teenpregnancyrate 63.6355
## repubgov 30.2639
## under25 23.6882
## averagecrimerate 22.1085
## percentcollege 16.1098
## ginicoefficient 14.7731
## suiciderate 11.9816
## population 10.4762
## percentfemale 6.8072
## overdoserate 1.9645
## smokingrate 0.1308
## povertyrate 0.0000

Regression with Significant Variables - Results
Train on Significant Variables
fit.OLS.r <- train(retail~teenpregnancyrate+work+under25, data = dataset, method="lm", metric=metric, trControl=control, #random forest
importance=T)
Summary Results
## RMSE Rsquared MAE Resample
## Min. :3.237 Min. :0.4946 Min. :2.654 Length:10
## 1st Qu.:3.618 1st Qu.:0.5675 1st Qu.:3.143 Class :character
## Median :4.507 Median :0.6590 Median :3.562 Mode :character
## Mean :4.316 Mean :0.6651 Mean :3.626
## 3rd Qu.:4.958 3rd Qu.:0.7747 3rd Qu.:4.145
## Max. :5.322 Max. :0.8579 Max. :4.763
Mean of Different Resamples
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 4.316096 0.665056 3.626224 0.7516771 0.1280543 0.7210391
Resamples
## RMSE Rsquared MAE Resample
## 1 4.516073 0.6232474 3.601424 Fold01
## 2 4.522921 0.6946748 3.588591 Fold02
## 3 3.872631 0.8578961 3.480913 Fold03
## 4 5.321531 0.5879240 4.525185 Fold04
## 5 3.237280 0.7799301 2.756772 Fold05
## 6 5.102548 0.5056997 4.326789 Fold06
## 7 4.498702 0.4946173 3.534965 Fold07
## 8 3.533148 0.7694509 3.030183 Fold08
## 9 3.452458 0.7763943 2.654230 Fold09
## 10 5.103664 0.5607251 4.763192 Fold10
Predictions
## RMSE Rsquared MAE
## 4.1226146 0.6807073 3.4337670
Variable Importance
## lm variable importance
##
## Overall
## work 100.00
## teenpregnancyrate 69.36
## under25 0.00

Training on the Whole Dataset
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -51.0122 -12.3741 0.5566 1.5460 14.4767 56.0830
Random Forest With Significant Variables - Results
Train on Significant Variables
fit.rf.r <- train(retail~teenpregnancyrate+work+under25, data = dataset, method="rf", metric=metric, trControl=control, #random forest
importance=T)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
Summary Results
## RMSE Rsquared MAE Resample
## Min. :2.897 Min. :0.4522 Min. :2.510 Length:10
## 1st Qu.:4.166 1st Qu.:0.5238 1st Qu.:3.167 Class :character
## Median :4.294 Median :0.6600 Median :3.631 Mode :character
## Mean :4.182 Mean :0.6650 Mean :3.479
## 3rd Qu.:4.304 3rd Qu.:0.8201 3rd Qu.:3.800
## Max. :4.959 Max. :0.8540 Max. :4.340
Mean of Different Resamples
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 2 4.182451 0.6650483 3.479180 0.5198412 0.1577431 0.5340736
## 2 3 4.204876 0.6603976 3.497218 0.5402368 0.1591097 0.5303850
Resamples
## RMSE Rsquared MAE Resample
## 1 4.181225 0.8346070 3.747845 Fold01
## 2 4.959311 0.4522175 4.340016 Fold02
## 3 4.460262 0.8540235 3.837726 Fold03
## 4 4.160431 0.8349138 2.905509 Fold04
## 5 3.972986 0.5035472 3.224397 Fold05
## 6 4.292958 0.7290909 3.817092 Fold06
## 7 2.896686 0.7766837 2.509815 Fold07
## 8 4.294924 0.4896968 3.734741 Fold08
## 9 4.300764 0.5909832 3.526961 Fold09
## 10 4.304965 0.5847198 3.147704 Fold10
Predictions
## RMSE Rsquared MAE
## 2.2621690 0.9100286 1.8678796
Variable Importance
## rf variable importance
##
## Overall
## work 100.0
## teenpregnancyrate 86.3
## under25 0.0

Forest Plot
