Load Libraries, Import Data, and Correlation

Load Libraries

library(readxl)
library(caret)
library(rattle)
library(party)
library(ggpubr)

Import Data

## Classes 'tbl_df', 'tbl' and 'data.frame':    102 obs. of  24 variables:
##  $ ...1             : chr  "1" "2" "3" "4" ...
##  $ location         : chr  "Alabama" "Alabama" "Alaska" "Alaska" ...
##  $ date             : POSIXct, format: "2020-03-29" "2020-04-05" ...
##  $ doc.id           : chr  "2020-03-29_US_Alabama_Mobility_Report_en.pdf" "2020-04-05_US_Alabama_Mobility_Report_en.pdf" "2020-03-29_US_Alaska_Mobility_Report_en.pdf" "2020-04-05_US_Alaska_Mobility_Report_en.pdf" ...
##  $ retail           : num  -41 -50 -48 -41 -43 -40 -36 -29 -50 -53 ...
##  $ grocery          : num  -13 -18 -27 -17 -19 -17 -3 -7 -24 -27 ...
##  $ national.park    : num  19 -1 18 29 -23 -7 25 81 -38 -61 ...
##  $ transport        : num  -30 -40 -55 -52 -47 -41 -33 -22 -54 -59 ...
##  $ work             : num  -32 -36 -33 -32 -37 -33 -29 -27 -39 -42 ...
##  $ residence        : num  9 12 10 9 11 10 8 7 15 16 ...
##  $ text             : chr  "covid-19 community mobility report alabama march 29, 2020 mobility google prepared report public health officia"| __truncated__ "covid-19 community mobility report alabama april 5, 2020 mobility google prepared report public health official"| __truncated__ "covid-19 community mobility report alaska march 29, 2020 mobility google prepared report public health official"| __truncated__ "covid-19 community mobility report alaska april 5, 2020 mobility google prepared report public health officials"| __truncated__ ...
##  $ smokingrate      : num  20.9 20.9 21 21 15.6 15.6 22.3 22.3 11.3 11.3 ...
##  $ suiciderate      : num  16.5 16.5 24.6 24.6 19.2 19.2 18.3 18.3 10.9 10.9 ...
##  $ overdoserate     : num  16.6 16.6 14.6 14.6 23.8 23.8 15.7 15.7 12.8 12.8 ...
##  $ teenpregnancyrate: num  25.2 25.2 19.3 19.3 20.1 20.1 30.4 30.4 13.6 13.6 ...
##  $ averagecrimerate : num  1741 1741 2186 2186 1711 ...
##  $ percentcollege   : num  0.249 0.249 0.292 0.292 0.289 ...
##  $ povertyrate      : num  15.6 15.6 12.6 12.6 13.2 13.2 15.4 15.4 12 12 ...
##  $ ginicoefficient  : num  47.8 47.8 41.8 41.8 46.9 ...
##  $ population       : num  4903185 4903185 731545 731545 7278717 ...
##  $ infectionrate    : num  0.000259 0.000259 0.000201 0.000201 0.00022 ...
##  $ repubgov         : num  59.6 59.6 51.5 51.5 56 56 65.3 65.3 38.1 38.1 ...
##  $ under25          : num  0.33 0.33 0.36 0.36 0.33 0.33 0.34 0.34 0.33 0.33 ...
##  $ percentfemale    : num  0.52 0.52 0.49 0.49 0.51 0.51 0.51 0.51 0.51 0.51 ...

Remove Unnecessary Variables

dataset = data[,-c(1,2,3,4,6,7,8,10,11,21)]
str(dataset)
## Classes 'tbl_df', 'tbl' and 'data.frame':    102 obs. of  14 variables:
##  $ retail           : num  -41 -50 -48 -41 -43 -40 -36 -29 -50 -53 ...
##  $ work             : num  -32 -36 -33 -32 -37 -33 -29 -27 -39 -42 ...
##  $ smokingrate      : num  20.9 20.9 21 21 15.6 15.6 22.3 22.3 11.3 11.3 ...
##  $ suiciderate      : num  16.5 16.5 24.6 24.6 19.2 19.2 18.3 18.3 10.9 10.9 ...
##  $ overdoserate     : num  16.6 16.6 14.6 14.6 23.8 23.8 15.7 15.7 12.8 12.8 ...
##  $ teenpregnancyrate: num  25.2 25.2 19.3 19.3 20.1 20.1 30.4 30.4 13.6 13.6 ...
##  $ averagecrimerate : num  1741 1741 2186 2186 1711 ...
##  $ percentcollege   : num  0.249 0.249 0.292 0.292 0.289 ...
##  $ povertyrate      : num  15.6 15.6 12.6 12.6 13.2 13.2 15.4 15.4 12 12 ...
##  $ ginicoefficient  : num  47.8 47.8 41.8 41.8 46.9 ...
##  $ population       : num  4903185 4903185 731545 731545 7278717 ...
##  $ repubgov         : num  59.6 59.6 51.5 51.5 56 56 65.3 65.3 38.1 38.1 ...
##  $ under25          : num  0.33 0.33 0.36 0.36 0.33 0.33 0.34 0.34 0.33 0.33 ...
##  $ percentfemale    : num  0.52 0.52 0.49 0.49 0.51 0.51 0.51 0.51 0.51 0.51 ...

Check for Missing or NA Values

##            retail              work       smokingrate       suiciderate 
##                 0                 0                 0                 2 
##      overdoserate teenpregnancyrate  averagecrimerate    percentcollege 
##                 2                 2                 0                 0 
##       povertyrate   ginicoefficient        population          repubgov 
##                 0                 0                 0                 2 
##           under25     percentfemale 
##                 0                 0
##      retail           work         smokingrate     suiciderate   
##  Min.   :-66.0   Min.   :-55.00   Min.   : 8.90   Min.   : 8.30  
##  1st Qu.:-50.0   1st Qu.:-39.25   1st Qu.:14.90   1st Qu.:13.70  
##  Median :-46.0   Median :-37.00   Median :17.15   Median :15.70  
##  Mean   :-46.3   Mean   :-36.85   Mean   :17.33   Mean   :16.45  
##  3rd Qu.:-41.0   3rd Qu.:-34.00   3rd Qu.:19.30   3rd Qu.:19.30  
##  Max.   :-29.0   Max.   :-24.00   Max.   :26.00   Max.   :25.20  
##   overdoserate   teenpregnancyrate averagecrimerate percentcollege  
##  Min.   : 6.90   Min.   : 7.2      Min.   : 790.2   Min.   :0.2026  
##  1st Qu.:13.20   1st Qu.:13.3      1st Qu.:1064.1   1st Qu.:0.2712  
##  Median :21.20   Median :16.7      Median :1427.5   Median :0.3001  
##  Mean   :21.77   Mean   :17.7      Mean   :1377.2   Mean   :0.3065  
##  3rd Qu.:27.50   3rd Qu.:21.6      3rd Qu.:1608.7   3rd Qu.:0.3327  
##  Max.   :51.50   Max.   :30.4      Max.   :2362.6   Max.   :0.4291  
##   povertyrate    ginicoefficient   population          repubgov    
##  Min.   : 6.60   Min.   :41.80   Min.   :  578759   Min.   :33.70  
##  1st Qu.:10.00   1st Qu.:45.05   1st Qu.: 1792147   1st Qu.:43.00  
##  Median :11.30   Median :46.37   Median : 4558234   Median :48.95  
##  Mean   :11.75   Mean   :46.35   Mean   : 6550675   Mean   :49.83  
##  3rd Qu.:13.20   3rd Qu.:47.76   3rd Qu.: 7614893   3rd Qu.:55.40  
##  Max.   :19.80   Max.   :51.29   Max.   :39512223   Max.   :76.70  
##     under25      percentfemale   
##  Min.   :0.260   Min.   :0.4900  
##  1st Qu.:0.310   1st Qu.:0.5000  
##  Median :0.325   Median :0.5100  
##  Mean   :0.324   Mean   :0.5094  
##  3rd Qu.:0.340   3rd Qu.:0.5200  
##  Max.   :0.420   Max.   :0.5200

Scatterplot Matrix

Train Models

Modeling Parameters

control <- trainControl(method="cv", number=10)
metric <- c("RMSE")
set.seed(7)

Train Models

  • Regression
  • Random Forest
set.seed(7)
fit.OLS <- train(retail~., data = dataset, method="lm", metric=metric, trControl=control) #OLS
set.seed(7)
fit.rf <- train(retail~., data = dataset, method="rf", metric=metric, trControl=control, #random forest 
               importance=T)

Summary Results

## 
## Call:
## summary.resamples(object = results)
## 
## Models: OLS, rf 
## Number of resamples: 10 
## 
## MAE 
##         Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## OLS 2.057056 3.083154 3.348474 3.621262 4.161366 6.043144    0
## rf  2.253922 2.789651 3.268588 3.432457 3.792297 5.600750    0
## 
## RMSE 
##         Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## OLS 2.759985 3.714630 4.254349 4.306376 4.649491 6.629104    0
## rf  2.455043 3.669378 4.024434 4.143926 4.731242 6.150467    0
## 
## Rsquared 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## OLS 0.3070336 0.7056952 0.7173016 0.6949383 0.7395073 0.8503637    0
## rf  0.2605568 0.6657810 0.7793879 0.7280297 0.8754823 0.9469632    0

Summary Plots

Regression Results

Summary Results

##       RMSE          Rsquared           MAE          Resample        
##  Min.   :2.760   Min.   :0.3070   Min.   :2.057   Length:10         
##  1st Qu.:3.715   1st Qu.:0.7057   1st Qu.:3.083   Class :character  
##  Median :4.254   Median :0.7173   Median :3.348   Mode  :character  
##  Mean   :4.306   Mean   :0.6949   Mean   :3.621                     
##  3rd Qu.:4.649   3rd Qu.:0.7395   3rd Qu.:4.161                     
##  Max.   :6.629   Max.   :0.8504   Max.   :6.043

Mean of Different Resamples

##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      TRUE 4.306376 0.6949383 3.621262 1.047673  0.1497762 1.089288

Resamples

##        RMSE  Rsquared      MAE Resample
## 1  4.218185 0.7409528 3.100920   Fold01
## 2  4.290513 0.6356206 3.563540   Fold02
## 3  4.710732 0.7094835 3.935748   Fold03
## 4  3.727232 0.7044325 2.802650   Fold04
## 5  2.759985 0.8503637 2.057056   Fold05
## 6  3.710429 0.8317223 3.133408   Fold06
## 7  6.629104 0.3070336 6.043144   Fold07
## 8  3.500395 0.7184417 3.077232   Fold08
## 9  5.051415 0.7161614 4.262348   Fold09
## 10 4.465766 0.7351705 4.236572   Fold10

Predictions Results

##      RMSE  Rsquared       MAE 
## 3.7275878 0.7389647 3.0454837

Variable Importance

## lm variable importance
## 
##                   Overall
## work              100.000
## teenpregnancyrate  54.262
## suiciderate        34.522
## under25            32.051
## percentfemale      31.482
## smokingrate        28.484
## overdoserate       18.873
## ginicoefficient    14.695
## povertyrate        10.682
## averagecrimerate   10.245
## population          5.919
## percentcollege      1.488
## repubgov            0.000

Random Forest Results

Summary Results

##       RMSE          Rsquared           MAE          Resample        
##  Min.   :2.455   Min.   :0.2606   Min.   :2.254   Length:10         
##  1st Qu.:3.669   1st Qu.:0.6658   1st Qu.:2.790   Class :character  
##  Median :4.024   Median :0.7794   Median :3.269   Mode  :character  
##  Mean   :4.144   Mean   :0.7280   Mean   :3.432                     
##  3rd Qu.:4.731   3rd Qu.:0.8755   3rd Qu.:3.792                     
##  Max.   :6.150   Max.   :0.9470   Max.   :5.601

Mean of Different Resamples

##   mtry     RMSE  Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 1    2 4.265927 0.7040235 3.526943 1.079101  0.2239624 1.0548249
## 2    7 4.190522 0.7212923 3.419777 1.016802  0.2115427 1.0131826
## 3   13 4.143926 0.7280297 3.432457 1.013233  0.2114289 0.9732064

Resamples

##        RMSE  Rsquared      MAE Resample
## 1  3.889006 0.7354434 3.244446   Fold01
## 2  4.159861 0.6513040 3.292729   Fold02
## 3  2.455043 0.8857760 2.253922   Fold05
## 4  4.954285 0.5097339 3.868182   Fold04
## 5  4.212389 0.9133743 3.564644   Fold03
## 6  3.657874 0.8233324 2.600430   Fold06
## 7  3.703891 0.9469632 2.766329   Fold09
## 8  3.352254 0.7092118 2.859617   Fold08
## 9  6.150467 0.2605568 5.600750   Fold07
## 10 4.904193 0.8446013 4.273525   Fold10

Predictions

##      RMSE  Rsquared       MAE 
## 1.9674408 0.9364127 1.6171312

Variable Importance

## rf variable importance
## 
##                    Overall
## work              100.0000
## teenpregnancyrate  63.6355
## repubgov           30.2639
## under25            23.6882
## averagecrimerate   22.1085
## percentcollege     16.1098
## ginicoefficient    14.7731
## suiciderate        11.9816
## population         10.4762
## percentfemale       6.8072
## overdoserate        1.9645
## smokingrate         0.1308
## povertyrate         0.0000

Regression with Significant Variables - Results

Train on Significant Variables

fit.OLS.r <- train(retail~teenpregnancyrate+work+under25, data = dataset, method="lm", metric=metric, trControl=control, #random forest 
               importance=T)

Summary Results

##       RMSE          Rsquared           MAE          Resample        
##  Min.   :3.237   Min.   :0.4946   Min.   :2.654   Length:10         
##  1st Qu.:3.618   1st Qu.:0.5675   1st Qu.:3.143   Class :character  
##  Median :4.507   Median :0.6590   Median :3.562   Mode  :character  
##  Mean   :4.316   Mean   :0.6651   Mean   :3.626                     
##  3rd Qu.:4.958   3rd Qu.:0.7747   3rd Qu.:4.145                     
##  Max.   :5.322   Max.   :0.8579   Max.   :4.763

Mean of Different Resamples

##   intercept     RMSE Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1      TRUE 4.316096 0.665056 3.626224 0.7516771  0.1280543 0.7210391

Resamples

##        RMSE  Rsquared      MAE Resample
## 1  4.516073 0.6232474 3.601424   Fold01
## 2  4.522921 0.6946748 3.588591   Fold02
## 3  3.872631 0.8578961 3.480913   Fold03
## 4  5.321531 0.5879240 4.525185   Fold04
## 5  3.237280 0.7799301 2.756772   Fold05
## 6  5.102548 0.5056997 4.326789   Fold06
## 7  4.498702 0.4946173 3.534965   Fold07
## 8  3.533148 0.7694509 3.030183   Fold08
## 9  3.452458 0.7763943 2.654230   Fold09
## 10 5.103664 0.5607251 4.763192   Fold10

Predictions

##      RMSE  Rsquared       MAE 
## 4.1226146 0.6807073 3.4337670

Variable Importance

## lm variable importance
## 
##                   Overall
## work               100.00
## teenpregnancyrate   69.36
## under25              0.00

Training on the Whole Dataset

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -51.0122 -12.3741   0.5566   1.5460  14.4767  56.0830

Plot

Random Forest With Significant Variables - Results

Train on Significant Variables

fit.rf.r <- train(retail~teenpregnancyrate+work+under25, data = dataset, method="rf", metric=metric, trControl=control, #random forest 
               importance=T)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .

Summary Results

##       RMSE          Rsquared           MAE          Resample        
##  Min.   :2.897   Min.   :0.4522   Min.   :2.510   Length:10         
##  1st Qu.:4.166   1st Qu.:0.5238   1st Qu.:3.167   Class :character  
##  Median :4.294   Median :0.6600   Median :3.631   Mode  :character  
##  Mean   :4.182   Mean   :0.6650   Mean   :3.479                     
##  3rd Qu.:4.304   3rd Qu.:0.8201   3rd Qu.:3.800                     
##  Max.   :4.959   Max.   :0.8540   Max.   :4.340

Mean of Different Resamples

##   mtry     RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1    2 4.182451 0.6650483 3.479180 0.5198412  0.1577431 0.5340736
## 2    3 4.204876 0.6603976 3.497218 0.5402368  0.1591097 0.5303850

Resamples

##        RMSE  Rsquared      MAE Resample
## 1  4.181225 0.8346070 3.747845   Fold01
## 2  4.959311 0.4522175 4.340016   Fold02
## 3  4.460262 0.8540235 3.837726   Fold03
## 4  4.160431 0.8349138 2.905509   Fold04
## 5  3.972986 0.5035472 3.224397   Fold05
## 6  4.292958 0.7290909 3.817092   Fold06
## 7  2.896686 0.7766837 2.509815   Fold07
## 8  4.294924 0.4896968 3.734741   Fold08
## 9  4.300764 0.5909832 3.526961   Fold09
## 10 4.304965 0.5847198 3.147704   Fold10

Predictions

##      RMSE  Rsquared       MAE 
## 2.2621690 0.9100286 1.8678796

Variable Importance

## rf variable importance
## 
##                   Overall
## work                100.0
## teenpregnancyrate    86.3
## under25               0.0

Forest Plot

Project Summary Plots

Scatterplot Matrix

Training Summary Plots

VI Comparison Plot